In [None]:
try:
  import polars as pl
except:
  !pip install polars
  import polars as pl

from polars import LazyFrame
from google.colab import files
import datetime as dt
import numpy as np

In [None]:
uploaded = files.upload()
file_name = list(uploaded.keys())[0]

print(file_name)

Saving 202306-divvy-tripdata.csv to 202306-divvy-tripdata.csv
202306-divvy-tripdata.csv


In [None]:
df: LazyFrame = pl.scan_csv(
    file_name,
    infer_schema = True,
    infer_schema_length = 1000
)

df.collect().head()

ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
str,str,str,str,str,str,str,str,f64,f64,f64,f64,str
"""6F1682AC40EB6F71""","""electric_bike""","""2023-06-05 13:34:12""","""2023-06-05 14:31:56""",,,,,41.91,-87.69,41.91,-87.7,"""member"""
"""622A1686D64948EB""","""electric_bike""","""2023-06-05 01:30:22""","""2023-06-05 01:33:06""",,,,,41.94,-87.65,41.94,-87.65,"""member"""
"""3C88859D926253B4""","""electric_bike""","""2023-06-20 18:15:49""","""2023-06-20 18:32:05""",,,,,41.95,-87.68,41.92,-87.63,"""member"""
"""EAD8A5E0259DEC88""","""electric_bike""","""2023-06-19 14:56:00""","""2023-06-19 15:00:35""",,,,,41.99,-87.65,41.98,-87.66,"""member"""
"""5A36F21930D6A55C""","""electric_bike""","""2023-06-19 15:03:34""","""2023-06-19 15:07:16""",,,,,41.98,-87.66,41.99,-87.65,"""member"""


In [None]:
df \
  .select([
      "start_station_name",
      "start_station_id",
      "end_station_name",
      "end_station_id"
    ]) \
  .filter(
    ~ pl.col("start_station_id").is_null() &
    ~ pl.col("end_station_id").is_null() &
    ~ pl.col("start_station_name").is_null() &
    ~ pl.col("end_station_name").is_null()
  ) \
  .head(10) \
  .collect()

start_station_name,start_station_id,end_station_name,end_station_id
str,str,str,str
"""California Ave & Milwaukee Ave""","""13084""","""California Ave & Division St""","""13256"""
"""Cottage Grove Ave & 51st St""","""TA1309000067""","""Cottage Grove Ave & 51st St""","""TA1309000067"""
"""Western Ave & Roscoe St""","""15634""","""Western Ave & Roscoe St""","""15634"""
"""Cottage Grove Ave & 51st St""","""TA1309000067""","""Cottage Grove Ave & 51st St""","""TA1309000067"""
"""Cottage Grove Ave & 51st St""","""TA1309000067""","""Cottage Grove Ave & 51st St""","""TA1309000067"""
"""California Ave & Milwaukee Ave""","""13084""","""California Ave & Milwaukee Ave""","""13084"""
"""Cottage Grove Ave & 51st St""","""TA1309000067""","""Cottage Grove Ave & 51st St""","""TA1309000067"""
"""Cottage Grove Ave & 51st St""","""TA1309000067""","""Cottage Grove Ave & 51st St""","""TA1309000067"""


In [None]:
'''
Calculate the following analytics/problems.

 - Convert all data types to the correct ones.
 - Count the number bike rides per day.
 - Calculate the average, max, and minimum number of rides per week of the dataset.
 - For each day, calculate how many rides that day is above or below the same day last week.
'''

# Converting data types.
results: LazyFrame = df \
  .with_columns(
      ride_id = pl.col("ride_id"),
      rideable_type = pl.col("rideable_type"),
      started_at = pl.col("started_at").str.to_datetime(),
      ended_at = pl.col("ended_at").str.to_datetime(),
      start_station_id = pl.col("start_station_id"),
      start_station_name = pl.col("start_station_name"),
      end_station_id = pl.col("end_station_id"),
      end_station_name = pl.col("end_station_name"),
      start_lat = pl.col("start_lat"),
      start_lng = pl.col("start_lng"),
      end_lat = pl.col("end_lat"),
      end_lng = pl.col("end_lng"),
      member_casual = pl.col("member_casual")
  )

# results.head().collect()

# Determining bikes per day.
no_bike_rides_per_day: LazyFrame = (
  results
    .with_columns(
      date = pl.col("started_at").dt.date()
    )
    .select([
      "rideable_type",
      "date",
      "ride_id"
    ])
    .group_by("rideable_type", "date")
    .agg(
      pl.count("ride_id").alias("trip_count")
    )
    .filter(
      pl.col("rideable_type").str.contains("bike")
    )
)

# no_bike_rides_per_day.sort("rideable_type", "trip_count", descending = True).collect()

# Calculate the average, max, and minimum number of rides per week of the dataset.
weekly_data: LazyFrame = (
    results.
      with_columns(
          week_start = pl.col("started_at").dt.truncate("1w")
      )
      .group_by("week_start")
      .agg(
          pl.len().alias("rides_per_week")
      )
)

weekly_stats: LazyFrame = (
    weekly_data
      .select([
          pl.col("rides_per_week").mean().alias("avg_rides_per_week"),
          pl.col("rides_per_week").max().alias("max_rides_per_week"),
          pl.col("rides_per_week").min().alias("min_rides_per_week"),
      ])
)

daily_trips: LazyFrame = (
    results
      .with_columns(
          ride_date = pl.col("started_at").dt.date()
      )
      .group_by("ride_date")
      .agg(
          pl.len().alias("trip_count")
      )
)

# Step 2: Clone & shift by 7 days for "last week's data"
last_week_counts = (
    daily_trips
    .with_columns((pl.col("ride_date") + pl.duration(days=7)).alias("ride_date"))
    .rename({"trip_count": "trips_last_week"})
)

comparison = (
    daily_trips
    .join(last_week_counts, on="ride_date", how="left")
    .with_columns(
        (pl.coalesce("trip_count", 0).cast(pl.Int64) - pl.coalesce("trips_last_week", 0).cast(pl.Int64)).alias("diff_vs_last_week")
    )
    .sort("ride_date")
)

comparison.collect()

ride_date,trip_count,trips_last_week,diff_vs_last_week
date,u32,u32,i64
2023-06-01,43,,43
2023-06-02,40,,40
2023-06-03,33,,33
2023-06-04,26,,26
2023-06-05,31,,31
…,…,…,…
2023-06-26,21,58,-37
2023-06-27,20,28,-8
2023-06-28,35,29,6
2023-06-29,26,53,-27
