In [2]:
import pandas as pd

df = pd.read_parquet("../data/clean_yellow_tripdata_2025-01.parquet")

In [5]:
# Trip duration in minutes
df['trip_duration_min'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds()/60

# Tip as percentage of fare
df['tip_pct'] = df['tip_amount'] / df['total_amount'] * 100
df.fillna({'trip_pct':0}, inplace=True)

# Trip speed (mph)
df['trip_speed_mph'] = df['trip_distance'] / (df['trip_duration_min']/60)
df.replace({'trip_speed_mph': {float('inf'), 0}}, {'trip_speed_mph': {-float('inf'), 0}}, inplace=True)
df.fillna({'trip_speed_mph': 0}, inplace=True)

# Flag long trips (>10 miles)
df['long_trip'] = df['trip_distance'] > 10

# Flag high tip (>20%)
df['high_tip'] = df['tip_pct'] > 20

In [6]:
revenue_hour = df.groupby('pickup_hour')['total_amount'].sum().reset_index()
revenue_hour.to_csv("../data/revenue_by_hour.csv", index=False)

trips_hour = df.groupby('pickup_hour').size().reset_index(name='num_trips')
trips_hour.to_csv("../data/trips_by_hour.csv", index=False)

In [8]:
vendor_kpis = df.groupby('VendorID').agg(
    total_revenue = ('total_amount', 'sum'),
    avg_revenue = ('total_amount', 'mean'),
    num_trips = ('total_amount', 'count'),
    avg_tip_pct = ('tip_pct', 'mean'),
    long_trip_pct = ('long_trip', 'mean')
).reset_index()

vendor_kpis['long_trip_pct'] *= 100
vendor_kpis.to_csv("../data/vendor_kpis.csv", index=False)

In [9]:
zone_kpis = df.groupby('PULocationID').agg(
    avg_distance=('trip_distance', 'mean'),
    long_trip_ratio=('long_trip', 'mean'),
    avg_tip_pct=('tip_pct', 'mean'),
    num_trips=('trip_distance', 'count')
).reset_index()

zone_kpis['long_trip_ratio'] *= 100
zone_kpis.to_csv("../data/zone_kpis.csv", index=False)

In [10]:
weekday_kpis = df.groupby('is_weekend').agg(
    avg_total=('total_amount', 'mean'),
    avg_distance=('trip_distance', 'mean'),
    avg_tip_pct=('tip_pct', 'mean'),
    num_trips=('total_amount', 'count')
).reset_index()

weekday_kpis.to_csv("../data/weekday_vs_weekend.csv", index=False)

In [11]:
df.to_parquet("../data/clean_yellow_tripdata_2025-01.parquet", index=False)
df.to_csv("../data/clean_yellow_tripdata_2025-01.csv", index=False)