In [1]:
import pandas as pd

df = pd.read_parquet('data/green_tripdata_2024-03.parquet')
print(df.shape)


(57457, 20)


In [5]:
from evidently.report import Report
from evidently.metrics import ColumnQuantileMetric, ColumnMissingValuesMetric
import pandas as pd
import webbrowser

# Load March 2024 data
df = pd.read_parquet("data/green_tripdata_2024-03.parquet")

# Create the report
report = Report(metrics=[
    ColumnQuantileMetric(column_name="fare_amount", quantile=0.5),
    ColumnMissingValuesMetric(column_name="passenger_count")
])

# Run with same data for reference/current
report.run(reference_data=df, current_data=df)

# Save to HTML
report.save_html("taxi_report_q2.html")

# Open the report
webbrowser.open("taxi_report_q2.html")


True

In [13]:
import pandas as pd
from evidently.report import Report
from evidently.metrics import ColumnQuantileMetric
from tqdm import tqdm

# Load data
df = pd.read_parquet("data/green_tripdata_2024-03.parquet")

# Convert pickup datetime
df['lpep_pickup_datetime'] = pd.to_datetime(df['lpep_pickup_datetime'])

# ✅ Filter to March 2024 only
df = df[(df['lpep_pickup_datetime'] >= '2024-03-01') & (df['lpep_pickup_datetime'] < '2024-04-01')]

# Clean up outliers
df = df[
    (df["fare_amount"] > 0) & (df["fare_amount"] < 200) &
    (df["passenger_count"] > 0) & (df["passenger_count"] < 8)
]

# Extract date for grouping
df['pickup_date'] = df['lpep_pickup_datetime'].dt.date

# Compute daily 0.5 quantile
daily_medians = []

for day in tqdm(sorted(df['pickup_date'].unique())):
    df_day = df[df['pickup_date'] == day]
    
    report = Report(metrics=[
        ColumnQuantileMetric(column_name="fare_amount", quantile=0.5)
    ])
    report.run(reference_data=df_day, current_data=df_day)
    
    result = report.as_dict()
    quantile_val = result['metrics'][0]['result']['current']['value']
    daily_medians.append((day, quantile_val))

# Results
daily_df = pd.DataFrame(daily_medians, columns=["date", "median_fare"])
max_median = daily_df["median_fare"].max()

print(daily_df.sort_values(by="median_fare", ascending=False).head())  # Optional for debug
print(f"\n✅ Final Max median fare during March 2024: {max_median}")


100%|██████████| 31/31 [00:01<00:00, 29.17it/s]

          date  median_fare
2   2024-03-03         14.2
29  2024-03-30         14.2
23  2024-03-24         14.2
9   2024-03-10         14.2
13  2024-03-14         14.2

✅ Final Max median fare during March 2024: 14.2



