In [1]:
# model1_baseline.py

import pandas as pd
from sklearn.metrics import mean_absolute_error
import mlflow

# ──────────────────────────────────────────────────────────────────────────────
PARQUET_PATH = "/Users/kaushalshivaprakash/Desktop/project3/data/processed/cleaned_citibike/citibike_2023_top3.parquet"
EXPERIMENT   = "CitiBike_Forecasting"
TRAIN_FRAC   = 0.8
# ──────────────────────────────────────────────────────────────────────────────

def load_and_aggregate(path):
    df = pd.read_parquet(path)
    df["datetime"] = df["started_at"].dt.floor("H")
    agg = df.groupby("datetime").size().reset_index(name="count")
    agg = agg.sort_values("datetime").reset_index(drop=True)
    return agg

def train_test_split_ts(df, frac):
    idx = int(len(df) * frac)
    return df.iloc[:idx], df.iloc[idx:]

def main():
    mlflow.set_experiment(EXPERIMENT)
    df = load_and_aggregate(PARQUET_PATH)
    train, test = train_test_split_ts(df, TRAIN_FRAC)

    # Baseline: predict train mean for every test point
    baseline_pred = train["count"].mean()
    preds = [baseline_pred] * len(test)
    mae = mean_absolute_error(test["count"], preds)

    with mlflow.start_run(run_name="baseline_mean"):
        mlflow.log_param("model_type", "baseline_mean")
        mlflow.log_param("train_count", len(train))
        mlflow.log_param("test_count", len(test))
        mlflow.log_metric("mae", mae)

    print(f"[baseline] MAE = {mae:.2f}")

if __name__ == "__main__":
    main()

2025/05/10 12:46:28 INFO mlflow.tracking.fluent: Experiment with name 'CitiBike_Forecasting' does not exist. Creating a new experiment.


[baseline] MAE = 31.20
