In [2]:
#!/usr/bin/env python3
"""
train_and_save_models.py

Loads cleaned Citibike data, then:
  1) trains and logs a baseline mean model
  2) trains and logs a LightGBM on 28 lag features
  3) trains and logs a LightGBM on top-10 importance features

All runs go to your DagsHub MLflow server, and each trained model is also saved locally under `models/`.
The best-performing model (lowest MAE) is additionally saved as `best_model.pkl`.
"""

import os
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.dummy import DummyRegressor
from lightgbm import LGBMRegressor
import mlflow
import joblib
from pathlib import Path

# ──────────────────────────────────────────────────────────────────────────────
# DagsHub MLflow settings
os.environ["MLFLOW_TRACKING_USERNAME"] = "kaushal-shivaprakashan"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "b01d7b8c94b982d47d0224ea469bbfe4b8870ff6"
mlflow.set_tracking_uri("https://dagshub.com/kaushal-shivaprakashan/final_project.mlflow")
mlflow.set_experiment("CitiBike_Remote_Experiment")

# Data & split config
PARQUET_PATH = "/Users/kaushalshivaprakash/Desktop/project3/data/processed/cleaned_citibike/citibike_2023_top3.parquet"
TRAIN_FRAC   = 0.8
MAX_LAG      = 28
TOP_K        = 10

# ──────────────────────────────────────────────────────────────────────────────
# Determine models directory (works in scripts and notebooks)
try:
    BASE_DIR = Path(__file__).parent
except NameError:
    BASE_DIR = Path.cwd()

MODEL_DIR = BASE_DIR / "models"
MODEL_DIR.mkdir(parents=True, exist_ok=True)
print("Saving models into:", MODEL_DIR)

# ──────────────────────────────────────────────────────────────────────────────

def load_and_agg(path):
    df = pd.read_parquet(path)
    df["datetime"] = df["started_at"].dt.floor("H")
    agg = (
        df
        .groupby("datetime")
        .size()
        .reset_index(name="count")
        .sort_values("datetime")
    )
    return agg

def train_test_split_ts(df, frac):
    idx = int(len(df) * frac)
    return df.iloc[:idx], df.iloc[idx:]

def log_baseline(train, test):
    run_name = "baseline_mean"
    with mlflow.start_run(run_name=run_name):
        mean_val = train["count"].mean()
        preds = [mean_val] * len(test)
        mae = mean_absolute_error(test["count"], preds)
        mlflow.log_param("model_type", run_name)
        mlflow.log_metric("mae", mae)

        # Wrap into DummyRegressor so we can save it
        dummy = DummyRegressor(strategy="constant", constant=mean_val)
        dummy.fit(train[["count"]], train["count"])
        out_path = MODEL_DIR / f"{run_name}.pkl"
        joblib.dump(dummy, out_path)
        print(f"[{run_name}] MAE = {mae:.4f}, saved to {out_path}")
        return dummy, mae

def log_lag_model(df):
    run_name = "lgbm_28lag"
    df_lag = df.copy()
    for lag in range(1, MAX_LAG + 1):
        df_lag[f"lag_{lag}"] = df_lag["count"].shift(lag)
    df_lag = df_lag.dropna().reset_index(drop=True)

    train, test = train_test_split_ts(df_lag, TRAIN_FRAC)
    feats = [f"lag_{i}" for i in range(1, MAX_LAG + 1)]
    X_train, y_train = train[feats], train["count"]
    X_test,  y_test  = test[feats],  test["count"]

    with mlflow.start_run(run_name=run_name):
        mlflow.log_param("model_type", run_name)
        mlflow.log_param("num_lags", MAX_LAG)
        model = LGBMRegressor(random_state=42)
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        mae = mean_absolute_error(y_test, preds)

        mlflow.log_metric("mae", mae)
        mlflow.sklearn.log_model(model, "model")

        out_path = MODEL_DIR / f"{run_name}.pkl"
        joblib.dump(model, out_path)
        print(f"[{run_name}] MAE = {mae:.4f}, saved to {out_path}")
        return model, mae

def log_topk_model(df):
    run_name = "lgbm_top10_imp"
    df_lag = df.copy()
    for lag in range(1, MAX_LAG + 1):
        df_lag[f"lag_{lag}"] = df_lag["count"].shift(lag)
    df_lag = df_lag.dropna().reset_index(drop=True)

    train, test = train_test_split_ts(df_lag, TRAIN_FRAC)
    feats = [f"lag_{i}" for i in range(1, MAX_LAG + 1)]
    X_train, y_train = train[feats], train["count"]
    X_test,  y_test  = test[feats], test["count"]

    # Fit a base LGBM to get importances
    base = LGBMRegressor(random_state=42)
    base.fit(X_train, y_train)
    importances = pd.Series(base.feature_importances_, index=feats)
    top_feats = importances.nlargest(TOP_K).index.tolist()

    with mlflow.start_run(run_name=run_name):
        mlflow.log_param("model_type", run_name)
        mlflow.log_param("num_lags", MAX_LAG)
        mlflow.log_param("selected_feats", top_feats)
        model = LGBMRegressor(random_state=42)
        model.fit(X_train[top_feats], y_train)
        preds = model.predict(X_test[top_feats])
        mae = mean_absolute_error(y_test, preds)

        mlflow.log_metric("mae", mae)
        mlflow.sklearn.log_model(model, "model")

        out_path = MODEL_DIR / f"{run_name}.pkl"
        joblib.dump(model, out_path)
        print(f"[{run_name}] MAE = {mae:.4f}, saved to {out_path}")
        return model, mae

def main():
    df = load_and_agg(PARQUET_PATH)
    train, test = train_test_split_ts(df, TRAIN_FRAC)

    baseline_model, mae_baseline = log_baseline(train, test)
    lag_model,      mae_lag      = log_lag_model(df)
    topk_model,     mae_topk     = log_topk_model(df)

    # Choose best by lowest MAE
    metrics = {
        "baseline_mean": mae_baseline,
        "lgbm_28lag":    mae_lag,
        "lgbm_top10_imp": mae_topk,
    }
    models = {
        "baseline_mean": baseline_model,
        "lgbm_28lag":    lag_model,
        "lgbm_top10_imp": topk_model,
    }
    best_name = min(metrics, key=metrics.get)
    best_model = models[best_name]
    best_path  = MODEL_DIR / "best_model.pkl"
    joblib.dump(best_model, best_path)
    print(f"✅ Best model '{best_name}' (MAE={metrics[best_name]:.4f}) saved to {best_path}")

if __name__ == "__main__":
    main()

Saving models into: /Users/kaushalshivaprakash/Desktop/project3/models
[baseline_mean] MAE = 31.1982, saved to /Users/kaushalshivaprakash/Desktop/project3/models/baseline_mean.pkl
🏃 View run baseline_mean at: https://dagshub.com/kaushal-shivaprakashan/final_project.mlflow/#/experiments/0/runs/891aaaf99d2b471e9b7ee4fae8f8ae23
🧪 View experiment at: https://dagshub.com/kaushal-shivaprakashan/final_project.mlflow/#/experiments/0
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000794 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5326
[LightGBM] [Info] Number of data points in the train set: 6582, number of used features: 28
[LightGBM] [Info] Start training from score 45.662413




[lgbm_28lag] MAE = 8.2161, saved to /Users/kaushalshivaprakash/Desktop/project3/models/lgbm_28lag.pkl
🏃 View run lgbm_28lag at: https://dagshub.com/kaushal-shivaprakashan/final_project.mlflow/#/experiments/0/runs/98e13fbf89ca4f0c9161741afe70986c
🧪 View experiment at: https://dagshub.com/kaushal-shivaprakashan/final_project.mlflow/#/experiments/0
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000929 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5326
[LightGBM] [Info] Number of data points in the train set: 6582, number of used features: 28
[LightGBM] [Info] Start training from score 45.662413
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001089 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1904
[LightGBM] [Info] Number of data points in the train set: 6582, number of used features: 10
[LightGBM] [Info] Star



[lgbm_top10_imp] MAE = 8.3349, saved to /Users/kaushalshivaprakash/Desktop/project3/models/lgbm_top10_imp.pkl
🏃 View run lgbm_top10_imp at: https://dagshub.com/kaushal-shivaprakashan/final_project.mlflow/#/experiments/0/runs/74c909cf936444f79ba1bcc37591cea4
🧪 View experiment at: https://dagshub.com/kaushal-shivaprakashan/final_project.mlflow/#/experiments/0
✅ Best model 'lgbm_28lag' (MAE=8.2161) saved to /Users/kaushalshivaprakash/Desktop/project3/models/best_model.pkl
