In [8]:
# train_and_save.py

import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from lightgbm import LGBMRegressor
from joblib import dump

# ─── Helper functions at top‐level so they’re pickleable ────────────────
def select_hour_bucket(df):
    # Expects a DataFrame with a 'hour_bucket' column
    return df[["hour_bucket"]]

def extract_datetime_parts(df):
    # Expects a DataFrame with a 'hour_bucket' datetime column
    return pd.DataFrame({
        "year":  df["hour_bucket"].dt.year,
        "month": df["hour_bucket"].dt.month,
        "day":   df["hour_bucket"].dt.day,
        "hour":  df["hour_bucket"].dt.hour
    })

def select_station_id(df):
    # Expects a DataFrame with 'start_station_id'
    return df[["start_station_id"]]

# ─── 1) Paths ────────────────────────────────────────────────────────────
PARQUET_PATH = "/Users/kaushalshivaprakash/Desktop/project3/data/processed/cleaned_citibike/citibike_2023_top3.parquet"
MODEL_DIR    = "/Users/kaushalshivaprakash/Desktop/project3/pipelines/models"
os.makedirs(MODEL_DIR, exist_ok=True)

# ─── 2) Load raw ride‐level data ─────────────────────────────────────────
df = pd.read_parquet(PARQUET_PATH)
df["started_at"] = pd.to_datetime(df["started_at"])
df["hour_bucket"] = df["started_at"].dt.floor("H")

# ─── 3) Aggregate to hourly counts per station ──────────────────────────
agg = (
    df
    .groupby(["start_station_id", "hour_bucket"])
    .agg(target_trips=("ride_id", "count"))
    .reset_index()
)

# ─── 4) Prepare features X and target y ─────────────────────────────────
X = agg[["start_station_id", "hour_bucket"]]
y = agg["target_trips"]

# ─── 5) Build preprocessing pipeline ───────────────────────────────────
datetime_pipeline = Pipeline([
    ("select_dt", FunctionTransformer(select_hour_bucket, validate=False)),
    ("extract",   FunctionTransformer(extract_datetime_parts, validate=False))
])

# For station ID, we’ll wrap it in a named function as well
station_pipeline = Pipeline([
    ("select_station", FunctionTransformer(select_station_id, validate=False)),
    ("onehot",         OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("dt_feats",   datetime_pipeline,    ["hour_bucket"]),
    ("loc_feats",  station_pipeline,     ["start_station_id"])
])

# ─── 6) Full modeling pipeline ─────────────────────────────────────────
pipeline = Pipeline([
    ("preproc",   preprocessor),
    ("estimator", LGBMRegressor(
        n_estimators=100,
        learning_rate=0.1,
        random_state=42
    ))
])

# ─── 7) Train/test split, fit, and validate ─────────────────────────────
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

pipeline.fit(X_train, y_train)
val_preds = pipeline.predict(X_val)
mae = abs(val_preds - y_val).mean()
print(f"Validation MAE: {mae:.3f}")

# ─── 8) Save artifacts ─────────────────────────────────────────────────
feature_pipeline_path = os.path.join(MODEL_DIR, "feature_pipeline.pkl")
model_path            = os.path.join(MODEL_DIR, "best_model.pkl")

dump(preprocessor, feature_pipeline_path)
print(f"✔ Saved feature pipeline → {feature_pipeline_path}")

dump(pipeline.named_steps["estimator"], model_path)
print(f"✔ Saved model            → {model_path}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000303 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 75
[LightGBM] [Info] Number of data points in the train set: 17593, number of used features: 6
[LightGBM] [Info] Start training from score 16.700335
Validation MAE: 5.534
✔ Saved feature pipeline → /Users/kaushalshivaprakash/Desktop/project3/pipelines/models/feature_pipeline.pkl
✔ Saved model            → /Users/kaushalshivaprakash/Desktop/project3/pipelines/models/best_model.pkl




In [12]:
# inference_pipeline.py
"""
Standalone batch inference script reading Parquet input directly.
"""

import pandas as pd
import joblib
import os
from sklearn.pipeline import Pipeline as SklearnPipeline

# ── 1) CONFIGURE YOUR PATHS HERE ────────────────────────────────────────
FEATURE_PIPELINE = "/Users/kaushalshivaprakash/Desktop/project3/pipelines/models/feature_pipeline.pkl"
MODEL_PATH       = "/Users/kaushalshivaprakash/Desktop/project3/pipelines/models/best_model.pkl"
INPUT_PARQUET    = "/Users/kaushalshivaprakash/Desktop/project3/data/processed/cleaned_citibike/citibike_2023_top3.parquet"
OUTPUT_CSV       = "/Users/kaushalshivaprakash/Desktop/project3/pipelines/output/predictions.csv"

# ── 2) LOAD & COMPOSE ──────────────────────────────────────────────────
def load_pipeline(pipeline_path: str, model_path: str) -> SklearnPipeline:
    preprocessor = joblib.load(pipeline_path)
    model        = joblib.load(model_path)
    return SklearnPipeline([
        ("preprocessing", preprocessor),
        ("estimator",     model)
    ])

# ── 3) BATCH INFERENCE ─────────────────────────────────────────────────
def run_batch_inference(pipeline: SklearnPipeline, input_path: str, output_path: str):
    # Read Parquet input
    df = pd.read_parquet(input_path)
    # Ensure timestamp is datetime
    df["started_at"] = pd.to_datetime(df["started_at"])
    # Bucket by hour (must match training)
    df["hour_bucket"] = df["started_at"].dt.floor("H")
    # Prepare feature frame
    X = pd.DataFrame({
        "start_station_id": df["start_station_id"],
        "hour_bucket":      df["hour_bucket"]
    })
    # Run predictions
    df["predicted_trips"] = pipeline.predict(X)
    # Ensure output directory exists
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    # Write out full DataFrame with predictions
    df.to_csv(output_path, index=False)
    print(f"✅ Predictions saved to {output_path}")

# ── 4) MAIN EXECUTION ──────────────────────────────────────────────────
def main():
    # Validate existence of all paths
    for path in (FEATURE_PIPELINE, MODEL_PATH, INPUT_PARQUET):
        if not os.path.exists(path):
            raise FileNotFoundError(f"Required file not found: {path}")
    # Load the combined pipeline
    pipeline = load_pipeline(FEATURE_PIPELINE, MODEL_PATH)
    # Run and save predictions
    run_batch_inference(pipeline, INPUT_PARQUET, OUTPUT_CSV)
    # Final confirmation
    print("🚀 Inference pipeline created and executed successfully!")

if __name__ == "__main__":
    main()




✅ Predictions saved to /Users/kaushalshivaprakash/Desktop/project3/pipelines/output/predictions.csv
🚀 Inference pipeline created and executed successfully!
