In [1]:
#!/usr/bin/env python3
"""
Ice Tea Mode — Full Modeling & Feature Importance Display
"""

import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, TimeSeriesSplit
import joblib

# -------------------------------
# PATHS
# -------------------------------
MERGE_DIR = Path("../all_dataset/merge_dataset")
FE_DIR = Path("../all_dataset/feature_dataset")
MODEL_DIR = Path("../all_dataset/models")
MODEL_DIR.mkdir(exist_ok=True, parents=True)

# -------------------------------
# HELPER FUNCTIONS
# -------------------------------
def train_production(df):
    target = "realized_ton"
    id_cols = ["week_start", "pit_id_num", "stockpile_num"]
    feature_cols = [c for c in df.columns if c not in id_cols + [target]]
    
    X = df[feature_cols]
    y = df[target]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    rf = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    # Fit full dataset & save
    rf.fit(X, y)
    model_file = MODEL_DIR / "rf_prod_full_dataset.joblib"
    joblib.dump(rf, model_file)
    
    importances = pd.Series(rf.feature_importances_, index=feature_cols).sort_values(ascending=False)
    
    return rf, {"rmse": rmse, "r2": r2}, importances

def train_shipping(df):
    id_cols = ["truck_id", "jetty_id", "week_start"]
    target = "weekly_truck_supply_ton"
    feature_cols = [c for c in df.columns if c not in id_cols + [target]]
    
    df[feature_cols] = df[feature_cols].apply(pd.to_numeric, errors="coerce")
    df["y_log"] = np.log1p(df[target])
    
    X = df[feature_cols].fillna(0)
    y = df["y_log"]
    
    tscv = TimeSeriesSplit(n_splits=5)
    rmse_list, r2_list = [], []
    
    for train_idx, test_idx in tscv.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        rf = RandomForestRegressor(
            n_estimators=200, max_depth=8, min_samples_split=5, min_samples_leaf=3,
            random_state=42, n_jobs=-1
        )
        rf.fit(X_train, y_train)
        
        y_pred = np.expm1(rf.predict(X_test))
        y_true = np.expm1(y_test)
        
        rmse_list.append(np.sqrt(mean_squared_error(y_true, y_pred)))
        r2_list.append(r2_score(y_true, y_pred))
    
    rf.fit(X, y)
    model_file = MODEL_DIR / "rf_shipping_model_full.joblib"
    joblib.dump(rf, model_file)
    
    importances = pd.Series(rf.feature_importances_, index=feature_cols).sort_values(ascending=False)
    
    return rf, {"rmse_cv": rmse_list, "r2_cv": r2_list}, importances

def display_top_features(importances, title="Top-10 Features"):
    print(f"\n=== {title} ===")
    top10 = importances.head(10)
    max_len = max(len(c) for c in top10.index)
    for f, v in zip(top10.index, top10.values):
        print(f"{f.ljust(max_len)} : {v:.6f}")

# -------------------------------
# MAIN
# -------------------------------
def main():
    # Production
    df_prod = pd.read_csv(MERGE_DIR / "merged_model_ready.csv", parse_dates=["week_start"])
    rf_prod, metrics_prod, imp_prod = train_production(df_prod)
    print(f"[Production] RMSE: {metrics_prod['rmse']:.2f} | R²: {metrics_prod['r2']:.4f}")
    display_top_features(imp_prod, "Production Top-10 Features")
    
    # Truck-to-Ship
    df_ship = pd.read_csv(FE_DIR / "truck_to_ship_fe.csv", parse_dates=["week_start"])
    rf_ship, metrics_ship, imp_ship = train_shipping(df_ship)
    print(f"\n[Truck-to-Ship] Avg RMSE: {np.mean(metrics_ship['rmse_cv']):.2f} | Avg R²: {np.mean(metrics_ship['r2_cv']):.4f}")
    display_top_features(imp_ship, "Truck-to-Ship Top-10 Features")

if __name__ == "__main__":
    main()


[Production] RMSE: 609.66 | R²: 0.9037

=== Production Top-10 Features ===
target_ton           : 0.556003
progress_ratio       : 0.334496
differential         : 0.054442
utilization_pct      : 0.003069
planned_loading_ton  : 0.002690
temp_avg_c           : 0.002638
rain_peak_mm         : 0.002595
humidity_avg_pct     : 0.002425
realized_ton_roll_4w : 0.002078
tonnage_moved_ton    : 0.002031

[Truck-to-Ship] Avg RMSE: 636.11 | Avg R²: 0.8953

=== Truck-to-Ship Top-10 Features ===
weekly_ship_demand_ton        : 0.289152
weekly_trips_total            : 0.253040
supply_alignment_ratio        : 0.177648
truck_to_ship_utilization     : 0.152148
avg_cycle_time_weighted       : 0.127496
jetty_id_num                  : 0.000176
weekly_truck_supply_ton_lag1  : 0.000175
weekly_truck_supply_ton_roll4 : 0.000134
avg_cycle_time_min            : 0.000027
tonnage_moved_ton             : 0.000002
