In [None]:
import pandas as pd
import numpy as np
import joblib
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

# Ensure we are in the project root
if os.path.basename(os.getcwd()) == "notebooks_clean":
    os.chdir("..")
print(f"Current Working Directory: {os.getcwd()}")

# Create models directory if not exists
os.makedirs("models", exist_ok=True)

## **1. Load Data & Define Target**

In [None]:
df = pd.read_csv("datasets/suro_dataset_final.csv")

# Define Congestion Classes (Quantile-based)
df['traffic_sum'] = df['inbound'] + df['outbound']
q1 = df['traffic_sum'].quantile(0.33)
q2 = df['traffic_sum'].quantile(0.66)

def congestion_label(x):
    if x <= q1: return 0   # Low
    elif x <= q2: return 1 # Moderate
    else: return 2         # High

df['congestion_class'] = df['traffic_sum'].apply(congestion_label)

# Features for Modeling
features = [
    "latitude", "longitude", "hour", "day", 
    "temperature_2m", "precipitation", "rain", "wind_speed_10m",
    "incident_flag"
]

X = df[features]
y = df['congestion_class']

# Time-based Split (80/20)
split_idx = int(0.8 * len(df))
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

print(f"Training Samples: {len(X_train)}, Test Samples: {len(X_test)}")

## **2. Train Hierarchical Congestion Model**
**Strategy:**
- **Stage 1:** Binary classifier to find "Moderate" (Class 1) vs. Rest.
- **Stage 2:** Binary classifier to find "Low" (0) vs. "High" (2) (trained only on non-moderate data).

In [None]:
# --- Stage 1: Moderate vs. Rest ---
print("Training Stage 1 (Moderate Specialist)...")
y_train_s1 = (y_train == 1).astype(int)
clf_stage1 = HistGradientBoostingClassifier(
    max_depth=20, learning_rate=0.15, max_iter=400, 
    class_weight="balanced", random_state=42
)
clf_stage1.fit(X_train, y_train_s1)

# --- Stage 2: Low vs. High ---
print("Training Stage 2 (Extreme Generalist)...")
mask_extreme = y_train != 1
X_train_s2 = X_train[mask_extreme]
y_train_s2 = y_train[mask_extreme]

clf_stage2 = HistGradientBoostingClassifier(
    max_depth=25, learning_rate=0.2, max_iter=450, 
    class_weight="balanced", random_state=42
)
clf_stage2.fit(X_train_s2, y_train_s2)

# --- Evaluation ---
print("Evaluating Hierarchical Model...")
probs_s1 = clf_stage1.predict_proba(X_test)[:, 1]
preds_s2 = clf_stage2.predict(X_test)

# Threshold tuned to 0.54 for optimal recall of Moderate class
final_preds = np.where(probs_s1 > 0.54, 1, preds_s2)

print(classification_report(y_test, final_preds))

# Save Models
joblib.dump(clf_stage1, 'models/congestion_stage1.pkl')
joblib.dump(clf_stage2, 'models/congestion_stage2.pkl')
print("Congestion models saved to models/")

## **3. Train Incident Prediction Model**
Predict the likelihood of an incident based on location, time, and weather.

In [None]:
# Target: incident_flag
y_inc = df['incident_flag']
X_inc = df.drop(['congestion_class', 'incident_flag', 'traffic_sum', 'inbound', 'outbound'], axis=1, errors='ignore')

# Stratified Split (Incidents are rare)
X_train_inc, X_test_inc, y_train_inc, y_test_inc = train_test_split(
    X_inc, y_inc, test_size=0.2, random_state=42, stratify=y_inc
)

print("Training Incident Model...")
model_inc = HistGradientBoostingClassifier(
    max_depth=20, learning_rate=0.15, max_iter=350, 
    class_weight="balanced", random_state=42
)
model_inc.fit(X_train_inc, y_train_inc)

# Evaluation
y_pred_inc = model_inc.predict(X_test_inc)
print(classification_report(y_test_inc, y_pred_inc))

# Save Model
joblib.dump(model_inc, 'models/incident_model.pkl')
print("Incident model saved to models/incident_model.pkl")