# 02_hybrid_modeling

**Purpose:** Load preprocessed features, split chronologically, train a baseline hybrid pipeline (classifier + regressor), evaluate, and save final artifacts.


In [None]:
import os
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import roc_auc_score, roc_curve, classification_report, confusion_matrix, mean_absolute_error, mean_squared_error
from sklearn.calibration import CalibratedClassifierCV

# Config
PROCESSED_PATH = Path("processed/features_ready.csv")
ARTIFACTS_DIR = Path("final_artifacts")
MODELS_DIR = ARTIFACTS_DIR / "models"
PLOTS_DIR = ARTIFACTS_DIR / "plots"
ARTIFACTS_DIR.mkdir(exist_ok=True)
MODELS_DIR.mkdir(parents=True, exist_ok=True)
PLOTS_DIR.mkdir(parents=True, exist_ok=True)

THRESHOLD = 50.0
SPLIT_DATE = pd.Timestamp("2025-02-15 23:00:00+00:00")
RANDOM_STATE = 42


In [None]:
df = pd.read_csv(PROCESSED_PATH, index_col=0, parse_dates=True)
df.index = df.index.tz_convert("UTC") if df.index.tzinfo is None else df.index
print("Loaded processed features:", df.shape)
df.head()


In [None]:
# Prepare X,y
exclude = ['Up_next_hour', 'Up']
feature_cols = [c for c in df.columns if c not in exclude]
X = df[feature_cols].copy()
y = df['Up_next_hour'].copy()

# Chronological split
train_mask = X.index <= SPLIT_DATE
X_train, X_test = X.loc[train_mask], X.loc[~train_mask]
y_train, y_test = y.loc[train_mask], y.loc[~train_mask]
print("Train/Test shapes:", X_train.shape, X_test.shape)

# Event labels
y_train_event = (y_train > THRESHOLD).astype(int)
y_test_event = (y_test > THRESHOLD).astype(int)
print("Event rate (train/test):", y_train_event.mean(), y_test_event.mean())


## Baseline models
We will use:
- Classifier: XGBoostClassifier (if available) or RandomForestClassifier
- Regressor: XGBoostRegressor (if available) or RandomForestRegressor
Regressor is trained only on event rows.


In [None]:
# classifier: try xgboost if installed, else RF
try:
    import xgboost as xgb
    clf_model = xgb.XGBClassifier(n_estimators=300, learning_rate=0.05, max_depth=5, use_label_encoder=False, eval_metric='logloss', n_jobs=-1, random_state=RANDOM_STATE)
    print("Using XGBoostClassifier")
except Exception:
    from sklearn.ensemble import RandomForestClassifier
    clf_model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=RANDOM_STATE, n_jobs=-1)
    print("Using RandomForestClassifier")

clf_pipe = Pipeline([("imputer", SimpleImputer(strategy="median")), ("clf", clf_model)])
clf_pipe.fit(X_train, y_train_event)
joblib.dump(clf_pipe, MODELS_DIR / "classifier_pipe.joblib")
print("Classifier trained and saved.")


In [None]:
# Evaluate classifier on test
y_proba = clf_pipe.predict_proba(X_test)[:,1]
y_pred_label = (y_proba > 0.5).astype(int)

print("Classification report (threshold=0.5):")
print(classification_report(y_test_event, y_pred_label, digits=4))
print("ROC-AUC:", roc_auc_score(y_test_event, y_proba).round(4))

# ROC plot
fpr, tpr, _ = roc_curve(y_test_event, y_proba)
plt.figure(figsize=(6,5)); plt.plot(fpr, tpr, label=f"AUC={roc_auc_score(y_test_event,y_proba):.3f}")
plt.plot([0,1],[0,1],'k--', alpha=0.3); plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title("ROC"); plt.legend(); plt.grid(alpha=0.2)
plt.tight_layout(); plt.savefig(PLOTS_DIR / "roc_classifier.png"); plt.close()


In [None]:
# regressor: try xgboost if available, else RF
try:
    import xgboost as xgb
    reg_model = xgb.XGBRegressor(n_estimators=300, learning_rate=0.05, max_depth=6, n_jobs=-1, random_state=RANDOM_STATE)
    print("Using XGBoostRegressor")
except Exception:
    from sklearn.ensemble import RandomForestRegressor
    reg_model = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=RANDOM_STATE, n_jobs=-1)
    print("Using RandomForestRegressor")

event_mask = y_train > THRESHOLD
print("Event rows for training regressor:", event_mask.sum())
if event_mask.sum() == 0:
    raise RuntimeError("No event rows in train set to train regressor on.")

reg_pipe = Pipeline([("imputer", SimpleImputer(strategy="median")), ("reg", reg_model)])
reg_pipe.fit(X_train.loc[event_mask], y_train.loc[event_mask])
joblib.dump(reg_pipe, MODELS_DIR / "regressor_pipe.joblib")
print("Regressor trained on event rows and saved.")


In [None]:
# Prob-weighted hybrid
reg_pred_all = reg_pipe.predict(X_test)
final_pred_pw = y_proba * reg_pred_all
final_pred_pw = np.maximum(final_pred_pw, 0.0)

# Evaluate
mae_pw = mean_absolute_error(y_test, final_pred_pw)
rmse_pw = mean_squared_error(y_test, final_pred_pw, squared=False)
print("Hybrid (prob-weighted) MAE:", mae_pw, "RMSE:", rmse_pw)

# MAE on event hours
event_mask_test = y_test > THRESHOLD
if event_mask_test.sum() > 0:
    mae_event = mean_absolute_error(y_test[event_mask_test], final_pred_pw[event_mask_test])
    print("MAE on true event hours:", mae_event)


In [None]:
# Save predictions
preds = pd.DataFrame({
    "datetime": X_test.index,
    "y_true": y_test.values,
    "p_event": y_proba,
    "pred_prob_weighted": final_pred_pw
}).set_index("datetime")
preds.to_csv(ARTIFACTS_DIR / "hybrid_predictions.csv")
print("Saved hybrid predictions to:", ARTIFACTS_DIR / "hybrid_predictions.csv")

# Pred vs actual plot
plt.figure(figsize=(12,4))
plt.plot(preds.index, preds["y_true"], label="Actual", linewidth=1.2)
plt.plot(preds.index, preds["pred_prob_weighted"], label="Predicted (prob-weighted)", alpha=0.9)
plt.title("Hybrid Predicted vs Actual (test)"); plt.legend(); plt.grid(alpha=0.2)
plt.tight_layout(); plt.savefig(PLOTS_DIR / "hybrid_pred_vs_actual.png"); plt.close()

# Error histogram
errors = preds["y_true"] - preds["pred_prob_weighted"]
plt.figure(figsize=(6,4)); plt.hist(errors, bins=50); plt.title("Error distribution"); plt.tight_layout()
plt.savefig(PLOTS_DIR / "hybrid_error_hist.png"); plt.close()


## Next steps
- Use Notebook 3 to benchmark other classifier/regressor combos and optimize pipelines.
- Consider calibrating classifier probabilities and using log1p transform for regressor target.
