# Step 5: Advanced Modeling & Optimization

**Objective:** Train multiple regression models, tune hyperparameters,
compare via cross-validation, and select the best model.

---

## 5.1 Setup

In [None]:
import sys
sys.path.insert(0, "..")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from src.models import train_model, tune_model, save_model
from src.evaluation import evaluate_model, print_metrics, cross_validate_model, build_comparison_table
from src.visualization import plot_actual_vs_predicted, plot_residuals

X_train = pd.read_csv("../data/processed/X_train.csv")
X_test  = pd.read_csv("../data/processed/X_test.csv")
y_train = pd.read_csv("../data/processed/y_train.csv").squeeze()
y_test  = pd.read_csv("../data/processed/y_test.csv").squeeze()

print(f"Train: {X_train.shape}  |  Test: {X_test.shape}")

## 5.2 Train Multiple Models

In [None]:
model_names = ["linear_regression", "ridge", "lasso", "decision_tree", "random_forest"]

trained_models = {}
results = {}

for name in model_names:
    print(f"\n{'='*40}")
    model = train_model(name, X_train, y_train)
    metrics = evaluate_model(model, X_test, y_test)
    print_metrics(metrics)
    trained_models[name] = model
    results[name] = metrics

In [None]:
# Optional: XGBoost (advanced)
try:
    xgb_model = train_model("xgboost", X_train, y_train)
    xgb_metrics = evaluate_model(xgb_model, X_test, y_test)
    print_metrics(xgb_metrics)
    trained_models["xgboost"] = xgb_model
    results["xgboost"] = xgb_metrics
except ImportError:
    print("XGBoost not installed — skipping.")

## 5.3 Initial Comparison

In [None]:
comparison_df = build_comparison_table(results)
comparison_df

## 5.4 Hyperparameter Tuning

Tune the top 2 performing models using RandomizedSearchCV.

In [None]:
# Tune Random Forest
print("Tuning Random Forest...")
best_rf = tune_model("random_forest", X_train, y_train, cv=5, n_iter=20)
rf_tuned_metrics = evaluate_model(best_rf, X_test, y_test)
print("\nTuned Random Forest:")
print_metrics(rf_tuned_metrics)
trained_models["random_forest_tuned"] = best_rf
results["random_forest_tuned"] = rf_tuned_metrics

In [None]:
# Tune XGBoost (if available)
if "xgboost" in trained_models:
    print("Tuning XGBoost...")
    best_xgb = tune_model("xgboost", X_train, y_train, cv=5, n_iter=20)
    xgb_tuned_metrics = evaluate_model(best_xgb, X_test, y_test)
    print("\nTuned XGBoost:")
    print_metrics(xgb_tuned_metrics)
    trained_models["xgboost_tuned"] = best_xgb
    results["xgboost_tuned"] = xgb_tuned_metrics

## 5.5 Cross-Validation

In [None]:
print("Cross-validation scores (R²):\n")
cv_results = {}
for name in ["ridge", "random_forest"]:
    print(f"{name}:")
    cv_results[name] = cross_validate_model(trained_models[name], X_train, y_train, cv=5)
    print()

## 5.6 Final Comparison Table

In [None]:
final_comparison = build_comparison_table(results)
print("\n=== Final Model Comparison ===")
final_comparison

## 5.7 Regularization & Bias-Variance Tradeoff

In [None]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import r2_score

alphas = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]

ridge_train_scores, ridge_test_scores = [], []
lasso_train_scores, lasso_test_scores = [], []

for alpha in alphas:
    # Ridge
    ridge = Ridge(alpha=alpha).fit(X_train, y_train)
    ridge_train_scores.append(r2_score(y_train, ridge.predict(X_train)))
    ridge_test_scores.append(r2_score(y_test, ridge.predict(X_test)))
    # Lasso
    lasso = Lasso(alpha=alpha, max_iter=10000).fit(X_train, y_train)
    lasso_train_scores.append(r2_score(y_train, lasso.predict(X_train)))
    lasso_test_scores.append(r2_score(y_test, lasso.predict(X_test)))

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].semilogx(alphas, ridge_train_scores, "o-", label="Train")
axes[0].semilogx(alphas, ridge_test_scores, "s--", label="Test")
axes[0].set_title("Ridge: Bias-Variance Tradeoff")
axes[0].set_xlabel("Alpha (regularization strength)")
axes[0].set_ylabel("R² Score")
axes[0].legend()

axes[1].semilogx(alphas, lasso_train_scores, "o-", label="Train")
axes[1].semilogx(alphas, lasso_test_scores, "s--", label="Test")
axes[1].set_title("Lasso: Bias-Variance Tradeoff")
axes[1].set_xlabel("Alpha (regularization strength)")
axes[1].set_ylabel("R² Score")
axes[1].legend()

fig.tight_layout()
fig.savefig("../reports/figures/bias_variance_tradeoff.png", dpi=150, bbox_inches="tight")
plt.show()

## 5.8 Select & Save Best Model

In [None]:
# Choose the best model based on the comparison table
best_name = final_comparison.iloc[0]["Model"]
print(f"Best model: {best_name}")

# Look up from trained_models which stores both base and tuned versions
best_model = trained_models.get(best_name)
if best_model is None:
    raise ValueError(
        f"Model '{best_name}' not found in trained_models. "
        f"Available: {list(trained_models.keys())}"
    )

save_model(best_model, "../models/best_model.joblib")

---

## Phase 5 Summary

### Model Comparison (Initial Run — with data leakage)

| Model | R² | MAE (BDT) | RMSE (BDT) |
|---|---|---|---|
| Random Forest | 1.0000 | 47.64 | 471.51 |
| Random Forest (tuned) | 1.0000 | 48.12 | 502.63 |
| Decision Tree | 0.9999 | 74.02 | 718.02 |
| XGBoost | 0.9997 | 394.00 | 1,482.20 |
| XGBoost (tuned) | 0.9996 | 313.53 | 1,554.41 |
| Linear Regression | 0.9969 | 1,703.62 | 4,554.05 |
| Ridge | 0.9969 | 1,705.40 | 4,554.06 |
| Lasso | 0.9969 | 1,702.12 | 4,556.44 |

**Best model selected:** Random Forest (R² = 1.0, MAE = 47.64 BDT).

### Analysis

- **Data leakage dominates results.** The near-perfect scores across all models confirm that `Base Fare` and `Tax & Surcharge` leak the target variable. Tree models achieve R²=1.0 because they can perfectly reconstruct the additive relationship via leaf splits.
- **Regularization findings:** Ridge and Lasso bias-variance plots show flat R² across all alpha values (~0.997), indicating the linear relationship is so strong that regularization has no meaningful effect.
- **Tuning impact:** Hyperparameter tuning provides negligible improvement — the tuned Random Forest actually has slightly worse RMSE (502 vs 472 BDT), consistent with overfitting noise when the signal is trivially learnable.

### Concern & Fix Applied

The leaking columns (`Base Fare`, `Tax & Surcharge`) have been dropped from the preprocessing pipeline in `src/pipeline.py`. Re-running the pipeline will produce honest metrics reflecting genuine fare prediction difficulty.