# 03 - Model Training & Comparison

This notebook trains and compares multiple ML models:
1. Linear Regression (baseline)
2. Ridge Regression
3. Lasso Regression
4. Decision Tree
5. Random Forest
6. Gradient Boosting

Models are trained on log-transformed salary and evaluated in original dollar scale.
Uses 5-fold cross-validation for model selection and hyperparameter tuning.

In [None]:
import pandas as pd
import numpy as np
import joblib
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import matplotlib.pyplot as plt
import seaborn as sns

np.random.seed(42)

## 1. Load Preprocessed Data

In [None]:
# Load unscaled data (for tree-based models)
X_train = pd.read_csv('../data/processed/X_train.csv')
X_val = pd.read_csv('../data/processed/X_val.csv')
X_test = pd.read_csv('../data/processed/X_test.csv')

# Load log-scale targets (for training)
y_train = pd.read_csv('../data/processed/y_train.csv').squeeze()
y_val = pd.read_csv('../data/processed/y_val.csv').squeeze()
y_test = pd.read_csv('../data/processed/y_test.csv').squeeze()

# Load original-scale targets (for dollar-based evaluation)
y_train_original = pd.read_csv('../data/processed/y_train_original.csv').squeeze()
y_val_original = pd.read_csv('../data/processed/y_val_original.csv').squeeze()
y_test_original = pd.read_csv('../data/processed/y_test_original.csv').squeeze()

# Load scaled data (for linear models)
X_train_scaled = pd.read_csv('../data/processed/X_train_scaled.csv')
X_val_scaled = pd.read_csv('../data/processed/X_val_scaled.csv')
X_test_scaled = pd.read_csv('../data/processed/X_test_scaled.csv')

print(f"Training set:   {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print(f"Test set:       {X_test.shape}")
print(f"\nTarget is log-transformed salary (log1p). Sample y_train values:")
print(y_train.head())

## 2. Define Evaluation Functions

In [None]:
def evaluate_model(model, X_test, y_test_log, y_test_orig, model_name):
    """Evaluate model: predict in log-space, inverse-transform to dollars for metrics."""
    y_pred_log = model.predict(X_test)
    # Inverse transform: expm1 undoes log1p
    y_pred_dollars = np.expm1(y_pred_log)
    y_actual_dollars = y_test_orig.values if hasattr(y_test_orig, 'values') else y_test_orig

    mae = mean_absolute_error(y_actual_dollars, y_pred_dollars)
    rmse = np.sqrt(mean_squared_error(y_actual_dollars, y_pred_dollars))
    r2 = r2_score(y_actual_dollars, y_pred_dollars)

    print(f"\n{model_name} Results (dollar scale):")
    print(f"  MAE:  ${mae:,.2f}")
    print(f"  RMSE: ${rmse:,.2f}")
    print(f"  R²:   {r2:.4f}")

    return {'model': model_name, 'MAE': mae, 'RMSE': rmse, 'R2': r2}


# Store results
results = []

## 3. Model 1: Linear Regression (Baseline)

Plain linear regression with no regularization serves as the baseline model that all other models are compared against.

In [None]:
print("="*60)
print("Training Linear Regression (Baseline)...")
print("="*60)

lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

In [None]:
# Evaluate baseline on validation set
lr_results = evaluate_model(lr_model, X_val_scaled, y_val, y_val_original, "Linear Regression")
results.append(lr_results)

# Store baseline MAE for comparison
baseline_mae = lr_results['MAE']
baseline_rmse = lr_results['RMSE']
baseline_r2 = lr_results['R2']
print(f"\n>>> Baseline MAE: ${baseline_mae:,.2f} — all models compared against this.")

## 4. Model 2: Ridge Regression

In [None]:
print("="*60)
print("Training Ridge Regression...")
print("="*60)

ridge_params = {'alpha': [0.01, 0.1, 1.0, 10.0, 100.0]}

ridge_gs = GridSearchCV(
    Ridge(), 
    ridge_params, 
    cv=5, 
    scoring='neg_mean_absolute_error',
    n_jobs=1
)
ridge_gs.fit(X_train_scaled, y_train)

print(f"Best alpha: {ridge_gs.best_params_['alpha']}")

ridge_model = ridge_gs.best_estimator_
del ridge_gs

In [None]:
# Evaluate on validation set
ridge_results = evaluate_model(ridge_model, X_val_scaled, y_val, y_val_original, "Ridge Regression")
results.append(ridge_results)

## 5. Model 3: Lasso Regression

In [None]:
print("="*60)
print("Training Lasso Regression...")
print("="*60)

lasso_params = {'alpha': [0.0001, 0.001, 0.01, 0.1, 1.0]}

lasso_gs = GridSearchCV(
    Lasso(max_iter=10000), 
    lasso_params, 
    cv=5, 
    scoring='neg_mean_absolute_error',
    n_jobs=1
)
lasso_gs.fit(X_train_scaled, y_train)

print(f"Best alpha: {lasso_gs.best_params_['alpha']}")

lasso_model = lasso_gs.best_estimator_
del lasso_gs

In [None]:
# Evaluate on validation set
lasso_results = evaluate_model(lasso_model, X_val_scaled, y_val, y_val_original, "Lasso Regression")
results.append(lasso_results)

## 6. Model 4: Decision Tree

In [None]:
print("="*60)
print("Training Decision Tree...")
print("="*60)

dt_params = {
    'max_depth': [5, 10, 15, 20],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8]
}

dt_gs = GridSearchCV(
    DecisionTreeRegressor(random_state=42),
    dt_params,
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=1
)
dt_gs.fit(X_train, y_train)

print(f"Best parameters: {dt_gs.best_params_}")

dt_model = dt_gs.best_estimator_
del dt_gs

In [None]:
# Evaluate on validation set
dt_results = evaluate_model(dt_model, X_val, y_val, y_val_original, "Decision Tree")
results.append(dt_results)

## 7. Model 5: Random Forest

In [None]:
print("="*60)
print("Training Random Forest...")
print("="*60)

rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_rs = RandomizedSearchCV(
    RandomForestRegressor(random_state=42),
    rf_params,
    n_iter=20,
    cv=5,
    scoring='neg_mean_absolute_error',
    random_state=42,
    n_jobs=1
)
rf_rs.fit(X_train, y_train)

print(f"Best parameters: {rf_rs.best_params_}")

rf_model = rf_rs.best_estimator_
del rf_rs

In [None]:
# Evaluate on validation set
rf_results = evaluate_model(rf_model, X_val, y_val, y_val_original, "Random Forest")
results.append(rf_results)

In [None]:
## 8. Model 6: Gradient Boosting

In [None]:
print("="*60)
print("Training Gradient Boosting...")
print("="*60)

gb_params = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 1.0],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

gb_rs = RandomizedSearchCV(
    GradientBoostingRegressor(random_state=42),
    gb_params,
    n_iter=20,
    cv=5,
    scoring='neg_mean_absolute_error',
    random_state=42,
    n_jobs=1
)
gb_rs.fit(X_train, y_train)

print(f"Best parameters: {gb_rs.best_params_}")

gb_model = gb_rs.best_estimator_
del gb_rs

In [None]:
# Evaluate on validation set
gb_results = evaluate_model(gb_model, X_val, y_val, y_val_original, "Gradient Boosting")
results.append(gb_results)

In [None]:
# Create results DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('MAE')

# Add improvement vs baseline columns
results_df['MAE_improvement'] = baseline_mae - results_df['MAE']
results_df['MAE_improvement_pct'] = (results_df['MAE_improvement'] / baseline_mae) * 100
results_df['R2_improvement'] = results_df['R2'] - baseline_r2

print("\n" + "="*70)
print("MODEL COMPARISON (sorted by MAE, dollar scale)")
print("="*70)
print(results_df[['model', 'MAE', 'RMSE', 'R2']].to_string(index=False))

print("\n" + "="*70)
print("IMPROVEMENT vs LINEAR REGRESSION BASELINE")
print("="*70)
print(results_df[['model', 'MAE_improvement', 'MAE_improvement_pct', 'R2_improvement']].to_string(index=False, float_format='%.2f'))

In [None]:
import os
os.makedirs('../models', exist_ok=True)

# Save all models
models_dict = {
    'linear_regression': lr_model,
    'ridge': ridge_model,
    'lasso': lasso_model,
    'decision_tree': dt_model,
    'random_forest': rf_model,
    'gradient_boosting': gb_model,
}

for name, model in models_dict.items():
    joblib.dump(model, f'../models/{name}_model.joblib')
    print(f"Saved: ../models/{name}_model.joblib")

# Save comparison results
results_df.to_csv('../models/model_comparison.csv', index=False)
print("\nSaved: ../models/model_comparison.csv")

In [None]:
# Identify best model
best_model_name = results_df.iloc[0]['model']
best_mae = results_df.iloc[0]['MAE']
best_r2 = results_df.iloc[0]['R2']

print(f"\n{'='*60}")
print(f"BEST MODEL: {best_model_name}")
print(f"  MAE:  ${best_mae:,.2f}")
print(f"  R²:   {best_r2:.4f}")
print(f"{'='*60}")

## Summary

### Models Trained (in order):
1. **Linear Regression** - Baseline model (no regularization)
2. **Ridge Regression** - L2 regularization
3. **Lasso Regression** - L1 regularization (feature selection)
4. **Decision Tree** - Single tree with GridSearchCV tuning
5. **Random Forest** - Ensemble of decision trees
6. **Gradient Boosting** - Sequential boosting (sklearn)

### Key Outputs:
- All models evaluated on **validation set** (dollar scale via `expm1` inverse transform)
- **Baseline comparison table** showing MAE/R² improvement vs Linear Regression
- **K-fold CV comparison table** for generalization assessment
- All 6 trained models saved to `../models/`

### Next Steps:
- See `04_evaluation.ipynb` for detailed evaluation, feature importance, SHAP, and business logic checks