# 03_Modeling_XGBoost

Train XGBoost with GridSearchCV, evaluate with cross-validation, and save the model. Also includes SHAP analysis.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error
import xgboost as xgb

# Load processed data
proc = pd.read_csv('../data/processed_train_prepared.csv')
X = proc.drop('SalePrice_log', axis=1)
y = proc['SalePrice_log']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape)

In [None]:
## Simple baseline XGBoost training
model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
model.fit(X_train, y_train)

pred = model.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared=False)
print('Baseline RMSE (log-scale):', rmse)

In [None]:
## GridSearchCV for hyperparameter tuning (small grid for demo)
param_grid = {
    'n_estimators': [200, 500],
    'learning_rate': [0.03, 0.05],
    'max_depth': [3, 5]
}

grid = GridSearchCV(xgb.XGBRegressor(objective='reg:squarederror', random_state=42),
                    param_grid, cv=3, scoring='neg_root_mean_squared_error', verbose=1)

grid.fit(X_train, y_train)
print('Best params:', grid.best_params_)
print('Best CV score:', -grid.best_score_)

In [None]:
## Evaluate best model on test set
best = grid.best_estimator_
pred = best.predict(X_test)
rmse_test = mean_squared_error(y_test, pred, squared=False)
print('Test RMSE (log-scale):', rmse_test)

# Save model
best.save_model('../models/xgboost_model.json')
import json
with open('../models/metrics.json','w') as f:
    json.dump({'rmse_log': rmse_test}, f)
print('Saved model and metrics.')

In [None]:
## Cross-validation on full dataset
cv_scores = cross_val_score(best, X, y, cv=5, scoring='neg_root_mean_squared_error')
print('5-fold CV RMSE (log-scale):', -cv_scores.mean())

In [None]:
## SHAP analysis (summary plot)
import shap

# load model
model = xgb.XGBRegressor()
model.load_model('../models/xgboost_model.json')

# Use a sample to speedup explainer
sample = X.sample(min(2000, X.shape[0]), random_state=42)
explainer = shap.Explainer(model, sample)
shap_values = explainer(sample)

# Summary plot
shap.summary_plot(shap_values, sample)

**Next steps / improvements:**

- Expand hyperparameter search (RandomizedSearchCV or Optuna).
- Use scikit-learn Pipeline + ColumnTransformer to avoid data leakage.
- Add feature interactions and target encoding where appropriate.
