In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

In [8]:
print("Model 3: XGBoost Regressor")

ml_data = pd.read_csv(r"C:/Users/Elias/Final Project/Cleaned output data files/ml_data.csv")

# 80-20 Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")


# Define parameter grid for tuning
xgb_param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Create and train model with GridSearch
xgb_model = XGBRegressor(random_state=42, n_jobs=-1)
xgb_grid = GridSearchCV(xgb_model, xgb_param_grid,
                       cv=5, scoring='r2',
                       n_jobs=-1, verbose=1)

print("Hyperparameter tuning in progress...")
xgb_grid.fit(X_train, y_train)

# Get best model
xgb_best = xgb_grid.best_estimator_
y_pred_xgb = xgb_best.predict(X_test)

Model 3: XGBoost Regressor
Hyperparameter tuning in progress...
Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [9]:
# Evaluation
r2_xgb = r2_score(y_test, y_pred_xgb)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
cv_xgb = cross_val_score(xgb_best, X_train, y_train, cv=5, scoring='r2').mean()

print("\nXGBoost Model Trained")
print(f"   Best Parameters: {xgb_grid.best_params_}")
print(f"   R² Score: {r2_xgb:.4f}")
print(f"   MAE: {mae_xgb:.4f} percentage points")
print(f"   RMSE: {rmse_xgb:.4f} percentage points")
print(f"   5-Fold CV R²: {cv_xgb:.4f}")
print(f"   Improvement over baseline: +{r2_xgb - r2_baseline:.4f} R²")

# Feature Importance (XGBoost)
xgb_feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': xgb_best.feature_importances_,
    'Percent': (xgb_best.feature_importances_ * 100).round(2)
}).sort_values('Importance', ascending=False)

print("\nXGBoost Feature Importance:")
print(xgb_feature_importance.to_string(index=False))


XGBoost Model Trained
   Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}
   R² Score: 0.7016
   MAE: 0.9002 percentage points
   RMSE: 1.1819 percentage points
   5-Fold CV R²: 0.6618


NameError: name 'r2_baseline' is not defined