In [None]:
# Step 0: Random Forest Regressor
# Purpose: Capture non-linear relationships and assess feature importance
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

In [None]:
df = pd.read_csv(r"C:/Users/Elias/Final Project/Cleaned output data files/ml_data.csv")
df.shape, df.head()

In [None]:
print("Model 2: Random Forest Regressor")

print("Training Random Forest Model...")

# Define parameter grid for tuning
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create and train model with GridSearch
rf_model = RandomForestRegressor(random_state=42, n_jobs=-1)
rf_grid = GridSearchCV(rf_model, rf_param_grid, 
                      cv=5, scoring='r2', 
                      n_jobs=-1, verbose=1)

print("Hyperparameter tuning in progress...")
rf_grid.fit(X_train, y_train)

# Get best model
rf_best = rf_grid.best_estimator_
y_pred_rf = rf_best.predict(X_test)

# Evaluation
r2_rf = r2_score(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
cv_rf = cross_val_score(rf_best, X_train, y_train, cv=5, scoring='r2').mean()

print("\n Random Forest Model Trained")
print(f"   Best Parameters: {rf_grid.best_params_}")
print(f"   RÂ² Score: {r2_rf:.4f}")
print(f"   MAE: {mae_rf:.4f} percentage points")
print(f"   RMSE: {rmse_rf:.4f} percentage points")
print(f"   5-Fold CV RÂ²: {cv_rf:.4f}")
print(f"   Improvement over baseline: +{r2_rf - r2_baseline:.4f} RÂ²")

# Feature Importance
rf_feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_best.feature_importances_,
    'Percent': (rf_best.feature_importances_ * 100).round(2)
}).sort_values('Importance', ascending=False)

print("\nðŸ“Š Random Forest Feature Importance:")
print(rf_feature_importance.to_string(index=False))