### 3. Polynomial Features with Random Forest and Randomised Search

#### Setup with Libraries and Utilities

In [None]:
import pandas as pd
import numpy as np
import os
import json
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt
import time
from sklearn.pipeline import make_pipeline

from utils.metrics import kpi_ML
from utils.normalisation import normalise_data

import warnings
warnings.filterwarnings("ignore")

#### Loading Training and Test Data Splits

In [None]:
X_train = pd.read_csv('../data_splits/X_train.csv')
X_test = pd.read_csv('../data_splits/X_test.csv')
y_train = pd.read_csv('../data_splits/y_train.csv').values.ravel()  # Convert to 1D array if needed
y_test = pd.read_csv('../data_splits/y_test.csv').values.ravel()

#### Normalise the Data

In [None]:
X_train_scaled, X_test_scaled = normalise_data(X_train, X_test)

#### Randomised Search for Optimising Polynomial Random Forest Regression Parameters

In [None]:
rf_param_grid_poly = {
    'randomforestregressor__n_estimators': [100, 200, 300],
    'randomforestregressor__max_features': ['auto', 'sqrt'],
    'randomforestregressor__max_depth': [10, 20, None],
    'randomforestregressor__min_samples_split': [2, 5, 10],
    'randomforestregressor__min_samples_leaf': [1, 2, 4],
    'randomforestregressor__bootstrap': [True, False]
}

rf_pipeline = make_pipeline(PolynomialFeatures(degree=2, interaction_only=True), RandomForestRegressor(random_state=42))

rf_random_search_poly = RandomizedSearchCV(estimator=rf_pipeline, param_distributions=rf_param_grid_poly, 
                                           n_iter=50, cv=3, n_jobs=-1, verbose=2, random_state=42)

start_time_rf3 = time.time()
rf_random_search_poly.fit(X_train_scaled, y_train) 
training_time_rf3 = (time.time() - start_time_rf3) / 60 
best_rf_poly = rf_random_search_poly.best_estimator_
print("Best parameters found: ", rf_random_search_poly.best_params_)
best_params_path = '../model_metrics/statistical_models/best_params_poly_rf.json'
best_params_tuned_rf_poly = rf_random_search_poly.best_params_

try:
    with open(best_params_path, 'w') as file:
        json.dump(best_params_tuned_rf_poly, file)
    print(f"Best parameters saved to {best_params_path}")
except Exception as e:
    print(f"Error saving best parameters: {e}")

#### Model Evaluation with Metrics

In [None]:
Y_train_pred_poly = best_rf_poly.predict(X_train_scaled)
Y_test_pred_poly = best_rf_poly.predict(X_test_scaled)

metrics_tuned_rf_poly = kpi_ML(y_train, Y_train_pred_poly, y_test, Y_test_pred_poly, "Tuned Random Forest Regressor with Polynomial Features")
metrics_tuned_rf_poly['Training Time rf3 (min)'] = round(training_time_rf3, 1)

print(metrics_tuned_rf_poly)
results_dir_poly = '../model_metrics/statistical_models'
os.makedirs(results_dir_poly, exist_ok=True) 
results_path_poly = os.path.join(results_dir_poly, 'results_tuned_rf_poly.csv')
metrics_tuned_rf_poly.to_csv(results_path_poly, index=False)
print("Tuned Random Forest results with Polynomial Features saved.")

#### Extracting and Plotting Feature Importances
We'll extract and visualise the feature importances to compare both the original and transformed features.

In [None]:
feature_importances_poly_rf = best_rf_poly.named_steps['randomforestregressor'].feature_importances_
poly_features = best_rf_poly.named_steps['polynomialfeatures'].get_feature_names_out(X_train.columns)

importance_df_poly = pd.DataFrame({'Feature': poly_features, 'Importance': feature_importances_poly_rf})
importance_df_poly = importance_df_poly.sort_values(by='Importance', ascending=False)

print("\nFeature Importances:")
print(importance_df_poly)
plt.figure(figsize=(10, 6))
plt.barh(importance_df_poly['Feature'][:20], importance_df_poly['Importance'][:20], color='skyblue')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Top 20 Feature Importances from Tuned Random Forest with Polynomial Features')
plt.gca().invert_yaxis()  
plt.savefig('../model_metrics/statistical_models/feature_importances_poly_rf.png', bbox_inches='tight')
plt.show()