### 4. Random Forest with PCA and Polynomial Features

#### Setup with Libraries and Utilities

In [None]:
import pandas as pd
import numpy as np
import os
import json
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
import matplotlib.pyplot as plt
import time

from utils.metrics import validate_regression_model, print_regression_performance
from utils.normalisation import normalise_data

import warnings
warnings.filterwarnings("ignore")

#### Loading Training and Test Data Splits

In [None]:
X_train = pd.read_csv('../data_splits/X_train.csv')
X_test = pd.read_csv('../data_splits/X_test.csv')
y_train = pd.read_csv('../data_splits/y_train.csv').values.ravel()  # Convert to 1D array if needed
y_test = pd.read_csv('../data_splits/y_test.csv').values.ravel()

#### Normalise the Data

In [None]:
X_train_scaled, X_test_scaled = normalise_data(X_train, X_test)

#### Randomised Search for Optimising Random Forest Regression Parameters with PCA and Polynomial Features

In [None]:
rf_param_grid = {
    'randomforestregressor__n_estimators': [100, 200, 300, 400, 500],
    'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'],
    'randomforestregressor__max_depth': [10, 20, 30, None],
    'randomforestregressor__min_samples_split': [2, 5, 10],
    'randomforestregressor__min_samples_leaf': [1, 2, 4],
    'randomforestregressor__bootstrap': [True, False]
}

rf_pipeline_pca = make_pipeline(
    PolynomialFeatures(degree=2, interaction_only=True),
    StandardScaler(),
    PCA(n_components=0.95),
    RandomForestRegressor(random_state=42)
)

rf_random_search_pca = RandomizedSearchCV(estimator=rf_pipeline_pca, param_distributions=rf_param_grid, 
                                          n_iter=100, cv=5, n_jobs=-1, verbose=2, random_state=42)

start_time_rf_pca = time.time()
rf_random_search_pca.fit(X_train_scaled, y_train)
training_time_rf_pca = (time.time() - start_time_rf_pca) / 60

#### Metrics for Random Forest Regressor with PCA

In [None]:
y_pred_pca = rf_random_search_pca.predict(X_test_scaled)

metrics_poly_rf_pca = validate_regression_model(rf_random_search_pca.best_estimator_, X_test_scaled, y_test, "Random Forest Regressor with PCA")
metrics_poly_rf_pca['Training Time rf_pca (min)'] = training_time_rf_pca

print(metrics_poly_rf_pca)

#### Saving Poly Random Forest Results for PCA Pipeline

In [None]:
results_dir_pca = '../model_metrics/statistical_models'
os.makedirs(results_dir_pca, exist_ok=True)
results_path_pca = os.path.join(results_dir_pca, 'metrics_poly_rf_pca.csv')

metrics_poly_rf_pca.to_csv(results_path_pca, index=False)

print("Poly Random Forest results for PCA pipeline saved.")

#### Saving Best Parameters for PCA Pipeline

In [None]:
best_params_path_pca = '../model_metrics/statistical_models/best_params_rf_pca.json'

best_params_rf_pca = rf_random_search_pca.best_params_

try:
    with open(best_params_path_pca, 'w') as file:
        json.dump(best_params_rf_pca, file)
    print(f"Best parameters for PCA pipeline saved to {best_params_path_pca}")
except Exception as e:
    print(f"Error saving best parameters for PCA pipeline: {e}")

#### Metrics for All Models

In [None]:
metrics_dir = '../model_metrics/statistical_models'

metric_files = [
    'results_simple_rf.csv',
    'results_tuned_rf.csv',
    'results_poly_rf.csv',
    'metrics_poly_rf_pca.csv'
]

dfs = [pd.read_csv(os.path.join(metrics_dir, file)).transpose() for file in metric_files if os.path.exists(os.path.join(metrics_dir, file))]

all_metrics_df = pd.concat(dfs, ignore_index=True)

print(all_metrics_df)

#### Extracting and Plotting Feature Importances
We'll extract and visualise the feature importances to compare both the original and transformed features.

In [None]:
# Extract and visualise the feature importances to compare both the original and transformed features.
rf_best_pca = rf_random_search_pca.best_estimator_.named_steps['randomforestregressor']
feature_importances_pca = rf_best_pca.feature_importances_
poly_features = rf_random_search_pca.best_estimator_.named_steps['polynomialfeatures'].get_feature_names_out(X_train.columns)

importance_df_pca = pd.DataFrame({'Feature': poly_features, 'Importance': feature_importances_pca})
importance_df_pca = importance_df_pca.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 6))
# Select top 20 features
top_features = importance_df_pca.head(20)

plt.barh(top_features['Feature'], top_features['Importance'], align='center', color='skyblue')
plt.xlabel('Feature Importance')
plt.title('Top 20 Feature Importances for Random Forest Regressor with Polynomial Features and PCA')
plt.gca().invert_yaxis()  

plt.savefig('../model_metrics/statistical_models/feature_importances_poly_rf_pca.png', bbox_inches='tight')
plt.show()