In [1]:
import os
import pickle
import pandas as pd
from multiprocessing import Pool
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error, make_scorer

In [2]:
# Load data
data_path = "../../data"
X_train = pd.read_csv(os.path.join(data_path, "X_train_pca.csv"))
y_train_org = pd.read_csv(os.path.join(data_path, "y_train_pca.csv"))
X_test = pd.read_csv(os.path.join(data_path, "X_test_pca.csv"))
y_test_org = pd.read_csv(os.path.join(data_path, "y_test_pca.csv"))

In [3]:
traits = ['Extraversion', 'Agreeableness', 'Conscientiousness', 'Emotional Stability', 'Openness']
random_state=27

In [4]:
# Create results directory
results_path = "../../results"
specific_results_path = os.path.join("../../results", "rf_regression")
os.makedirs(results_path, exist_ok=True)
os.makedirs(specific_results_path, exist_ok=True)

In [5]:
# Define the parameter grid you want to search over
param_grid = {
    'n_estimators': [100,150,200,250,300],  # Number of trees in the forest
    'max_depth': [None, 10,15,20,25,30],  # Maximum depth of the trees
    'min_samples_split': [2,3,4,5,6,8,10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 3, 4, 5]    # Minimum number of samples required to be at a leaf node
}
r2_scorer = make_scorer(r2_score)

# Create a Random Forest Regressor model
rf = RandomForestRegressor(random_state=42)

# Create a grid search object
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, scoring=r2_scorer, cv=3, n_jobs=5)

for trait in traits:
    print(f"Processing {trait}")
    y_train = y_train_org[trait]
    y_test = y_test_org[trait]
    # Fit the grid search to your data
    grid_search.fit(X_train, y_train)

    # Print the best hyperparameters and the corresponding score
    print("Best Hyperparameters: ", grid_search.best_params_)
    print("Best Score: ", -grid_search.best_score_)

    # Get the best model from the grid search
    best_rf = grid_search.best_estimator_

    # Now, you can use the best_rf model for predictions on your test data
    y_pred = best_rf.predict(X_test)
    y_true = y_test
    # Compute metrics
    # Calculate Mean Absolute Error (MAE)
    mae = mean_absolute_error(y_true, y_pred)
    # Calculate Mean Squared Error (MSE)
    mse = mean_squared_error(y_true, y_pred)
    # Calculate Root Mean Squared Error (RMSE)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    # Calculate R-squared (R2) Score
    r2 = r2_score(y_true, y_pred)
    # Calculate Mean Absolute Percentage Error (MAPE)
    mape = mean_absolute_percentage_error(y_true, y_pred)
    # Calculate Mean Percentage Error (MPE)
    mpe = (1/len(y_true)) * sum(((y_true[i] - y_pred[i]) / y_true[i]) * 100 for i in range(len(y_true)))

    print("Mean Absolute Error (MAE):", mae)
    print("Mean Squared Error (MSE):", mse)
    print("Root Mean Squared Error (RMSE):", rmse)
    print("R-squared (R2) Score:", r2)
    print("Mean Absolute Percentage Error (MAPE):", mape)
    print("Mean Percentage Error (MPE):", mpe)

    # Add metrics to dict
    metrics = {"mae": mae, "mse": mse, "rmse": rmse, "r2": r2, "mape": mape, "best_hyperparameters": grid_search.best_params_, "best_score": grid_search.best_score_}

    # Save model and metrics 
    curr_result_path = os.path.join(specific_results_path, trait)
    os.makedirs(curr_result_path, exist_ok=True)
    with open(os.path.join(curr_result_path, f'rf_model_tuned.pkl'), 'wb') as file:
        pickle.dump(best_rf, file)
    with open(os.path.join(curr_result_path, f'perf_metrics_tuned.pkl'), 'wb') as file:
        pickle.dump(metrics, file)

Processing Extraversion
Best Hyperparameters:  {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Best Score:  -0.13510211611961942
Mean Absolute Error (MAE): 0.864659854186218
Mean Squared Error (MSE): 1.0170736214210732
Root Mean Squared Error (RMSE): 1.0085006799308929
R-squared (R2) Score: 0.12704592318190455
Mean Absolute Percentage Error (MAPE): 0.2185456155611552
Mean Percentage Error (MPE): -5.479071690591802
Processing Agreeableness
Best Hyperparameters:  {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 200}
Best Score:  -0.14048277371311313
Mean Absolute Error (MAE): 0.5763961904154744
Mean Squared Error (MSE): 0.5159499705078129
Root Mean Squared Error (RMSE): 0.718296575592431
R-squared (R2) Score: 0.19840919287935554
Mean Absolute Percentage Error (MAPE): 0.11524638171190167
Mean Percentage Error (MPE): -0.5012972703435495
Processing Conscientiousness
Best Hyperparameters:  {'max_depth': 25, 'min_samples_le