In [1]:
import os
import pickle
import pandas as pd
from multiprocessing import Pool
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error, make_scorer

In [2]:
# Load data
data_path = "../../data"
X_train = pd.read_csv(os.path.join(data_path, "X_train.csv"))
y_train_org = pd.read_csv(os.path.join(data_path, "y_train.csv"))
X_test = pd.read_csv(os.path.join(data_path, "X_test.csv"))
y_test_org = pd.read_csv(os.path.join(data_path, "y_test.csv"))

In [3]:
traits = ['Extraversion', 'Agreeableness', 'Conscientiousness', 'Emotional Stability', 'Openness']
random_state=27

In [4]:
# Create results directory
results_path = "../../results"
specific_results_path = os.path.join("../../results", "rf_regression_non_pca")
os.makedirs(results_path, exist_ok=True)
os.makedirs(specific_results_path, exist_ok=True)

In [5]:
# Define the parameter grid you want to search over
param_grid = {
    'n_estimators': [100,150,200,250,300],  # Number of trees in the forest
    'max_depth': [None, 10,15,20,25,30],  # Maximum depth of the trees
    'min_samples_split': [2,3,4,5,6,8,10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 3, 4, 5]    # Minimum number of samples required to be at a leaf node
}
r2_scorer = make_scorer(r2_score)

# Create a Random Forest Regressor model
rf = RandomForestRegressor(random_state=42)

# Create a grid search object
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, scoring=r2_scorer, cv=3, n_jobs=5)

for trait in traits:
    print(f"Processing {trait}")
    y_train = y_train_org[trait]
    y_test = y_test_org[trait]
    # Fit the grid search to your data
    grid_search.fit(X_train, y_train)

    # Print the best hyperparameters and the corresponding score
    print("Best Hyperparameters: ", grid_search.best_params_)
    print("Best Score: ", -grid_search.best_score_)

    # Get the best model from the grid search
    best_rf = grid_search.best_estimator_

    # Now, you can use the best_rf model for predictions on your test data
    y_pred = best_rf.predict(X_test)
    y_true = y_test
    # Compute metrics
    # Calculate Mean Absolute Error (MAE)
    mae = mean_absolute_error(y_true, y_pred)
    # Calculate Mean Squared Error (MSE)
    mse = mean_squared_error(y_true, y_pred)
    # Calculate Root Mean Squared Error (RMSE)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    # Calculate R-squared (R2) Score
    r2 = r2_score(y_true, y_pred)
    # Calculate Mean Absolute Percentage Error (MAPE)
    mape = mean_absolute_percentage_error(y_true, y_pred)
    # Calculate Mean Percentage Error (MPE)
    mpe = (1/len(y_true)) * sum(((y_true[i] - y_pred[i]) / y_true[i]) * 100 for i in range(len(y_true)))

    print("Mean Absolute Error (MAE):", mae)
    print("Mean Squared Error (MSE):", mse)
    print("Root Mean Squared Error (RMSE):", rmse)
    print("R-squared (R2) Score:", r2)
    print("Mean Absolute Percentage Error (MAPE):", mape)
    print("Mean Percentage Error (MPE):", mpe)

    # Add metrics to dict
    metrics = {"mae": mae, "mse": mse, "rmse": rmse, "r2": r2, "mape": mape, "best_hyperparameters": grid_search.best_params_, "best_score": grid_search.best_score_}

    # Save model and metrics 
    curr_result_path = os.path.join(specific_results_path, trait)
    os.makedirs(curr_result_path, exist_ok=True)
    with open(os.path.join(curr_result_path, f'rf_model_tuned.pkl'), 'wb') as file:
        pickle.dump(best_rf, file)
    with open(os.path.join(curr_result_path, f'perf_metrics_tuned.pkl'), 'wb') as file:
        pickle.dump(metrics, file)

Processing Extraversion
Best Hyperparameters:  {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 250}
Best Score:  -0.25271002160997397
Mean Absolute Error (MAE): 0.7315044008875738
Mean Squared Error (MSE): 0.760768773784635
Root Mean Squared Error (RMSE): 0.8722205992663984
R-squared (R2) Score: 0.3358107107452639
Mean Absolute Percentage Error (MAPE): 0.18903247495150272
Mean Percentage Error (MPE): -6.155051088355879
Processing Agreeableness
Best Hyperparameters:  {'max_depth': 25, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Best Score:  -0.28472002619100223
Mean Absolute Error (MAE): 0.5044836524751882
Mean Squared Error (MSE): 0.430544917555407
Root Mean Squared Error (RMSE): 0.6561592166200266
R-squared (R2) Score: 0.3101524849589097
Mean Absolute Percentage Error (MAPE): 0.10326593546421446
Mean Percentage Error (MPE): -1.0410627979927032
Processing Conscientiousness
Best Hyperparameters:  {'max_depth': 25, 'min_samples_lea

In [6]:
# Define the parameter grid you want to search over for Random Forest
param_grid = {
    'random_forest__n_estimators': [100, 150, 200, 250, 300],
    'random_forest__max_depth': [10, 15, 20, 25, 30],
    'random_forest__min_samples_split': [2, 3, 4, 5, 6, 8, 10],
    'random_forest__min_samples_leaf': [1, 2, 3, 4, 5]
}

# Create a Random Forest Classifier model
rf = RandomForestRegressor(random_state=42)
r2_scorer = make_scorer(r2_score)
# Define the parameter grid for feature selection with a scoring function
param_grid_fs = {
    'feature_selection__k': [1, 2, 3, 4, 5, 6, 7, 8],
}

# Create a pipeline with feature selection and the model
pipeline = Pipeline([
    ('feature_selection', SelectKBest()),
    ('random_forest', rf)
])

# Create a grid search object with feature selection and model parameter grids
grid_search = GridSearchCV(estimator=pipeline,
                           param_grid={'feature_selection__k': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], **param_grid},
                           scoring=r2_scorer, cv=3, n_jobs=5)

In [7]:
for trait in traits:
    print(f"Processing {trait}")
    y_train = y_train_org[trait]
    y_test = y_test_org[trait]
    # Fit the grid search to your data
    grid_search.fit(X_train, y_train)

    # Print the best hyperparameters and the corresponding score
    print("Best Hyperparameters: ", grid_search.best_params_)
    print("Best Score: ", grid_search.best_score_)

    # Get the best model from the grid search
    best_rf = grid_search.best_estimator_

    # Get the best feature selection method and model from the grid search
    best_feature_selection = grid_search.best_estimator_.named_steps['feature_selection']
    best_rf = grid_search.best_estimator_.named_steps['random_forest']

    # Now, you can use the best_feature_selection and best_rf for predictions on your test data
    X_train_selected = best_feature_selection.transform(X_train)
    X_test_selected = best_feature_selection.transform(X_test)
    best_rf.fit(X_train_selected, y_train)
    y_pred = best_rf.predict(X_test_selected)
    y_true = y_test

    # Compute metrics
    # Calculate Mean Absolute Error (MAE)
    mae = mean_absolute_error(y_true, y_pred)
    # Calculate Mean Squared Error (MSE)
    mse = mean_squared_error(y_true, y_pred)
    # Calculate Root Mean Squared Error (RMSE)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    # Calculate R-squared (R2) Score
    r2 = r2_score(y_true, y_pred)
    # Calculate Mean Absolute Percentage Error (MAPE)
    mape = mean_absolute_percentage_error(y_true, y_pred)
    # Calculate Mean Percentage Error (MPE)
    mpe = (1/len(y_true)) * sum(((y_true[i] - y_pred[i]) / y_true[i]) * 100 for i in range(len(y_true)))

    print("Mean Absolute Error (MAE):", mae)
    print("Mean Squared Error (MSE):", mse)
    print("Root Mean Squared Error (RMSE):", rmse)
    print("R-squared (R2) Score:", r2)
    print("Mean Absolute Percentage Error (MAPE):", mape)
    print("Mean Percentage Error (MPE):", mpe)
    print("\n\n")

    # Add metrics to dict
    metrics = {"mae": mae, "mse": mse, "rmse": rmse, "r2": r2, "mape": mape, "best_hyperparameters": grid_search.best_params_, "best_score": grid_search.best_score_, "best_features": best_feature_selection}

    # Save model and metrics 
    curr_result_path = os.path.join(specific_results_path, trait)
    os.makedirs(curr_result_path, exist_ok=True)
    with open(os.path.join(curr_result_path, f'rf_model_tuned_features.pkl'), 'wb') as file:
        pickle.dump(best_rf, file)
    with open(os.path.join(curr_result_path, f'perf_metrics_tuned_features.pkl'), 'wb') as file:
        pickle.dump(metrics, file)

Processing Extraversion
Best Hyperparameters:  {'feature_selection__k': 10, 'random_forest__max_depth': 30, 'random_forest__min_samples_leaf': 1, 'random_forest__min_samples_split': 3, 'random_forest__n_estimators': 300}
Best Score:  0.19837168436777233
Mean Absolute Error (MAE): 0.7796923661986157
Mean Squared Error (MSE): 0.8730854510821929
Root Mean Squared Error (RMSE): 0.9343904168398737
R-squared (R2) Score: 0.23775261919846646
Mean Absolute Percentage Error (MAPE): 0.20102810016252387
Mean Percentage Error (MPE): -6.489763667745556



Processing Agreeableness
Best Hyperparameters:  {'feature_selection__k': 9, 'random_forest__max_depth': 30, 'random_forest__min_samples_leaf': 1, 'random_forest__min_samples_split': 3, 'random_forest__n_estimators': 300}
Best Score:  0.23039324351791388
Mean Absolute Error (MAE): 0.5238036493589029
Mean Squared Error (MSE): 0.4738507267776924
Root Mean Squared Error (RMSE): 0.6883681622342018
R-squared (R2) Score: 0.24076505600385167
Mean Absolute 