In [1]:
import os
import pickle
import pandas as pd
import numpy as np
from multiprocessing import Pool
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

In [2]:
# Load data
data_path = "../../data"
X_train = pd.read_csv(os.path.join(data_path, "X_train_pca.csv"))
y_train_org = pd.read_csv(os.path.join(data_path, "y_train_pca.csv"))
X_test = pd.read_csv(os.path.join(data_path, "X_test_pca.csv"))
y_test_org = pd.read_csv(os.path.join(data_path, "y_test_pca.csv"))

In [3]:
traits = ['Extraversion', 'Agreeableness', 'Conscientiousness', 'Emotional Stability', 'Openness']
random_state=27

In [4]:
# Create results directory
results_path = "../../results"
specific_results_path = os.path.join("../../results", "rf_classification")
os.makedirs(results_path, exist_ok=True)
os.makedirs(specific_results_path, exist_ok=True)

In [5]:
def calc_roc_auc(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    n_classes = np.unique(y_true)
    roc_auc_scores = []
    for label in n_classes:
        # Create binary labels for the current class vs. all other classes
        y_true_class = (y_true == label).astype(int)
        y_pred_class = (y_pred == label).astype(int)
        
        # Calculate ROC AUC for the current class
        roc_auc = roc_auc_score(y_true_class, y_pred_class)
        roc_auc_scores.append(roc_auc)
    return roc_auc_scores

In [6]:
# Define the parameter grid you want to search over
param_grid = {
    'n_estimators': [100,150,200,250,300],  # Number of trees in the forest
    'max_depth': [None, 10,15,20,25,30],  # Maximum depth of the trees
    'min_samples_split': [2,3,4,5,6,8,10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 3, 4, 5]    # Minimum number of samples required to be at a leaf node
}

# Create a Random Forest Regressor model
rf = RandomForestClassifier(random_state=42)

# Create a grid search object
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, scoring="accuracy", cv=3, n_jobs=5)

for trait in traits:
    print(f"Processing {trait}")
    trait_bin = trait + "_bin"
    label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
    y_train = [label_mapping[label] for label in y_train_org[trait_bin]]
    y_test = [label_mapping[label] for label in y_test_org[trait_bin]]
    # Fit the grid search to your data
    grid_search.fit(X_train, y_train)

    # Print the best hyperparameters and the corresponding score
    print("Best Hyperparameters: ", grid_search.best_params_)
    print("Best Score: ", grid_search.best_score_)

    # Get the best model from the grid search
    best_rf = grid_search.best_estimator_

    # Now, you can use the best_rf model for predictions on your test data
    y_pred = best_rf.predict(X_test)
    y_true = y_test

    # Compute metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average="weighted")
    recall = recall_score(y_true, y_pred, average="weighted")
    f1 = f1_score(y_true, y_pred, average="weighted")
    roc_auc = calc_roc_auc(y_true, y_pred)
    conf_matrix = confusion_matrix(y_true, y_pred)

    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1-Score: {f1}')
    print(f'ROC AUC: {roc_auc}')
    print(f'Confusion Matrix:\n{conf_matrix}')
    print("\n\n")
    metrics = {"accuracy": accuracy, "precision": precision, "recall": recall, "f1_score": f1, "roc_auc": roc_auc, "conf_matrix": conf_matrix, "best_hyperparameters": grid_search.best_params_, "best_score": grid_search.best_score_}

    # Save model and metrics 
    curr_result_path = os.path.join(specific_results_path, trait)
    os.makedirs(curr_result_path, exist_ok=True)
    with open(os.path.join(curr_result_path, f'rf_model_tuned.pkl'), 'wb') as file:
        pickle.dump(best_rf, file)
    with open(os.path.join(curr_result_path, f'perf_metrics_tuned.pkl'), 'wb') as file:
        pickle.dump(metrics, file)

Processing Extraversion
Best Hyperparameters:  {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 300}
Best Score:  0.6035313001605136
Accuracy: 0.6009615384615384
Precision: 0.6090088564403632
Recall: 0.6009615384615384
F1-Score: 0.5684478239414923
ROC AUC: [0.5848126232741617, 0.6564007421150277, 0.6669579520921148]
Confusion Matrix:
[[ 8 24  7]
 [ 4 85  9]
 [ 2 37 32]]



Processing Agreeableness
Best Hyperparameters:  {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 150}
Best Score:  0.6698769395398609
Accuracy: 0.6778846153846154
Precision: 0.671624529316837
Recall: 0.6778846153846154
F1-Score: 0.6688080915891788
ROC AUC: [0.647273425499232, 0.647273425499232]
Confusion Matrix:
[[ 41  43]
 [ 24 100]]



Processing Conscientiousness
Best Hyperparameters:  {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 200}
Best Score:  0.6987693953986089
Accuracy: 0.6442307692307693
Precision: 0.63537