In [1]:
import os
import pickle
import pandas as pd
import numpy as np
from multiprocessing import Pool
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

In [2]:
# Load data
data_path = "../../data"
X_train = pd.read_csv(os.path.join(data_path, "X_train_pca.csv"))
y_train_org = pd.read_csv(os.path.join(data_path, "y_train_pca.csv"))
X_test = pd.read_csv(os.path.join(data_path, "X_test_pca.csv"))
y_test_org = pd.read_csv(os.path.join(data_path, "y_test_pca.csv"))

In [3]:
traits = ['Extraversion', 'Agreeableness', 'Conscientiousness', 'Emotional Stability', 'Openness']
random_state=27

In [4]:
# Create results directory
results_path = "../../results"
specific_results_path = os.path.join("../../results", "svm_classification")
os.makedirs(results_path, exist_ok=True)
os.makedirs(specific_results_path, exist_ok=True)

In [5]:
def calc_roc_auc(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    n_classes = np.unique(y_true)
    roc_auc_scores = []
    for label in n_classes:
        # Create binary labels for the current class vs. all other classes
        y_true_class = (y_true == label).astype(int)
        y_pred_class = (y_pred == label).astype(int)
        
        # Calculate ROC AUC for the current class
        roc_auc = roc_auc_score(y_true_class, y_pred_class)
        roc_auc_scores.append(roc_auc)
    return roc_auc_scores

In [6]:
# Define the parameter grid you want to search over
param_grid = {
    'C': [0.1, 0.5, 1, 5, 10, 50, 100],        # Regularization parameter
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],  # Kernel type
    'degree': [2, 3, 4, 5, 6],                 # Degree of the polynomial kernel (only for 'poly' kernel)
    'gamma': ['scale', 'auto', 0.1, 0.5, 1.0],  # Kernel coefficient for 'rbf', 'poly', and 'sigmoid'
    'coef0': [0.0, 0.1, 0.5, 1.0]             # Independent term in the kernel function (only for 'poly' and 'sigmoid')
}

# Create an SVM classifier
svm = SVC(random_state=27)

# Create a grid search object
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, scoring="accuracy", cv=3, n_jobs=5)

for trait in traits:
    print(f"Processing {trait}")
    trait_bin = trait + "_bin"
    label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
    y_train = [label_mapping[label] for label in y_train_org[trait_bin]]
    y_test = [label_mapping[label] for label in y_test_org[trait_bin]]
    # Fit the grid search to your data
    grid_search.fit(X_train, y_train)

    # Print the best hyperparameters and the corresponding score
    print("Best Hyperparameters: ", grid_search.best_params_)
    print("Best Score: ", grid_search.best_score_)

    # Get the best model from the grid search
    best_svm = grid_search.best_estimator_

    # Now, you can use the best_rf model for predictions on your test data
    y_pred = best_svm.predict(X_test)
    y_true = y_test

    # Compute metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average="weighted")
    recall = recall_score(y_true, y_pred, average="weighted")
    f1 = f1_score(y_true, y_pred, average="weighted")
    roc_auc = calc_roc_auc(y_true, y_pred)
    conf_matrix = confusion_matrix(y_true, y_pred)

    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1-Score: {f1}')
    print(f'ROC AUC: {roc_auc}')
    print(f'Confusion Matrix:\n{conf_matrix}')
    print("\n\n")
    metrics = {"accuracy": accuracy, "precision": precision, "recall": recall, "f1_score": f1, "roc_auc": roc_auc, "conf_matrix": conf_matrix, "best_hyperparameters": grid_search.best_params_, "best_score": grid_search.best_score_}

    # Save model and metrics 
    curr_result_path = os.path.join(specific_results_path, trait)
    os.makedirs(curr_result_path, exist_ok=True)
    with open(os.path.join(curr_result_path, f'svm_model_tuned.pkl'), 'wb') as file:
        pickle.dump(best_svm, file)
    with open(os.path.join(curr_result_path, f'perf_metrics_tuned.pkl'), 'wb') as file:
        pickle.dump(metrics, file)

Processing Extraversion
Best Hyperparameters:  {'C': 10, 'coef0': 0.0, 'degree': 2, 'gamma': 'scale', 'kernel': 'rbf'}
Best Score:  0.5821294810058855
Accuracy: 0.5625
Precision: 0.552702982403717
Recall: 0.5625
F1-Score: 0.5369410200098144
ROC AUC: [0.6015779092702169, 0.6439703153988868, 0.6059422226791404]
Confusion Matrix:
[[10 19 10]
 [ 4 79 15]
 [ 5 38 28]]



Processing Agreeableness
Best Hyperparameters:  {'C': 100, 'coef0': 0.5, 'degree': 3, 'gamma': 'scale', 'kernel': 'poly'}
Best Score:  0.6773675762439808
Accuracy: 0.6394230769230769
Precision: 0.6350958202103241
Recall: 0.6394230769230769
F1-Score: 0.6365335431831595
ROC AUC: [0.6188556067588324, 0.6188556067588326]
Confusion Matrix:
[[43 41]
 [34 90]]



Processing Conscientiousness
Best Hyperparameters:  {'C': 50, 'coef0': 0.0, 'degree': 2, 'gamma': 'scale', 'kernel': 'rbf'}
Best Score:  0.6730872124130552
Accuracy: 0.6634615384615384
Precision: 0.6567374932759549
Recall: 0.6634615384615384
F1-Score: 0.6536774628879892
R