In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import shap
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBRFClassifier
from lightgbm import LGBMClassifier
import json
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV

In [None]:
data = pd.read_csv('../data/data_labeled.csv', index_col=0)

In [None]:
data['Stress_cat'].replace({1:0, 2:1, 3:1, 4:1}, inplace=True)
data['Anxiety_cat'].replace({1:0, 2:1, 3:1, 4:1}, inplace=True)
data['Depression_cat'].replace({1:0, 2:1, 3:1, 4:1}, inplace=True)


In [None]:
with open('../data/codebook_dict.json', 'r') as f:
    code = json.load(f)

In [None]:
DASS_keys = {'Depression': [3, 5, 10, 13, 16, 17, 21, 24, 26, 31, 34, 37, 38, 42],
             'Anxiety': [2, 4, 7, 9, 15, 19, 20, 23, 25, 28, 30, 36, 40, 41],
             'Stress': [1, 6, 8, 11, 12, 14, 18, 22, 27, 29, 32, 33, 35, 39]}

DASS_bins = {'Depression': [(0, 10), (10, 14), (14, 21), (21, 28)],
             'Anxiety': [(0, 8), (8, 10), (10, 15), (15, 20)],
             'Stress': [(0, 15), (15, 19), (19, 26), (26, 34)]}
             

In [None]:
severity = {0: 'Normal', 1: 'Mild', 2: 'Moderate', 3: 'Severe', 4:'Extremely severe'}

In [None]:
to_drop = data.filter(regex='Q\d{1,2}[IE]|VCL.*', axis=1).columns.to_list()

In [None]:
df1 = data.drop(labels=to_drop, axis=1).drop(['introelapse', 'testelapse', 'surveyelapse', 
                                              'uniquenetworklocation', 'screensize', 'hand', 'country','source'], axis=1)

In [None]:
df_cor = df1[['Depression_cat', 'Anxiety_cat', 'Stress_cat']]
df_tipi = df1.loc[:, 'TIPI1':'TIPI10']
df_tipi = pd.concat([df_tipi, df_cor], axis=1)

In [None]:
X = df_tipi.loc[:, :'TIPI10']
y = df_tipi.Depression_cat

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,  stratify=y, test_size=0.20)

In [None]:
svm = SVC(kernel='rbf', C=0.1, class_weight='balanced')
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
sns.set_theme(style="white", palette=None)
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_svm)

In [None]:
print(classification_report(y_test, y_pred_svm))

In [None]:
metrics.balanced_accuracy_score(y_test, y_pred_svm)

In [None]:
rf = RandomForestClassifier(n_estimators=1000, class_weight='balanced')
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
sns.set_theme(style="white", palette=None)
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_rf)

In [None]:
print(classification_report(y_test, y_pred_rf))

In [None]:
metrics.balanced_accuracy_score(y_test, y_pred_rf)

In [None]:
pipe = Pipeline([

    ('clf', 'passthrough'),
])


parameters = [
    {
        'clf': (SVC(),),
        'clf__C': [0.01, 0.1],
        'clf__kernel': ('rbf',),
        'clf__class_weight': ['balanced'],
    }, 
    {

        'clf': (RandomForestClassifier(),),
        'clf__n_estimators': [1000],
        'clf__class_weight': ['balanced'],
        'clf__criterion': ['entropy']
        
    }, 
    

]

grid_search = GridSearchCV(pipe, parameters, cv=3, scoring=['balanced_accuracy', 'f1_weighted'], verbose=4,
                           n_jobs=-1, refit='balanced_accuracy')

In [None]:
result = grid_search.fit(X_train, y_train)
report = pd.DataFrame(result.cv_results_)


In [None]:
report.sort_values(by='mean_test_balanced_accuracy', ascending=False).drop_duplicates('param_clf')