In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import shap
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBRFClassifier
from lightgbm import LGBMClassifier
import json
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV

In [None]:
data = pd.read_csv('../data/data_labeled.csv', index_col=0)

In [None]:
with open('../data/codebook_dict.json', 'r') as f:
    code = json.load(f)

In [None]:
DASS_keys = {'Depression': [3, 5, 10, 13, 16, 17, 21, 24, 26, 31, 34, 37, 38, 42],
             'Anxiety': [2, 4, 7, 9, 15, 19, 20, 23, 25, 28, 30, 36, 40, 41],
             'Stress': [1, 6, 8, 11, 12, 14, 18, 22, 27, 29, 32, 33, 35, 39]}

DASS_bins = {'Depression': [(0, 10), (10, 14), (14, 21), (21, 28)],
             'Anxiety': [(0, 8), (8, 10), (10, 15), (15, 20)],
             'Stress': [(0, 15), (15, 19), (19, 26), (26, 34)]}
             

In [None]:
severity = {0: 'Normal', 1: 'Mild', 2: 'Moderate', 3: 'Severe', 4:'Extremely severe'}

In [None]:
to_drop = data.filter(regex='Q\d{1,2}[IE]|VCL.*', axis=1).columns.to_list()

In [None]:
df1 = data.drop(labels=to_drop, axis=1).drop(['introelapse', 'testelapse', 'surveyelapse', 
                                              'uniquenetworklocation', 'screensize', 'hand', 'country','source'], axis=1)

### Depression

In [None]:
X = df1.loc[:, :'Q42A']
y = df1.Depression_cat

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.20)

In [None]:
svm = SVC(kernel='linear', probability=True)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
sns.set_theme(style="white", palette=None)
ConfusionMatrixDisplay.from_predictions(y_test, y_pred)

In [None]:
svm_imp = pd.Series(abs(svm.coef_[0]), index=X.columns).sort_values(ascending=False)
svm_imp.head(14)

### Anxiety

In [None]:
X = df1.loc[:, :'Q42A']
y = df1.Anxiety_cat

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,  stratify=y, test_size=0.20)

In [None]:
svm = SVC(kernel='linear', probability=True)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
sns.set_theme(style="white", palette=None)
ConfusionMatrixDisplay.from_predictions(y_test, y_pred)

In [None]:
svm_imp = pd.Series(abs(svm.coef_[0]), index=X.columns).sort_values(ascending=False)
svm_imp.head(14)

### Stress

In [None]:
X = df1.loc[:, :'Q42A']
y = df1.Stress_cat

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.20)

In [None]:
svm = SVC(kernel='linear', probability=True)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
sns.set_theme(style="white", palette=None)
ConfusionMatrixDisplay.from_predictions(y_tesclass_weight=ed)

In [None]:
svm_imp = pd.Series(abs(svm.coef_[0]), index=X.columns).sort_values(ascending=False)
svm_imp.head(14)

### Recursive Feature Elimination

Since linear SVM can perfectly predict the labels we can do recursive feature elimination to see if we could make the list of questions for each condition shorter

In [None]:
DASS_keys = {'Depression': [3, 5, 10, 13, 16, 17, 21, 24, 26, 31, 34, 37, 38, 42],
             'Anxiety': [2, 4, 7, 9, 15, 19, 20, 23, 25, 28, 30, 36, 40, 41],
             'Stress': [1, 6, 8, 11, 12, 14, 18, 22, 27, 29, 32, 33, 35, 39]}

In [None]:
to_str = '|'.join([str(x) for x in DASS_keys['Depression']])
X_dep = X.filter(regex=f'^Q({to_str})A', axis=1)
y_dep = df1.Depression_cat

In [None]:
to_str = '|'.join([str(x) for x in DASS_keys['Anxiety']])
X_anx = X.filter(regex=f'^Q({to_str})A', axis=1)
y_anx = df1.Anxiety_cat

In [None]:
to_str = '|'.join([str(x) for x in DASS_keys['Stress']])
X_str = X.filter(regex=f'^Q({to_str})A', axis=1)
y_str = df1.Stress_cat

In [None]:
results = []
for X, y in zip([X_dep, X_anx, X_str], [y_dep, y_anx, y_str]):    
    
    # Create the RFE object and compute a cross-validated score.
    svc = SVC(kernel="linear", C=0.1, class_weight='balanced')
    # The "accuracy" scoring shows the proportion of correct classifications

    min_features_to_select = 7  # Minimum number of features to consider
    rfecv = RFECV(
        estimator=svc,
        step=1,
        cv=StratifiedKFold(3),
        scoring="balanced_accuracy",
        min_features_to_select=min_features_to_select,
        n_jobs=-1
    )
    rfecv.fit(X, y)
    results.append(rfecv)
    print('loop finished')
    print("Optimal number of features : %d" % rfecv.n_features_)

    # Plot number of features VS. cross-validation scores
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (balanced accuracy)")
    plt.plot(
        range(min_features_to_select, len(rfecv.grid_scores_) + min_features_to_select),
        rfecv.grid_scores_,
    )
    plt.show()

### Bin more the label columns

In [None]:
data = df1.copy()

In [None]:
data['Stress_cat'].replace({1:0}, inplace=True)
data['Stress_cat'].replace({3:4}, inplace=True)
data['Anxiety_cat'].replace({1:0}, inplace=True)
data['Anxiety_cat'].replace({3:4}, inplace=True)
data['Depression_cat'].replace({1:0}, inplace=True)
data['Depression_cat'].replace({3:4}, inplace=True)

In [None]:
severity = {0: 'Normal-Mild', 2: 'Moderate', 4:'Severe'}

In [None]:
group_dep = data.groupby('Depression_cat')['Q1A'].count().reset_index().rename(columns={'Q1A': 'counts'})
group_dep['Depression_Severity'] = group_dep.Depression_cat.replace(severity)
group_dep['Percentage'] = group_dep.counts/group_dep.counts.sum()*100

In [None]:
plt.figure(figsize=(10, 5))
sns.set(font_scale=1.5)
sns.barplot(data=group_dep, x='Depression_Severity', y='Percentage', palette='Oranges')
plt.xlabel('Depression')
plt.savefig('../presentation/depression_bars.png', dpi=200)

In [None]:
X = data.loc[:, :'Q42A']

In [None]:
to_str = '|'.join([str(x) for x in DASS_keys['Depression']])
X_dep = X.filter(regex=f'^Q({to_str})A', axis=1)
y_dep = data.Depression_cat

In [None]:
to_str = '|'.join([str(x) for x in DASS_keys['Anxiety']])
X_anx = X.filter(regex=f'^Q({to_str})A', axis=1)
y_anx = data.Anxiety_cat

In [None]:
to_str = '|'.join([str(x) for x in DASS_keys['Stress']])
X_str = X.filter(regex=f'^Q({to_str})A', axis=1)
y_str = data.Stress_cat

In [None]:
results_reduced = []
for X, y in zip([X_dep, X_anx, X_str], [y_dep, y_anx, y_str]):    
    
    # Create the RFE object and compute a cross-validated score.
    svc = SVC(kernel="linear", C=0.1, class_weight='balanced')
    # The "accuracy" scoring shows the proportion of correct classifications

    min_features_to_select = 7  # Minimum number of features to consider
    rfecv = RFECV(
        estimator=svc,
        step=1,
        cv=StratifiedKFold(3),
        scoring="balanced_accuracy",
        min_features_to_select=min_features_to_select,
        njobs=-1
    )
    rfecv.fit(X, y)
    results.append(rfecv)
    print('loop finished')
    print("Optimal number of features : %d" % rfecv.n_features_)

    # Plot number of features VS. cross-validation scores
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (balanced accuracy)")
    plt.plot(
        range(min_features_to_select, len(rfecv.grid_scores_) + min_features_to_select),
        rfecv.grid_scores_,
    )
    plt.show()