In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import shap
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBRFClassifier
from lightgbm import LGBMClassifier
import json
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV

In [None]:
data = pd.read_csv('../data/data_labeled.csv', index_col=0)

In [None]:
with open('../data/codebook_dict.json', 'r') as f:
    code = json.load(f)

In [None]:
DASS_keys = {'Depression': [3, 5, 10, 13, 16, 17, 21, 24, 26, 31, 34, 37, 38, 42],
             'Anxiety': [2, 4, 7, 9, 15, 19, 20, 23, 25, 28, 30, 36, 40, 41],
             'Stress': [1, 6, 8, 11, 12, 14, 18, 22, 27, 29, 32, 33, 35, 39]}

DASS_bins = {'Depression': [(0, 10), (10, 14), (14, 21), (21, 28)],
             'Anxiety': [(0, 8), (8, 10), (10, 15), (15, 20)],
             'Stress': [(0, 15), (15, 19), (19, 26), (26, 34)]}
             

In [None]:
severity = {0: 'Normal', 1: 'Mild', 2: 'Moderate', 3: 'Severe', 4:'Extremely severe'}

In [None]:
to_drop = data.filter(regex='Q\d{1,2}[IE]|VCL.*', axis=1).columns.to_list()

In [None]:
df1 = data.drop(labels=to_drop, axis=1).drop(['introelapse', 'testelapse', 'surveyelapse', 
                                              'uniquenetworklocation', 'screensize', 'hand', 'country','source'], axis=1)

### Barplots

In [None]:
group_dep = df1.groupby('Depression_cat')['Q1A'].count().reset_index().rename(columns={'Q1A': 'counts'})
group_dep['Depression_Severity'] = group_dep.Depression_cat.replace(severity)
group_dep['Percentage'] = group_dep.counts/group_dep.counts.sum()*100

In [None]:
plt.figure(figsize=(10, 5))
sns.set(font_scale=1.5)
sns.barplot(data=group_dep, x='Depression_Severity', y='Percentage', palette='Oranges')
plt.xlabel('Depression')
plt.savefig('../presentation/depression_bars.png', dpi=200)

In [None]:
group_dep = df1.groupby('Anxiety_cat')['Q1A'].count().reset_index().rename(columns={'Q1A': 'counts'})
group_dep['Anxiety_Severity'] = group_dep.Anxiety_cat.replace(severity)
group_dep['Percentage'] = group_dep.counts/group_dep.counts.sum()*100

In [None]:
plt.figure(figsize=(10, 5))
sns.set(font_scale=1.5)
sns.barplot(data=group_dep, x='Anxiety_Severity', y='Percentage', palette='Oranges')
plt.xlabel('Anxiety')
plt.savefig('../presentation/anxiety_bars.png', dpi=200)

In [None]:
group_dep = df1.groupby('Stress_cat')['Q1A'].count().reset_index().rename(columns={'Q1A': 'counts'})
group_dep['Stress_Severity'] = group_dep.Stress_cat.replace(severity)
group_dep['Percentage'] = group_dep.counts/group_dep.counts.sum()*100

In [None]:
plt.figure(figsize=(10, 5))
sns.set(font_scale=1.5)
sns.barplot(data=group_dep, x='Stress_Severity', y='Percentage', palette='Oranges')
plt.xlabel('Stress')
plt.savefig('../presentation/stress_bars.png', dpi=200)

## Correlation Map between Depression, Stress and Anxiety

In [None]:
df_cor = df1[['Depression_cat', 'Anxiety_cat', 'Stress_cat']]

In [None]:
plt.figure()
heatmap = sns.heatmap(df_cor.corr(),vmin=-1, vmax=1, annot=True)
plt.savefig('../presentation/DAS_cmap.png', dpi=200)

Maybe we can try predict e.g. anxiety from depression questions

## Correlation Map between Depression, Stress, Anxiety and TIPI questions

In [None]:
df_tipi = df1.loc[:, 'TIPI1':'TIPI10']
df_tipi = pd.concat([df_tipi, df_cor], axis=1)

In [None]:
df_tipi.head(2)

In [None]:
plt.figure(figsize=(15, 12))
heatmap = sns.heatmap(df_tipi.corr(),vmin=-1, vmax=1, annot=True, cmap='Oranges')
plt.savefig('../presentation/TIPI_cmap.png', dpi=200)

### Average reply score by question

In [None]:
qu_mean = df1.loc[:, 'Q1A': 'Q42A'].mean().sort_values().reset_index().rename(columns={'index': 'question', 0:'mean_answer'})

In [None]:
qu_mean.head(2)

In [None]:
plt.figure(figsize=(12, 8))
first = ['g']*6 + ['grey']*34 + ['r']*2
ax = sns.barplot(data=qu_mean, x='question', y='mean_answer', palette=first)
ax.axhline(y=qu_mean.mean()[0])
plt.xticks(rotation=60)

In [None]:
qu_mean.mean().va

In [None]:
[2,3] + [2]