In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/depression-anxiety-stress-scales/DASS_data_21.02.19/data.csv', sep=r'\t', engine='python')

In [None]:
df.head()

In [None]:
DASS_keys = {'Depression': [3, 5, 10, 13, 16, 17, 21, 24, 26, 31, 34, 37, 38, 42],
             'Anxiety': [2, 4, 7, 9, 15, 19, 20, 23, 25, 28, 30, 36, 40, 41],
             'Stress': [1, 6, 8, 11, 12, 14, 18, 22, 27, 29, 32, 33, 35, 39]}

DASS_bins = {'Depression': [(0, 10), (10, 14), (14, 21), (21, 28)],
             'Anxiety': [(0, 8), (8, 10), (10, 15), (15, 20)],
             'Stress': [(0, 15), (15, 19), (19, 26), (26, 34)]}

In [None]:
only_q = df.filter(regex='Q\d{1,2}A')

In [None]:
import seaborn as sn
def draw_freq_plot(df):
    cnt_num = [(df[df == i].sum(axis=1).sum())/(i) for i in range(1,5)]
    ax=sn.barplot(x = [0,1,2,3], y=cnt_num)
    ax.set_xlabel('Level')
    ax.set_ylabel('Frequency')

In [None]:
def sub(df):
    return df.subtract(1, axis=1)

In [None]:
dep = []
for i in DASS_keys["Depression"]:
    dep.append('Q'+str(i)+'A')
stress = []
for i in DASS_keys["Stress"]:
    stress.append('Q'+str(i)+'A')
anx = []
for i in DASS_keys["Anxiety"]:
    anx.append('Q'+str(i)+'A')

In [None]:
depression_q = only_q.filter(dep)
stress_q = only_q.filter(stress)
anxiety_q = only_q.filter(anx)

In [None]:
draw_freq_plot(stress_q)

In [None]:
draw_freq_plot(depression_q)

In [None]:
draw_freq_plot(anxiety_q)

In [None]:
depression_q = sub(depression_q)
stress_q = sub(stress_q)
anxiety_q = sub(anxiety_q)

In [None]:
depression_q.info()

In [None]:
depression_q.isnull().sum()

In [None]:
depression_q.head()

In [None]:
def scores(df):
    col = list(df)
    df["Scores"] = df[col].sum(axis=1)
    return df

In [None]:
train_dep = scores(depression_q)
train_str = scores(stress_q)
train_anx = scores(anxiety_q)

In [None]:
train_dep.head()

In [None]:
def append(df, string):
    conditions = [
    ((df['Scores'] >= DASS_bins[string][0][0])  & (df['Scores'] < DASS_bins[string][0][1])),
    ((df['Scores'] >= DASS_bins[string][1][0])  & (df['Scores'] < DASS_bins[string][1][1])),
    ((df['Scores'] >= DASS_bins[string][2][0])  & (df['Scores'] < DASS_bins[string][2][1])),
    ((df['Scores'] >= DASS_bins[string][3][0])  & (df['Scores'] < DASS_bins[string][3][1])),
    (((df['Scores'] >= DASS_bins[string][3][1])))
    ]
    values = ['Normal','Mild', 'Moderate', 'Severe', 'Extremely Severe']
    df['Category'] = np.select(conditions, values)
    return df
    
train_dep = append(train_dep, 'Depression')
train_dep.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
cat = train_dep['Category']
train_dep.drop('Category', inplace=True, axis=1)

In [None]:
Xtrain,Xtest,ytrain,ytest = train_test_split(train_dep, cat, train_size=0.75,random_state=2)
print(Xtrain.shape, ytrain.shape, Xtest.shape, ytest.shape)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
 
    title = 'Confusion Matrix'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred, labels=classes)

    fig, ax = plt.subplots(figsize=(5,5))
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    # ax.figure.colorbar(im, ax=ax)
    ax.figure.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=0)

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    
    plt.grid(b=None)

In [None]:
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import f1_score
model_d = MultinomialNB(alpha=0.0001, fit_prior = False)
model_d.fit(Xtrain,ytrain)
plot_confusion_matrix(model_d, Xtest, ytest)
predictions = model_d.predict(Xtest)
f1_score(ytest,predictions, average = 'micro')

In [None]:
from sklearn.ensemble import RandomForestClassifier
model1 = RandomForestClassifier(random_state=0)
model1.fit(Xtrain,ytrain)
plot_confusion_matrix(model1, Xtest, ytest)
predictions = model1.predict(Xtest)
f1_score(ytest,predictions, average = 'micro')

In [None]:
from sklearn.neighbors import KNeighborsClassifier
model2 = KNeighborsClassifier(n_neighbors=3)
model2.fit(Xtrain,ytrain)
plot_confusion_matrix(model2, Xtest, ytest)
predictions = model2.predict(Xtest)
f1_score(ytest,predictions, average = 'micro')