In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
import missingno as msno
from collections import Counter
from itertools import chain, combinations
import sklearn as sk
from sklearn.metrics import accuracy_score, f1_score

# Performance comparison

In this notebook we compare the performance of multiple models on different subsets of our data.
#### The models are:
+ Logistic Regression
+ SVM
+ KNN
+ Neural Network
+ XGBoost

#### The datasets:
+ Mean/Mode imputed
+ KNN imputed
+ MICE imputed
+ no imputations

#### The targets:
+ has_dep_diag
+ a binary combination of all the target variables

#### In combinations of:
+ trained on balanced, tested on balanced
+ trained on balanced, tested on imbalanced
+ trained on imbalanced, tested on balanced
+ trained on imbalanced, tested on imbalanced

#### We employ  range of visualisation methods:
+ ROC curves
+ bar plots
+ learning curves

The iputed datasets:
+ 0 - unedited
+ 1 - Mean/Mode
+ 2/3? - drop NaN
+ 4 - K-Means
+ 5 - MICE

In [2]:
data0 = pd.read_csv("newdata3.csv", engine='python')
data1 = pd.read_csv("imputed_dataset_1.csv", engine='python').drop(['Unnamed: 0'], axis = 1)
data2 = pd.read_csv("imputed_dataset_2.csv", engine='python').drop(['Unnamed: 0'], axis = 1)
data3 = pd.read_csv("imputed_dataset_3.csv", engine='python').drop(['Unnamed: 0'], axis = 1)
data4 = pd.read_csv("imputed_dataset_4.csv", engine='python').drop(['Unnamed: 0'], axis = 1)
data5 = pd.read_csv("imputed_dataset_5.csv", engine='python').drop(['Unnamed: 0'], axis = 1)

### dataset 5 - combined variable (dep)

In [3]:
data = pd.read_csv("imputed_dataset_5.csv", engine='python').drop(['Unnamed: 0'], axis = 1)
dep_data = data.copy()

dep_num = np.array([12, 11, 10, 4])
no_dep_num = np.setdiff1d(range(13), dep_num)

dep_data[['prim_diag', 'secd_diag']] = dep_data[['prim_diag', 'secd_diag']].replace(list(no_dep_num), 0)
dep_data[['prim_diag', 'secd_diag']] = dep_data[['prim_diag', 'secd_diag']].replace(list(dep_num), 1)

In [4]:
full = data.dropna(axis = 0, how = 'any')
dep = [x for x in data.columns if 'dep' in x or 'diag' in x or 'panic' in x]
dep_data['dep'] = dep_data['secd_diag'] + dep_data['prim_diag'] + 0 * dep_data['has_dep_diag']
dep_data['dep'] = dep_data['dep'].replace(range(2, 4), 1)
full = dep_data.dropna(axis = 0, how = 'any')
dep = [x for x in data.columns if 'dep' in x or 'diag' in x or 'panic' in x]

In [25]:
has_dep = full.query('dep == 1')
no_dep = full.query('dep == 0')
size = int(np.round(0.8 * min(len(has_dep), len(no_dep))))
#sample = pd.concat([has_dep.sample(size), no_dep.sample(size)])
sample = full.sample(8000)

sample = sample.sort_index()
sample = sample.reset_index(drop = True)


X_comb = sample.drop(dep, axis = 1).drop('dep', axis = 1)
Y_comb = sample['dep']

#sample = full.sample(8000)
#smote = SMOTE(random_state = 0)
#X, y = smote.fit_resample(sample.drop(dep, axis = 1).drop('dep', axis = 1), sample['dep'])

### dataset 5 - has_dep_diag

In [26]:
X_hasdep = data5.drop(['has_dep_diag'],axis=1).drop(['secd_diag'],axis=1).drop(['prim_diag'],axis=1).drop(['dep_score'],axis=1).drop(['dep_thoughts'],axis=1).drop(['panic_score'], axis=1)
print(X.shape)
Y_hasdep = np.array(data5['has_dep_diag'])
print(Y.shape)

(13734, 36)
(13734,)


### Over/undersampling to obtain imbalanced and balanced datasets

In [11]:
import imblearn
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state = 0)

#combined, imbalanced (original ratio), undersampling
X_combined_imb_und, y_combained_imb_und = smote.fit_resample(sample.drop(dep, axis = 1).drop('dep', axis = 1), sample['dep'])
#combined, imbalanced (original ratio), oversampling
#combined, balanced, undersampling
#combined, balanced, oversampling
#has_dep_diag, imbalanced (original ratio), undersampling
#has_dep_diag, imbalanced (original ratio), oversampling
#has_dep_diag, balanced, undersampling
#has_dep_diag, balanced, oversampling

### Train/test split function

In [23]:
from random import shuffle
from sklearn.model_selection import train_test_split

def shuffle_dataset(N, X, y, X_shuffled, y_shuffled):
    ind_list = [i for i in range(N)]
    shuffle(ind_list)
    X_shuffled  = X.iloc[ind_list]
    y_shuffled = y.iloc[ind_list]
    
def split_dataset(split, N, X, y):
    X_shuffled = X
    y_shuffled = y
    shuffle_dataset(N, pd. DataFrame(X), pd. DataFrame(Y), pd. DataFrame(X_shuffled), pd. DataFrame(y_shuffled))
    X_train, X_test, y_train, y_test = train_test_split(X_shuffled, y_shuffled,test_size=split, random_state=40)
    return X_train, X_test, y_train, y_test

In [30]:
X_train_a, X_test_a, y_train_a, y_test_a = split_dataset(0.2, Y_comb.size, X_comb, Y_comb)
print(X_train_a.shape); print(X_test_a.shape)
print(y_train_a.shape); print(y_test_a.shape)

(6400, 35)
(1600, 35)
(6400,)
(1600,)


## Models

### Logistic Regression

### KNN Classifier

### SVM

### FNN

### XGBoost