In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
import missingno as msno
from collections import Counter
from itertools import chain, combinations
import sklearn as sk
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report

# Performance comparison

In this notebook we compare the performance of multiple models on different subsets of our data.
#### The models are:
+ Logistic Regression
+ SVM
+ KNN
+ Neural Network
+ XGBoost

#### The datasets:
+ Mean/Mode imputed
+ KNN imputed
+ MICE imputed
+ no imputations

#### The targets:
+ has_dep_diag
+ a binary combination of all the target variables

#### In combinations of:
+ trained on balanced, tested on balanced
+ trained on balanced, tested on imbalanced
+ trained on imbalanced, tested on balanced
+ trained on imbalanced, tested on imbalanced

#### We employ  range of visualisation methods:
+ ROC curves
+ bar plots
+ learning curves

The iputed datasets:
+ 0 - unedited
+ 1 - Mean/Mode
+ 2/3? - drop NaN
+ 4 - K-Means
+ 5 - MICE

In [25]:
data0 = pd.read_csv("newdata3.csv", engine='python')
data1 = pd.read_csv("imputed_dataset_1.csv", engine='python').drop(['Unnamed: 0'], axis = 1)
data2 = pd.read_csv("imputed_dataset_2.csv", engine='python').drop(['Unnamed: 0'], axis = 1)
data3 = pd.read_csv("imputed_dataset_3.csv", engine='python').drop(['Unnamed: 0'], axis = 1)
data4 = pd.read_csv("imputed_dataset_4.csv", engine='python').drop(['Unnamed: 0'], axis = 1)
data5 = pd.read_csv("imputed_dataset_5.csv", engine='python').drop(['Unnamed: 0'], axis = 1)

### dataset 5 - combined variable (dep)

In [26]:
data = pd.read_csv("imputed_dataset_5.csv", engine='python').drop(['Unnamed: 0'], axis = 1)
dep_data = data.copy()

dep_num = np.array([12, 11, 10, 4])
no_dep_num = np.setdiff1d(range(13), dep_num)

dep_data[['prim_diag', 'secd_diag']] = dep_data[['prim_diag', 'secd_diag']].replace(list(no_dep_num), 0)
dep_data[['prim_diag', 'secd_diag']] = dep_data[['prim_diag', 'secd_diag']].replace(list(dep_num), 1)

In [27]:
full = data.dropna(axis = 0, how = 'any')
dep = [x for x in data.columns if 'dep' in x or 'diag' in x or 'panic' in x]
dep_data['dep'] = dep_data['secd_diag'] + dep_data['prim_diag'] + 0 * dep_data['has_dep_diag']
dep_data['dep'] = dep_data['dep'].replace(range(2, 4), 1)
full = dep_data.dropna(axis = 0, how = 'any')
dep = [x for x in data.columns if 'dep' in x or 'diag' in x or 'panic' in x]

In [28]:
has_dep = full.query('dep == 1')
no_dep = full.query('dep == 0')
size = int(np.round(0.8 * min(len(has_dep), len(no_dep))))
#sample = pd.concat([has_dep.sample(size), no_dep.sample(size)])
sample = full.sample(8000)

sample = sample.sort_index()
sample = sample.reset_index(drop = True)


X_comb = sample.drop(dep, axis = 1).drop('dep', axis = 1)
Y_comb = sample['dep']
print('dataset shape %s' % Counter(Y_comb))

#sample = full.sample(8000)
#smote = SMOTE(random_state = 0)
#X, y = smote.fit_resample(sample.drop(dep, axis = 1).drop('dep', axis = 1), sample['dep'])

dataset shape Counter({0.0: 7723, 1.0: 277})


### dataset 5 - has_dep_diag

In [29]:
X_hasdep = data5.drop(['has_dep_diag'],axis=1).drop(['secd_diag'],axis=1).drop(['prim_diag'],axis=1).drop(['dep_score'],axis=1).drop(['dep_thoughts'],axis=1).drop(['panic_score'], axis=1)
print(X_hasdep.shape)
Y_hasdep = np.array(data5['has_dep_diag'])
print(Y_hasdep.shape)
print('dataset shape %s' % Counter(Y_hasdep))

(13734, 36)
(13734,)
dataset shape Counter({0.0: 13344, 1.0: 390})


### Over/undersampling to obtain imbalanced and balanced datasets

In [30]:
import imblearn
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN 
from imblearn import under_sampling
from imblearn.ensemble import RUSBoostClassifier

nb = 1000
nl = 1880
ns = 120

rusboost = RUSBoostClassifier(sampling_strategy={1.0: 400, 0.0: 400}, random_state=42)

#combined, imbalanced (original ratio), undersampling
smoteenn = SMOTEENN(random_state = 42, sampling_strategy=1.0)
rus = under_sampling.RandomUnderSampler(sampling_strategy={1.0: nl, 0.0: ns}, random_state=42)
X_combined_imb, Y_combined_imb = smoteenn.fit_resample(X_comb, Y_comb)
X_combined_imb, Y_combined_imb = rus.fit_resample(X_combined_imb, Y_combined_imb)
print(X_combined_imb.shape)
print(Y_combined_imb.shape)
print('Resampled dataset shape %s' % Counter(Y_combined_imb))

#combined, imbalanced (original ratio), oversampling

#combined, balanced, undersampling
smoteenn = SMOTEENN(random_state = 42, sampling_strategy=1.0)
rus = under_sampling.RandomUnderSampler(sampling_strategy={1.0: nb, 0.0: nb}, random_state=42)
X_combined_b, Y_combined_b = smoteenn.fit_resample(X_comb, Y_comb)
X_combined_b, Y_combined_b = rus.fit_resample(X_combined_b, Y_combined_b)
#rusboost.fit(X_combined_b, Y_combined_b)
#Y_combined_b = rusboost.predict(X_combined_b)
print(X_combined_b.shape)
print(Y_combined_b.shape)
print('Resampled dataset shape %s' % Counter(Y_combined_b))

#combined, balanced, oversampling

#has_dep_diag, imbalanced (original ratio), undersampling
smoteenn = SMOTEENN(random_state = 42, sampling_strategy=1.0)
rus = under_sampling.RandomUnderSampler(sampling_strategy={1.0: nl, 0.0: ns}, random_state=42)
X_hasdep_imb, Y_hasdep_imb = smoteenn.fit_resample(X_hasdep, Y_hasdep)
X_hasdep_imb, Y_hasdep_imb = rus.fit_resample(X_hasdep_imb, Y_hasdep_imb)
print(X_hasdep_imb.shape)
print(Y_hasdep_imb.shape)
print('Resampled dataset shape %s' % Counter(Y_hasdep_imb))

#has_dep_diag, imbalanced (original ratio), oversampling

#has_dep_diag, balanced, undersampling
smoteenn = SMOTEENN(random_state = 42, sampling_strategy=1.0)
rus = under_sampling.RandomUnderSampler(sampling_strategy={1.0: nb, 0.0: nb}, random_state=42)
X_hasdep_b, Y_hasdep_b = smoteenn.fit_resample(X_hasdep, Y_hasdep)
X_hasdep_b, Y_hasdep_b = rus.fit_resample(X_hasdep_b, Y_hasdep_b)
print(X_hasdep_b.shape)
print(Y_hasdep_b.shape)
print('Resampled dataset shape %s' % Counter(Y_hasdep_b))

#has_dep_diag, balanced, oversampling

(2000, 35)
(2000,)
Resampled dataset shape Counter({1.0: 1880, 0.0: 120})
(2000, 35)
(2000,)
Resampled dataset shape Counter({0.0: 1000, 1.0: 1000})
(2000, 36)
(2000,)
Resampled dataset shape Counter({1.0: 1880, 0.0: 120})
(2000, 36)
(2000,)
Resampled dataset shape Counter({0.0: 1000, 1.0: 1000})


### Train/test split function

In [31]:
from random import shuffle
from sklearn.model_selection import train_test_split

def shuffle_dataset(N, X, y, X_shuffled, y_shuffled):
    ind_list = [i for i in range(N)]
    shuffle(ind_list)
    X_shuffled  = X.iloc[ind_list]
    y_shuffled = y.iloc[ind_list]
    
def split_dataset(split, N, X, y):
    X_shuffled = X
    y_shuffled = y
    shuffle_dataset(N, pd. DataFrame(X), pd. DataFrame(y), pd. DataFrame(X_shuffled), pd. DataFrame(y_shuffled))
    X_train, X_test, y_train, y_test = train_test_split(X_shuffled, y_shuffled,test_size=split, random_state=40)
    return X_train, X_test, y_train, y_test

In [32]:
#combined, imbalanced (original ratio), undersampling
X_train_comb_imb, X_test_comb_imb, y_train_comb_imb, y_test_comb_imb = split_dataset(0.2, Y_combined_imb.size, X_combined_imb, Y_combined_imb)
print(X_train_comb_imb.shape); print(X_test_comb_imb.shape)
print(y_train_comb_imb.shape); print(y_test_comb_imb.shape)

#combined, balanced, undersampling
X_train_comb_b, X_test_comb_b, y_train_comb_b, y_test_comb_b = split_dataset(0.2, Y_combined_b.size, X_combined_b, Y_combined_b)
print(X_train_comb_b.shape); print(X_test_comb_b.shape)
print(y_train_comb_b.shape); print(y_test_comb_b.shape)

#has_dep_diag, imbalanced (original ratio), undersampling
X_train_hasdep_imb, X_test_hasdep_imb, y_train_hasdep_imb, y_test_hasdep_imb = split_dataset(0.2, Y_hasdep_imb.size, X_hasdep_imb, Y_hasdep_imb)
print(X_train_hasdep_imb.shape); print(X_test_hasdep_imb.shape)
print(y_train_hasdep_imb.shape); print(y_test_hasdep_imb.shape)

#has_dep_diag, balanced, undersampling
X_train_hasdep_b, X_test_hasdep_b, y_train_hasdep_b, y_test_hasdep_b = split_dataset(0.2, Y_hasdep_b.size, X_hasdep_b, Y_hasdep_b)
print(X_train_hasdep_b.shape); print(X_test_hasdep_b.shape)
print(y_train_hasdep_b.shape); print(y_test_hasdep_b.shape)

(1600, 35)
(400, 35)
(1600,)
(400,)
(1600, 35)
(400, 35)
(1600,)
(400,)
(1600, 36)
(400, 36)
(1600,)
(400,)
(1600, 36)
(400, 36)
(1600,)
(400,)


## Models

### Logistic Regression

In [33]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(penalty = 'l2', max_iter = 1000, C = 1e-07, solver = 'newton-cg')

#combined variable

#train imbalanced, test imbalanced
clf.fit(X_train_comb_imb, y_train_comb_imb)
y_pred = clf.predict(X_test_comb_imb)
print(f1_score(y_pred, y_test_comb_imb, average = 'macro'))

#train imbalanced, test balanced
clf.fit(X_train_comb_imb, y_train_comb_imb)
y_pred = clf.predict(X_test_comb_b)
print(f1_score(y_pred, y_test_comb_b, average = 'macro'))

#train balanced, test imbalanced
clf.fit(X_train_comb_b, y_train_comb_b)
y_pred = clf.predict(X_test_comb_imb)
print(f1_score(y_pred, y_test_comb_imb, average = 'macro'))

#train balanced, test balanced
clf.fit(X_train_comb_b, y_train_comb_b)
y_pred = clf.predict(X_test_comb_b)
print(f1_score(y_pred, y_test_comb_b, average = 'macro'))


#has_dep_diag

#train imbalanced, test imbalanced
clf.fit(X_train_hasdep_imb, y_train_hasdep_imb)
y_pred = clf.predict(X_test_hasdep_imb)
print(f1_score(y_pred, y_test_hasdep_imb, average = 'macro'))

#train imbalanced, test balanced
clf.fit(X_train_hasdep_imb, y_train_hasdep_imb)
y_pred = clf.predict(X_test_hasdep_b)
print(f1_score(y_pred, y_test_hasdep_b, average = 'macro'))

#train balanced, test imbalanced
clf.fit(X_train_hasdep_b, y_train_hasdep_b)
y_pred = clf.predict(X_test_hasdep_imb)
print(f1_score(y_pred, y_test_hasdep_imb, average = 'macro'))

#train balanced, test balanced
clf.fit(X_train_hasdep_b, y_train_hasdep_b)
y_pred = clf.predict(X_test_hasdep_b)
print(f1_score(y_pred, y_test_hasdep_b, average = 'macro'))

0.4865211810012837
0.3464052287581699
0.0498812351543943
0.31972789115646255
0.4865211810012837
0.3464052287581699
0.0498812351543943
0.31972789115646255


### KNN Classifier

In [34]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5) #, metric='euclidean')

#combined variable

#train imbalanced, test imbalanced
knn.fit(X_train_comb_imb, y_train_comb_imb)
y_pred = knn.predict(X_test_comb_imb)
print(f1_score(y_pred, y_test_comb_imb, average = 'macro'))

#train imbalanced, test balanced
knn.fit(X_train_comb_imb, y_train_comb_imb)
y_pred = knn.predict(X_test_comb_b)
print(f1_score(y_pred, y_test_comb_b, average = 'macro'))

#train balanced, test imbalanced
knn.fit(X_train_comb_b, y_train_comb_b)
y_pred = knn.predict(X_test_comb_imb)
print(f1_score(y_pred, y_test_comb_imb, average = 'macro'))

#train balanced, test balanced
knn.fit(X_train_comb_b, y_train_comb_b)
y_pred = knn.predict(X_test_comb_b)
print(f1_score(y_pred, y_test_comb_b, average = 'macro'))


#has_dep_diag

#train imbalanced, test imbalanced
knn.fit(X_train_hasdep_imb, y_train_hasdep_imb)
y_pred = knn.predict(X_test_hasdep_imb)
print(f1_score(y_pred, y_test_hasdep_imb, average = 'macro'))

#train imbalanced, test balanced
knn.fit(X_train_hasdep_imb, y_train_hasdep_imb)
y_pred = knn.predict(X_test_hasdep_b)
print(f1_score(y_pred, y_test_hasdep_b, average = 'macro'))

#train balanced, test imbalanced
knn.fit(X_train_hasdep_b, y_train_hasdep_b)
y_pred = knn.predict(X_test_hasdep_imb)
print(f1_score(y_pred, y_test_hasdep_imb, average = 'macro'))

#train balanced, test balanced
knn.fit(X_train_hasdep_b, y_train_hasdep_b)
y_pred = knn.predict(X_test_hasdep_b)
print(f1_score(y_pred, y_test_hasdep_b, average = 'macro'))

0.912739965095986
0.8262014483212639
0.9008182494421026
0.8815775924209659
0.9447437491366211
0.8031862549535893
0.9113475177304965
0.8876375801220664


### SVM

In [35]:
from sklearn import svm
from sklearn.svm import SVC

svm = SVC(C=5, gamma='auto', kernel='rbf')

#combined variable

#train imbalanced, test imbalanced
svm.fit(X_train_comb_imb, y_train_comb_imb)
y_pred = svm.predict(X_test_comb_imb)
print(f1_score(y_pred, y_test_comb_imb, average = 'macro'))

#train imbalanced, test balanced
svm.fit(X_train_comb_imb, y_train_comb_imb)
y_pred = svm.predict(X_test_comb_b)
print(f1_score(y_pred, y_test_comb_b, average = 'macro'))

#train balanced, test imbalanced
svm.fit(X_train_comb_b, y_train_comb_b)
y_pred = svm.predict(X_test_comb_imb)
print(f1_score(y_pred, y_test_comb_imb, average = 'macro'))

#train balanced, test balanced
svm.fit(X_train_comb_b, y_train_comb_b)
y_pred = svm.predict(X_test_comb_b)
print(f1_score(y_pred, y_test_comb_b, average = 'macro'))


#has_dep_diag

#train imbalanced, test imbalanced
svm.fit(X_train_hasdep_imb, y_train_hasdep_imb)
y_pred = svm.predict(X_test_hasdep_imb)
print(f1_score(y_pred, y_test_hasdep_imb, average = 'macro'))

#train imbalanced, test balanced
svm.fit(X_train_hasdep_imb, y_train_hasdep_imb)
y_pred = svm.predict(X_test_hasdep_b)
print(f1_score(y_pred, y_test_hasdep_b, average = 'macro'))

#train balanced, test imbalanced
svm.fit(X_train_hasdep_b, y_train_hasdep_b)
y_pred = svm.predict(X_test_hasdep_imb)
print(f1_score(y_pred, y_test_hasdep_imb, average = 'macro'))

#train balanced, test balanced
svm.fit(X_train_hasdep_b, y_train_hasdep_b)
y_pred = svm.predict(X_test_hasdep_b)
print(f1_score(y_pred, y_test_hasdep_b, average = 'macro'))

0.912739965095986
0.8089983022071308
0.9357305826022686
0.9289772727272727
0.9447437491366211
0.8060973099322004
0.9335106382978724
0.8960650480699152


### FNN

In [36]:
from FFNeuralNetwork import NN_models

neurons = [36, 36]
fnn = NN_models.NN_Model_Dropout(neuron = neurons, activation = 'relu', in_shape = (36,), classes = 2, output_activation = 'sigmoid', optimizer = 'Adam', loss = 'sparse_categorical_crossentropy', dr = 0.4)
final_history = fnn.fit(X_train_comb_imb, y_train_comb_imb, epochs = 100, validation_data = (X_test_comb_imb, y_test_comb_imb))
score = fnn.evaluate(X_test_comb_imb, y_test_comb_imb, verbose = 0, batch_size = 32)
print(score)


Epoch 1/100


ValueError: in user code:

    File "/home/karolina/anaconda3/lib/python3.8/site-packages/keras/engine/training.py", line 878, in train_function  *
        return step_function(self, iterator)
    File "/home/karolina/anaconda3/lib/python3.8/site-packages/keras/engine/training.py", line 867, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/karolina/anaconda3/lib/python3.8/site-packages/keras/engine/training.py", line 860, in run_step  **
        outputs = model.train_step(data)
    File "/home/karolina/anaconda3/lib/python3.8/site-packages/keras/engine/training.py", line 808, in train_step
        y_pred = self(x, training=True)
    File "/home/karolina/anaconda3/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/home/karolina/anaconda3/lib/python3.8/site-packages/keras/engine/input_spec.py", line 263, in assert_input_compatibility
        raise ValueError(f'Input {input_index} of layer "{layer_name}" is '

    ValueError: Input 0 of layer "sequential" is incompatible with the layer: expected shape=(None, 36), found shape=(32, 35)


### XGBoost

In [39]:
from sklearn.metrics import mean_absolute_error
from xgboost import plot_importance
from matplotlib import pyplot as plt
from sklearn import metrics 
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
import xgboost as xgb
from xgboost import XGBRegressor
from xgboost import XGBClassifier
#from xgboost import MultiOutputRegressor
from sklearn.model_selection import GridSearchCV   #Perforing grid search
from pandas import MultiIndex, Int64Index
import warnings
warnings.filterwarnings('ignore')


def modelfit(alg, dtrain_X, dtrain_Y, dtest_X, dtest_Y, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):


    xgb_param = alg.get_xgb_params()
        
    xgtrain = xgb.DMatrix(dtrain_X.values, dtrain_Y.values)
    xgtest_X = xgb.DMatrix(dtest_X)
    xgtest_Y = xgb.DMatrix(dtest_Y)
        
    cvresult = xgb.cv(xgb_param, 
                     xgtrain, 
                     num_boost_round=alg.get_params()['n_estimators'], 
                     nfold=cv_folds, 
                      metrics='auc',
                    early_stopping_rounds=early_stopping_rounds)
    alg.set_params(n_estimators=cvresult.shape[0])
    
    alg.fit(dtrain_X, dtrain_Y,eval_metric=['logloss','auc','error'])

    # pred
    dtrain_predictions = alg.predict(dtest_X)
    dtrain_predprob = alg.predict_proba(dtest_X)[:,1]

    print ("f1 : %.4g" % metrics.f1_score(dtest_Y, dtrain_predictions, average = 'macro'))




xgbc = XGBClassifier(n_estimators=1000,
                         learning_rate=0.05, 
                         n_jobs=4,
                         max_depth = 16,
                         min_child_weight = 1,
                         gamma = 0.2,
                         subsample=0.97, 
                         colsample_bytree=0.73,
                         reg_alpha = 3.64,
                         seed=1024,
                         use_label_encoder=False)


#combined variable

#train imbalanced, test imbalanced
modelfit(xgbc, X_train_comb_imb, y_train_comb_imb, X_test_comb_imb, y_test_comb_imb)

#train imbalanced, test balanced
modelfit(xgbc, X_train_comb_imb, y_train_comb_imb, X_test_comb_b, y_test_comb_b)

#train balanced, test imbalanced
modelfit(xgbc, X_train_comb_b, y_train_comb_b, X_test_comb_imb, y_test_comb_imb)

#train balanced, test balanced
modelfit(xgbc, X_train_comb_b, y_train_comb_b, X_test_comb_b, y_test_comb_b)


#has_dep_diag

#train imbalanced, test imbalanced
modelfit(xgbc, X_train_hasdep_imb, y_train_hasdep_imb, X_test_hasdep_imb, y_test_hasdep_imb)

#train imbalanced, test balanced
modelfit(xgbc, X_train_hasdep_imb, y_train_hasdep_imb, X_test_hasdep_b, y_test_hasdep_b)

#train balanced, test imbalanced
modelfit(xgbc, X_train_hasdep_b, y_train_hasdep_b, X_test_hasdep_imb, y_test_hasdep_imb)

#train balanced, test balanced
modelfit(xgbc, X_train_hasdep_b, y_train_hasdep_b, X_test_hasdep_b, y_test_hasdep_b)

f1 : 0.9127
f1 : 0.8148
f1 : 0.9058
f1 : 0.9699


AttributeError: 'numpy.ndarray' object has no attribute 'values'