<br>  

## <span style='color:blue'>Section 1: Import</span>  

In [1]:
import pandas as pd
import numpy as np
import copy

from sklearn.model_selection import StratifiedKFold

from sklearn.impute import KNNImputer
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from collections import Counter

from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

<br>  

## <span style='color:blue'>Section 2: Read, drop columns, form X_train and y_train</span>  

In [2]:
# ---------- read ----------

df_1 = pd.read_csv('../data/code_3_train.csv')

# ---------- drop ----------

print('Before drop :', df_1.shape)
df_1.drop(columns=['index',
                   'pco2', 'ph', 'basophils', 'lactic_acid', 'bmi',
                   'creatine_kinase', 'lymphocyte', 'neutrophils'], inplace=True)
print('After drop:', df_1.shape)
print('')

# ---------- form X ----------

X_train = df_1.drop(columns=['outcome'])
print('X_train :', X_train.shape)

# ---------- form y ----------

y_train = df_1['outcome']
print('y_train :', y_train.shape)
print('')
print('y_train :', np.unique(y_train, return_counts=True))
print('y_train :', Counter(y_train))
print(y_train.value_counts(normalize=True))

Before drop : (882, 51)
After drop: (882, 42)

X_train : (882, 41)
y_train : (882,)

y_train : (array([0., 1.]), array([763, 119], dtype=int64))
y_train : Counter({0.0: 763, 1.0: 119})
0.0    0.865079
1.0    0.134921
Name: outcome, dtype: float64


<br>  

## <span style='color:blue'>Section 3: Set up 15-fold cross validation</span>  

In [3]:
kfold_cv = StratifiedKFold(n_splits=15, shuffle=True, random_state=42)

<br>  

## <span style='color:blue'>Section 4: Explore Decision Tree Classifier</span>  

In [4]:
# ---------- make pipeline ----------

pipe_line_dt = Pipeline([('knn_impute', KNNImputer(n_neighbors=5)),
                         ('oversample_SMOTE', SMOTE(random_state=42)),
                         ('ss_scale', StandardScaler()),
                         ('dt_class', DecisionTreeClassifier(random_state=42))])

# ---------- tuning of hyperparameters ----------

pipe_line_params_dt = {'dt_class__max_depth': [2,3,4,5,6,7],
                       'dt_class__max_leaf_nodes': [2,3,4,5,6,7]}

# ---------- instantiate gridsearchcv for recall in accordance to aim of modelling ----------

search_dt = GridSearchCV(pipe_line_dt,
                         param_grid=pipe_line_params_dt,
                         cv=kfold_cv,
                         return_train_score=True,
                         scoring='recall')

search_dt.fit(X_train, y_train)

# ---------- quick overview of recall train score, test score and overfit ----------

print('Training :', search_dt.cv_results_['mean_train_score'])
print('Validation :', search_dt.cv_results_['mean_test_score'])
print('Overfit %:', (search_dt.cv_results_['mean_test_score']-search_dt.cv_results_['mean_train_score'])/search_dt.cv_results_['mean_train_score']*100)
print('')

# ---------- details of recall train score and test score ----------

temp_df_dt = []
for j in range(len(search_dt.cv_results_['params'])):
    for i in range(kfold_cv.n_splits):
        over_fit = (search_dt.cv_results_['split'+str(i)+'_test_score'][j]-search_dt.cv_results_['split'+str(i)+'_train_score'][j])/search_dt.cv_results_['split'+str(i)+'_train_score'][j]*100
        temp_df_dt.append([search_dt.cv_results_['params'][j],
                           search_dt.cv_results_['params'][j]['dt_class__max_depth'],
                           search_dt.cv_results_['params'][j]['dt_class__max_leaf_nodes'],
                           search_dt.cv_results_['split'+str(i)+'_train_score'][j],
                           search_dt.cv_results_['split'+str(i)+'_test_score'][j],
                           over_fit])
temp_df_dt = pd.DataFrame(temp_df_dt, columns=['parameters', 'max_depth', 'max_leaf_nodes', 'training', 'validation', 'overfit_%'])
print(temp_df_dt)
temp_df_dt.to_csv('../data/code_5a_dt_train_validate_recall.csv', na_rep='NaN', index_label='index')

Training : [0.62437259 0.1957958  0.31144895 0.31144895 0.31144895 0.31144895
 0.62437259 0.1957958  0.48682432 0.57431896 0.64650901 0.66152402
 0.62437259 0.1957958  0.48682432 0.52386851 0.54911519 0.55103496
 0.62437259 0.1957958  0.48682432 0.52386851 0.5587248  0.54082475
 0.62437259 0.1957958  0.48682432 0.52386851 0.5587248  0.54623016
 0.62437259 0.1957958  0.48682432 0.52386851 0.5587248  0.54623016]
Validation : [0.54404762 0.15833333 0.27142857 0.27142857 0.27142857 0.27142857
 0.54404762 0.15833333 0.4297619  0.46309524 0.52738095 0.53571429
 0.54404762 0.15833333 0.4297619  0.39642857 0.3797619  0.40238095
 0.54404762 0.15833333 0.4297619  0.39642857 0.4047619  0.40238095
 0.54404762 0.15833333 0.4297619  0.39642857 0.4047619  0.39404762
 0.54404762 0.15833333 0.4297619  0.39642857 0.4047619  0.39404762]
Overfit %: [-12.86490943 -19.13343558 -12.84973915 -12.84973915 -12.84973915
 -12.84973915 -12.86490943 -19.13343558 -11.72135752 -19.3661939
 -18.42635678 -19.0181662  -

  over_fit = (search_dt.cv_results_['split'+str(i)+'_test_score'][j]-search_dt.cv_results_['split'+str(i)+'_train_score'][j])/search_dt.cv_results_['split'+str(i)+'_train_score'][j]*100


### <span style='color:green'>Severe overfitting with Decision Tree Classifier.</span>  
<br>  
<br>  


<br>  

## <span style='color:blue'>Section 5: Explore Random Forest Classifier</span>  

In [5]:
# ---------- make pipeline ----------

pipe_line_rf = Pipeline([('knn_impute', KNNImputer(n_neighbors=5)),
                         ('oversample_SMOTE', SMOTE(random_state=42)),
                         ('ss_scale', StandardScaler()),
                         ('rf_class', RandomForestClassifier(random_state=42))])

# ---------- tuning of hyperparameters ----------

pipe_line_params_rf = {'rf_class__max_depth': [2,3,4,5,6,7],
                       'rf_class__n_estimators': [2,3,4,5,6,7]}

# ---------- instantiate gridsearchcv for recall in accordance to aim of modelling ----------

search_rf = GridSearchCV(pipe_line_rf,
                         param_grid=pipe_line_params_rf,
                         cv=kfold_cv,
                         return_train_score=True,
                         scoring='recall')

search_rf.fit(X_train, y_train)

# ---------- quick overview of recall train score, test score and overfit ----------

print('Training :', search_rf.cv_results_['mean_train_score'])
print('Validation :', search_rf.cv_results_['mean_test_score'])
print('Overfit %:', (search_rf.cv_results_['mean_test_score']-search_rf.cv_results_['mean_train_score'])/search_rf.cv_results_['mean_train_score']*100)
print('')

# ---------- details of recall train score and test score ----------

temp_df_rf = []
for j in range(len(search_rf.cv_results_['params'])):
    for i in range(kfold_cv.n_splits):
        over_fit = (search_rf.cv_results_['split'+str(i)+'_test_score'][j]-search_rf.cv_results_['split'+str(i)+'_train_score'][j])/search_rf.cv_results_['split'+str(i)+'_train_score'][j]*100
        temp_df_rf.append([search_rf.cv_results_['params'][j],
                           search_rf.cv_results_['params'][j]['rf_class__max_depth'],
                           search_rf.cv_results_['params'][j]['rf_class__n_estimators'],
                           search_rf.cv_results_['split'+str(i)+'_train_score'][j],
                           search_rf.cv_results_['split'+str(i)+'_test_score'][j],
                           over_fit])
temp_df_rf = pd.DataFrame(temp_df_rf, columns=['parameters', 'max_depth', 'n_estimators', 'training', 'validation', 'overfit_%'])
print(temp_df_rf)
temp_df_rf.to_csv('../data/code_5a_rf_train_validate_recall.csv', na_rep='NaN', index_label='index')

Training : [0.53907121 0.62669455 0.62429215 0.63987559 0.64228335 0.64407979
 0.59603711 0.62483912 0.64762441 0.66984663 0.68786465 0.71127735
 0.62065101 0.69384384 0.68724796 0.69804269 0.70109395 0.70949163
 0.6524882  0.7365133  0.74910446 0.749142   0.75994745 0.77915058
 0.72453882 0.78396075 0.80496032 0.82056521 0.83495817 0.8517589
 0.76594273 0.8361701  0.86680609 0.87638889 0.88540862 0.9021879 ]
Validation : [0.48809524 0.54642857 0.56309524 0.5797619  0.5547619  0.56309524
 0.50595238 0.5547619  0.48928571 0.54761905 0.55595238 0.52261905
 0.46071429 0.52261905 0.4952381  0.47142857 0.4797619  0.4797619
 0.41428571 0.48214286 0.47380952 0.45714286 0.51428571 0.47261905
 0.41904762 0.44404762 0.3952381  0.43690476 0.3952381  0.42142857
 0.30119048 0.40119048 0.35952381 0.39285714 0.37738095 0.37619048]
Overfit %: [ -9.45626007 -12.80783119  -9.80260784  -9.39458952 -13.62661034
 -12.57368368 -15.11394614 -11.21524202 -24.449155   -18.24709998
 -19.17706766 -26.52387307 -2

### <span style='color:green'>Severe overfitting with Random Forest Classifier.</span>  
<br>  
<br>  


<br>  

## <span style='color:blue'>Section 6: Explore Multinomial Naive Bayes</span>  

In [6]:
# ---------- make pipeline ----------

pipe_line_nb = Pipeline([('knn_impute', KNNImputer(n_neighbors=5)),
                         ('oversample_SMOTE', SMOTE(random_state=42)),
#                         ('ss_scale', StandardScaler()),
                         ('nb_class', MultinomialNB())])

# ---------- tuning of hyperparameters ----------

pipe_line_params_nb = {'nb_class__alpha': [100,500,1_000,5_000,10_000,50_000,100_000,500_000]}

# ---------- instantiate gridsearchcv for recall in accordance to aim of modelling ----------

search_nb = GridSearchCV(pipe_line_nb,
                         param_grid=pipe_line_params_nb,
                         cv=kfold_cv,
                         return_train_score=True,
                         scoring='recall')

search_nb.fit(X_train, y_train)

# ---------- quick overview of recall train score, test score and overfit ----------

print('Training :', search_nb.cv_results_['mean_train_score'])
print('Validation :', search_nb.cv_results_['mean_test_score'])
print('Overfit %:', (search_nb.cv_results_['mean_test_score']-search_nb.cv_results_['mean_train_score'])/search_nb.cv_results_['mean_train_score']*100)
print('')

# ---------- details of recall train score and test score ----------

temp_df_nb = []
for j in range(len(search_nb.cv_results_['params'])):
    for i in range(kfold_cv.n_splits):
        over_fit = (search_nb.cv_results_['split'+str(i)+'_test_score'][j]-search_nb.cv_results_['split'+str(i)+'_train_score'][j])/search_nb.cv_results_['split'+str(i)+'_train_score'][j]*100
        temp_df_nb.append([search_nb.cv_results_['params'][j],
                           search_nb.cv_results_['params'][j]['nb_class__alpha'],
                           search_nb.cv_results_['split'+str(i)+'_train_score'][j],
                           search_nb.cv_results_['split'+str(i)+'_test_score'][j],
                           over_fit])
temp_df_nb = pd.DataFrame(temp_df_nb, columns=['parameters', 'alpha', 'training', 'validation', 'overfit_%'])
print(temp_df_nb)
temp_df_nb.to_csv('../data/code_5a_nb_train_validate_recall.csv', na_rep='NaN', index_label='index')

Training : [0.50960961 0.51201201 0.51320785 0.5222115  0.53541399 0.57202917
 0.6140444  0.77370764]
Validation : [0.51309524 0.51309524 0.52142857 0.52142857 0.5297619  0.57142857
 0.61309524 0.78214286]
Overfit %: [ 0.68398013  0.21156263  1.60183066 -0.14992504 -1.05564681 -0.10499475
 -0.1545757   1.09023364]

                      parameters   alpha  training  validation  overfit_%
0       {'nb_class__alpha': 100}     100  0.495495    0.875000  76.590909
1       {'nb_class__alpha': 100}     100  0.513514    0.500000  -2.631579
2       {'nb_class__alpha': 100}     100  0.531532    0.375000 -29.449153
3       {'nb_class__alpha': 100}     100  0.504505    0.500000  -0.892857
4       {'nb_class__alpha': 100}     100  0.513514    0.250000 -51.315789
..                           ...     ...       ...         ...        ...
115  {'nb_class__alpha': 500000}  500000  0.765766    0.875000  14.264706
116  {'nb_class__alpha': 500000}  500000  0.747748    0.750000   0.301205
117  {'nb_class__

### <span style='color:green'>Very small overfitting with Multinomial Naive Bayes.</span>  
### <span style='color:green'>However, recall performance very low.</span>  
<br>  
<br>  


<br>  

## <span style='color:blue'>Section 7: Explore K Nearest Neighbours Classifier</span>  

In [7]:
# ---------- make pipeline ----------

pipe_line_kn = Pipeline([('knn_impute', KNNImputer(n_neighbors=5)),
                         ('oversample_SMOTE', SMOTE(random_state=42)),
                         ('ss_scale', StandardScaler()),
                         ('kn_class', KNeighborsClassifier())])

# ---------- tuning of hyperparameters ----------

pipe_line_params_kn = {'kn_class__n_neighbors': [10,20,30,40,50,60,70,80]}

# ---------- instantiate gridsearchcv for recall in accordance to aim of modelling ----------

search_kn = GridSearchCV(pipe_line_kn,
                         param_grid=pipe_line_params_kn,
                         cv=kfold_cv,
                         return_train_score=True,
                         scoring='recall')

search_kn.fit(X_train, y_train)

# ---------- quick overview of recall train score, test score and overfit ----------

print('Training :', search_kn.cv_results_['mean_train_score'])
print('Validation :', search_kn.cv_results_['mean_test_score'])
print('Overfit %:', (search_kn.cv_results_['mean_test_score']-search_kn.cv_results_['mean_train_score'])/search_kn.cv_results_['mean_train_score']*100)
print('')

# ---------- details of recall train score and test score ----------

temp_df_kn = []
for j in range(len(search_kn.cv_results_['params'])):
    for i in range(kfold_cv.n_splits):
        over_fit = (search_kn.cv_results_['split'+str(i)+'_test_score'][j]-search_kn.cv_results_['split'+str(i)+'_train_score'][j])/search_kn.cv_results_['split'+str(i)+'_train_score'][j]*100
        temp_df_kn.append([search_kn.cv_results_['params'][j],
                           search_kn.cv_results_['params'][j]['kn_class__n_neighbors'],
                           search_kn.cv_results_['split'+str(i)+'_train_score'][j],
                           search_kn.cv_results_['split'+str(i)+'_test_score'][j],
                           over_fit])
temp_df_kn = pd.DataFrame(temp_df_kn, columns=['parameters', 'n_neighbours', 'training', 'validation', 'overfit_%'])
print(temp_df_kn)
temp_df_kn.to_csv('../data/code_5a_kn_train_validate_recall.csv', na_rep='NaN', index_label='index')

Training : [0.96458065 0.93456135 0.93037323 0.92436722 0.92917739 0.94057808
 0.93937151 0.93576255]
Validation : [0.81547619 0.81547619 0.85       0.85833333 0.88333333 0.89166667
 0.9        0.9       ]
Overfit %: [-15.45795691 -12.74235843  -8.63881588  -7.1436859   -4.93383274
  -5.20014367  -4.19126126  -3.8217546 ]

                        parameters  n_neighbours  training  validation  \
0    {'kn_class__n_neighbors': 10}            10  0.954955       0.875   
1    {'kn_class__n_neighbors': 10}            10  0.963964       0.875   
2    {'kn_class__n_neighbors': 10}            10  0.963964       1.000   
3    {'kn_class__n_neighbors': 10}            10  0.954955       0.750   
4    {'kn_class__n_neighbors': 10}            10  0.972973       0.750   
..                             ...           ...       ...         ...   
115  {'kn_class__n_neighbors': 80}            80  0.927928       0.875   
116  {'kn_class__n_neighbors': 80}            80  0.936937       0.875   
117  {'kn

### <span style='color:green'>Acceptable overfitting at some hyperparameters with K Nearest Neighbours Classifier.</span>  
### <span style='color:green'>Recall performance higher than Multinomial Naive Bayes.</span>  
<br>  
<br>  


<br>  

## <span style='color:blue'>Section 8: Explore Support Vector Classifier</span>  

In [8]:
# ---------- make pipeline ----------

pipe_line_scv = Pipeline([('knn_impute', KNNImputer(n_neighbors=5)),
                          ('oversample_SMOTE', SMOTE(random_state=42)),
                          ('ss_scale', StandardScaler()),
                          ('svc_class', SVC(random_state=42, kernel='poly'))])

# ---------- tuning of hyperparameters ----------

pipe_line_params_scv = {'svc_class__C': [0.01,0.1,1,10,100,1_000],
                        'svc_class__degree': [2,3,4,5,6,7]}

# ---------- instantiate gridsearchcv for recall in accordance to aim of modelling ----------

search_scv = GridSearchCV(pipe_line_scv,
                          param_grid=pipe_line_params_scv,
                          cv=kfold_cv,
                          return_train_score=True,
                          scoring='recall')

search_scv.fit(X_train, y_train)

# ---------- quick overview of recall train score, test score and overfit ----------

print('Training :', search_scv.cv_results_['mean_train_score'])
print('Validation :', search_scv.cv_results_['mean_test_score'])
print('Overfit %:', (search_scv.cv_results_['mean_test_score']-search_scv.cv_results_['mean_train_score'])/search_scv.cv_results_['mean_train_score']*100)
print('')

# ---------- details of recall train score and test score ----------

temp_df_scv = []
for j in range(len(search_scv.cv_results_['params'])):
    for i in range(kfold_cv.n_splits):
        over_fit = (search_scv.cv_results_['split'+str(i)+'_test_score'][j]-search_scv.cv_results_['split'+str(i)+'_train_score'][j])/search_scv.cv_results_['split'+str(i)+'_train_score'][j]*100
        temp_df_scv.append([search_scv.cv_results_['params'][j],
                            search_scv.cv_results_['params'][j]['svc_class__C'],
                            search_scv.cv_results_['params'][j]['svc_class__degree'],
                            search_scv.cv_results_['split'+str(i)+'_train_score'][j],
                            search_scv.cv_results_['split'+str(i)+'_test_score'][j],
                            over_fit])
temp_df_scv = pd.DataFrame(temp_df_scv, columns=['parameters', 'C', 'degree', 'training', 'validation', 'overfit_%'])
print(temp_df_scv)
temp_df_scv.to_csv('../data/code_5a_scv_train_validate_recall.csv', na_rep='NaN', index_label='index')

Training : [0.05493887 0.11882239 0.0882293  0.14645538 0.18667417 0.21968576
 0.92676963 0.81212462 0.6770592  0.59545796 0.60144788 0.63868511
 0.95379665 0.97899507 0.99279816 0.99039575 1.         1.
 0.98800408 1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.        ]
Validation : [0.02857143 0.07738095 0.01785714 0.01785714 0.01785714 0.01785714
 0.55595238 0.53928571 0.07619048 0.15238095 0.05119048 0.11071429
 0.33571429 0.61309524 0.32738095 0.62261905 0.53690476 0.88214286
 0.29285714 0.60595238 0.30119048 0.74761905 0.58690476 0.94166667
 0.2952381  0.61428571 0.29285714 0.84047619 0.60357143 0.925
 0.2952381  0.61428571 0.29285714 0.84880952 0.5952381  0.91666667]
Overfit %: [-47.99414348 -34.87679393 -79.76052999 -87.80711069 -90.4340582
 -91.87150633 -40.01180392 -33.59569481 -88.74685168 -74.40945236
 -91.4887926  -82.6652785  -64.80232087 -37.37504314 -67.

### <span style='color:green'>Severe overfitting with Support Vector Classifier.</span>  

### <span style='color:green'>Will perform parameters and hyperparameters search on K Nearest Neighbours to finalise predictive model because it has good balance between overfit and recall.</span>  

3 Jan 2022