In [1]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [167]:
data = pd.read_csv('og6K.csv') # loading dataset

#drop rows with empty values and unnecessary columns
data = data.dropna(axis=0,how='any')
data.drop(['Residence_type'],axis=1,inplace=True)
data.drop(['chest pain type'],axis=1,inplace=True)
data.drop(['Unnamed: 0'],axis=1,inplace=True)

data.rename(columns={
    'skin_thickness':'skin thickness', 
    'hypertension':'hyper tension',
    'heart_disease':'heart disease',
    'smoking_status':'smoking status',
    'diabetes_pedigree':'diabetes pedigree'}, inplace=True)

data.loc[data['triage'] == "red", 'triage'] = "immediate"
data.loc[data['triage'] == "orange", 'triage'] = "immediate"
for key in ['smoking status']:
    data[key] = pd.Categorical(data[key])
    data[key] = data[key].cat.codes

data.sample(n=10)

Unnamed: 0,age,gender,blood pressure,cholesterol,max heart rate,exercise angina,plasma glucose,skin thickness,insulin,bmi,diabetes pedigree,hyper tension,heart disease,smoking status,triage
307,68,1.0,145,182,148,0,97.0,36,111,19.0,0.467386,0,0,2,yellow
1653,66,1.0,64,152,154,0,84.0,23,115,29.9,0.471,0,0,2,yellow
738,51,0.0,130,256,149,0,123.0,34,120,23.0,0.467386,0,0,2,yellow
709,54,1.0,125,294,152,0,89.0,43,132,18.0,0.467386,0,0,2,yellow
2920,82,0.0,129,195,164,0,96.63,98,138,26.5,0.467386,0,0,0,yellow
5269,80,0.0,107,178,143,0,103.6,81,81,23.7,0.467386,1,0,2,yellow
2138,58,0.0,82,193,185,0,94.09,34,98,30.9,0.467386,0,0,2,yellow
3860,60,1.0,93,167,138,0,57.02,67,128,20.7,0.467386,0,0,0,yellow
255,59,1.0,130,231,165,0,93.0,60,106,23.0,0.467386,0,0,2,yellow
5174,58,0.0,113,171,169,0,83.93,72,122,25.6,0.467386,0,0,1,yellow


In [168]:
from sklearn.model_selection import train_test_split

# We split the data up into a test set and a training set, 30 - 70 %
# train_test_split
# First argument: x data is all data without class column
# Second argument:  this is the class label column
# Random state = 0: ensures the train and test splitting is deterministic. 
#Otherwise every student would get a different train test split.
X_train, X_test, y_train, y_test = train_test_split(data.drop('triage',1), 
                                                    data['triage'],
                                                    test_size=0.3,
                                                    random_state=0)
print(X_train)
print(y_train.unique())
print(y_test.unique())

      age  gender  blood pressure  cholesterol  max heart rate  \
2585   49     1.0             117          193             150   
3998   49     1.0             125          184             185   
2079   74     1.0              94          165             182   
2920   82     0.0             129          195             164   
1804   45     1.0              78          192             174   
...   ...     ...             ...          ...             ...   
4931   49     1.0             106          188             175   
3264   82     1.0              82          181             186   
1653   66     1.0              64          152             154   
2607   64     0.0             139          185             142   
2732   58     1.0             118          187             158   

      exercise angina  plasma glucose  skin thickness  insulin   bmi  \
2585                0           70.78              77      104  20.3   
3998                0          108.33              75      110 

In [169]:
from sklearn.preprocessing import MinMaxScaler

# define a new scaler: 
x_scaler = MinMaxScaler()

# fit the normalization on the training set: 
x_scaler.fit(X_train)

# then create new and normalized training/test sets: 
X_train_norm = x_scaler.transform(X_train)
X_test_norm = x_scaler.transform(X_test)

In [170]:
from sklearn.metrics import classification_report, confusion_matrix

def evaluate_on_training_set(y_test, y_pred):
  # print out recall and precision
  labels = ["immediate","yellow","green","white"]
  print(classification_report(y_test, y_pred,labels=labels))
  
  # print out confusion matrix
  print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred,labels=labels))

## Decision Trees



In [228]:
# from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

model = tree.DecisionTreeClassifier(max_depth=10, min_samples_leaf=1) 

model.fit(X_train, y_train)# Training model

DecisionTreeClassifier(max_depth=10)

In [229]:
y_pred = model.predict(X_test) # Predicting labels for our test set using model
evaluate_on_training_set(y_test, y_pred) #evaluate our model using new function

              precision    recall  f1-score   support

   immediate       0.81      0.93      0.87       126
      yellow       0.99      0.98      0.99      1695
       green       1.00      0.99      1.00       137
       white       0.96      0.96      0.96       131

    accuracy                           0.98      2089
   macro avg       0.94      0.97      0.95      2089
weighted avg       0.98      0.98      0.98      2089

Confusion Matrix: 
 [[ 117    7    0    2]
 [  24 1668    0    3]
 [   0    1  136    0]
 [   3    2    0  126]]


## K Nearest Neighbors

In [181]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=5) # Define the model with parameters

In [182]:
model.fit(X_train_norm, y_train) # Training the model

# Evaluate the model: 
y_pred = model.predict(X_test_norm) # Predicting labels for our test set using trained model
evaluate_on_training_set(y_test, y_pred) #evaluate our model using newly defined function

              precision    recall  f1-score   support

   immediate       0.48      0.36      0.41       126
      yellow       0.89      0.96      0.92      1695
       green       0.97      0.74      0.84       137
       white       0.48      0.29      0.36       131

    accuracy                           0.86      2089
   macro avg       0.71      0.59      0.63      2089
weighted avg       0.85      0.86      0.85      2089

Confusion Matrix: 
 [[  45   76    0    5]
 [  36 1622    3   34]
 [   0   34  101    2]
 [  12   81    0   38]]


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


## Logistic Regression
Let's move on to a very powerful, yet fast model: Logistic Regression. Again, we start by loading the library and defining the model together with its parameters. 

In this case, multi_class auto will detect the number of classes automatically, C is our regularisation parameter, and solver is the optimization algorithm used to fit the model: 
* For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ and ‘saga’ are faster for large ones.
* For multiclass problems, only ‘newton-cg’, ‘sag’, ‘saga’ and ‘lbfgs’ handle multinomial loss; ‘liblinear’ is limited to one-versus-rest schemes.
* ‘newton-cg’, ‘lbfgs’, ‘sag’ and ‘saga’ handle L2 or no penalty
* ‘liblinear’ and ‘saga’ also handle L1 penalty
* ‘saga’ also supports ‘elasticnet’ penalty
* ‘liblinear’ does not handle no penalty

In [234]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(C=2, multi_class='auto', solver='sag') 

In [235]:
model.fit(X_train_norm, y_train) # Training the model

y_pred = model.predict(X_test_norm) # Predicting labels for our test set using trained model
evaluate_on_training_set(y_test, y_pred) #evaluate our model using newly defined function

              precision    recall  f1-score   support

   immediate       0.63      0.47      0.54       126
      yellow       0.93      0.98      0.95      1695
       green       0.97      0.90      0.93       137
       white       0.72      0.48      0.58       131

    accuracy                           0.91      2089
   macro avg       0.81      0.71      0.75      2089
weighted avg       0.90      0.91      0.90      2089

Confusion Matrix: 
 [[  59   54    0   13]
 [  25 1656    2   12]
 [   0   14  123    0]
 [   9   57    2   63]]


## Gaussian Naive Bayes

In [203]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB() # Define the model with parameters

In [204]:
model.fit(X_train_norm, y_train) # Training the model

y_pred = model.predict(X_test_norm) # Predicting labels for our test set using trained model
evaluate_on_training_set(y_test, y_pred) #evaluate our model using newly defined function

              precision    recall  f1-score   support

   immediate       0.15      1.00      0.26       126
      yellow       1.00      0.55      0.71      1695
       green       0.44      1.00      0.61       137
       white       0.13      0.02      0.03       131

    accuracy                           0.57      2089
   macro avg       0.43      0.64      0.40      2089
weighted avg       0.86      0.57      0.63      2089

Confusion Matrix: 
 [[126   0   0   0]
 [574 936 172  13]
 [  0   0 137   0]
 [125   3   1   2]]


## SVM
The kernel can be: ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’. If none is given, ‘rbf’ will be used.

In [238]:
from sklearn.svm import SVC
model = SVC(C=10, gamma='auto', kernel='poly')
model.fit(X_train_norm, y_train)

SVC(C=10, gamma='auto', kernel='poly')

In [239]:
model.fit(X_train_norm, y_train) # Training SVM

y_pred = model.predict(X_test_norm) # Predicting labels for our test set using trained model
evaluate_on_training_set(y_test, y_pred) #evaluate our model using newly defined function

              precision    recall  f1-score   support

   immediate       0.00      0.00      0.00       126
      yellow       0.85      1.00      0.92      1695
       green       0.99      0.66      0.79       137
       white       0.00      0.00      0.00       131

    accuracy                           0.85      2089
   macro avg       0.46      0.41      0.43      2089
weighted avg       0.75      0.85      0.80      2089

Confusion Matrix: 
 [[   0  126    0    0]
 [   0 1694    1    0]
 [   0   47   90    0]
 [   0  131    0    0]]


  _warn_prf(average, modifier, msg_start, len(result))


### gridsearch
* with kernel rbf, try varying gamma (which is a coefficient in the rbf kernel) to be 1e-3 or 1e-4; and vary C to be 1, 10, or 100.
* with linear kernel (there is no gamma here), try varying C to be 1, 10, and 100.

In [209]:
from sklearn.model_selection import GridSearchCV

tuned_parameters = [{'kernel': ['rbf'], 
                     'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100]},
                    {'kernel': ['linear'],
                     'C': [1, 10, 100]}]

In [212]:
# we define the grid search model for SVM:  cv=2 is 2-fold CV
clf = GridSearchCV(SVC(), tuned_parameters, cv=3,
                   scoring='accuracy', verbose=1, n_jobs=4)

# train the model on the training set: 
clf.fit(X_train_norm, y_train)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  20 out of  27 | elapsed:    0.7s remaining:    0.2s
[Parallel(n_jobs=4)]: Done  27 out of  27 | elapsed:    0.9s finished


GridSearchCV(cv=3, estimator=SVC(), n_jobs=4,
             param_grid=[{'C': [1, 10, 100], 'gamma': [0.001, 0.0001],
                          'kernel': ['rbf']},
                         {'C': [1, 10, 100], 'kernel': ['linear']}],
             scoring='accuracy', verbose=1)

In [213]:
# Show best parameters: 
print("Best parameter set found on development set:")
print(clf.best_params_, '\n')

y_pred = clf.predict(X_test_norm) #create predictions
evaluate_on_training_set(y_test, y_pred) # evaluate like we always do

Best parameter set found on development set:
{'C': 100, 'kernel': 'linear'} 

              precision    recall  f1-score   support

   immediate       0.62      0.57      0.59       126
      yellow       0.95      0.97      0.96      1695
       green       0.97      0.98      0.97       137
       white       0.66      0.54      0.60       131

    accuracy                           0.92      2089
   macro avg       0.80      0.76      0.78      2089
weighted avg       0.91      0.92      0.91      2089

Confusion Matrix: 
 [[  72   39    0   15]
 [  34 1636    4   21]
 [   0    3  134    0]
 [  11   49    0   71]]


## Ada Boost

In [240]:
from sklearn.ensemble import AdaBoostClassifier
#learning rate can vary from 0 to 1; 
# and n_estimators is the number of times a model is built.
model = AdaBoostClassifier(n_estimators=1500, learning_rate=0.06) # Define the model with parameters

In [241]:
model.fit(X_train_norm, y_train) # Training the model

y_pred = model.predict(X_test_norm) # Predicting labels for our test set using trained model
evaluate_on_training_set(y_test, y_pred) #evaluate our model using newly defined function

              precision    recall  f1-score   support

   immediate       0.76      0.81      0.78       126
      yellow       1.00      0.98      0.99      1695
       green       1.00      0.99      1.00       137
       white       0.82      0.95      0.88       131

    accuracy                           0.97      2089
   macro avg       0.89      0.93      0.91      2089
weighted avg       0.97      0.97      0.97      2089

Confusion Matrix: 
 [[ 102    3    0   21]
 [  30 1658    0    7]
 [   0    1  136    0]
 [   3    4    0  124]]


## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
# n_estimators is how many trees there are in the model
model = RandomForestClassifier(n_estimators = 200) # Define the model

In [None]:
model.fit(X_train_norm, y_train)

y_pred = model.predict(X_test_norm)
evaluate_on_training_set(y_test, y_pred)