In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier

## Column Description
 
* HAEMATOCRIT - Patient laboratory test result of haematocrit
* HAEMOGLOBINS- Patient laboratory test result of haemoglobins
* ERYTHROCYTE - Patient laboratory test result of erythrocyte
* LEUCOCYTE- Patient laboratory test result of leucocyte
* THROMBOCYTE - Patient laboratory test result of thrombocyte
* MCH - Patient laboratory test result of MCH
* MCHC - Patient laboratory test result of MCHC
* MCV - Patient laboratory test result of MCV
* AGE - Patient age
* SEX - Patient gender
* SOURCE{in,out} - The class target in.= in care patient, out = out care patient

In [2]:
df = pd.read_csv('./Datasets/patients_data.csv')
df.head()

Unnamed: 0,HAEMATOCRIT,HAEMOGLOBINS,ERYTHROCYTE,LEUCOCYTE,THROMBOCYTE,MCH,MCHC,MCV,AGE,SEX,SOURCE
0,35.1,11.8,4.65,6.3,310,25.4,33.6,75.5,1,F,out
1,43.5,14.8,5.39,12.7,334,27.5,34.0,80.7,1,F,out
2,33.5,11.3,4.74,13.2,305,23.8,33.7,70.7,1,F,out
3,39.1,13.7,4.98,10.5,366,27.5,35.0,78.5,1,F,out
4,30.9,9.9,4.23,22.1,333,23.4,32.0,73.0,1,M,out


In [3]:
# Checking for null values
df.isnull().sum()

HAEMATOCRIT     0
HAEMOGLOBINS    0
ERYTHROCYTE     0
LEUCOCYTE       0
THROMBOCYTE     0
MCH             0
MCHC            0
MCV             0
AGE             0
SEX             0
SOURCE          0
dtype: int64

In [4]:
df['SEX'].value_counts()

M    2290
F    2122
Name: SEX, dtype: int64

In [5]:
df['SEX'] = df['SEX'].replace({'M':0,'F':1}) # Male: 0, Female:1

In [6]:
df['SEX'].value_counts()

0    2290
1    2122
Name: SEX, dtype: int64

In [7]:
x = df.iloc[:,:-1]
y = df.iloc[:,-1]
print(x.shape)
print(y.shape)

(4412, 10)
(4412,)


In [8]:
x.head(3)

Unnamed: 0,HAEMATOCRIT,HAEMOGLOBINS,ERYTHROCYTE,LEUCOCYTE,THROMBOCYTE,MCH,MCHC,MCV,AGE,SEX
0,35.1,11.8,4.65,6.3,310,25.4,33.6,75.5,1,1
1,43.5,14.8,5.39,12.7,334,27.5,34.0,80.7,1,1
2,33.5,11.3,4.74,13.2,305,23.8,33.7,70.7,1,1


In [9]:
y.head(3)

0    out
1    out
2    out
Name: SOURCE, dtype: object

In [10]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(3088, 10)
(1324, 10)
(3088,)
(1324,)


### AdaBoost Classifier

In [11]:
ada1 = AdaBoostClassifier() # n_estimators=50, DecisionTreeClassifier(max_depth=1)
ada1.fit(x_train,y_train)

AdaBoostClassifier()

In [12]:
print('Training Acc',ada1.score(x_train,y_train))
print('Testing Acc',ada1.score(x_test,y_test))

Training Acc 0.7606865284974094
Testing Acc 0.7439577039274925


In [13]:
ypred_ada1 = ada1.predict(x_test)

In [14]:
cm1 = confusion_matrix(y_test,ypred_ada1)
print(cm1)
print(classification_report(y_test,ypred_ada1))

[[323 225]
 [114 662]]
              precision    recall  f1-score   support

          in       0.74      0.59      0.66       548
         out       0.75      0.85      0.80       776

    accuracy                           0.74      1324
   macro avg       0.74      0.72      0.73      1324
weighted avg       0.74      0.74      0.74      1324



### AdaBoost with modifications

In [15]:
from sklearn.tree import DecisionTreeClassifier

In [16]:
dt1 = DecisionTreeClassifier(max_depth=2)
ada2 = AdaBoostClassifier(base_estimator=dt1, n_estimators=35) 
ada2.fit(x_train,y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2),
                   n_estimators=35)

In [17]:
print('Training Acc',ada2.score(x_train,y_train))
print('Testing Acc',ada2.score(x_test,y_test))

Training Acc 0.7856217616580311
Testing Acc 0.7311178247734139


In [18]:
ypred_ada2 = ada2.predict(x_test)

In [19]:
cm2 = confusion_matrix(y_test,ypred_ada2)
print(cm2)
print(classification_report(y_test,ypred_ada2))

[[332 216]
 [140 636]]
              precision    recall  f1-score   support

          in       0.70      0.61      0.65       548
         out       0.75      0.82      0.78       776

    accuracy                           0.73      1324
   macro avg       0.72      0.71      0.72      1324
weighted avg       0.73      0.73      0.73      1324

