**Liver Patient prediction using Boosting - "Adaboost" ensemble learning algorithm.**

In [0]:
#Loading the required libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

**Data Preprocessing**

In [0]:
class Preprocess(object):
  def __init__(self):
    self.url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00225/Indian%20Liver%20Patient%20Dataset%20(ILPD).csv"
    self.cols = ['age','gender','TB','DB','alkphos','sgpt','sgot','TP','albumin','AG_ratio','Is_liverPatient']
    self.dataset = None
  
  def load_data(self):
    self.dataset = pd.read_csv(self.url,names=self.cols)
    print('shape of df :',self.dataset.shape)
    return self.dataset
  
  def preprocess(self,data):
    #preprocessing the data
    # 1. Removing the duplicate rows
    self.dataset = data.drop_duplicates()
    print('Shape after removing the duplicates :',self.dataset.shape)
    # checking for the Null values, #display no of null values by column
    print(self.dataset.isnull().sum())
    #dropping the 4 rows here
    self.dataset = self.dataset[pd.notnull(self.dataset['AG_ratio'])]
    return None
  
  def encode_label(self):
    #changing the gender attribute to categorical type
    # 0 is for 'Female' and 1 for 'Male
    self.dataset['gender'] = self.dataset['gender'].apply(lambda x: 0 if(x == 'Female') else 1)
    #self.dataset.gender = pd.factorize(self.dataset.gender)[0] + 0.0
    #Encoding the label for proper implementation in network (1 neuron in the output layer)
    # Label 1 means "a liver patient" so encoding as 1
    # Label 2 means "not a liver patient" so encoding as 0
    self.dataset['Is_liverPatient'] = self.dataset['Is_liverPatient'].apply(lambda x: 0 if(x == 2) else x)
    #self.dataset.loc[(self.dataset.Is_liverPatient == 2), 'Is_liverPatient'] = 0
    return None
  
  def scaleData(self):
    #Scaling the attribute values to be between [0,1]
    scaler = MinMaxScaler()
    cols_to_scale = ['age', 'TB', 'DB', 'alkphos', 'sgpt', 'sgot', 'TP', 'albumin', 'AG_ratio']
    Dataset = pd.DataFrame(data=self.dataset)
    self.dataset[cols_to_scale] = scaler.fit_transform(Dataset[cols_to_scale])
    return None
  
  def get_dataset(self):
    return self.dataset

In [3]:
prep = Preprocess()
dataset = prep.load_data()
prep.preprocess(dataset)
prep.encode_label()
dataset = prep.get_dataset()

shape of df : (583, 11)
Shape after removing the duplicates : (570, 11)
age                0
gender             0
TB                 0
DB                 0
alkphos            0
sgpt               0
sgot               0
TP                 0
albumin            0
AG_ratio           4
Is_liverPatient    0
dtype: int64


In [4]:
dataset

Unnamed: 0,age,gender,TB,DB,alkphos,sgpt,sgot,TP,albumin,AG_ratio,Is_liverPatient
0,65,0,0.7,0.1,187,16,18,6.8,3.3,0.90,1
1,62,1,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,1,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,1,1.0,0.4,182,14,20,6.8,3.4,1.00,1
4,72,1,3.9,2.0,195,27,59,7.3,2.4,0.40,1
...,...,...,...,...,...,...,...,...,...,...,...
578,60,1,0.5,0.1,500,20,34,5.9,1.6,0.37,0
579,40,1,0.6,0.1,98,35,31,6.0,3.2,1.10,1
580,52,1,0.8,0.2,245,48,49,6.4,3.2,1.00,1
581,31,1,1.3,0.5,184,29,32,6.8,3.4,1.00,1


In [0]:
#Splitting the dataset into train and test splits
# Taking the label out of the original preprocessed dataset
X = dataset[['age', 'gender', 'TB', 'DB', 'alkphos', 'sgpt', 'sgot', 'TP', 'albumin', 'AG_ratio']]
y = dataset['Is_liverPatient']

X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.70)  #test_size = 1-0.7 = 0.3

**Building the Adaboost classifier.**

a) Using Decision tree classifier as the base learner.

In [0]:
abc_DT = AdaBoostClassifier(n_estimators=50,learning_rate=1,base_estimator=DecisionTreeClassifier()) #base_estimator=DecisionTreeClassifier()

In [0]:
model = abc_DT.fit(X_train,y_train)

In [20]:
y_pred = model.predict(X_test)
print('Accuracy :',accuracy_score(y_test,y_pred))

Accuracy : 0.6470588235294118


b) Using SVC as the base learner.

In [0]:
abc_SVC = AdaBoostClassifier(n_estimators=50,learning_rate=1,base_estimator=SVC(probability=True,kernel='linear')) #base_estimator=DecisionTreeClassifier()

In [0]:
model_SVC = abc_SVC.fit(X_train,y_train)

In [23]:
y_pred_SVC = model_SVC.predict(X_test)
print('Accuracy :',accuracy_score(y_test, y_pred_SVC))

Accuracy : 0.7352941176470589


c) No base estimator set as base learner.

In [0]:
a = AdaBoostClassifier(n_estimators=50,learning_rate=0.1)

In [37]:
model_a = a.fit(X_train,y_train)
y_pred_a = model_a.predict(X_test)
print('Accuracy :',accuracy_score(y_test,y_pred_a))

Accuracy : 0.7588235294117647


**RESULT**

The accuracy when the base learner used is Decision tree is around 65% not that good. But with SVC as the base learner gives around 73.54%. With no base learner set, gives the accuracy of around 74.11%.

Optimizing the Hyperparameters for the Adaboost algorithm.

In [0]:
# The data is = x,y
# the estimator:
estimator = AdaBoostClassifier()
# List the parameters
params = {
 'n_estimators': [50, 100, 500],
 'learning_rate' : [0.01, 0.05, 0.1, 0.5]
 }

In [0]:
#building the GridSearchCV 
gscv = GridSearchCV(estimator=estimator,param_grid=params,cv=5,verbose=1,return_train_score=True)

In [33]:
#fitting the data to the GridSearchCV model
gscv.fit(X,y)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:   21.2s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=AdaBoostClassifier(algorithm='SAMME.R',
                                          base_estimator=None,
                                          learning_rate=1.0, n_estimators=50,
                                          random_state=None),
             iid='deprecated', n_jobs=None,
             param_grid={'learning_rate': [0.01, 0.05, 0.1, 0.5],
                         'n_estimators': [50, 100, 500]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring=None, verbose=1)

In [35]:
print(gscv.best_params_)
print(gscv.best_score_)

{'learning_rate': 0.1, 'n_estimators': 50}
0.7208352740257725


In [38]:
best_estimator = gscv.best_estimator_
print(best_estimator)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.1,
                   n_estimators=50, random_state=None)


In [39]:
best_estimator.fit(X_train,y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.1,
                   n_estimators=50, random_state=None)

In [42]:
 y_pred = best_estimator.predict(X_test)
 accuracy_score(y_test,y_pred)

0.7588235294117647

The best achievable **accuracy is 76%** after tuning the hyperparameters.