**Liver Patient prediction using the XgBoost Ensemble learning technique.**

In [0]:
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
import numpy as np


**Data Preparation:**

In [0]:
class Preprocess(object):
  def __init__(self):
    self.url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00225/Indian%20Liver%20Patient%20Dataset%20(ILPD).csv"
    self.cols = ['age','gender','TB','DB','alkphos','sgpt','sgot','TP','albumin','AG_ratio','Is_liverPatient']
    self.dataset = None
  
  def load_data(self):
    self.dataset = pd.read_csv(self.url,names=self.cols)
    print('shape of df :',self.dataset.shape)
    return self.dataset
  
  def preprocess(self,data):
    #preprocessing the data
    # 1. Removing the duplicate rows
    self.dataset = data.drop_duplicates()
    print('Shape after removing the duplicates :',self.dataset.shape)
    # checking for the Null values, #display no of null values by column
    print(self.dataset.isnull().sum())
    #dropping the 4 rows here
    self.dataset = self.dataset[pd.notnull(self.dataset['AG_ratio'])]
    return None
  
  def encode_label(self):
    #changing the gender attribute to categorical type
    # 0 is for 'Female' and 1 for 'Male
    self.dataset['gender'] = self.dataset['gender'].apply(lambda x: 0 if(x == 'Female') else 1)
    #self.dataset.gender = pd.factorize(self.dataset.gender)[0] + 0.0
    #Encoding the label for proper implementation in network (1 neuron in the output layer)
    # Label 1 means "a liver patient" so encoding as 1
    # Label 2 means "not a liver patient" so encoding as 0
    self.dataset['Is_liverPatient'] = self.dataset['Is_liverPatient'].apply(lambda x: 0 if(x == 2) else x)
    #self.dataset.loc[(self.dataset.Is_liverPatient == 2), 'Is_liverPatient'] = 0
    return None
  
  def scaleData(self):
    #Scaling the attribute values to be between [0,1]
    scaler = MinMaxScaler()
    cols_to_scale = ['age', 'TB', 'DB', 'alkphos', 'sgpt', 'sgot', 'TP', 'albumin', 'AG_ratio']
    Dataset = pd.DataFrame(data=self.dataset)
    self.dataset[cols_to_scale] = scaler.fit_transform(Dataset[cols_to_scale])
    return None
  
  def get_dataset(self):
    return self.dataset

In [3]:
prep = Preprocess()
dataset = prep.load_data()
prep.preprocess(dataset)
prep.encode_label()
#prep.scaleData()
dataset = prep.get_dataset()

shape of df : (583, 11)
Shape after removing the duplicates : (570, 11)
age                0
gender             0
TB                 0
DB                 0
alkphos            0
sgpt               0
sgot               0
TP                 0
albumin            0
AG_ratio           4
Is_liverPatient    0
dtype: int64


In [4]:
dataset

Unnamed: 0,age,gender,TB,DB,alkphos,sgpt,sgot,TP,albumin,AG_ratio,Is_liverPatient
0,65,0,0.7,0.1,187,16,18,6.8,3.3,0.90,1
1,62,1,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,1,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,1,1.0,0.4,182,14,20,6.8,3.4,1.00,1
4,72,1,3.9,2.0,195,27,59,7.3,2.4,0.40,1
...,...,...,...,...,...,...,...,...,...,...,...
578,60,1,0.5,0.1,500,20,34,5.9,1.6,0.37,0
579,40,1,0.6,0.1,98,35,31,6.0,3.2,1.10,1
580,52,1,0.8,0.2,245,48,49,6.4,3.2,1.00,1
581,31,1,1.3,0.5,184,29,32,6.8,3.4,1.00,1


In [0]:
# Taking the label out of the original preprocessed dataset
X = dataset[['age', 'gender', 'TB', 'DB', 'alkphos', 'sgpt', 'sgot', 'TP', 'albumin', 'AG_ratio']]
y = dataset['Is_liverPatient']

***Building the XgBoost model without cross-validation***

In [0]:
# splitting the dataset into train-test splits
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.30)

In [0]:
#defining the XGBClassifer
xgb_cl = xgb.XGBClassifier(max_depth=5,learning_rate=1,n_estimators=50,objective='binary:logistic')

In [71]:
xgb_cl.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=1,
              max_delta_step=0, max_depth=5, min_child_weight=1, missing=None,
              n_estimators=50, n_jobs=1, nthread=None,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
              subsample=1, verbosity=1)

In [72]:
y_pred = xgb_cl.predict(X_test)
print('Accuracy :',accuracy_score(y_test,y_pred))
print('Accuracy mannual calculation :',float(np.sum(y_pred == y_test))/y_test.shape[0])

Accuracy : 0.6705882352941176
Accuracy mannual calculation : 0.6705882352941176


***Model Evaluation using XgBoost API with cross-validation***

In [0]:
#creating the DMatrix from x and y 
dmatrix = xgb.DMatrix(data=X, label=y)
#Creating the params dictionary with required parameters
params = {'objective' : 'binary:logistic', 'max_depth' : 4}

In [0]:
#Starting the cross validation process
cv_results = xgb.cv(params=params,dtrain=dmatrix,num_boost_round=10,nfold=3,metrics='error',as_pandas=True)

In [54]:
print('Accuracy : %f' %((1-cv_results['test-error-mean']).iloc[-1]))

Accuracy : 0.687287


In [0]:
cv_results_auc = xgb.cv(params=params,dtrain=dmatrix,num_boost_round=10,nfold=3,metrics='auc',as_pandas=True)

In [56]:
print('Accuracy : %.4f' %((1-cv_results_auc['test-auc-mean']).iloc[-1]))

Accuracy : 0.2615


**Tuning the parameters using GridSearchCV**

In [0]:
# The data is = x,y
# the estimator:
estimator = xgb.XGBClassifier()
# List the parameters
params = {
 'n_estimators': [50, 100, 500, 10],
 'learning_rate' : [0.01, 0.05, 1, 0.5],
 'max_depth' : [3,4,5,10]
 }

In [0]:
#building the GridSearchCV 
gscv = GridSearchCV(estimator=estimator,param_grid=params,cv=5,verbose=1,return_train_score=True)

In [61]:
#fitting the data to the GridSearchCV model
gscv.fit(X,y)

Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 320 out of 320 | elapsed:   27.6s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='deprecated', n_jobs=None,
             param_grid={'learning_rate': [0.01, 0.05, 1, 0.5],
                         'max_depth': [3, 4, 5, 10],
                         'n_estimators': [50, 100, 500, 10]}

In [62]:
print(gscv.best_params_)
print(gscv.best_score_)

{'learning_rate': 1, 'max_depth': 3, 'n_estimators': 50}
0.6996584381307251


In [63]:
best_estimator = gscv.best_estimator_
print(best_estimator)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=1,
              max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
              n_estimators=50, n_jobs=1, nthread=None,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
              subsample=1, verbosity=1)


In [64]:
best_estimator.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=1,
              max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
              n_estimators=50, n_jobs=1, nthread=None,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
              subsample=1, verbosity=1)

In [65]:
 y_pred = best_estimator.predict(X_test)
 accuracy_score(y_test,y_pred)

0.6647058823529411

The max achievable accuracy using XGBoost is **70%**