<a href="https://colab.research.google.com/github/karimqasim/MachineLearning/blob/master/EnsembleLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
import seaborn as sns # data visualization library  
import matplotlib.pyplot as plt

In [5]:
from google.colab import files
uploaded = files.upload()

Saving Indian Liver Patient Dataset (ILPD).csv to Indian Liver Patient Dataset (ILPD).csv


In [0]:
df = pd.read_csv('Indian Liver Patient Dataset (ILPD).csv')

In [7]:
df.head()

Unnamed: 0,age,gender,tot_bilirubin,direct_bilirubin,tot_proteins,albumin,ag_ratio,sgpt,sgot,alkphos,is_patient
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
age                 583 non-null int64
gender              583 non-null object
tot_bilirubin       583 non-null float64
direct_bilirubin    583 non-null float64
tot_proteins        583 non-null int64
albumin             583 non-null int64
ag_ratio            583 non-null int64
sgpt                583 non-null float64
sgot                583 non-null float64
alkphos             579 non-null float64
is_patient          583 non-null int64
dtypes: float64(5), int64(5), object(1)
memory usage: 50.2+ KB


Note that there are null values in alkphos (Alkaline Phosphotase)

In [9]:
df.isnull().sum()
df.isna().sum()

age                 0
gender              0
tot_bilirubin       0
direct_bilirubin    0
tot_proteins        0
albumin             0
ag_ratio            0
sgpt                0
sgot                0
alkphos             4
is_patient          0
dtype: int64

In [0]:
df_final = df.fillna(df.mean())

In [11]:
df_final['alkphos'].describe()

count    583.000000
mean       0.947064
std        0.318492
min        0.300000
25%        0.700000
50%        0.947064
75%        1.100000
max        2.800000
Name: alkphos, dtype: float64

Gender is a categorical variable:

In [12]:
df_final['gender'].value_counts()

Male      441
Female    142
Name: gender, dtype: int64

In [0]:
df_final['Is_male'] = df_final['gender'].map(dict(zip(['Male','Female'],[1,0])))


In [14]:
df_final.head()

Unnamed: 0,age,gender,tot_bilirubin,direct_bilirubin,tot_proteins,albumin,ag_ratio,sgpt,sgot,alkphos,is_patient,Is_male
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1,0
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1,1


In [0]:
X = df_final.drop(['gender', 'is_patient'],axis=1)

In [16]:
X.head()

Unnamed: 0,age,tot_bilirubin,direct_bilirubin,tot_proteins,albumin,ag_ratio,sgpt,sgot,alkphos,Is_male
0,65,0.7,0.1,187,16,18,6.8,3.3,0.9,0
1,62,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,3.9,2.0,195,27,59,7.3,2.4,0.4,1


We will standardize the variables 

In [0]:
from sklearn.preprocessing import StandardScaler

In [0]:
scaler = StandardScaler()

In [19]:
scaler.fit(X.drop('Is_male', axis=1)) #Standardizing all the variables except the Is_male variable

StandardScaler(copy=True, with_mean=True, with_std=True)

In [0]:
scaled_features = scaler.transform(X.drop('Is_male', axis=1)) #transform method standardizes by centering and scaling

In [21]:
scaled_features

array([[ 1.25209764, -0.41887783, -0.49396398, ...,  0.29211961,
         0.19896867, -0.14789798],
       [ 1.06663704,  1.22517135,  1.43042334, ...,  0.93756634,
         0.07315659, -0.65069686],
       [ 1.06663704,  0.6449187 ,  0.93150811, ...,  0.47653296,
         0.19896867, -0.17932291],
       ...,
       [ 0.44843504, -0.4027597 , -0.45832717, ..., -0.0767071 ,
         0.07315659,  0.16635131],
       [-0.84978917, -0.32216906, -0.35141677, ...,  0.29211961,
         0.32478075,  0.16635131],
       [-0.41704777, -0.37052344, -0.42269037, ...,  0.75315299,
         1.58290153,  1.73759779]])

In [0]:
df_X = pd.DataFrame(scaled_features, columns=X.columns[:-1])

In [23]:
df_X.head()

Unnamed: 0,age,tot_bilirubin,direct_bilirubin,tot_proteins,albumin,ag_ratio,sgpt,sgot,alkphos
0,1.252098,-0.418878,-0.493964,-0.426715,-0.354665,-0.318393,0.29212,0.198969,-0.147898
1,1.066637,1.225171,1.430423,1.682629,-0.091599,-0.034333,0.937566,0.073157,-0.650697
2,1.066637,0.644919,0.931508,0.821588,-0.113522,-0.145186,0.476533,0.198969,-0.179323
3,0.819356,-0.370523,-0.387054,-0.447314,-0.365626,-0.311465,0.29212,0.324781,0.166351
4,1.684839,0.096902,0.183135,-0.393756,-0.294379,-0.176363,0.753153,-0.93334,-1.719144


In [0]:
df_X['Is_male'] = df_final['Is_male']

In [25]:
df_X.head()

Unnamed: 0,age,tot_bilirubin,direct_bilirubin,tot_proteins,albumin,ag_ratio,sgpt,sgot,alkphos,Is_male
0,1.252098,-0.418878,-0.493964,-0.426715,-0.354665,-0.318393,0.29212,0.198969,-0.147898,0
1,1.066637,1.225171,1.430423,1.682629,-0.091599,-0.034333,0.937566,0.073157,-0.650697,1
2,1.066637,0.644919,0.931508,0.821588,-0.113522,-0.145186,0.476533,0.198969,-0.179323,1
3,0.819356,-0.370523,-0.387054,-0.447314,-0.365626,-0.311465,0.29212,0.324781,0.166351,1
4,1.684839,0.096902,0.183135,-0.393756,-0.294379,-0.176363,0.753153,-0.93334,-1.719144,1


In [0]:
y = df_final['is_patient']

In [0]:
# Set seed for reproducibility
SEED=1

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.ensemble import VotingClassifier

# Instantiate lr
lr = LogisticRegression(random_state=SEED)

# Instantiate knn
knn = KNN(n_neighbors=27)

# Instantiate dt
dt = DecisionTreeClassifier(min_samples_leaf=0.13, random_state=SEED)

# Define the list classifiers
classifiers = [('Logistic Regression', lr), ('K Nearest Neighbours', knn), ('Classification Tree', dt)]

In [0]:
X_train, X_test, y_train, y_test = train_test_split(df_X, y, test_size=0.3, random_state=SEED)

In [29]:
# Iterate over the pre-defined list of classifiers
for clf_name, clf in classifiers:    
 
    # Fit clf to the training set
    clf.fit(X_train, y_train)    
   
    # Predict y_pred
    y_pred = clf.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred) 
   
    # Evaluate clf's accuracy on the test set
    print('{:s} : {:.3f}'.format(clf_name, accuracy))

Logistic Regression : 0.731
K Nearest Neighbours : 0.714
Classification Tree : 0.726




Logistic Regression has achieved the highest accuracy of 73.1%.

Finally, we'll evaluate the performance of a voting classifier that takes the outputs of the models defined in the list classifiers and assigns labels by majority voting.

In [30]:
# Import VotingCLassifier from sklearn.ensemble
from sklearn.ensemble import VotingClassifier

# Instantiate a VotingClassifier vc
vc = VotingClassifier(estimators=classifiers)     

# Fit vc to the training set
vc.fit(X_train, y_train)   

# Evaluate the test set predictions
y_pred = vc.predict(X_test)

# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print('Voting Classifier: {:.3f}'.format(accuracy))

Voting Classifier: 0.749




Notice how the voting classifier achieves a test set accuracy of 74.9%. This value is greater than that achieved by LogisticRegression. 

## Bagging Classification

In [0]:
# Import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier

# Import BaggingClassifier
from sklearn.ensemble import BaggingClassifier

# Instantiate dt
dt = DecisionTreeClassifier(random_state=1)

# Instantiate bc
bc = BaggingClassifier(base_estimator=dt, n_estimators=50, random_state=1)

In [32]:
# Fit dt to the training set
dt.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=1, splitter='best')

In [33]:

# Use dt to predict test set labels
y_pred_dt = dt.predict(X_test)

# Evaluate accuracy_entropy
accuracy_dt = accuracy_score(y_test,y_pred_dt)

# Print accuracy_dt
print('Accuracy achieved by using dt: ', accuracy_dt)


Accuracy achieved by using dt:  0.6171428571428571


In [34]:
# Fit bc to the training set
bc.fit(X_train, y_train)

# Predict test set labels
y_pred = bc.predict(X_test)

# Evaluate acc_test
acc_test = accuracy_score(y_test, y_pred)
print('Test set accuracy of bc: {:.2f}'.format(acc_test)) 

Test set accuracy of bc: 0.69


A single tree dt would have achieved an accuracy of 62% which is 7% lower than bc's accuracy!

## AdaBoost Classifier

As a first step, you'll start by instantiating an AdaBoost classifier.

In [0]:
# Import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier

# Import AdaBoostClassifier
from sklearn.ensemble import AdaBoostClassifier

# Instantiate dt
dt = DecisionTreeClassifier(max_depth=2, random_state=1)

# Instantiate ada
ada = AdaBoostClassifier(base_estimator=dt, n_estimators=180, random_state=1)

Next we will train ada and evaluate the probability of obtaining the positive class in the test set.

In [0]:
# Fit ada to the training set
ada.fit(X_train, y_train)

# Compute the probabilities of obtaining the positive class
y_pred_proba = ada.predict_proba(X_test)[:,1]

In [37]:
# Import roc_auc_score
from sklearn.metrics import roc_auc_score 

# Evaluate test-set roc_auc_score
ada_roc_auc = roc_auc_score(y_test, y_pred_proba)

# Print roc_auc_score
print('ROC AUC score: {:.2f}'.format(ada_roc_auc))

ROC AUC score: 0.70


This untuned AdaBoost classifier achieved a ROC AUC score of 0.70!