In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import LabelEncoder as LE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, accuracy_score, classification_report

In [2]:
# Load data
dset = pd.read_csv('liver_disease_.csv')
features = dset.iloc[::, :-1]

In [3]:
# encode 'Gender' attribute
le = LE()
le.fit(['Male', 'Female'])
features.Gender = le.transform(features.Gender)

In [4]:
# handle NaN data
# replace missing-values with column mean
for col in features.columns :
    #if col  == 'Gender' : continue
    mean = features[col].mean()
    features[col] = features[col].fillna(mean)
target = dset.iloc[::, -1]

le.fit(['Yes', 'No'])
target = le.transform(target)

In [5]:
# FEATURE SCALING
scaler = StandardScaler()
scaler.fit(features)
features = scaler.transform(features)

# Split data
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=0)

accuracy_all = {}
f1_all = {}

In [6]:
def getModelReport(modelName, yTest, yPredict) :
    accuracy = accuracy_score(yTest, yPredict)
    precision = precision_score(yTest, yPredict)
    recall = recall_score(yTest, yPredict)
    f1 = f1_score(yTest, yPredict)
    conf_matrix = confusion_matrix(yTest, yPredict)
    clf_report = classification_report(yTest, yPredict)
    accuracy_all[modelName] = accuracy
    f1_all[modelName] = f1

    print('\n\n*************************************************************************************')
    print('FOR {0} :'.format(modelName))
    print('ACCURACY ->', accuracy)
    print('CONFUSION MATRIX ->')
    print(conf_matrix)
    print('CLASSIFICATION REPORT ->')
    print(clf_report)
    print('*************************************************************************************')

In [7]:
# a) Implement AdaBoost ensemble Classifier with 3 different base classifiers (Logistic Regression, Naïve Bayes and Polynomial SVM)

# Implement AdaBoost with Logistic Regression as base classifier
base_clf = LogisticRegression(solver='liblinear')
model = AdaBoostClassifier(base_estimator=base_clf, n_estimators=30)
model.fit(X_train, y_train)
log_y_predict = model.predict(X_test)

In [8]:
# Implement AdaBoost with Naive Bayes as base classifier
base_clf = GaussianNB()
model = AdaBoostClassifier(base_estimator=base_clf, n_estimators=50, algorithm='SAMME')
model.fit(X_train, y_train)
nb_y_predict = model.predict(X_test)

In [9]:
# Implement AdaBoost with Polynomial SVM as base classifier
#base_clf = SVC(kernel='poly', degree=3, class_weight='balanced', gamma='scale', probability=True)
#base_clf = SVC(kernel='poly', degree=3, gamma='scale', probability=True)
base_clf = SVC(kernel='poly', degree=8, gamma='scale')

#model = AdaBoostClassifier(base_estimator=base_clf, n_estimators=10)#, algorithm='SAMME')
model = AdaBoostClassifier(base_estimator=base_clf, algorithm='SAMME')
model.fit(X_train, y_train)
poly_y_predict = model.predict(X_test)
#print(poly_y_predict[:100])

In [10]:
# b) Implement Random Forest (with Grid Search CV)
parameters = {
    'max_features' : ['auto', 2,3],
    'criterion' : ['entropy', 'gini'],
    'min_samples_split' : [2,3],
    'min_samples_leaf' : [2,3],
    'max_leaf_nodes' : [None,2,3],
    'n_estimators' : [10, 50, 100]
    }
clf = RandomForestClassifier()

grid_cv_forest = GridSearchCV(clf, param_grid=parameters, cv=5, iid=True)
grid_cv_forest.fit(X_train, y_train)
#print(grid_cv_forest.best_params_)
best_params = grid_cv_forest.best_params_
model = RandomForestClassifier(n_estimators = best_params['n_estimators'],
                                max_features = best_params['max_features'],
                                criterion = best_params['criterion'],
                                min_samples_split = best_params['min_samples_split'],
                                min_samples_leaf = best_params['min_samples_leaf'],
                                max_leaf_nodes = best_params['max_leaf_nodes']
                                )
model.fit(X_train, y_train)
forest_y_predict = model.predict(X_test)

In [11]:
# c) Implement XG Boost
model = XGBClassifier(max_depth=2)
model.fit(X_train, y_train)
xgboost_y_predict = model.predict(X_test)

In [12]:
# d) Compare accuracy measures (Precision/Recall/F1/CM)
getModelReport('ADABOOST WITH LOGISTIC REGRESSION AS BASE CLASSIFIER', y_test, log_y_predict)
getModelReport('ADABOOST WITH NAIVE BAYES AS BASE CLASSIFIER', y_test, nb_y_predict)
getModelReport('ADABOOST WITH POLYNOMIAL SVM AS BASE CLASSIFIER', y_test, poly_y_predict)
getModelReport('RANDOM FOREST (with Grid Search CV)', y_test, forest_y_predict)
getModelReport('XGBOOST', y_test, xgboost_y_predict)



*************************************************************************************
FOR ADABOOST WITH LOGISTIC REGRESSION AS BASE CLASSIFIER :
ACCURACY -> 0.6971428571428572
CONFUSION MATRIX ->
[[  9  44]
 [  9 113]]
CLASSIFICATION REPORT ->
              precision    recall  f1-score   support

           0       0.50      0.17      0.25        53
           1       0.72      0.93      0.81       122

    accuracy                           0.70       175
   macro avg       0.61      0.55      0.53       175
weighted avg       0.65      0.70      0.64       175

*************************************************************************************


*************************************************************************************
FOR ADABOOST WITH NAIVE BAYES AS BASE CLASSIFIER :
ACCURACY -> 0.5714285714285714
CONFUSION MATRIX ->
[[53  0]
 [75 47]]
CLASSIFICATION REPORT ->
              precision    recall  f1-score   support

           0       0.41      1.00      0.59        5

In [13]:
# find model with best accuracy and best f1
acc_best = 0
model_acc_best = None
f1_best = 0

for model in accuracy_all :
    if accuracy_all[model] > acc_best :
        model_acc_best = model
        acc_best = accuracy_all[model]

for model in f1_all :
    if f1_all[model] > f1_best :
        model_f1_best = model
        f1_best = f1_all[model]

print('\n\n*************************************************************************************')
print('{0} has the best accuracy score with value : {1}'.format(model_acc_best, accuracy_all[model_acc_best]))
print('{0} has the best f1 score with value : {1}'.format(model_f1_best, f1_all[model_f1_best]))



*************************************************************************************
ADABOOST WITH LOGISTIC REGRESSION AS BASE CLASSIFIER has the best accuracy score with value : 0.6971428571428572
ADABOOST WITH POLYNOMIAL SVM AS BASE CLASSIFIER has the best f1 score with value : 0.8175675675675675
