# Gradient Boosting Libraries
- conda install -c conda-forge lightgbm xgboost catboost

## Setting up

- Wine data
- Binary classification (filter class 0 and 1)
- 2 features ('alcohol','malic_acid')

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_wine
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

#Load data
dataObj = load_wine()
X = dataObj.data
y = dataObj.target

# Create DataFrame with features
dfori = pd.DataFrame(X)
dfori.columns = dataObj.feature_names

# Add class column
dfori.insert(loc=0, column="Class", value=y)

#Filter class 0 and 1
filt = (dfori['Class'] == 0) | (dfori['Class'] == 1)
df = dfori.loc[filt]

# Extract two features 
X = df[['alcohol','malic_acid']].values
y = df['Class'].values

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1, stratify=y)

In [3]:
# Make scorer
scorer_ACC = make_scorer(accuracy_score)
scorer_PRE = make_scorer(precision_score, zero_division=0, pos_label=1, average='weighted')
scorer_REC = make_scorer(recall_score, zero_division=0, pos_label=1, average='weighted')
scorer_F1 = make_scorer(f1_score, pos_label=1, average='weighted')
scorer_ROCAUC = make_scorer(roc_auc_score)

scorings = {'accuracy':scorer_ACC,
            'precision':scorer_PRE,
            'recall': scorer_REC,
            'f1': scorer_F1,
            'rocauc': scorer_ROCAUC}

In [4]:
def calculate_scores(clf, name, data):
        dataArr = []
        X_train, X_test, y_train, y_test = data
        sc = cross_validate(estimator=clf,
                                X=X_train,
                                y=y_train,
                                cv=10,
                                scoring=scorings,
                                n_jobs=-1)
                                
        ACC = sc['test_accuracy']
        PRE = sc['test_precision']
        REC = sc['test_recall']
        F1 = sc['test_f1']
        ROCAUC = sc['test_rocauc']

        data = {'clf': f"{name} (Train)",
                'ACC': f"{ACC.mean():6.3f}",
                'PRE': f"{PRE.mean():6.3f}",
                'REC': f"{REC.mean():6.3f} ",
                'F1': f"{F1.mean():6.3f}",
                'ROCAUC': f"{ROCAUC.mean():6.3f}",
                'Type': 'Train'}

        dataArr.append(data)

        # Testing scores
        clf.fit(X_train, y_train)
        ACC = scorings['accuracy'](clf, X_test, y_test)
        PRE = scorings['precision'](clf, X_test, y_test)
        REC = scorings['recall'](clf, X_test, y_test)
        F1 = scorings['f1'](clf, X_test, y_test)
        y_proba = clf.predict_proba(X_test)
        ROCAUC = roc_auc_score(y_true=y_test, y_score=y_proba[:,1])

        data = {'clf': f"{name} (Test)", 'ACC': ACC, 'PRE': PRE, 'REC': REC, 'F1': F1, 'ROCAUC': ROCAUC, 'Type': 'Test'}
        dataArr.append(data)
        return dataArr

In [5]:
# Classifier
tree = DecisionTreeClassifier(criterion='entropy', max_depth=None, random_state=1)

base = DecisionTreeClassifier(criterion='entropy', 
                              max_depth=1,
                              random_state=1)

ada = AdaBoostClassifier(base_estimator=base,
                         n_estimators=500, 
                         learning_rate=0.1,
                         random_state=1)


# Gradient Boosting Machine
gbm = GradientBoostingClassifier(loss='deviance',
                                learning_rate=0.1,
                                n_estimators=100,
                                min_samples_split=2, #The minimum number of samples required to split an internal node
                                min_samples_leaf=1, #The minimum number of samples required to be at a leaf node. 
                                max_depth=3 #The maximum depth of the individual regression estimators.
                                )

# XGBoost                    
xgb = XGBClassifier(eval_metric="logloss", use_label_encoder=False, n_estimators=100)

# LightGBM
lgb = LGBMClassifier(n_estimators=100)

# CatBoost
cat = CatBoostClassifier(verbose=0, n_estimators=100)

In [13]:
import time

clfs = {'Tree': tree, 'AdaBoost': ada, 'GBM-SKL': gbm, 'XGBoost': xgb, 'LightGBM': lgb, 'CatBoost': cat }
data =  (X_train, X_test, y_train, y_test)

dataArrAll = []
for name, clf in clfs.items():
    tic = time.perf_counter()
    dataArr = calculate_scores(clf, name, data)
    toc = time.perf_counter()
    ep_time = toc - tic
    print(f'Calculating {name}, Elapse Time: {ep_time:0.4f} seconds')
    dataArrAll = [*dataArrAll, *dataArr]

dft = pd.DataFrame.from_records(dataArrAll).set_index(['clf'])
display(dft)

Calculating Tree, Elapse Time: 0.0185 seconds
Calculating AdaBoost, Elapse Time: 1.2826 seconds
Calculating GBM-SKL, Elapse Time: 0.0937 seconds
Calculating XGBoost, Elapse Time: 0.2421 seconds
Calculating LightGBM, Elapse Time: 0.8248 seconds
Calculating CatBoost, Elapse Time: 0.8189 seconds


Unnamed: 0_level_0,ACC,PRE,REC,F1,ROCAUC,Type
clf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Tree (Train),0.821,0.855,0.821,0.813,0.822,Train
Tree (Test),0.865385,0.869195,0.865385,0.86432,0.860119,Test
AdaBoost (Train),0.846,0.872,0.846,0.844,0.847,Train
AdaBoost (Test),0.923077,0.925175,0.923077,0.922731,0.990327,Test
GBM-SKL (Train),0.861,0.878,0.861,0.859,0.863,Train
GBM-SKL (Test),0.923077,0.926036,0.923077,0.923191,0.968006,Test
XGBoost (Train),0.911,0.92,0.911,0.911,0.915,Train
XGBoost (Test),0.903846,0.910085,0.903846,0.903953,0.979911,Test
LightGBM (Train),0.898,0.918,0.898,0.896,0.903,Train
LightGBM (Test),0.903846,0.904729,0.903846,0.903953,0.987351,Test


In [None]:
display(dft[dft['Type']=='Train'])

Unnamed: 0_level_0,ACC,PRE,REC,F1,ROCAUC,Type
clf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Tree (Train),0.821,0.834,0.885,0.846,0.822,Train
AdaBoost (Train),0.846,0.862,0.885,0.862,0.847,Train
GBM-SKL (Train),0.861,0.877,0.885,0.873,0.863,Train
XGBoost (Train),0.911,0.93,0.905,0.913,0.915,Train
LightGBM (Train),0.898,0.922,0.905,0.904,0.903,Train
CatBoost (Train),0.911,0.95,0.885,0.913,0.918,Train


In [None]:
display(dft[dft['Type']=='Test'])

Unnamed: 0_level_0,ACC,PRE,REC,F1,ROCAUC,Type
clf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Tree (Test),0.865385,0.83871,0.928571,0.881356,0.860119,Test
AdaBoost (Test),0.923077,0.9,0.964286,0.931034,0.990327,Test
GBM-SKL (Test),0.942308,1.0,0.892857,0.943396,0.968006,Test
XGBoost (Test),0.903846,0.96,0.857143,0.90566,0.979911,Test
LightGBM (Test),0.903846,0.925926,0.892857,0.909091,0.987351,Test
CatBoost (Test),0.923077,1.0,0.857143,0.923077,1.0,Test
