# Introduction

In [None]:
"""
What? Building non-correlated ensembles
"""

# Import modules

In [4]:
import datetime as dt
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier
from category_encoders.target_encoder import TargetEncoder
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import cross_val_score
# Silence warnings
import warnings
warnings.filterwarnings('ignore')

# Load the dataset

In [None]:
"""
This datasets can be ued to predicts Uber and Lyft cab prices.

"""

In [2]:
X, y = load_breast_cancer(return_X_y=True)

In [3]:
kfold = StratifiedKFold(n_splits=5)

In [5]:
def classification_model(model):
    # Obtain scores of cross-validation using 5 splits
    scores = cross_val_score(model, X, y, cv=kfold)

    # Return mean score
    return scores.mean()

In [6]:
classification_model(XGBClassifier())

0.9771619313771154

In [7]:
classification_model(XGBClassifier(booster='gblinear'))

0.5357397919577706

In [8]:
classification_model(XGBClassifier(booster='dart', one_drop=True))

0.9736376339077782

In [9]:
classification_model(RandomForestClassifier(random_state=2))

0.9666356155876418

In [10]:
classification_model(LogisticRegression(max_iter=10000))

0.9525694767893184

In [11]:
classification_model(XGBClassifier(n_estimators=800, max_depth=4, colsample_bylevel=0.8))

0.9771464058376027

In [None]:
"""
Most models perform respectably, with the XGBoost classifier obtaining the highest score. The gblinear base learner 
did not perform particularly well, however, so we will not use it going forward.
"""

# Correlation

In [None]:
"""
The purpose of this section is not to select all models for the ensemble, but rather to select the 
non-correlated models. “Correlation is a statistical measure between -1 and 1 that indicates the 
strength of the linear relationship between two sets of points. A correlation of 1 is a perfectly 
straight line, while a correlation of 0 shows no linear relationship whatsoever.

A high correlation between machine learning models is undesirable in an ensemble. But why? Consider the
case of two classifiers with 1,000 predictions each. If these classifiers all make the same predictions, 
no new information is gained from the second classifier, making it superfluous. Using a majority rules 
implementation, a prediction is only wrong if the majority of classifiers get it wrong. It's desirable, 
therefore, to have a diversity of models that score well but give different predictions. If most models
give the same predictions, the correlation is high, and there is little value in adding the new model to
the ensemble. Finding differences in predictions where a strong model may be wrong gives the ensemble the
chance to produce better results. Predictions will be different when the models are non-correlated.
"""

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

In [15]:
def y_pred(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = accuracy_score(y_pred, y_test)
    print(score)
    return y_pred

In [None]:
"""
To compute correlations between machine learning models, we first need data points to compare. The different 
data points that machine learning models produce are their predictions. After obtaining predictions, we 
concatenate them into a DataFrame, and then apply the .corr method to obtain all correlations at once.
"""

In [16]:
y_pred_gbtree = y_pred(XGBClassifier())

0.951048951048951


In [17]:
y_pred_dart = y_pred(XGBClassifier(booster='dart', one_drop=True))

0.951048951048951


In [18]:
y_pred_forest = y_pred(RandomForestClassifier(random_state=2))

0.9370629370629371


In [19]:
y_pred_logistic = y_pred(LogisticRegression(max_iter=10000))

0.9370629370629371


In [20]:
y_pred_xgb = y_pred(XGBClassifier(max_depth=2, n_estimators=500, learning_rate=0.1))

0.965034965034965


In [21]:
df_pred = pd.DataFrame(data= np.c_[y_pred_gbtree, y_pred_dart, y_pred_forest, y_pred_logistic, y_pred_xgb], 
                  columns=['gbtree', 'dart', 'forest', 'logistic', 'xgb'])

In [22]:
df_pred.corr()

Unnamed: 0,gbtree,dart,forest,logistic,xgb
gbtree,1.0,0.971146,0.884584,0.914111,0.971146
dart,0.971146,1.0,0.913438,0.914111,0.971146
forest,0.884584,0.913438,1.0,0.943308,0.913438
logistic,0.914111,0.914111,0.943308,1.0,0.914111
xgb,0.971146,0.971146,0.913438,0.914111,1.0


In [None]:
"""
There is no clear cut-off to obtain a non-correlated threshold. It ultimately depends on the values of correlation 
and the number of models to choose from. For this example, we could pick the next two least correlated models with
our best model, xgb, which are the random forest and logistic regression. Now we
will combine them into a single ensemble using the VotingClassifier ensemble, introduced next.
"""

In [25]:
estimators = []
logistic_model = LogisticRegression(max_iter=10000)
xgb_model = XGBClassifier(max_depth=2, n_estimators=500, learning_rate=0.1)
rf_model = RandomForestClassifier(random_state=2)

estimators.append(('logistic', logistic_model))
estimators.append(('xgb', xgb_model))
estimators.append(('rf', rf_model))

ensemble = VotingClassifier(estimators)
scores = cross_val_score(ensemble, X, y, cv=kfold)
print(scores.mean())

0.9771619313771154


# Stacking models

In [None]:
"""
It is important to note that the stacking classifier builds a model on the base models(or estimators) hereby 
increasing fit for the dataset. This also increases the tendency of the stacking classifier to overfit especially
in the case of multi-layer stacking.
"""

In [24]:
base_models = []
base_models.append(('lr', LogisticRegression()))
base_models.append(('xgb', XGBClassifier()))
base_models.append(('rf', RandomForestClassifier(random_state=2)))

# Define meta learner model. This is something is not done in the voting classifier above
meta_model = LogisticRegression()

# define the stacking ensemble
clf = StackingClassifier(estimators=base_models, final_estimator=meta_model)
scores = cross_val_score(clf, X, y, cv=kfold)
print(scores.mean())

0.9789318428815401


# References
<hr style="border:2px solid black"> </hr>


- Corey Wade. “Hands-On Gradient Boosting with XGBoost and scikit-learn
- https://github.com/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn
    
