# Bagging, Boosting, and Stacking Ensemble Methods

Generally use ensemble only when it's needed because ensemble methods are generally resource intensive

## Bagging

Oversamples and creates synthetic data to have as much diversity as possible (diverse variance) to combat overfitting.

`KFold` How many iterations to sample variety of data (cross validation)

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.datasets import load_breast_cancer

In [2]:
# issues with https
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [3]:
url="http://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"

col_headers = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

df_diab = pd.read_csv(url, names=col_headers)
df_diab.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
# Split X, y
X = df_diab.drop(columns='class', axis=1)
y = df_diab['class']

In [6]:
# Set up cross validation
k_fold = KFold(n_splits=10)

# Build the model
DTC = DecisionTreeClassifier()

In [7]:
BC = BaggingClassifier(estimator=DTC, n_estimators=80, random_state=12)

In [8]:
results = cross_val_score(BC, X, y, cv=k_fold)

In [9]:
results

array([0.64935065, 0.83116883, 0.72727273, 0.66233766, 0.77922078,
       0.83116883, 0.83116883, 0.85714286, 0.71052632, 0.78947368])

In [10]:
results.mean()

0.7668831168831168

## Boosting

### Gradient Boosting / XGBoost

In [13]:
bc = load_breast_cancer()
df_titties = pd.DataFrame(bc.data, columns=bc.feature_names)

In [14]:
X = bc.data
y = bc.target

In [16]:
k_folds_titties = KFold(n_splits=8, random_state=5, shuffle=True)

In [17]:
GB = GradientBoostingClassifier()

In [18]:
# Give me the default or specified params of the model
GB.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'log_loss',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [19]:
results_titties = cross_val_score(GB, X, y, cv=k_folds_titties)

In [20]:
results_titties

array([0.97222222, 0.98591549, 1.        , 0.97183099, 0.95774648,
       0.97183099, 0.98591549, 0.88732394])