# Introduction
<hr style="border:2px solid black"> </hr>

In [None]:
"""
What? Improve performance with ensembles

The three most popular methods for combining the predictions from different models are:
[1] BAGGING: [1.1] bgged decistion tres. [1.2] Random forest [1.3] Extra trees
[2] BOOSTING: [2.1] AdaBoost, [2.2] Stochastic gradient boosting
[3] VOTING: 
"""

# Import python modules

In [1]:
from pandas import read_csv
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from IPython.display import Markdown, display
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier

# Importing dataset

In [3]:
filename = '../DATASETS/pima-indians-diabetes.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
dataframe = read_csv(filename, names = names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
num_folds = 10
print("Input size: ", X.shape)
print("Labels size: ", Y.shape)

Input size:  (768, 8)
Labels size:  (768,)


# BAGGING algorithms: Bagged Decision Trees

In [None]:
"""
Bagging performs best with algorithms that have HIGH VARIANCE.
A popular example are decision trees, often constructed 
without pruning.
"""

In [9]:
seed = 7
kfold = KFold(n_splits=10, shuffle = True, random_state = seed)
cart = DecisionTreeClassifier()
num_trees = 100
model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed)
results = cross_val_score(model, X, Y, cv = kfold)
print(f"Mean: {results.mean():.4f}, standard deviation: {results.std():.4f}")

Mean: 0.7578, standard deviation: 0.0386


# BAGGING algorithms: Random forest

In [None]:
"""
Random Forests is an extension of bagged decision trees. 
Samples of the training dataset are taken with replacement, 
but the trees are constructed in a way that reduces the 
correlation between individual classifiers. Specifically, 
rather than greedily choosing the best split point in the construction of each tree, only a random subset of features are considered for each split
"""

In [10]:
num_trees = 100
max_features = 3
kfold = KFold(n_splits=10, shuffle = True, random_state=7)
model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features) 
results = cross_val_score(model, X, Y, cv=kfold)
print(f"Mean: {results.mean():.4f}, standard deviation: {results.std():.4f}")

Mean: 0.7734, standard deviation: 0.0518


# BAGGING algorithms: Extra trees

In [None]:
"""
Extra Trees are another modification of bagging where 
random trees are constructed from samples of the training dataset
"""

In [11]:
num_trees = 100
max_features = 7
kfold = KFold(n_splits=10, shuffle = True, random_state=7)
model = ExtraTreesClassifier(n_estimators=num_trees, max_features=max_features)
results = cross_val_score(model, X, Y, cv=kfold)


Mean: 0.7578, standard deviation: 0.0483


# BOOSTING algorithms: AdaBoost

In [None]:
"""
AdaBoost was perhaps the first successful boosting ensemble algorithm. 
It generally works by weighting instances in the dataset by how easy 
or difficult they are to classify, allowing the algorithm to pay or 
less attention to them in the construction of subsequent models.
"""

In [12]:
num_trees = 30
seed=7
kfold = KFold(n_splits=10, shuffle = True, random_state=seed)
model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
results = cross_val_score(model, X, Y, cv=kfold)
print(f"Mean: {results.mean():.4f}, standard deviation: {results.std():.4f}")

Mean: 0.7553, standard deviation: 0.0371


# BOOSTING algorithms: Stochastic Gradient Boosting

In [None]:
"""
It is also a technique that is proving to be perhaps one of the 
best techniques available for improving performance via ensembles.
"""

In [14]:
seed = 7
num_trees = 100
kfold = KFold(n_splits=10, shuffle = True, random_state=seed)
model = GradientBoostingClassifier(n_estimators=num_trees, random_state=seed) 
results = cross_val_score(model, X, Y, cv=kfold)
print(f"Mean: {results.mean():.4f}, standard deviation: {results.std():.4f}")

Mean: 0.7605, standard deviation: 0.0537


# Voting ensemble

In [None]:
"""
Voting is one of the simplest ways of combining the predictions from multiple 
machine learning algorithms. It works by first creating two or more standalone 
models from your training dataset. A Voting Classifier can then be used to wrap
your models and average the predictions of the sub-models when asked to make predictions for new data.
"""

In [15]:
kfold = KFold(n_splits=10, shuffle = True, random_state=7)
# create the sub models
estimators = []
model1 = LogisticRegression(max_iter = 250)
estimators.append(('logistic', model1))
model2 = DecisionTreeClassifier()
estimators.append(('cart', model2))
model3 = SVC()
estimators.append(('svm', model3))
# create the ensemble model
ensemble = VotingClassifier(estimators)
results = cross_val_score(ensemble, X, Y, cv=kfold)
print(f"Mean: {results.mean():.4f}, standard deviation: {results.std():.4f}")

Mean: 0.7696, standard deviation: 0.0508
