In [5]:
import sys
import sklearn
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_validate
from sklearn import datasets, neighbors, tree, ensemble, linear_model, naive_bayes, svm

In [6]:
# check versions
print(sys.version)
!python --version
print("sklearn: ", sklearn.__version__)

3.11.7 | packaged by Anaconda, Inc. | (main, Dec 15 2023, 18:05:47) [MSC v.1916 64 bit (AMD64)]
Python 3.11.7
sklearn:  1.4.0


In [38]:
# initialize a random state to be used for all models
rand_state = 45

## Create random data sets with sklearn datasets class

In [56]:
# classifier data
# using sklearn to create a data set. n_samples is number or rows, n_features is number of columns and n_classes is number of categories to classify
xclassifier, yclassifier = datasets.make_classification(n_samples=10000, n_features=50, n_classes=2)

xclassifier_train, xclassifier_test, yclassifier_train, yclassifier_test = train_test_split(xclassifier, yclassifier, stratify=yclassifier, shuffle=True)
print(xclassifier_train.shape, xclassifier_test.shape, yclassifier_train.shape, yclassifier_test.shape)

(7500, 50) (2500, 50) (7500,) (2500,)


In [57]:
# regression data
xreg, yreg = datasets.make_regression(n_samples=10000, n_features=50, n_targets=1)

xreg_train, xreg_test, yreg_train, yreg_test = train_test_split(xreg, yreg, shuffle=True)
print(xreg_train.shape, xreg_test.shape, yreg_train.shape, yreg_test.shape)

(7500, 50) (2500, 50) (7500,) (2500,)


## Classification models

In [39]:
# create a list of classifier models to be used
model_list_clf = [
    (ensemble.BaggingClassifier(estimator=linear_model.RidgeClassifier(random_state=rand_state), n_estimators=20), 'BaggingClassifier'),
    (ensemble.RandomForestClassifier(random_state=rand_state), 'RandomForestClassifier'),
    (ensemble.ExtraTreesClassifier(random_state=rand_state), 'ExtraTreesClassifier'),
    (ensemble.AdaBoostClassifier(algorithm='SAMME', random_state=rand_state), 'AdaBoostClassifier'),
    (ensemble.GradientBoostingClassifier(random_state=rand_state), 'GradientBoostingClassifier'),
    (ensemble.HistGradientBoostingClassifier(random_state=rand_state), 'HistGradientBoostingClassifier')
]

In [40]:
def fit_model(clf):
    clf.fit(xclassifier_train, yclassifier_train)
    yclassifier_pred = clf.predict(xclassifier_test)

    accuracy = metrics.accuracy_score(yclassifier_test, yclassifier_pred)
    print("Accuracy: ", accuracy)

### fit the data to each model in the model_list_clf list to see which perform best on the data

In [41]:
for clf, name in model_list_clf:
    print("-"*50)
    print(name)
    fit_model(clf)

--------------------------------------------------
BaggingClassifier
Accuracy:  0.8868
--------------------------------------------------
RandomForestClassifier
Accuracy:  0.8944
--------------------------------------------------
ExtraTreesClassifier
Accuracy:  0.8872
--------------------------------------------------
AdaBoostClassifier
Accuracy:  0.8852
--------------------------------------------------
GradientBoostingClassifier
Accuracy:  0.8936
--------------------------------------------------
HistGradientBoostingClassifier
Accuracy:  0.8888


### Voting classifier

In [58]:
clf1 = ensemble.BaggingClassifier(estimator=linear_model.RidgeClassifier(random_state=rand_state))
clf2 = naive_bayes.GaussianNB()
clf3 = ensemble.HistGradientBoostingClassifier(random_state=rand_state)

estimators = [
    ('br', clf1),
    ('lr', clf2),
    ('hgb', clf3),
]

ensemble_classifier = ensemble.VotingClassifier(estimators=estimators)
ensemble_classifier.fit(xclassifier_train, yclassifier_train)

yclassifier_pred = ensemble_classifier.predict(xclassifier_test)

accuracy = metrics.accuracy_score(yclassifier_test, yclassifier_pred)
print("Accuracy: ", accuracy)

Accuracy:  0.9284


Results with ensemble methods:
- BaggingClassifier - Accuracy:  0.8868
- RandomForestClassifier - Accuracy:  0.8944
- ExtraTreesClassifier - Accuracy:  0.8872
- AdaBoostClassifier - Accuracy:  0.8852
- GradientBoostingClassifier - Accuracy:  0.8936
- HistGradientBoostingClassifier - Accuracy:  0.8888

Results of ensemble methods used with voting classifier:
- Accuracy:  0.9284

## Regression models

In [45]:
# create a list of regression models to be used
model_list_reg = [
    (ensemble.BaggingRegressor(estimator=linear_model.Ridge(random_state=rand_state), n_estimators=20), 'BaggingRegressor'),
    (ensemble.RandomForestRegressor(random_state=rand_state), 'RandomForestRegressor'),
    (ensemble.ExtraTreesRegressor(random_state=rand_state), 'ExtraTreesRegressor'),
    (ensemble.AdaBoostRegressor(random_state=rand_state), 'AdaBoostRegressor'),
    (ensemble.GradientBoostingRegressor(random_state=rand_state), 'GradientBoostingRegressor'),
    (ensemble.HistGradientBoostingRegressor(random_state=rand_state), 'HistGradientBoostingRegressor')
]

In [43]:
def fit_model_reg(reg):
    reg.fit(xreg_train, yreg_train)
    yreg_pred = reg.predict(xreg_test)

    mse = metrics.mean_squared_error(yreg_test, yreg_pred)
    print("MSE: ", mse)

In [46]:
for reg, name in model_list_reg:
    print("-"*50)
    print(name)
    fit_model_reg(reg)

--------------------------------------------------
BaggingRegressor
MSE:  0.0008066532351918132
--------------------------------------------------
RandomForestRegressor
MSE:  8161.828531608716
--------------------------------------------------
ExtraTreesRegressor
MSE:  6681.386187683136
--------------------------------------------------
AdaBoostRegressor
MSE:  9816.239323477608
--------------------------------------------------
GradientBoostingRegressor
MSE:  2902.5682633281417
--------------------------------------------------
HistGradientBoostingRegressor
MSE:  1451.1854870253912


### Stack

In [49]:
estimators = [
    ('ridge', linear_model.Ridge(random_state=rand_state)),
    ('linear', linear_model.LinearRegression()),
    ('knr', neighbors.KNeighborsRegressor()),
]

final_estimator = ensemble.GradientBoostingRegressor(random_state=rand_state)

reg = ensemble.StackingRegressor(estimators=estimators, final_estimator=final_estimator)

reg.fit(xreg_train, yreg_train)
yreg_pred = reg.predict(xreg_test)

mse = metrics.mean_squared_error(yreg_test, yreg_pred)
print("MSE: ", mse)

MSE:  4.406927237514981


### Stack of stacks

In [51]:
estimators_2 = [
    ('gbr', ensemble.GradientBoostingRegressor(random_state=rand_state)),
    ('rfg', ensemble.RandomForestRegressor(random_state=rand_state)),
    ('abr', ensemble.AdaBoostRegressor(random_state=rand_state)),
]

final_estimator = ensemble.StackingRegressor(estimators=estimators_2, final_estimator=linear_model.Ridge(random_state=rand_state))

# run the previous estimators also
estimators = [
    ('ridge', linear_model.Ridge(random_state=rand_state)),
    ('linear', linear_model.LinearRegression()),
    ('knr', neighbors.KNeighborsRegressor()),
]

reg = ensemble.StackingRegressor(estimators=estimators, final_estimator=final_estimator)

reg.fit(xreg_train, yreg_train)
yreg_pred = reg.predict(xreg_test)

mse = metrics.mean_squared_error(yreg_test, yreg_pred)
print("MSE: ", mse)

MSE:  0.13733201894820282


### Voting Regressor

In [55]:
reg1 = ensemble.BaggingRegressor(estimator=linear_model.Ridge(random_state=rand_state))
reg2 = linear_model.LinearRegression()
reg3 = ensemble.HistGradientBoostingRegressor(random_state=rand_state)

estimators = [
    ('br', reg1),
    ('lr', reg2),
    ('hgb', reg3),
]

ensemble_regressor = ensemble.VotingRegressor(estimators=estimators)
ensemble_regressor.fit(xreg_train, yreg_train)

yreg_pred = ensemble_regressor.predict(xreg_test)

mse = metrics.mean_squared_error(yreg_test, yreg_pred)
print("MSE: ", mse)

MSE:  161.36963070130594


##### Ensemble method results
- BaggingRegressor - MSE: 0.0008066532351918132
- RandomForestRegressor - MSE: 8161.828531608716
- ExtraTreesRegressor - MSE: 6681.386187683136
- AdaBoostRegressor - MSE: 9816.239323477608
- GradientBoostingRegressor - MSE: 2902.5682633281417
- HistGradientBoostingRegressor - MSE: 1451.1854870253912
##### Stack results
- MSE: 4.406927237514981
##### Stack of stacks results
- MSE: 0.13733201894820282
##### Voting Regressor results
- MSE: 161.36963070130594