In [1]:
%matplotlib inline

In [77]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import \
    BaggingClassifier, VotingClassifier, \
    AdaBoostClassifier, StackingClassifier, \
    RandomForestClassifier

from sklearn.metrics import accuracy_score, classification_report

import xgboost as xgb

# Comparison between Ensemble Algorithms

## Adult

Prediction task is to determine whether a person makes over 50K a year.

Data from [here](https://archive.ics.uci.edu/ml/datasets/adult).

### Process the data

In [55]:
income_data = pd.read_csv("adult/adult.data", header=None)
income_data.columns = [
    "age",
    "workclass",
    "final_weight",
    "education",
    "education-num",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital_gain",
    "capital_loss",
    "hours_per_week",
    "native_country",
    "income_class",
]

income_target = income_data.income_class
income_target = income_target.str.strip()

income_attributes = income_data.drop(columns="income_class")
income_attributes = pd.get_dummies(income_attributes, drop_first=True)
scaler = MinMaxScaler()
income_attributes = scaler.fit_transform(income_attributes)

income_attributes_train, income_attributes_test, \
income_target_train, income_target_test, \
= train_test_split(income_attributes, income_target, train_size=0.8, test_size=0.2)

print("Shape of the train and test sets for the features and target:")
for group in [income_attributes_train, income_attributes_test, income_target_train, income_target_test]:
    print(group.shape)

Shape of the train and test sets for the features and target:
(26048, 100)
(6513, 100)
(26048,)
(6513,)


In [67]:
print("Proportion of target classses:")
print(
    "Training set:\t",
    income_target_train.value_counts(normalize=True).values.round(2)
)

print(
    "Test set:\t",
    income_target_test.value_counts(normalize=True).values.round(2)
)

Proportion of target classses:
Training set:	 [0.76 0.24]
Test set:	 [0.75 0.25]


### Classifier models

In [32]:
X_train, y_train = income_attributes_train, income_target_train
X_test, y_test = income_attributes_test, income_target_test

In [92]:
def print_model_score(model, on_training=False):
    print("On test data:")
    print(f"Default estimator score: {model.score(X_test, y_test):.2f}")
    print(f"Accuracy:\t\t {accuracy_score(y_test, model.predict(X_test)):.2f}")
    
    if on_training:
        print("On training data:")
        print(f"Default estimator score: {model.score(X_train, y_train):.2f}")
        print(f"Accuracy:\t\t {accuracy_score(y_train, model.predict(X_train)):.2f}")

#### Decision tree

In [95]:
print("Decision tree scores")

print("Depth 1")
tree_depth1 = DecisionTreeClassifier(max_depth=1)
tree_depth1.fit(X_train, y_train)
print_model_score(tree_depth1)

print("\nDepth 2")
tree_depth2 = DecisionTreeClassifier(max_depth=2)
tree_depth2.fit(X_train, y_train)
print_model_score(tree_depth2)

print("\nDepth 4")
tree_depth4 = DecisionTreeClassifier(max_depth=4)
tree_depth4.fit(X_train, y_train)
print_model_score(tree_depth4)

print("\nDepth 8")
tree_depth8 = DecisionTreeClassifier(max_depth=8)
tree_depth8.fit(X_train, y_train)
print_model_score(tree_depth8)

Decision tree scores
Depth 1
On test data:
Default estimator score: 0.76
Accuracy:		 0.76

Depth 2
On test data:
Default estimator score: 0.83
Accuracy:		 0.83

Depth 4
On test data:
Default estimator score: 0.85
Accuracy:		 0.85

Depth 8
On test data:
Default estimator score: 0.86
Accuracy:		 0.86


In [90]:
all_trees = [tree_depth1, tree_depth2, tree_depth4, tree_depth8]
titles = [f"Tree, max depth={x}" for x in list("1248")]

for a_tree, title in zip(all_trees, titles):
    report = classification_report(
        y_test, 
        a_tree.predict(X_test),
        zero_division=False
    )
    print(title, "\n", report, "\n")

Tree, max depth=1 
               precision    recall  f1-score   support

       <=50K       0.76      1.00      0.86      4957
        >50K       0.00      0.00      0.00      1556

    accuracy                           0.76      6513
   macro avg       0.38      0.50      0.43      6513
weighted avg       0.58      0.76      0.66      6513
 

Tree, max depth=2 
               precision    recall  f1-score   support

       <=50K       0.85      0.95      0.89      4957
        >50K       0.73      0.45      0.56      1556

    accuracy                           0.83      6513
   macro avg       0.79      0.70      0.73      6513
weighted avg       0.82      0.83      0.81      6513
 

Tree, max depth=4 
               precision    recall  f1-score   support

       <=50K       0.86      0.95      0.90      4957
        >50K       0.76      0.53      0.62      1556

    accuracy                           0.85      6513
   macro avg       0.81      0.74      0.76      6513
weighted a

#### Random forest

In [103]:
forest_5 = RandomForestClassifier(n_estimators=5)
forest_5.fit(X_train, y_train)

forest_10 = RandomForestClassifier(n_estimators=10)
forest_10.fit(X_train, y_train)

forest_30 = RandomForestClassifier(n_estimators=30)
forest_30.fit(X_train, y_train)

The random tree score is the mean accuracy on the given test data and labels.

In [115]:
all_forests = [forest_5, forest_10, forest_30, ]
titles = [f"Random forest, number of estimators {x}: " for x in [5, 10, 30]]

print("Score = mean accuracy\n")
for a_forest, title in zip(all_forests, titles):
    score = a_forest.score(X_test, y_test)
    print(f"{title}{score:.3f}")

Score = mean accuracy

Random forest, number of estimators 5: 0.838
Random forest, number of estimators 10: 0.846
Random forest, number of estimators 30: 0.853


### Comparisons

In [122]:
tree_depth2 = DecisionTreeClassifier(max_depth=2)
tree_depth4 = DecisionTreeClassifier(max_depth=4)
tree_depth8 = DecisionTreeClassifier(max_depth=8)

#### Bagging vs Voting

In [118]:
bag_2 = BaggingClassifier(tree_depth2, n_estimators=10)
bag_2.fit(X_train, y_train)

bag_4 = BaggingClassifier(tree_depth4, n_estimators=10)
bag_4.fit(X_train, y_train)

bag_8 = BaggingClassifier(tree_depth8, n_estimators=10)
bag_8.fit(X_train, y_train)

In [123]:
# vote_2 = VotingClassifier(tree_depth2)
# vote_2.fit(X_train, y_train)

# vote_4 = VotingClassifier(tree_depth4)
# vote_4.fit(X_train, y_train)

# vote_8 = VotingClassifier(tree_depth8)
# vote_8.fit(X_train, y_train)

#### AdaBoost vs StackingClassifier

In [None]:
ada_2 = AdaBoostClassifier(base_estimator=tree_depth2)
ada_2 = StackingClassifier(base_estimator=tree_depth2)