# Day 09. Exercise 03
# Ensembles

## 0. Imports

In [1]:
# This is an display config snippet!
import pandas as pd
from IPython.core.display import display, HTML
pd.options.display.float_format = '{:,.4f}'.format

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.expand_frame_repr', False)
display(HTML("<style>.container { width:95% !important; }</style>"))



In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, roc_auc_score, recall_score
import numpy as np
from sklearn.utils._testing import ignore_warnings
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import pickle
from sklearn.model_selection import GridSearchCV
from tqdm.auto import tqdm
from itertools import product
from collections import defaultdict
from sklearn.ensemble import VotingClassifier, BaggingClassifier, StackingClassifier

## 1. Preprocessing

1. Create the same dataframe as in the previous exercise.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test` and then get `X_train`, `y_train`, `X_valid`, `y_valid` from the previous `X_train`, `y_train`. Use the additional parameter `stratify`.

In [3]:
df = pd.read_csv('../data/dayofweek-not-scaled.csv')

In [4]:
scaler = StandardScaler()

In [5]:
X = np.hstack([scaler.fit_transform(df[['numTrials', 'hour']]), df.drop(columns=['numTrials', 'hour', 'dayofweek'])])
y = df['dayofweek'].values

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=21, test_size=0.2)

## 2. Individual classifiers

1. Train SVM, decision tree and random forest again with the best parameters that you got from the 01 exercise with `random_state=21` for all of them.
2. Evaluate `accuracy`, `precision`, and `recall` for them on the validation set.
3. The result of each cell of the section should look like this:

```
accuracy is 0.87778
precision is 0.88162
recall is 0.87778
```

In [7]:
svc_params = {'C': 10, 'class_weight': None, 'gamma': 'scale', 'kernel': 'rbf'}
svc = SVC(random_state=21, probability=True, **svc_params)
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
print(f"accuracy is {accuracy_score(y_test, y_pred)}")
print(f"precision is {precision_score(y_test, y_pred, average='weighted')}")
print(f"recall is {recall_score(y_test, y_pred, average='weighted')}")

accuracy is 0.8964497041420119
precision is 0.8987502097411376
recall is 0.8964497041420119


In [8]:
tree_params = {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 25}
tree = DecisionTreeClassifier(random_state=21, **tree_params)
tree.fit(X_train, y_train)
y_pred = tree.predict(X_test)
print(f"accuracy is {accuracy_score(y_test, y_pred)}")
print(f"precision is {precision_score(y_test, y_pred, average='weighted')}")
print(f"recall is {recall_score(y_test, y_pred, average='weighted')}")

accuracy is 0.8994082840236687
precision is 0.9010905154871189
recall is 0.8994082840236687


In [9]:
forest_params = {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 30, 'n_estimators': 50}
forest = RandomForestClassifier(random_state=21, **forest_params)
forest.fit(X_train, y_train)
y_pred = forest.predict(X_test)
print(f"accuracy is {accuracy_score(y_test, y_pred)}")
print(f"precision is {precision_score(y_test, y_pred, average='weighted')}")
print(f"recall is {recall_score(y_test, y_pred, average='weighted')}")

accuracy is 0.9349112426035503
precision is 0.9360699474016719
recall is 0.9349112426035503


## 3. Voting classifiers

1. Using `VotingClassifier` and the three models that you have just trained, calculate the `accuracy`, `precision`, and `recall` on the validation set.
2. Play with the other parameteres.
3. Calculate the `accuracy`, `precision` and `recall` on the test set for the model with the best weights in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).

In [10]:
voting = VotingClassifier([('svc', svc), ('tree', tree), ('forest', forest)], voting='hard', weights=[1, 2, 2])

In [11]:
voting.fit(X_train, y_train)
y_pred = voting.predict(X_test)
print(f"accuracy is {accuracy_score(y_test, y_pred)}")
print(f"precision is {precision_score(y_test, y_pred, average='weighted')}")
print(f"recall is {recall_score(y_test, y_pred, average='weighted')}")

accuracy is 0.9349112426035503
precision is 0.9356752950033232
recall is 0.9349112426035503


In [12]:
def magic(estimators, params):
    result_metrics = defaultdict(dict)
    assert len(estimators) == len(params)
    for estimator, model_params in zip(estimators, params):
        estimator = estimator.set_params(**model_params).fit(X_train, y_train)
        y_pred = estimator.predict(X_test)
        estimator_name = type(estimator).__name__
        result_metrics['accuracy'][estimator_name] = accuracy_score(y_test, y_pred)
        result_metrics['precision'][estimator_name] = precision_score(y_test, y_pred, average='weighted')
        result_metrics['recall'][estimator_name] = recall_score(y_test, y_pred, average='weighted')
    return dict(result_metrics)

In [13]:
results = []
all_weigth = [(i, j, k) for i in range(1, 6) for j in range(1, 6) for k in range(1, 6)]
for weights in tqdm(all_weigth, total=5**3):
    results.append(magic([voting], [{'weights': weights}]))

  0%|          | 0/125 [00:00<?, ?it/s]

In [14]:
results = [{k: v['VotingClassifier'] for k, v in result.items()} for result in results]

In [15]:
results = pd.DataFrame.from_dict(results)

In [16]:
print(results.query('accuracy == accuracy.max()').query('precision == precision.max()').query('recall == recall.max()').iloc[0].to_dict())

{'accuracy': 0.9349112426035503, 'precision': 0.9360699474016719, 'recall': 0.9349112426035503}


## 4. Bagging classifiers

1. Using `BaggingClassifier` and `SVM` with the best parameters create an ensemble, try different values of the `n_estimators`, use `random_state=21`.
2. Play with the other parameters.
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision)

In [17]:
bagging = BaggingClassifier(svc, random_state=21)

In [18]:
bagging.fit(X_train, y_train)
y_pred = bagging.predict(X_test)
print(f"accuracy is {accuracy_score(y_test, y_pred)}")
print(f"precision is {precision_score(y_test, y_pred, average='weighted')}")
print(f"recall is {recall_score(y_test, y_pred, average='weighted')}")

accuracy is 0.8905325443786982
precision is 0.8935279699697187
recall is 0.8905325443786982


In [19]:
grid_search = GridSearchCV(bagging, {'n_estimators': [5, 10, 50, 100]}, cv=5,verbose=2, )

In [20]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END .....................................n_estimators=5; total time=   0.8s
[CV] END .....................................n_estimators=5; total time=   1.0s
[CV] END .....................................n_estimators=5; total time=   1.0s
[CV] END .....................................n_estimators=5; total time=   1.0s
[CV] END .....................................n_estimators=5; total time=   0.8s
[CV] END ....................................n_estimators=10; total time=   1.6s
[CV] END ....................................n_estimators=10; total time=   1.6s
[CV] END ....................................n_estimators=10; total time=   1.9s
[CV] END ....................................n_estimators=10; total time=   1.6s
[CV] END ....................................n_estimators=10; total time=   1.6s
[CV] END ....................................n_estimators=50; total time=   8.8s
[CV] END ....................................n_es

GridSearchCV(cv=5,
             estimator=BaggingClassifier(base_estimator=SVC(C=10,
                                                            probability=True,
                                                            random_state=21),
                                         random_state=21),
             param_grid={'n_estimators': [5, 10, 50, 100]}, verbose=2)

In [21]:
y_pred = grid_search.best_estimator_.predict(X_test)
print(f"accuracy is {accuracy_score(y_test, y_pred)}")
print(f"precision is {precision_score(y_test, y_pred, average='weighted')}")
print(f"recall is {recall_score(y_test, y_pred, average='weighted')}")

accuracy is 0.8994082840236687
precision is 0.9014451020625085
recall is 0.8994082840236687


## 5. Stacking classifiers

1. To achieve reproducibility in this case you will have to create an object of cross-validation generator: `StratifiedKFold(n_splits=n, shuffle=True, random_state=21)`, where `n` you will try to optimize (the details are below).
2. Using `StackingClassifier` and the three models that you have recently trained, calculate the `accuracy`, `precision` and `recall` on the validation set, try different values of `n_splits` `[2, 3, 4, 5, 6, 7]` in the cross-validation generator and parameter `passthrough` in the classifier itself,
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision). Use `final_estimator=LogisticRegression(solver='liblinear')`.

In [22]:
for n in tqdm([2, 3, 4, 5, 6, 7]):
    print(n)
    k_fold = StratifiedKFold(n_splits=n, shuffle=True, random_state=21)
    stacking = StackingClassifier([('svc', svc), ('tree', tree), ('forest', forest)], cv=k_fold)
    stacking.fit(X_train,y_train)
    y_pred = stacking.predict(X_test)
    print(f"accuracy is {accuracy_score(y_test, y_pred)}")

  0%|          | 0/6 [00:00<?, ?it/s]

2
accuracy is 0.9349112426035503
3
accuracy is 0.9467455621301775
4
accuracy is 0.9319526627218935
5
accuracy is 0.9378698224852071
6
accuracy is 0.9319526627218935
7
accuracy is 0.9437869822485208


In [23]:
k_fold = StratifiedKFold(n_splits=3, shuffle=True, random_state=21)
stacking = StackingClassifier([('svc', svc), ('tree', tree), ('forest', forest)], cv=k_fold)
stacking.fit(X_train,y_train)
y_pred = stacking.predict(X_test)
print(f"accuracy is {accuracy_score(y_test, y_pred)}")
print(f"precision is {precision_score(y_test, y_pred, average='weighted')}")
print(f"recall is {recall_score(y_test, y_pred, average='weighted')}")

accuracy is 0.9467455621301775
precision is 0.9476169593900625
recall is 0.9467455621301775


## 6. Predictions

1. Choose the best model in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).
2. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your full dataset), for which labname and for which users.
3. Save the model.

Stacking classifiers is the best in terms of the accuracy

In [24]:
with open('../data/ex03_model.pkl', 'wb') as f:
    pickle.dump(stacking, f)