# Day 09. Exercise 03
# Ensembles

## 0. Imports

In [None]:
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier, StackingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from itertools import product
import pandas as pd
import joblib

In [33]:
import warnings
warnings.filterwarnings('ignore')

## 1. Preprocessing

1. Create the same dataframe as in the previous exercise.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test` and then get `X_train`, `y_train`, `X_valid`, `y_valid` from the previous `X_train`, `y_train`. Use the additional parameter `stratify`.

In [2]:
df = pd.read_csv('../../datasets/day-of-week-not-scaled.csv')
df_scaled = pd.read_csv('../../datasets/dayofweek.csv')
df['dayofweek'] = df_scaled['dayofweek']
df.head()

Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1,dayofweek
0,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
1,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
2,3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
3,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
4,5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4


In [4]:
X = df.drop('dayofweek', axis=1)
y = df['dayofweek']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=21, stratify=y_train)

## 2. Individual classifiers

1. Train SVM, decision tree and random forest again with the best parameters that you got from the 01 exercise with `random_state=21` for all of them.
2. Evaluate `accuracy`, `precision`, and `recall` for them on the validation set.
3. The result of each cell of the section should look like this:

```
accuracy is 0.87778
precision is 0.88162
recall is 0.87778
```

In [17]:
svc_params = {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf'}
svc_model = SVC(**svc_params, random_state=21, probability=True)

svc_model.fit(X_train, y_train)
y_pred = svc_model.predict(X_test)

print(f'accuracy is {accuracy_score(y_test, y_pred)}')
print(f'precision is {precision_score(y_test, y_pred, average='weighted')}')
print(f'recall is {recall_score(y_test, y_pred, average='weighted')}')

accuracy is 0.8994082840236687
precision is 0.9031327775918397
recall is 0.8994082840236687


In [14]:
tree_params = {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 23}
tree_model = DecisionTreeClassifier(**tree_params, random_state=21)

tree_model.fit(X_train, y_train)
y_pred = tree_model.predict(X_test)

print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Precision: {precision_score(y_test, y_pred, average='weighted')}')
print(f'Recall: {recall_score(y_test, y_pred, average='weighted')}')

Accuracy: 0.9023668639053254
Precision: 0.9050248872120567
Recall: 0.9023668639053254


In [15]:
forest_params = {'class_weight': None, 'criterion': 'gini', 'max_depth': 28, 'n_estimators': 50}
forest_model = RandomForestClassifier(**forest_params, random_state=21)

forest_model.fit(X_train, y_train)
y_pred = forest_model.predict(X_test)

print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Precision: {precision_score(y_test, y_pred, average='weighted')}')
print(f'Recall: {recall_score(y_test, y_pred, average='weighted')}')

Accuracy: 0.9378698224852071
Precision: 0.941458739514193
Recall: 0.9378698224852071


## 3. Voting classifiers

1. Using `VotingClassifier` and the three models that you have just trained, calculate the `accuracy`, `precision`, and `recall` on the validation set.
2. Play with the other parameteres.
3. Calculate the `accuracy`, `precision` and `recall` on the test set for the model with the best weights in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).

In [None]:
voting_clf = VotingClassifier(estimators=[('rf', forest_model), ('dt', tree_model), ('svm', svc_model)], voting='soft')
voting_clf.fit(X_train, y_train)
y_val_pred = voting_clf.predict(X_valid)

accuracy = accuracy_score(y_valid, y_val_pred)
precision = precision_score(y_valid, y_val_pred, average='weighted')
recall = recall_score(y_valid, y_val_pred, average='weighted')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')

[Validation] Accuracy: 0.8925925925925926
Precision: 0.8970935033371042
Recall: 0.8925925925925926


In [25]:
weights_list = [
    [1, 1, 1],
    [2, 1, 1],
    [2, 1, 1],   
    [1, 2, 1],  
    [1, 1, 2],   
    [3, 1, 2],   
]

results = []

for weights in weights_list:
    clf = VotingClassifier(
        estimators=[
            ('rf', forest_model),
            ('dt', tree_model),
            ('svc', svc_model)
        ],
        voting='soft',
        weights=weights
    )
    clf.fit(X_train, y_train)
    y_val_pred = clf.predict(X_valid)

    acc = accuracy_score(y_valid, y_val_pred)
    prec = precision_score(y_valid, y_val_pred, average='weighted')
    rec = recall_score(y_valid, y_val_pred, average='weighted')

    results.append({
        'weights': weights,
        'accuracy': acc,
        'precision': prec,
        'recall': rec
    })

results_df = pd.DataFrame(results)
results_df_sorted = results_df.sort_values(by=['accuracy', 'precision'], ascending=False)

print("\nTop validation results:")
print(results_df_sorted.head())


Top validation results:
     weights  accuracy  precision    recall
5  [3, 1, 2]  0.922222   0.925960  0.922222
1  [2, 1, 1]  0.922222   0.925397  0.922222
2  [2, 1, 1]  0.922222   0.925397  0.922222
4  [1, 1, 2]  0.900000   0.904132  0.900000
0  [1, 1, 1]  0.892593   0.897094  0.892593


In [26]:
best_weights = results_df_sorted.iloc[0]['weights']
final_voting_clf = VotingClassifier(
    estimators=[
        ('rf', forest_model),
        ('dt', tree_model),
        ('svc', svc_model)
    ],
    voting='soft',
    weights=best_weights
)

final_voting_clf.fit(X_train, y_train)
y_test_pred = final_voting_clf.predict(X_test)

test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred, average='weighted')
test_recall = recall_score(y_test, y_test_pred, average='weighted')

print(f'Accuracy: {test_accuracy:.5f}')
print(f'Precision: {test_precision:.5f}')
print(f'Recall: {test_recall:.5f}')

Accuracy: 0.92308
Precision: 0.92444
Recall: 0.92308


## 4. Bagging classifiers

1. Using `BaggingClassifier` and `SVM` with the best parameters create an ensemble, try different values of the `n_estimators`, use `random_state=21`.
2. Play with the other parameters.
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision)

In [29]:
best_svc_params = {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf'}

n_estimators_list = [5, 10, 20, 30, 50]
results = []

for n in n_estimators_list:
    model = BaggingClassifier(
        estimator=SVC(**best_svc_params, probability=True),
        n_estimators=n,
        random_state=21
    )
    
    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_valid)

    acc = accuracy_score(y_valid, y_val_pred)
    prec = precision_score(y_valid, y_val_pred, average='weighted')
    rec = recall_score(y_valid, y_val_pred, average='weighted')

    results.append({
        'n_estimators': n,
        'accuracy': acc,
        'precision': prec,
        'recall': rec
    })

results_df = pd.DataFrame(results)
results_df_sorted = results_df.sort_values(by=['accuracy', 'precision'], ascending=False)
print(results_df_sorted)

   n_estimators  accuracy  precision    recall
3            30  0.870370   0.876313  0.870370
4            50  0.870370   0.876313  0.870370
2            20  0.862963   0.870199  0.862963
1            10  0.848148   0.857371  0.848148
0             5  0.818519   0.828006  0.818519


In [34]:
param_grid = {
    'n_estimators': [10, 20, 30],
    'max_samples': [0.5, 0.7, 1.0],
    'max_features': [0.5, 1.0]
}

grid = list(product(param_grid['n_estimators'], param_grid['max_samples'], param_grid['max_features']))
extended_results = []

for n, ms, mf in grid:
    model = BaggingClassifier(
        estimator=SVC(**best_svc_params, probability=True),
        n_estimators=n,
        max_samples=ms,
        max_features=mf,
        random_state=21
    )

    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_valid)

    acc = accuracy_score(y_valid, y_val_pred)
    prec = precision_score(y_valid, y_val_pred, average='weighted')
    rec = recall_score(y_valid, y_val_pred, average='weighted')

    extended_results.append({
        'n_estimators': n,
        'max_samples': ms,
        'max_features': mf,
        'accuracy': acc,
        'precision': prec,
        'recall': rec
    })

extended_df = pd.DataFrame(extended_results)
extended_df_sorted = extended_df.sort_values(by=['accuracy', 'precision'], ascending=False)
print(extended_df_sorted.head())

    n_estimators  max_samples  max_features  accuracy  precision    recall
17            30          1.0           1.0  0.870370   0.876313  0.870370
11            20          1.0           1.0  0.862963   0.870199  0.862963
5             10          1.0           1.0  0.848148   0.857371  0.848148
15            30          0.7           1.0  0.822222   0.833257  0.822222
9             20          0.7           1.0  0.822222   0.832476  0.822222


In [36]:
best_row = extended_df_sorted.iloc[0]

best_model = BaggingClassifier(
    estimator=SVC(**best_svc_params, probability=True),
    n_estimators=int(best_row['n_estimators']),
    max_samples=best_row['max_samples'],
    max_features=best_row['max_features'],
    random_state=21
)

best_model.fit(X_train, y_train)
y_test_pred = best_model.predict(X_test)

test_acc = accuracy_score(y_test, y_test_pred)
test_prec = precision_score(y_test, y_test_pred, average='weighted')
test_rec = recall_score(y_test, y_test_pred, average='weighted')

print(f"\n[TEST] Accuracy: {test_acc:.5f}")
print(f"[TEST] Precision: {test_prec:.5f}")
print(f"[TEST] Recall: {test_rec:.5f}")
print(f"[BEST PARAMS] n_estimators={int(best_row['n_estimators'])}, max_samples={best_row['max_samples']}, max_features={best_row['max_features']}")


[TEST] Accuracy: 0.89941
[TEST] Precision: 0.90120
[TEST] Recall: 0.89941
[BEST PARAMS] n_estimators=30, max_samples=1.0, max_features=1.0


## 5. Stacking classifiers

1. To achieve reproducibility in this case you will have to create an object of cross-validation generator: `StratifiedKFold(n_splits=n, shuffle=True, random_state=21)`, where `n` you will try to optimize (the details are below).
2. Using `StackingClassifier` and the three models that you have recently trained, calculate the `accuracy`, `precision` and `recall` on the validation set, try different values of `n_splits` `[2, 3, 4, 5, 6, 7]` in the cross-validation generator and parameter `passthrough` in the classifier itself,
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision). Use `final_estimator=LogisticRegression(solver='liblinear')`.

In [42]:
final_est = LogisticRegression(solver='liblinear')

n_splits_list = [2, 3, 4, 5, 6, 7]
passthrough_options = [True, False]

results = []

for n in n_splits_list:
    for passthrough in passthrough_options:
        skf = StratifiedKFold(n_splits=n, shuffle=True, random_state=21)

        stack_model = StackingClassifier(
            estimators=[
                ('rf', forest_model),
                ('dt', tree_model),
                ('svc', svc_model)
            ],
            final_estimator=final_est,
            cv=skf,
            passthrough=passthrough,
            n_jobs=-1
        )

        stack_model.fit(X_valid, y_valid)
        y_val_pred = stack_model.predict(X_valid)

        acc = accuracy_score(y_valid, y_val_pred)
        prec = precision_score(y_valid, y_val_pred, average='weighted')
        rec = recall_score(y_valid, y_val_pred, average='weighted')

        results.append({
            'n_splits': n,
            'passthrough': passthrough,
            'accuracy': acc,
            'precision': prec,
            'recall': rec
        })

results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by=['accuracy', 'precision'], ascending=False)
print(results_df)

    n_splits  passthrough  accuracy  precision    recall
1          2        False  1.000000   1.000000  1.000000
3          3        False  1.000000   1.000000  1.000000
5          4        False  1.000000   1.000000  1.000000
7          5        False  1.000000   1.000000  1.000000
9          6        False  1.000000   1.000000  1.000000
10         7         True  1.000000   1.000000  1.000000
11         7        False  1.000000   1.000000  1.000000
4          4         True  0.996296   0.996380  0.996296
6          5         True  0.996296   0.996380  0.996296
8          6         True  0.996296   0.996380  0.996296
2          3         True  0.996296   0.996354  0.996296
0          2         True  0.985185   0.985836  0.985185


In [43]:
best_params = results_df.iloc[0]
print(f"\nBest params: n_splits = {best_params['n_splits']}, passthrough = {best_params['passthrough']}")

best_skf = StratifiedKFold(n_splits=int(best_params['n_splits']), shuffle=True, random_state=21)
best_stack = StackingClassifier(
    estimators=[
        ('rf', forest_model),
        ('dt', tree_model),
        ('svc', svc_model)
    ],
    final_estimator=final_est,
    cv=best_skf,
    passthrough=bool(best_params['passthrough']),
    n_jobs=-1
)

best_stack.fit(X_train, y_train)
y_test_pred = best_stack.predict(X_test)

test_acc = accuracy_score(y_test, y_test_pred)
test_prec = precision_score(y_test, y_test_pred, average='weighted')
test_rec = recall_score(y_test, y_test_pred, average='weighted')

print(f"\nTest Accuracy: {test_acc:.4f}")
print(f"Test Precision: {test_prec:.4f}")
print(f"Test Recall: {test_rec:.4f}")


Best params: n_splits = 2, passthrough = False

Test Accuracy: 0.9379
Test Precision: 0.9415
Test Recall: 0.9379


## 6. Predictions

1. Choose the best model in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).
2. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your full dataset), for which labname and for which users.
3. Save the model.

In [44]:
dayofweek_test = df.loc[X_test.index, 'dayofweek']
error_df = pd.DataFrame({
    'y_true': y_test,
    'y_testy_test_pred':y_test_pred,
    'is_error':y_test_pred != y_test,
    'dayofweek': dayofweek_test
})


errors_by_day = error_df.groupby('dayofweek')['is_error'].sum()

total_by_day = df['dayofweek'].value_counts().sort_index()

day_stats = pd.DataFrame({
    'errors': errors_by_day,
    'total_samples': total_by_day
})

day_stats['error_percent'] = (day_stats['errors'] / day_stats['total_samples']) * 100

worst_day = day_stats['error_percent'].idxmax()
worst_error = day_stats.loc[worst_day, 'error_percent']

print(day_stats)
print(f"\nMost error-prone weekday: {worst_day} with {worst_error:.2f}% errors")

           errors  total_samples  error_percent
dayofweek                                      
0               3            136       2.205882
1               5            274       1.824818
2               4            149       2.684564
3               4            396       1.010101
4               2            104       1.923077
5               0            271       0.000000
6               3            356       0.842697

Most error-prone weekday: 2 with 2.68% errors


In [45]:
labname_cols = [col for col in df.columns if col.startswith('labname_')]

max_error_percent = 0
max_error_percent_col = None

for col in labname_cols:
    col_values_test = df.loc[X_test.index, col]

    error_df = pd.DataFrame({
        'y_true': y_test,
        'y_test_pred': y_test_pred,
        'is_error': y_test_pred != y_test,
        col: col_values_test
    })

    errors = error_df[error_df[col] == 1]['is_error'].sum()
    total = df[df[col] == 1].shape[0]

    error_percent = (errors / total) * 100 if total > 0 else 0

    if error_percent > max_error_percent:
        max_error_percent = error_percent
        max_error_percent_col = col

    print(f"{col}: {error_percent:.2f}% errors ({errors}/{total})")
    
print(f"Max error percent: {max_error_percent:.2f}% for {max_error_percent_col}")

labname_code_rvw: 1.22% errors (1/82)
labname_lab02: 0.00% errors (0/2)
labname_lab03: 0.00% errors (0/1)
labname_lab03s: 0.00% errors (0/1)
labname_lab05s: 2.78% errors (1/36)
labname_laba04: 2.25% errors (4/178)
labname_laba04s: 0.00% errors (0/104)
labname_laba05: 0.90% errors (2/222)
labname_laba06: 6.25% errors (3/48)
labname_laba06s: 1.64% errors (1/61)
labname_project1: 0.95% errors (9/951)
Max error percent: 6.25% for labname_laba06


In [47]:
user_cols = [col for col in df.columns if col.startswith('uid_user_')]

max_percent = 0
max_col = None

for col in user_cols:
    col_values_test = df.loc[X_test.index, col]

    error_df = pd.DataFrame({
        'y_true': y_test,
        'y_test_pred': y_test_pred,
        'is_error': y_test_pred != y_test,
        col: col_values_test
    })

    errors = error_df[error_df[col] == 1]['is_error'].sum()
    total = df[df[col] == 1].shape[0]

    error_percent = (errors / total) * 100 if total > 0 else 0

    if error_percent > max_percent:
        max_percent = error_percent
        max_col = col

    print(f"{col}: {error_percent:.2f}% errors ({errors}/{total})")

print(f'Max error percent: {max_percent:.2f}% ({max_col})')

uid_user_0: 0.00% errors (0/2)
uid_user_1: 0.00% errors (0/46)
uid_user_10: 0.00% errors (0/71)
uid_user_11: 0.00% errors (0/5)
uid_user_12: 0.00% errors (0/49)
uid_user_13: 1.67% errors (1/60)
uid_user_14: 1.52% errors (2/132)
uid_user_15: 0.00% errors (0/17)
uid_user_16: 3.12% errors (1/32)
uid_user_17: 0.00% errors (0/34)
uid_user_18: 0.00% errors (0/35)
uid_user_19: 1.10% errors (1/91)
uid_user_2: 0.83% errors (1/121)
uid_user_20: 0.00% errors (0/86)
uid_user_21: 2.27% errors (1/44)
uid_user_22: 0.00% errors (0/7)
uid_user_23: 0.00% errors (0/4)
uid_user_24: 3.57% errors (2/56)
uid_user_25: 0.00% errors (0/120)
uid_user_26: 1.11% errors (1/90)
uid_user_27: 8.70% errors (2/23)
uid_user_28: 0.00% errors (0/60)
uid_user_29: 1.56% errors (1/64)
uid_user_3: 0.00% errors (0/71)
uid_user_30: 5.13% errors (2/39)
uid_user_31: 1.33% errors (1/75)
uid_user_4: 1.60% errors (3/188)
uid_user_6: 16.67% errors (2/12)
uid_user_7: 0.00% errors (0/5)
uid_user_8: 0.00% errors (0/47)
Max error percent:

In [48]:
joblib.dump(model, 'ex03model.pkl')

['ex03model.pkl']