# Day 09. Exercise 00
# Regularization

## 0. Imports

In [48]:
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import pandas as pd
import warnings
import joblib
warnings.filterwarnings("ignore")

## 1. Preprocessing

1. Read the file `dayofweek.csv` that you used in the previous day to a dataframe.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [9]:
df = pd.read_csv('../../datasets/dayofweek.csv')
X = df.drop('dayofweek', axis=1)
y = df['dayofweek']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

## 2. Logreg regularization

### a. Default regularization

1. Train a baseline model with the only parameters `random_state=21`, `fit_intercept=False`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model


The result of the code where you trained and evaluated the baseline model should be exactly like this (use `%%time` to get the info about how long it took to run the cell):

```
train -  0.62902   |   valid -  0.59259
train -  0.64633   |   valid -  0.62963
train -  0.63479   |   valid -  0.56296
train -  0.65622   |   valid -  0.61481
train -  0.63397   |   valid -  0.57778
train -  0.64056   |   valid -  0.59259
train -  0.64138   |   valid -  0.65926
train -  0.65952   |   valid -  0.56296
train -  0.64333   |   valid -  0.59701
train -  0.63674   |   valid -  0.62687
Average accuracy on crossval is 0.60165
Std is 0.02943
```

In [30]:
%%time
model = LogisticRegression(random_state=21, fit_intercept=False)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model.fit(X_train, y_train)

    print(f'train - {accuracy_score(y_train, model.predict(X_train)):.5f}  |  test - {accuracy_score(y_test, model.predict(X_test)):.5f}')

cross_val_acc = cross_val_score(model, X_train, y_train, cv=skf)
print(f'Average accuracy on crossval is {cross_val_acc.mean():.5f}')
print(f'Std is {cross_val_acc.std():.5f}')


train - 0.63546  |  test - 0.65089
train - 0.65326  |  test - 0.60947
train - 0.63942  |  test - 0.63314
train - 0.63283  |  test - 0.57988
train - 0.65590  |  test - 0.57988
train - 0.64535  |  test - 0.62130
train - 0.63834  |  test - 0.60714
train - 0.63702  |  test - 0.59524
train - 0.64295  |  test - 0.68452
train - 0.63900  |  test - 0.56548
Average accuracy on crossval is 0.60932
Std is 0.03367
CPU times: user 148 ms, sys: 6.99 ms, total: 155 ms
Wall time: 155 ms


### b. Optimizing regularization parameters

1. In the cells below try different values of penalty: `none`, `l1`, `l2` – you can change the values of solver too.

In [23]:
configs = [
    {'penalty': None, 'solver': 'lbfgs'},
    {'penalty': 'l2', 'solver': 'lbfgs'},
    {'penalty': 'l1', 'solver': 'liblinear'},
    {'penalty': 'l1', 'solver': 'saga'},
    {'penalty': 'l2', 'solver': 'saga'}
]

for config in configs:
    model = LogisticRegression(**config, random_state=21, fit_intercept=False, max_iter=1000)
    print(f'config: {config}')
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)
        print(f'train - {accuracy_score(y_train, model.predict(X_train)):.5f}  |  test - {accuracy_score(y_test, model.predict(X_test)):.5f}')

    cross_val_acc = cross_val_score(model, X_train, y_train, cv=skf)
    print(f'Average accuracy on crossval is {cross_val_acc.mean():.5f}')
    print(f'Std is {cross_val_acc.std():.5f}')
    print('\n')

config: {'penalty': None, 'solver': 'lbfgs'}
train - 0.66645  |  test - 0.68639
train - 0.65985  |  test - 0.65089
train - 0.66249  |  test - 0.68639
train - 0.67172  |  test - 0.60355
train - 0.67436  |  test - 0.58580
train - 0.66051  |  test - 0.63314
train - 0.66930  |  test - 0.61905
train - 0.65679  |  test - 0.61905
train - 0.65679  |  test - 0.68452
train - 0.66469  |  test - 0.58929
Average accuracy on crossval is 0.63369
Std is 0.03438


config: {'penalty': 'l2', 'solver': 'lbfgs'}
train - 0.63546  |  test - 0.65089
train - 0.65326  |  test - 0.60947
train - 0.63942  |  test - 0.63314
train - 0.63283  |  test - 0.57988
train - 0.65590  |  test - 0.57988
train - 0.64535  |  test - 0.62130
train - 0.63834  |  test - 0.60714
train - 0.63702  |  test - 0.59524
train - 0.64295  |  test - 0.68452
train - 0.63900  |  test - 0.56548
Average accuracy on crossval is 0.60932
Std is 0.03367


config: {'penalty': 'l1', 'solver': 'liblinear'}
train - 0.62887  |  test - 0.63314
train - 0.63

## 3. SVM regularization

### a. Default regularization

1. Train a baseline model with the only parameters `probability=True`, `kernel='linear'`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [31]:
%%time
model = SVC(probability=True, kernel='linear', random_state=21)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model.fit(X_train, y_train)

    print(f'train - {accuracy_score(y_train, model.predict(X_train)):.5f}  |  test - {accuracy_score(y_test, model.predict(X_test)):.5f}')

cross_val_acc = cross_val_score(model, X_train, y_train, cv=skf)
print(f'Average accuracy on crossval is {cross_val_acc.mean():.5f}')
print(f'Std is {cross_val_acc.std():.5f}')

train - 0.70138  |  test - 0.71598
train - 0.69677  |  test - 0.68639
train - 0.70402  |  test - 0.71006
train - 0.69941  |  test - 0.63905
train - 0.71127  |  test - 0.62130
train - 0.70336  |  test - 0.69822
train - 0.69038  |  test - 0.67857
train - 0.70487  |  test - 0.69048
train - 0.69895  |  test - 0.71429
train - 0.70026  |  test - 0.61905
Average accuracy on crossval is 0.67721
Std is 0.03168
CPU times: user 2.97 s, sys: 15 ms, total: 2.98 s
Wall time: 3.01 s


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `C`.

In [33]:
C_values = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

for C_val in C_values:
    model = SVC(C=C_val, probability=True, kernel='linear', random_state=21)
    print(f'C: {C_val}')
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)
        print(f'train - {accuracy_score(y_train, model.predict(X_train)):.5f}  |  test - {accuracy_score(y_test, model.predict(X_test)):.5f}')

    cross_val_acc = cross_val_score(model, X_train, y_train, cv=skf)
    print(f'Average accuracy on crossval is {cross_val_acc.mean():.5f}')
    print(f'Std is {cross_val_acc.std():.5f}')
    print('\n')

C: 0.001
train - 0.23467  |  test - 0.23669
train - 0.23467  |  test - 0.23669
train - 0.23467  |  test - 0.23669
train - 0.23467  |  test - 0.23669
train - 0.23467  |  test - 0.23669
train - 0.23467  |  test - 0.23669
train - 0.23518  |  test - 0.23214
train - 0.23518  |  test - 0.23214
train - 0.23518  |  test - 0.23214
train - 0.23518  |  test - 0.23214
Average accuracy on crossval is 0.23517
Std is 0.00258


C: 0.01
train - 0.40804  |  test - 0.41420
train - 0.41859  |  test - 0.38462
train - 0.43771  |  test - 0.41420
train - 0.44034  |  test - 0.44970
train - 0.39684  |  test - 0.36095
train - 0.43705  |  test - 0.48521
train - 0.44137  |  test - 0.43452
train - 0.39789  |  test - 0.40476
train - 0.44137  |  test - 0.42857
train - 0.43412  |  test - 0.45238
Average accuracy on crossval is 0.39196
Std is 0.02365


C: 0.1
train - 0.58075  |  test - 0.59763
train - 0.57877  |  test - 0.54438
train - 0.57284  |  test - 0.57396
train - 0.58603  |  test - 0.61538
train - 0.59328  |  te

## 4. Tree

### a. Default regularization

1. Train a baseline model with the only parameter `max_depth=10` and `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [36]:
%%time
model = DecisionTreeClassifier(max_depth=10, random_state=21)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model.fit(X_train, y_train)

    print(f'train - {accuracy_score(y_train, model.predict(X_train)):.5f}  |  test - {accuracy_score(y_test, model.predict(X_test)):.5f}')

cross_val_acc = cross_val_score(model, X_train, y_train, cv=skf)
print(f'Average accuracy on crossval is {cross_val_acc.mean():.5f}')
print(f'Std is {cross_val_acc.std():.5f}')    

train - 0.82004  |  test - 0.79290
train - 0.82663  |  test - 0.69822
train - 0.82927  |  test - 0.76331
train - 0.81806  |  test - 0.71598
train - 0.82268  |  test - 0.74556
train - 0.80554  |  test - 0.77515
train - 0.83333  |  test - 0.75595
train - 0.81555  |  test - 0.76786
train - 0.81225  |  test - 0.77381
train - 0.81752  |  test - 0.69048
Average accuracy on crossval is 0.74772
Std is 0.02253
CPU times: user 74.8 ms, sys: 4.31 ms, total: 79.1 ms
Wall time: 77.9 ms


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `max_depth`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [37]:
max_depth_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
min_samples_split_values = [2, 5, 10, 15, 20]
max_leaf_nodes_values = [2, 5, 10, 15, 20]

for max_depth_value in max_depth_values:
    for min_samples_split_value in min_samples_split_values:
        for max_leaf_nodes_value in max_leaf_nodes_values:
            model = DecisionTreeClassifier(max_depth=max_depth_value, min_samples_split=min_samples_split_value, max_leaf_nodes=max_leaf_nodes_value, random_state=21)
            print(f'max_depth={max_depth_value}, min_samples_split={min_samples_split_value}, max_leaf_nodes={max_leaf_nodes_value}')

            for train_index, test_index in skf.split(X, y):
                X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]

                model.fit(X_train, y_train)
                print(f'train - {accuracy_score(y_train, model.predict(X_train)):.5f}  |  test - {accuracy_score(y_test, model.predict(X_test)):.5f}')
        
            cross_val_acc = cross_val_score(model, X_train, y_train, cv=skf)
            print(f'Average accuracy on crossval is {cross_val_acc.mean():.5f}')
            print(f'Std is {cross_val_acc.std():.5f}')
            print('\n')
            

max_depth=1, min_samples_split=2, max_leaf_nodes=2
train - 0.35662  |  test - 0.36686
train - 0.35992  |  test - 0.33728
train - 0.35860  |  test - 0.34911
train - 0.35662  |  test - 0.36686
train - 0.36058  |  test - 0.33136
train - 0.35333  |  test - 0.39645
train - 0.35837  |  test - 0.35119
train - 0.35837  |  test - 0.35119
train - 0.35705  |  test - 0.36310
train - 0.35705  |  test - 0.36310
Average accuracy on crossval is 0.35702
Std is 0.01983


max_depth=1, min_samples_split=2, max_leaf_nodes=5
train - 0.35662  |  test - 0.36686
train - 0.35992  |  test - 0.33728
train - 0.35860  |  test - 0.34911
train - 0.35662  |  test - 0.36686
train - 0.36058  |  test - 0.33136
train - 0.35333  |  test - 0.39645
train - 0.35837  |  test - 0.35119
train - 0.35837  |  test - 0.35119
train - 0.35705  |  test - 0.36310
train - 0.35705  |  test - 0.36310
Average accuracy on crossval is 0.35702
Std is 0.01983


max_depth=1, min_samples_split=2, max_leaf_nodes=10
train - 0.35662  |  test - 0.366

## 5. Random forest

### a. Default regularization

1. Train a baseline model with the only parameters `n_estimators=50`, `max_depth=14`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [39]:
%%time
model = RandomForestClassifier(n_estimators=50, max_depth=14, random_state=21)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model.fit(X_train, y_train)

    print(f'train - {accuracy_score(y_train, model.predict(X_train)):.5f}  |  test - {accuracy_score(y_test, model.predict(X_test)):.5f}')

cross_val_acc = cross_val_score(model, X_train, y_train, cv=skf)
print(f'Average accuracy on crossval is {cross_val_acc.mean():.5f}')
print(f'Std is {cross_val_acc.std():.5f}') 

train - 0.97034  |  test - 0.90533
train - 0.96704  |  test - 0.87574
train - 0.96902  |  test - 0.91124
train - 0.97429  |  test - 0.89349
train - 0.96243  |  test - 0.86982
train - 0.96638  |  test - 0.94083
train - 0.97036  |  test - 0.92262
train - 0.97036  |  test - 0.91667
train - 0.96838  |  test - 0.89881
train - 0.97563  |  test - 0.88690
Average accuracy on crossval is 0.89525
Std is 0.02315
CPU times: user 738 ms, sys: 8.02 ms, total: 746 ms
Wall time: 764 ms


### b. Optimizing regularization parameters

1. In the new cells try different values of the parameters `max_depth` and `n_estimators`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [41]:
max_depth_values = [5, 10, 15, 20, 25]
n_estimators_values = [100, 200, 300, 400, 500]

for max_depth_value in max_depth_values:
    for n_estimators_value in n_estimators_values:
        model = RandomForestClassifier(max_depth=max_depth_value, n_estimators=n_estimators_value)
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            model.fit(X_train, y_train)

            print(f'train - {accuracy_score(y_train, model.predict(X_train)):.5f}  |  test - {accuracy_score(y_test, model.predict(X_test)):.5f}')

        cross_val_acc = cross_val_score(model, X_train, y_train, cv=skf)
        print(f'Average accuracy on crossval is {cross_val_acc.mean():.5f}')
        print(f'Std is {cross_val_acc.std():.5f}')
        print('\n')
            

train - 0.58471  |  test - 0.58580
train - 0.59525  |  test - 0.53846
train - 0.60382  |  test - 0.58580
train - 0.61437  |  test - 0.61538
train - 0.60646  |  test - 0.53254
train - 0.60910  |  test - 0.57396
train - 0.56192  |  test - 0.54762
train - 0.58037  |  test - 0.59524
train - 0.59223  |  test - 0.58333
train - 0.59947  |  test - 0.54167
Average accuracy on crossval is 0.59352
Std is 0.03469


train - 0.58141  |  test - 0.57988
train - 0.59262  |  test - 0.56213
train - 0.58603  |  test - 0.57988
train - 0.60250  |  test - 0.58580
train - 0.61239  |  test - 0.53846
train - 0.62096  |  test - 0.57988
train - 0.58432  |  test - 0.58333
train - 0.61528  |  test - 0.60119
train - 0.57312  |  test - 0.57738
train - 0.61528  |  test - 0.56548
Average accuracy on crossval is 0.58363
Std is 0.02971


train - 0.59525  |  test - 0.60355
train - 0.60382  |  test - 0.56805
train - 0.57746  |  test - 0.55621
train - 0.58734  |  test - 0.57396
train - 0.60053  |  test - 0.52663
train - 0.6

## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.
3. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your test dataset).
4. Save the model.

In [46]:
model = RandomForestClassifier(max_depth=20, random_state=21, n_estimators=300)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Cross_val_score with cv=7: {cross_val_score(model, X_train, y_train, cv=7).mean()}')

Accuracy: 0.9166666666666666
Cross_val_score with cv=7: 0.5795692829102966


In [None]:
dayofweek_test = df.loc[X_test.index, 'dayofweek']
error_df = pd.DataFrame({
    'y_true': y_test,
    'y_pred': y_pred,
    'is_error': y_pred != y_test,
    'dayofweek': dayofweek_test
})

day_stats = error_df.groupby('dayofweek').agg(
    total_samples=('is_error', 'count'),
    errors=('is_error', 'sum')
)

day_stats['error_percent'] = (day_stats['errors'] / day_stats['total_samples']) * 100

worst_day = day_stats['error_percent'].idxmax()
worst_error = day_stats.loc[worst_day, 'error_percent']

print(day_stats)

print(f"\nMost error-prone weekday: {worst_day} with {worst_error:.2f}% errors")


Error stats by weekday:
           total_samples  errors  error_percent
dayofweek                                      
0                     13       3      23.076923
1                     28       4      14.285714
2                     15       2      13.333333
3                     39       1       2.564103
4                     10       0       0.000000
5                     27       0       0.000000
6                     36       4      11.111111

Most error-prone weekday: 0 with 23.08% errors


In [49]:
joblib.dump(model, 'model.pkl')

['model.pkl']