# Day 09. Exercise 01
# Gridsearch

## 0. Imports

In [4]:
# This is an display config snippet!
import pandas as pd
from IPython.core.display import display, HTML
pd.options.display.float_format = '{:,.4f}'.format

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.expand_frame_repr', False)
display(HTML("<style>.container { width:95% !important; }</style>"))



In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.utils._testing import ignore_warnings
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import pickle
from sklearn.model_selection import GridSearchCV
from tqdm.auto import tqdm
from itertools import product

## 1. Preprocessing

1. Read the file [`day-of-week-not-scaled.csv`](https://drive.google.com/file/d/1LjhbMTPnZsZpa1Uj75bCOaFpcfORXF3m/view?usp=sharing). It is similar to the one from the previous exercise, but this time we did not scale continuous features (we are not going to use logreg anymore).
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [6]:
df = pd.read_csv('../data/dayofweek-not-scaled.csv')

In [7]:
scaler = StandardScaler()

In [8]:
X = np.hstack([scaler.fit_transform(df[['numTrials', 'hour']]), df.drop(columns=['numTrials', 'hour', 'dayofweek'])])
y = df['dayofweek'].values

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=21, test_size=0.2)

## 2. SVM gridsearch

1. Using `GridSearchCV` try different parameters of kernel (`linear`, `rbf`, `sigmoid`), C (`0.01`, `0.1`, `1`, `1.5`, `5`, `10`), gamma (`scale`, `auto`), class_weight (`balanced`, `None`) use `random_state=21` and `probability=True` and get the best combination of them in terms of accuracy.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`. Check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [7]:
grid_search_svc = GridSearchCV(SVC(random_state=21, probability=True), {"C": (0.01, 0.1, 1, 1.5, 5, 10), 'gamma': ('scale', 'auto'), 'class_weight': ('balanced', None),
                                                                    'kernel': ('linear', 'rbf', 'sigmoid')},
                          n_jobs=-1)

In [8]:
%%time
grid_search_svc.fit(X_train, y_train)

CPU times: user 1.13 s, sys: 276 ms, total: 1.41 s
Wall time: 26.7 s


GridSearchCV(estimator=SVC(probability=True, random_state=21), n_jobs=-1,
             param_grid={'C': (0.01, 0.1, 1, 1.5, 5, 10),
                         'class_weight': ('balanced', None),
                         'gamma': ('scale', 'auto'),
                         'kernel': ('linear', 'rbf', 'sigmoid')})

In [9]:
results_df_svc = pd.DataFrame(grid_search_svc.cv_results_).sort_values('rank_test_score')
results_df_svc.head(2)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_class_weight,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
67,0.761,0.0331,0.0706,0.0093,10,,scale,rbf,"{'C': 10, 'class_weight': None, 'gamma': 'scale', 'kernel': 'rbf'}",0.8963,0.8407,0.8963,0.8699,0.8625,0.8731,0.0212,1
61,0.6952,0.0243,0.0847,0.012,10,balanced,scale,rbf,"{'C': 10, 'class_weight': 'balanced', 'gamma': 'scale', 'kernel': 'rbf'}",0.8815,0.8333,0.8556,0.8773,0.8216,0.8539,0.0236,2


## 3. Decision tree

1. Using `GridSearchCV` try different parameters of `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use `random_state=21`.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [10]:
%%time
grid_search_tree.fit(X_train, y_train)

NameError: name 'grid_search_tree' is not defined

In [11]:
results_df_tree = pd.DataFrame(grid_search_tree.cv_results_).sort_values('rank_test_score')
results_df_tree.head(2)

NameError: name 'grid_search_tree' is not defined

In [None]:
grid_search_tree = GridSearchCV(DecisionTreeClassifier(random_state=21), {'max_depth': range(1, 50), 'class_weight': ('balanced', None), 'criterion': ('entropy', 'gini')},
                          n_jobs=-1)

## 4. Random forest

1. Using `GridSearchCV` try different parameters of `n_estimators` (`5`, `10`, `50`, `100`), `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use random_state=21.
2. Create a dataframe from the results of the gridsearch and sort it ascendengly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [10]:
forest_param_grid = {'max_depth': range(1, 50), 'class_weight': ('balanced', None), 'criterion': ('entropy', 'gini'), 'n_estimators': (5, 10, 50, 100)}

In [11]:
grid_search_forest = GridSearchCV(RandomForestClassifier(random_state=21), forest_param_grid,
                          n_jobs=-1)

In [12]:
%%time
grid_search_forest.fit(X_train, y_train)

CPU times: user 4.31 s, sys: 788 ms, total: 5.1 s
Wall time: 56.2 s


GridSearchCV(estimator=RandomForestClassifier(random_state=21), n_jobs=-1,
             param_grid={'class_weight': ('balanced', None),
                         'criterion': ('entropy', 'gini'),
                         'max_depth': range(1, 50),
                         'n_estimators': (5, 10, 50, 100)})

In [13]:
results_df_forest = pd.DataFrame(grid_search_forest.cv_results_).sort_values('rank_test_score')
results_df_forest.head(2)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
314,0.1913,0.018,0.0148,0.0017,balanced,gini,30,50,"{'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 30, 'n_estimators': 50}",0.9222,0.9037,0.9,0.9071,0.8848,0.9035,0.0121,1
711,0.3611,0.0082,0.0254,0.0017,,gini,31,100,"{'class_weight': None, 'criterion': 'gini', 'max_depth': 31, 'n_estimators': 100}",0.9185,0.9111,0.8963,0.9108,0.8773,0.9028,0.0146,2


In [14]:
results_df_forest = pd.DataFrame(val_scores).sort_values('mean_accuracy', ascending=False)
results_df_forest.head(2)

NameError: name 'val_scores' is not defined

## 5. Progress bar

Gridsearch can be a quite long process and you may find yourself wondering when it will end.
1. Create a manual gridsearch for the same parameters values of random forest iterating through the list of the possible values and calculating `cross_val_score` for each combination. Try to increase `n_jobs`. The value `cv` for `cross_val_score` is 5.
2. Track the progress using the library `tqdm.notebook`.
3. Create a dataframe from the results of the gridsearch with the columns corresponding to the names of the parameters and `mean_accuracy` and `std_accuracy`.
4. Sort it descendingly by the `mean_accuracy`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [None]:
%%time
val_scores = []
for values in tqdm(product(*forest_param_grid.values()), total=np.prod(list(map(len, forest_param_grid.values())))):
    params = dict(zip(forest_param_grid, values))
    forest = RandomForestClassifier(random_state=21, **dict(zip(forest_param_grid, values)), n_jobs=-1)
    scores = cross_val_score(forest, X_train, y_train, cv=5)
    mean_score = np.mean(scores)
    std_score = np.std(scores)
    val_scores.append({'mean_accuracy': mean_score, 'std_accuracy': std_score, **params})

## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.

In [None]:
forest = RandomForestClassifier(random_state=21,
                                **results_df_forest.query('mean_accuracy == mean_accuracy.max()').drop(columns=['mean_accuracy', 'std_accuracy']).to_dict(orient='records')[0])

In [None]:
forest.fit(X_train, y_train)

In [None]:
y_pred = forest.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)