# Hyperparameter tuning and gridsearch

## Introduction

This is continuation of the notebook MLPclassifierwithdropout.ipynb. In this notebook, we will use gridsearch to tune the hyperparameters for our regularization techniques. By hyperparameter, I mean the following:
- The stopping time for training.
- The dropout rate `dr` when implementing dropout.
- The learning rate `lr` used in training our models.
- The momentum parameter `m` used by the optimizer.
- The weight decay parameter `wd` in the L2 regularization.

## Imports and installs

In [8]:
import functools
import warnings
warnings.filterwarnings("ignore")
from pathlib import Path


import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    confusion_matrix,
    f1_score,
    log_loss,
    recall_score,
)
from sklearn.model_selection import (
    cross_val_predict,
    StratifiedKFold,
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    StandardScaler, LabelEncoder
)
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset

import torchaudio
from tqdm.notebook import tqdm

import itertools as it

def to_prob(metric):
    @functools.wraps(metric)
    def metric_that_takes_prob(y_actual, y_pred, sample_weight=None):
        return metric(y_actual, y_pred.argmax(1), sample_weight=sample_weight)

    return metric_that_takes_prob


metrics = {
    "accuracy": to_prob(accuracy_score),
    "balanced_accuracy": to_prob(balanced_accuracy_score),
    "unweighted_f1": to_prob(functools.partial(f1_score, average="macro")),
    "UAR": to_prob(functools.partial(recall_score, average="macro")),
    "logloss": log_loss,
}

In [9]:
from scripts.featureDataSets import *
from scripts.utility_functions import *
from scripts.models import *

## Implementing gridsearch

Unfortunately, since we want to carry out early stopping, we want to measure cross-validation in a manner akin to the `cross_validate` function imported from `scripts.utility_functions`. This means our current way of training and seeing results over CV are incompatible with the usual libraries. While it may be possible to use `skorch` to make it all work, it's easier for me to just write a naive gridsearch by hand.

First, a useful function.

In [10]:
def range(hps, hp):
    log_range=np.linspace(np.log10(hps[hp][0]), np.log10(hps[hp][1]) , hps[hp][2], dtype=float) #range cannot include 0.
    return 10**log_range


Here is our implementation of gridsearch.

In [25]:
def twolayergridsearch(hps, df=P05_t):
# hps should be a dictionary with hps.keys() equal to ['dr', 'lr', 'm', 'wd'] and with each key a 3-tuple (start,stop, num) describing the range to be searched, and the number of pts to check in that range.
    ranges_list=[range(hps, hp) for hp in hps.keys()]
    grid=list(it.product(*ranges_list))
    selector=pd.DataFrame()
    selector.index.name="('dr','lr','m','wd')"
    selector["best_f1_val"]=[0]*len(grid)
    selector["f1_val_stop"]=[0]*len(grid)
    selector["best_logloss_val"]=[0]*len(grid)
    selector["logloss_val_stop"]=[0]*len(grid)
    for i, pt in enumerate(tqdm(grid)):
        model_pt=two_layer_classifier_do(num_labels=len(P05_t.Label.unique()), dr=pt[0])
        output_pt = crossvalid_nobar(model=model_pt, df=df, k_fold=3, lr=pt[1], m=pt[2], wd=pt[3])
        df_f1=output_pt['unweighted_f1']
        df_logloss=output_pt['logloss']
        selector.loc[i, 'best_f1_val']=(df_f1['split_0_val']+df_f1['split_1_val']+df_f1['split_2_val']).max()/3
        selector.loc[i, 'f1_val_stop']=(df_f1['split_0_val']+df_f1['split_1_val']+df_f1['split_2_val']).argmax()
        selector.loc[i, 'best_logloss_val']=(df_logloss['split_0_val']+df_logloss['split_1_val']+df_logloss['split_2_val']).min()/3
        selector.loc[i, 'logloss_val_stop']=(df_logloss['split_0_val']+df_logloss['split_1_val']+df_logloss['split_2_val']).argmin()
    selector.index=grid
    return selector


            

In [26]:
hps={'dr': (1e-1,5e-1,5), 'lr': (1e-5, 1e-2, 6), 'm': (8e-1,9e-1,2), 'wd':(1e-5,1e-3,3)}
selector=twolayergridsearch(hps)
selector

  0%|          | 0/180 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
selector['best_f1_val'].nlargest(n=5)

In [None]:
selector['best_logloss_val'].nsmallest(n=5)

(0.5, 0.001, 0.9, 0.001)                                      0.856862
(0.08891397050194613, 0.00015848931924611142, 0.9, 0.001)     0.877056
(0.2811706625951745, 0.001, 0.8, 0.001)                       0.881455
(0.5, 0.001, 0.8, 0.001)                                      0.884058
(0.049999999999999996, 0.00015848931924611142, 0.9, 0.001)    0.884820
Name: best_f1_val, dtype: float64

We can repeat the same thing for three layer networks.

In [16]:
def threelayergridsearch(hps, df=P05_t):
# hps should be a dictionary with hps.keys() equal to ['dr', 'lr', 'm', 'wd'] and with each key a 3-tuple (start,stop, num) describing the range to be searched, and the number of pts to check in that range.
    ranges_list=[range(hps, hp) for hp in hps.keys()]
    grid=list(it.product(*ranges_list))
    selector=pd.DataFrame()
    selector.index.name="('dr','lr','m','wd')"
    selector["best_f1_val"]=[0]*len(grid)
    selector["f1_val_stop"]=[0]*len(grid)
    selector["best_logloss_val"]=[0]*len(grid)
    selector["logloss_val_stop"]=[0]*len(grid)
    for i, pt in enumerate(tqdm(grid)):
        model_pt=three_layer_classifier_do(num_labels=len(P05_t.Label.unique()), dr=pt[0])
        output_pt = crossvalid_nobar(model=model_pt, df=df, k_fold=3, lr=pt[1], m=pt[2], wd=pt[3])
        df_f1=output_pt['unweighted_f1']
        df_logloss=output_pt['logloss']
        selector.loc[i, 'best_f1_val']=(df_f1['split_0_val']+df_f1['split_1_val']+df_f1['split_2_val']).max()/3
        selector.loc[i, 'f1_val_stop']=(df_f1['split_0_val']+df_f1['split_1_val']+df_f1['split_2_val']).argmax()
        selector.loc[i, 'best_logloss_val']=(df_logloss['split_0_val']+df_logloss['split_1_val']+df_logloss['split_2_val']).min()/3
        selector.loc[i, 'logloss_val_stop']=(df_logloss['split_0_val']+df_logloss['split_1_val']+df_logloss['split_2_val']).argmin()
    selector.index=grid
    return selector


            

In [14]:
hps3={'dr': (1e-1,5e-1,5), 'lr': (1e-5, 1e-2, 6), 'm': (9e-1,10e-1,1), 'wd':(1e-5,1e-3,3)} #momentum has been having little effect in earlier runs of this cell, so I fixed it at m=.9.
selector3=twolayergridsearch(hps3)
selector3

  0%|          | 0/120 [00:00<?, ?it/s]

KeyboardInterrupt: 

### With AdamW as the optimizer

We should also check to see what this tuning spits out when we run with AdamW as the optimizer. Note that `optim.AdamW` does not take a parameter `m` for momentum. Instead, it uses `betas` which is a $2$-tuple used for computing running averages of the gradient and its square. We won't gridsearch in these dimensions - AdamW takes long enough as is.

Our input dictionaries are a little shorter, since there is no momementum to care about.

In [None]:
def twolayergridsearch_AdamW(hps, df=P05_t):
# hps should be a dictionary with hps.keys() equal to ['dr', 'lr', 'wd'] and with each key a 3-tuple (start,stop, num) describing the range to be searched, and the number of pts to check in that range.
    ranges_list=[range(hps, hp) for hp in hps.keys()]
    grid=list(it.product(*ranges_list))
    selector=pd.DataFrame()
    selector.index.name="('dr','lr','wd')"
    selector["best_f1_val"]=[0]*len(grid)
    selector["f1_val_stop"]=[0]*len(grid)
    selector["best_logloss_val"]=[0]*len(grid)
    selector["logloss_val_stop"]=[0]*len(grid)
    for i, pt in enumerate(tqdm(grid)):
        model_pt=two_layer_classifier_do(num_labels=len(P05_t.Label.unique()), dr=pt[0])
        output_pt = crossvalid_AdamW_nobar(model=model_pt, df=df, k_fold=3, lr=pt[1], wd=pt[2])
        df_f1=output_pt['unweighted_f1']
        df_logloss=output_pt['logloss']
        selector.loc[i, 'best_f1_val']=(df_f1['split_0_val']+df_f1['split_1_val']+df_f1['split_2_val']).max()/3
        selector.loc[i, 'f1_val_stop']=(df_f1['split_0_val']+df_f1['split_1_val']+df_f1['split_2_val']).argmax()
        selector.loc[i, 'best_f1_val']=(df_logloss['split_0_val']+df_logloss['split_1_val']+df_logloss['split_2_val']).min()/3
        selector.loc[i, 'f1_val_stop']=(df_logloss['split_0_val']+df_logloss['split_1_val']+df_logloss['split_2_val']).argmin()
    selector.index=grid
    return selector

In [None]:
hps2_AdamW={'dr': (5e-2,5e-1,5), 'lr': (1e-5, 1e-3, 6), 'wd':(1e-3,1e-1,2)}
selector2A=twolayergridsearch_AdamW(hps2_AdamW)
selector2A

In [None]:
def threelayergridsearch_AdamW(hps, df=P05_t):
# hps should be a dictionary with hps.keys() equal to ['dr', 'lr', 'wd'] and with each key a 3-tuple (start,stop, num) describing the range to be searched, and the number of pts to check in that range.
    ranges_list=[range(hps, hp) for hp in hps.keys()]
    grid=list(it.product(*ranges_list))
    selector=pd.DataFrame()
    selector.index.name="('dr','lr','wd')"
    selector["best_f1_val"]=[0]*len(grid)
    selector["f1_val_stop"]=[0]*len(grid)
    selector["best_logloss_val"]=[0]*len(grid)
    selector["logloss_val_stop"]=[0]*len(grid)
    for i, pt in enumerate(tqdm(grid)):
        model_pt=three_layer_classifier_do(num_labels=len(P05_t.Label.unique()), dr=pt[0])
        output_pt = crossvalid_AdamW_nobar(model=model_pt, df=df, k_fold=3, lr=pt[1], wd=pt[2])
        df_f1=output_pt['unweighted_f1']
        df_logloss=output_pt['logloss']
        selector.loc[i, 'best_f1_val']=(df_f1['split_0_val']+df_f1['split_1_val']+df_f1['split_2_val']).max()/3
        selector.loc[i, 'f1_val_stop']=(df_f1['split_0_val']+df_f1['split_1_val']+df_f1['split_2_val']).argmax()
        selector.loc[i, 'best_f1_val']=(df_logloss['split_0_val']+df_logloss['split_1_val']+df_logloss['split_2_val']).min()/3
        selector.loc[i, 'f1_val_stop']=(df_logloss['split_0_val']+df_logloss['split_1_val']+df_logloss['split_2_val']).argmin()
    selector.index=grid
    return selector

In [None]:
hps3_AdamW={'dr': (5e-2,5e-1,5), 'lr': (1e-5, 1e-3, 6), 'wd':(1e-3,1e-1,2)}
selector3A=twolayergridsearch_AdamW(hps3_AdamW)
selector3A