# Unleash the Ray - Grid Search

Let's revisit our grid search example but now with Ray

A lot of this code is going to be familiar as we already had our pipeline wraped in a function

In [None]:
%reload_ext autoreload
%autoreload 2

import numpy as np
import sklearn.datasets
import sklearn.metrics
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
sns.set_context('talk')
sns.set_style('whitegrid')
sns.set_palette(sns.color_palette("bright", 8))

from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import f1_score

In [None]:
import ray
from ray import tune

### Let's start Ray

In [None]:
ray.shutdown()
ray.init(num_cpus=10, num_gpus=0, include_dashboard=True)

After initialisation the [Ray Dashboard](https://docs.ray.io/en/master/ray-dashboard.html) is available on the **webui_url** port

## Setup some raytune compatible training code

Very similar to before except now we have an end-to-end function

In [None]:

# differences from what we've seen before, this is an end to end training function
# where we are loading the dataset running our complete train and test loop whilst
# 
def e2e_simple_training(config):
    
    #threadsafe
    X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)
    
    # chose your CV strategy
    splitter = StratifiedKFold(n_splits=5)
    
    # run k fold training and testing
    f1_scores = [] # keep hold of all individual scores
    for train_ind, test_ind in splitter.split(X, y):
        pipeline = make_pipeline(RobustScaler(),
                                  RandomForestClassifier(random_state=42))

        pipeline.set_params(**config)
        pipeline.fit(X[train_ind], y[train_ind])
        
        y_pred = pipeline.predict(X[test_ind])
        
        #f1_scores.append(f1_score(y_pred, y[test_ind]))

        tune.report(f1_score=f1_score(y_pred, y[test_ind]))
    # use tunes reporter
    #tune.report(f1_score = f1_scores)
  #  reporter = tune.JupyterNotebookReporter(overwrite=True,
 #                                            metric_columns={
 #                                                'mean_f1_score': np.array(f1_scores).mean(),
 #                                                'std_f1_score': np.array(f1_scores).std()
 #                                })
                # and we can actually add any metrics we like)

Previously we had a param grid like this

```
param_grid = {
    'randomforestclassifier__n_estimators': [1,5,15,50,100],
    'randomforestclassifier__criterion': ['gini', 'entropy'],
    'randomforestclassifier__bootstrap': [True, False]
}
```

### TODO convert this to a set of ray search spaces

The Ray config object is freeform, we imprint our own structure.

However, tunable parameters need to be represented by tune distribution object >> [read the docs](https://docs.ray.io/en/latest/tune/api_docs/grid_random.html?highlight=tune.grid#random-distributions-api)

In [None]:
ray_tuning_config = {
    'randomforestclassifier__n_estimators': tune.grid_search([1,5,15])
}

In [77]:
analysis = tune.run(
                e2e_simple_training,
                metric='f1_score',
                mode='max',
            #    progress_reporter=tune.JupyterNotebookReporter(overwrite=True,metric_columns='f1_scores'),
                config=ray_tuning_config,
                resources_per_trial=dict(cpu=1, gpu=0),
                local_dir="~/ray_results/grid_search")

Trial name,status,loc,randomforestclassifier__n_estimators
e2e_simple_training_ad6f1_00000,RUNNING,,1
e2e_simple_training_ad6f1_00001,PENDING,,5
e2e_simple_training_ad6f1_00002,PENDING,,15


Result for e2e_simple_training_ad6f1_00000:
  date: 2020-11-09_12-28-25
  done: false
  experiment_id: 144c6dfa37944f2a9f7aa9bc57138e8d
  experiment_tag: 0_randomforestclassifier__n_estimators=1
  f1_score: 0.9
  hostname: Schlepptop
  iterations_since_restore: 1
  node_ip: 192.168.123.68
  pid: 17808
  time_since_restore: 0.036040306091308594
  time_this_iter_s: 0.036040306091308594
  time_total_s: 0.036040306091308594
  timestamp: 1604921305
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: ad6f1_00000
  
Result for e2e_simple_training_ad6f1_00001:
  date: 2020-11-09_12-28-25
  done: false
  experiment_id: 75999b5faeb84f0f82fa84e95bb5730c
  experiment_tag: 1_randomforestclassifier__n_estimators=5
  f1_score: 0.9142857142857144
  hostname: Schlepptop
  iterations_since_restore: 1
  node_ip: 192.168.123.68
  pid: 17810
  time_since_restore: 0.051476240158081055
  time_this_iter_s: 0.051476240158081055
  time_total_s: 0.051476240158081055
  timestamp: 1604921305
  timeste

Trial name,status,loc,randomforestclassifier__n_estimators,iter,total time (s),f1_score
e2e_simple_training_ad6f1_00000,TERMINATED,,1,5,0.117696,0.942857
e2e_simple_training_ad6f1_00001,TERMINATED,,5,5,0.182361,0.992908
e2e_simple_training_ad6f1_00002,TERMINATED,,15,5,0.280174,0.978723


In [78]:
df = analysis.dataframe()
print(df.columns)
df.head()

Index(['f1_score', 'time_this_iter_s', 'done', 'timesteps_total',
       'episodes_total', 'training_iteration', 'experiment_id', 'date',
       'timestamp', 'time_total_s', 'pid', 'hostname', 'node_ip',
       'time_since_restore', 'timesteps_since_restore',
       'iterations_since_restore', 'trial_id', 'experiment_tag',
       'config/randomforestclassifier__n_estimators', 'logdir'],
      dtype='object')


Unnamed: 0,f1_score,time_this_iter_s,done,timesteps_total,episodes_total,training_iteration,experiment_id,date,timestamp,time_total_s,pid,hostname,node_ip,time_since_restore,timesteps_since_restore,iterations_since_restore,trial_id,experiment_tag,config/randomforestclassifier__n_estimators,logdir
0,0.957746,0.015585,False,,,3,144c6dfa37944f2a9f7aa9bc57138e8d,2020-11-09_12-28-25,1604921305,0.077695,17808,Schlepptop,192.168.123.68,0.077695,0,3,ad6f1_00000,0_randomforestclassifier__n_estimators=1,1,/home/lena/ray_results/grid_search/e2e_simple_...
1,0.992908,0.025794,False,,,5,75999b5faeb84f0f82fa84e95bb5730c,2020-11-09_12-28-25,1604921305,0.182361,17810,Schlepptop,192.168.123.68,0.182361,0,5,ad6f1_00001,1_randomforestclassifier__n_estimators=5,5,/home/lena/ray_results/grid_search/e2e_simple_...
2,0.986111,0.055378,False,,,3,91cc6fb91f584d9988e64cfce597574f,2020-11-09_12-28-25,1604921305,0.175307,17806,Schlepptop,192.168.123.68,0.175307,0,3,ad6f1_00002,2_randomforestclassifier__n_estimators=15,15,/home/lena/ray_results/grid_search/e2e_simple_...


In [80]:
print("Best config: ", analysis.get_best_config(metric="f1_score"))

Best config:  {'randomforestclassifier__n_estimators': 5}


In [None]:
from scipy.stats import norm

def plot_some_tune_results(df):
    fig, ax = plt.subplots(1, 1, figsize=(16,6))
    x = np.linspace(0.85, 1.0, 100)

    n_estimators = df['config/randomforestclassifier__n_estimators'].values.tolist()

    lines = []
    for mu, sigma in zip(df['mean_f1_score'], df['std_f1_score']):
        pdf = norm.pdf(x, mu, sigma)
        line, = ax.plot(x, pdf, alpha=0.6)
        ax.axvline(mu, color=line.get_color())
        ax.text(mu, pdf.max(), f"{mu:.3f}", color=line.get_color(), fontsize=14)
        lines.append(line)

    plt.legend(handles=lines, labels=n_estimators, title="n estimators")
    ax.set_title(f"Average F1 Scores")
    
plot_some_tune_results(df)

## Really increase the size of the search space

In [None]:
#
# 6D search space - 960 combinations - 4800 calls to fit
#

ray_tuning_config = {
    'randomforestclassifier__n_estimators': tune.grid_search([1,5,15,50,100]),
    'randomforestclassifier__criterion': tune.grid_search(['gini', 'entropy']),
    'randomforestclassifier__max_features': tune.grid_search(['auto', 'sqrt', 'log2']),
#     'randomforestclassifier__bootstrap': tune.grid_search([True, False]),
#     'randomforestclassifier__min_samples_leaf': tune.grid_search([1,2,3,4]),
    'randomforestclassifier__min_samples_split': tune.grid_search([3,4,5,6])
}

In [67]:
analysis = tune.run(
                e2e_simple_training,
                mode='max',
                config=ray_tuning_config,
                resources_per_trial=dict(cpu=1, gpu=0)
                )

Trial name,status,loc,randomforestclassifier__n_estimators
e2e_simple_training_474f9_00000,RUNNING,,1
e2e_simple_training_474f9_00001,PENDING,,5
e2e_simple_training_474f9_00002,PENDING,,15


Result for e2e_simple_training_474f9_00000:
  date: 2020-11-09_12-25-33
  done: false
  experiment_id: 8c436635c4524858b82e2979777d6942
  experiment_tag: 0_randomforestclassifier__n_estimators=1
  f1_score: 0.9
  hostname: Schlepptop
  iterations_since_restore: 1
  node_ip: 192.168.123.68
  pid: 17235
  time_since_restore: 0.033599853515625
  time_this_iter_s: 0.033599853515625
  time_total_s: 0.033599853515625
  timestamp: 1604921133
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 474f9_00000
  
Result for e2e_simple_training_474f9_00001:
  date: 2020-11-09_12-25-33
  done: false
  experiment_id: a3e12976f12d4532b58afe7150f1022e
  experiment_tag: 1_randomforestclassifier__n_estimators=5
  f1_score: 0.9142857142857144
  hostname: Schlepptop
  iterations_since_restore: 1
  node_ip: 192.168.123.68
  pid: 17249
  time_since_restore: 0.04265308380126953
  time_this_iter_s: 0.04265308380126953
  time_total_s: 0.04265308380126953
  timestamp: 1604921133
  timesteps_since_res

Trial name,status,loc,randomforestclassifier__n_estimators,iter,total time (s),f1_score
e2e_simple_training_474f9_00000,TERMINATED,,1,5,0.108161,0.942857
e2e_simple_training_474f9_00001,TERMINATED,,5,5,0.160606,0.992908
e2e_simple_training_474f9_00002,TERMINATED,,15,5,0.258089,0.978723


In [None]:
from pprint import pprint
print("Best config: ")
pprint(analysis.get_best_config(metric="mean_f1_score"))

In [None]:
df = analysis.dataframe()
top_n_df = df.nlargest(10, "mean_f1_score")

In [None]:
plot_some_tune_results(top_n_df)

In [None]:
%load_ext tensorboard

In [None]:
from tensorboard import notebook
%tensorboard --logdir "~/ray_results/grid_search"
notebook.display(height=1000) 

### Once you are all done, shutdown Ray

In [None]:
ray.shutdown()