# Hyperparameter tuning grid search example

In this notebook, hyperparameter tuning using grid search algorithm is demonstrated.We have a dataset consisting
of amazon product reviews and a sklearn classifier to classiy these reviews. We take advantage of cloud functions
to tune this classifier's hyperparameters and show how Lithops can be used for this kind of computations.

## Installing Dependencies

In [1]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline

import joblib

from pprint import pprint
from time import time
import click
import bz2

## Downloading the Dataset

The dataset should be downloaded and extracted from zip file before this step. 
`load_data` function seperates the data as X and Y arrays to prepare them for classifier.

In [2]:
def load_data(mib):
    # Download the dataset at
    # https://www.kaggle.com/bittlingmayer/amazonreviews

    print("Loading Amazon reviews dataset:")
    compressed = bz2.BZ2File('./train.ft.txt.bz2')

    X = []
    y = []
    total_size = 0
    for _ in range(3_600_000):
        line = compressed.readline().decode('utf-8')
        X.append(line[11:])
        y.append(int(line[9]) - 1)  # __label__1, __label__2

        total_size += len(line[11:])
        if (total_size / 2 ** 20) > mib:
            break

    print("\t%d reviews" % len(X))
    print("\t%0.2f MiB of data" % (total_size / 2 ** 20))
    return X, y

Here running options are defined and shell arguments are declared.

In [None]:
@click.command()
@click.option('--backend', default='loky', help='Joblib backend to perform grid search '
                                                '(loky | lithops | dask | ray | tune)')
@click.option('--address', default=None, help='Scheduler address (dask) or head node address '
                                              '(ray, ray[tune])')
@click.option('--mib', default=10, type=int, help='Load X MiB from the dataset')
@click.option('--refit', default=False, is_flag=True, help='Fit the final model with the best '
                                                           'configuration and print score')
@click.option('--jobs', default=-1, help='Number of jobs to execute the search. -1 means all processors.')

## Execution

In the main function, grid search is performed using GridSearchCV from sklearn library with different parameters depending on the backend chosen. 

In [3]:
def main(backend, address, mib, refit, jobs):

    X, y = load_data(mib)

    n_features = 2 ** 18
    pipeline = Pipeline([
        ('vect', HashingVectorizer(n_features=n_features, alternate_sign=False)),
        ('clf', SGDClassifier()),
    ])

    parameters = {
        'vect__norm': ('l1', 'l2'),
        'vect__ngram_range': ((1, 1), (1, 2)),
        'clf__alpha': (1e-2, 1e-3, 1e-4, 1e-5),
        'clf__max_iter': (20, 60, 100, 160),
        'clf__penalty': ('l2', 'l1', 'elasticnet')
    }

    if backend == 'lithops':
        from sklearn.model_selection import GridSearchCV
        from lithops.util.joblib import register_lithops
        register_lithops()
        grid_search = GridSearchCV(pipeline, parameters,
                                   error_score='raise',
                                   refit=refit, cv=5, n_jobs=jobs)

    elif backend == 'ray':
        from sklearn.model_selection import GridSearchCV
        import ray
        from ray.util.joblib import register_ray
        address = 'auto' if address is None else address
        ray.init(address, redis_password='5241590000000000')
        register_ray()
        grid_search = GridSearchCV(pipeline, parameters,
                                   error_score='raise',
                                   refit=refit, cv=5, n_jobs=jobs)

    elif backend == 'tune':
        from tune_sklearn import TuneGridSearchCV
        import ray
        address = 'auto' if address is None else address
        ray.init(address, log_to_driver=False, redis_password='5241590000000000')
        grid_search = TuneGridSearchCV(pipeline, parameters,
            error_score='raise', refit=refit, cv=5, n_jobs=jobs)
        backend = 'loky' # not used

    elif backend == 'dask':
        from dask_ml.model_selection import GridSearchCV
        from dask_ml.feature_extraction.text import HashingVectorizer as DaskHashingVectorizer
        from distributed import Client
        if address is None:
            print('Error: must specify a scheduler address for dask distributed')
            exit(1)
        Client(address=address)
        pipeline = Pipeline([
            ('vect', DaskHashingVectorizer(n_features=n_features, alternate_sign=False)),
            ('clf', SGDClassifier()),
        ])
        grid_search = GridSearchCV(pipeline, parameters,
            error_score='raise', refit=refit, cv=5, n_jobs=jobs)

    else:   # loky
        from sklearn.model_selection import GridSearchCV
        grid_search = GridSearchCV(pipeline, parameters,
            error_score='raise', refit=refit, cv=5, n_jobs=jobs)

    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters: ", end='')
    pprint(parameters)

    with joblib.parallel_backend(backend):
        print("Performing grid search...")
        t0 = time()
        grid_search.fit(X, y)
        total_time = time() - t0
        print("Done in %0.3fs\n" % total_time) 

    if refit:
        print("Best score: %0.3f" % grid_search.best_score_)
        print("Best parameters set:")
        best_parameters = grid_search.best_estimator_.get_params()
        for param_name in sorted(parameters.keys()):
            print("\t%s: %r" % (param_name, best_parameters[param_name]))


if __name__ == "__main__":
    main()

Usage: ipykernel_launcher.py [OPTIONS]
Try "ipykernel_launcher.py --help" for help.

Error: no such option: -f
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/lib/python3/dist-packages/click/parser.py", line 415, in _process_opts
    self._match_long_opt(norm_long_opt, explicit_value, state)
  File "/usr/lib/python3/dist-packages/click/parser.py", line 325, in _match_long_opt
    raise NoSuchOption(opt, possibilities=possibilities, ctx=self.ctx)
click.exceptions.NoSuchOption: no such option: -f

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3/dist-packages/click/core.py", line 716, in main
    with self.make_context(prog_name, args, **extra) as ctx:
  File "/usr/lib/python3/dist-packages/click/core.py", line 641, in make_context
    self.parse_args(ctx, args)
  File "/usr/lib/python3/dist-packages/click/core.py", line 936, in parse_args
    opts, args, param_order = parser.parse_args(args=args)
  File "/usr/lib/python3/dist-packages/click/parser.py", line 268, in parse_args
    self._process_args_for_options(state)

TypeError: object of type 'NoneType' has no len()