# Sources:

[Scikit Learn Hyperparameter Tuning(Youtube Video)](https://www.youtube.com/watch?v=Q-X1ukbxIs0)

[Github code](https://github.com/knathanieltucker/bit-of-data-science-and-scikit-learn/blob/master/notebooks/HyperparamTuning.ipynb)

## GridSearch

The grid search provided by GridSearchCV exhaustively generates candidates from a grid of parameter values specified with the param_grid parameter. For instance, the following param_grid:

In [1]:
param_grid = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
 ]

specifies that two grids should be explored: one with a linear kernel and C values in [1, 10, 100, 1000], and the second one with an RBF kernel, and the cross-product of C values ranging in [1, 10, 100, 1000] and gamma values in [0.001, 0.0001].

The GridSearchCV instance implements the usual estimator API: when “fitting” it on a dataset all the possible combinations of parameter values are evaluated and the best combination is retained.

In [5]:
from sklearn.model_selection import GridSearchCV
%matplotlib inline

# Import Libraries
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
GridSearchCV?

In [6]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.svm import SVC

In [7]:
df = pd.read_csv('Bike_train_test.csv')
df.head()

Unnamed: 0,online_order,order_status,list_price,standard_cost,gross_profit,margin_percentage,markup_percentage,product_class_label_1,product_class_label_2,product_class_label_3,product_size_label_1,product_size_label_2,product_size_label_3,high_profit_product
0,0,1,71.49,53.62,17.87,25.0,33.33,0,1,0,0,1,0,0
1,1,1,2091.47,388.92,1702.55,81.4,437.76,0,1,0,0,0,1,1
2,0,1,1793.43,248.82,1544.61,86.13,620.77,1,0,0,0,1,0,1
3,0,1,1198.46,381.1,817.36,68.2,214.47,0,1,0,0,1,0,1
4,1,1,1765.3,709.48,1055.82,59.81,148.82,0,1,0,0,0,1,1


In [8]:
X = df.drop(['high_profit_product'], axis=1)
y = df['high_profit_product']
n_samples = len(y)

In [12]:
#X = digits.images.reshape((n_samples, -1))
#y = digits.target

# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=0)

tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]



clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5,
                       scoring='f1_macro',return_train_score=True)
clf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'kernel': ['rbf'], 'gamma': [0.001, 0.0001], 'C': [1, 10, 100, 1000]}, {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='f1_macro', verbose=0)

In [13]:
clf.best_params_

{'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}

In [14]:
clf.cv_results_

{'mean_fit_time': array([0.25564823, 0.14111595, 0.26124225, 0.16190557, 0.31941147,
        0.13431273, 0.2446558 , 0.13392525, 0.08514895, 0.08414927,
        0.0845489 , 0.08415012]),
 'std_fit_time': array([0.01124232, 0.01312699, 0.0180218 , 0.02226818, 0.08764085,
        0.00272162, 0.00318586, 0.00199499, 0.002785  , 0.00193803,
        0.00257546, 0.00292517]),
 'mean_score_time': array([0.05217881, 0.02279491, 0.05337715, 0.02838311, 0.0617661 ,
        0.02319026, 0.05197349, 0.02278633, 0.00519791, 0.00519795,
        0.00499792, 0.00539699]),
 'std_score_time': array([1.46746841e-03, 7.50030888e-04, 2.41128435e-03, 5.46060189e-03,
        8.90381463e-03, 4.07578217e-04, 6.38157363e-04, 3.99754759e-04,
        4.00114699e-04, 3.99972145e-04, 3.16297988e-07, 4.89959997e-04]),
 'param_C': masked_array(data=[1, 1, 10, 10, 100, 100, 1000, 1000, 1, 10, 100, 1000],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False,

In [16]:
y_true, y_pred = y_test, clf.predict(X_test)
print (classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6430
           1       1.00      1.00      1.00      3472

   micro avg       1.00      1.00      1.00      9902
   macro avg       1.00      1.00      1.00      9902
weighted avg       1.00      1.00      1.00      9902



In [17]:
clf.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_C', 'param_gamma', 'param_kernel', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score', 'split0_train_score', 'split1_train_score', 'split2_train_score', 'split3_train_score', 'split4_train_score', 'mean_train_score', 'std_train_score'])

In [19]:
for param, score in zip(clf.cv_results_['params'], clf.cv_results_['mean_test_score']):
    print (param, score)

{'C': 1, 'gamma': 0.001, 'kernel': 'rbf'} 1.0
{'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'} 1.0
{'C': 10, 'gamma': 0.001, 'kernel': 'rbf'} 1.0
{'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'} 1.0
{'C': 100, 'gamma': 0.001, 'kernel': 'rbf'} 1.0
{'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'} 1.0
{'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'} 1.0
{'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'} 1.0
{'C': 1, 'kernel': 'linear'} 1.0
{'C': 10, 'kernel': 'linear'} 1.0
{'C': 100, 'kernel': 'linear'} 1.0
{'C': 1000, 'kernel': 'linear'} 1.0


## Randomized Search

While using a grid of parameter settings is currently the most widely used method for parameter optimization, other search methods have more favourable properties. RandomizedSearchCV implements a randomized search over parameters, where each setting is sampled from a distribution over possible parameter values. This has two main benefits over an exhaustive search:

* A budget can be chosen independent of the number of parameters and possible values.
* Adding parameters that do not influence the performance does not decrease efficiency.

Specifying how parameters should be sampled is done using a dictionary, very similar to specifying parameters for GridSearchCV. Additionally, a computation budget, being the number of sampled candidates or sampling iterations, is specified using the n_iter parameter. For each parameter, either a distribution over possible values or a list of discrete choices (which will be sampled uniformly) can be specified:

In [20]:
import scipy

params = {'C': scipy.stats.expon(scale=100), 'gamma': scipy.stats.expon(scale=.1),
  'kernel': ['rbf'], 'class_weight':['balanced', None]}

This example uses the scipy.stats module, which contains many useful distributions for sampling parameters, such as expon, gamma, uniform or randint. In principle, any function can be passed that provides a rvs (random variate sample) method to sample a value. A call to the rvs function should provide independent random samples from possible parameter values on consecutive calls.

For continuous parameters, such as C above, it is important to specify a continuous distribution to take full advantage of the randomization. This way, increasing n_iter will always lead to a finer search.

In [21]:
from sklearn.model_selection import RandomizedSearchCV

RandomizedSearchCV?

In [26]:
clf = RandomizedSearchCV(SVC(), params, cv=5,
                       scoring='f1_macro',return_train_score=True)
clf.fit(X_train, y_train)

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
          fit_params=None, iid='warn', n_iter=10, n_jobs=None,
          param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001FC97879630>, 'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001FC977C9A58>, 'kernel': ['rbf'], 'class_weight': ['balanced', None]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring='f1_macro', verbose=0)

In [27]:
clf.best_params_


{'C': 29.521455201021624,
 'class_weight': 'balanced',
 'gamma': 0.005844893556272121,
 'kernel': 'rbf'}

In [28]:
clf.cv_results_

{'mean_fit_time': array([0.31720514, 0.35598521, 0.39676585, 0.37197323, 0.37218461,
        0.35359502, 0.33680501, 0.39616671, 0.3733839 , 0.28703012]),
 'std_fit_time': array([0.03595438, 0.0245397 , 0.04478803, 0.0556237 , 0.03396507,
        0.02771974, 0.00739791, 0.04063448, 0.01904867, 0.00420516]),
 'mean_score_time': array([0.06436687, 0.07355895, 0.06976357, 0.07156196, 0.07235742,
        0.06995568, 0.06915684, 0.0747602 , 0.08514671, 0.06076837]),
 'std_score_time': array([0.00349311, 0.01161485, 0.00074917, 0.0093423 , 0.00631006,
        0.00088996, 0.00074953, 0.01062052, 0.01607958, 0.00075367]),
 'param_C': masked_array(data=[29.521455201021624, 133.69177605949338,
                    137.72484944787288, 177.96181269929065,
                    39.295522513261616, 1.9982749496399932,
                    19.21677672200269, 31.989824859131183,
                    125.14873050617314, 22.773690758662],
              mask=[False, False, False, False, False, False, False, F

In [29]:
y_true, y_pred = y_test, clf.predict(X_test)
print (classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6430
           1       1.00      1.00      1.00      3472

   micro avg       1.00      1.00      1.00      9902
   macro avg       1.00      1.00      1.00      9902
weighted avg       1.00      1.00      1.00      9902



In [30]:
for param, score in zip(clf.cv_results_['params'], clf.cv_results_['mean_test_score']):
    print (param, score)

{'C': 29.521455201021624, 'class_weight': 'balanced', 'gamma': 0.005844893556272121, 'kernel': 'rbf'} 1.0
{'C': 133.69177605949338, 'class_weight': None, 'gamma': 0.008809671436791733, 'kernel': 'rbf'} 1.0
{'C': 137.72484944787288, 'class_weight': 'balanced', 'gamma': 0.12111412390951858, 'kernel': 'rbf'} 1.0
{'C': 177.96181269929065, 'class_weight': None, 'gamma': 0.00940237556424927, 'kernel': 'rbf'} 1.0
{'C': 39.295522513261616, 'class_weight': 'balanced', 'gamma': 0.038300332404798715, 'kernel': 'rbf'} 1.0
{'C': 1.9982749496399932, 'class_weight': None, 'gamma': 0.04132502350585016, 'kernel': 'rbf'} 1.0
{'C': 19.21677672200269, 'class_weight': 'balanced', 'gamma': 0.038088275127955686, 'kernel': 'rbf'} 1.0
{'C': 31.989824859131183, 'class_weight': 'balanced', 'gamma': 0.09108964749731219, 'kernel': 'rbf'} 1.0
{'C': 125.14873050617314, 'class_weight': None, 'gamma': 0.4286223834811363, 'kernel': 'rbf'} 1.0
{'C': 22.773690758662, 'class_weight': 'balanced', 'gamma': 0.004680555707119

Don't forget the old _CV classes that are faster than gridsearch! And also don't forget about OOB error that can be a great proxy

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19803 entries, 0 to 19802
Data columns (total 14 columns):
online_order             19803 non-null int64
order_status             19803 non-null int64
list_price               19803 non-null float64
standard_cost            19803 non-null float64
gross_profit             19803 non-null float64
margin_percentage        19803 non-null float64
markup_percentage        19803 non-null float64
product_class_label_1    19803 non-null int64
product_class_label_2    19803 non-null int64
product_class_label_3    19803 non-null int64
product_size_label_1     19803 non-null int64
product_size_label_2     19803 non-null int64
product_size_label_3     19803 non-null int64
high_profit_product      19803 non-null int64
dtypes: float64(5), int64(9)
memory usage: 2.1 MB
