This tutorial has been taken from [Tara Boyle's](https://towardsdatascience.com/@terrah27) [Towards Data Science blogpost](https://towardsdatascience.com/hyperparameter-tuning-c5619e7e6624). <br>
The original Kaggle notebook by the author can be found [here](https://www.kaggle.com/tboyle10/hyperparameter-tuning). <br>
The training and testing datasets used in this notebook can be found [here](https://www.kaggle.com/c/dont-overfit-ii/data?select=train.csv).

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler

from sklearn import linear_model
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import warnings

In [2]:
# warnings.filterwarnings('ignore')
np.random.seed(27)

In [4]:
# Setting up default plotting parameters
%matplotlib inline

plt.rcParams['figure.figsize'] = [20.0, 7.0]
plt.rcParams.update({'font.size': 22,})

sns.set_palette('viridis')
sns.set_style('white')
sns.set_context('talk', font_scale=0.8)

In [5]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print('Train Shape: ', train.shape)
print('Test Shape: ', test.shape)

train.head()

Train Shape:  (250, 302)
Test Shape:  (19750, 301)


Unnamed: 0,id,target,0,1,2,3,4,5,6,7,...,290,291,292,293,294,295,296,297,298,299
0,0,1.0,-1.067,-1.114,-0.616,0.376,1.09,0.467,-0.422,0.46,...,0.22,-0.339,0.254,-0.179,0.352,0.125,0.347,0.436,0.958,-0.824
1,1,0.0,-0.831,0.271,1.716,1.096,1.731,-0.197,1.904,-0.265,...,-0.765,-0.735,-1.158,2.554,0.856,-1.506,0.462,-0.029,-1.932,-0.343
2,2,0.0,0.099,1.39,-0.732,-1.065,0.005,-0.081,-1.45,0.317,...,-1.311,0.799,-1.001,1.544,0.575,-0.309,-0.339,-0.148,-0.646,0.725
3,3,1.0,-0.989,-0.916,-1.343,0.145,0.543,0.636,1.127,0.189,...,-1.37,1.093,0.596,-0.589,-0.649,-0.163,-0.958,-1.081,0.805,3.401
4,4,0.0,0.811,-1.509,0.522,-0.36,-0.22,-0.959,0.334,-0.566,...,-0.178,0.718,-1.017,1.249,-0.596,-0.445,1.751,1.442,-0.393,-0.643


In [6]:
# Prepare for modeling
X_train = train.drop(['id', 'target'], axis=1)
y_train = train['target']

X_test = test.drop(['id'], axis=1)

# scaling data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [7]:
# define models
ridge = linear_model.Ridge()
lasso = linear_model.Lasso()
elastic = linear_model.ElasticNet()
lasso_lars = linear_model.LassoLars()
bayesian_ridge = linear_model.BayesianRidge()
logistic = linear_model.LogisticRegression(solver='liblinear')
sgd = linear_model.SGDClassifier()

In [8]:
models = [ridge, lasso, elastic, lasso_lars, bayesian_ridge, logistic, sgd]

In [11]:
# function to get cross validation scores
def get_cv_scores(model):
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
    print('CV Mean: ', np.mean(scores))
    print('STD: ', np.std(scores))
    print('\n')

In [12]:
# loop through list of models
for model in models:
    print(model)
    get_cv_scores(model)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)
CV Mean:  0.655320621373253
STD:  0.08822973705933819


Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)
CV Mean:  0.5
STD:  0.0


ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=None, selection='cyclic', tol=0.0001, warm_start=False)
CV Mean:  0.5
STD:  0.0


LassoLars(alpha=1.0, copy_X=True, eps=2.220446049250313e-16, fit_intercept=True,
          fit_path=True, max_iter=500, normalize=True, positive=False,
          precompute='auto', verbose=False)
CV Mean:  0.5
STD:  0.0


BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, alpha_init=None,
              compute_score=False, copy_X=True, fit_i

We see that our best performing models are logistic regression and stochastic gradient descent. Let's tune their hyperparameters.

### Logistic Regression and Grid Search
Grid search is an exhaustive search over specified parameter values.

In [16]:
penalty = ['l1', 'l2']
C = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
class_weight = [{1: 0.5, 0: 0.5}, {1: 0.4, 0: 0.6}, {1: 0.6, 0: 0.4}, {1: 0.7, 0: 0.3}, {1: 0.8, 0: 0.2}]
solver = ['liblinear', 'saga']

param_grid = dict(penalty=penalty,
                  C=C,
                  class_weight=class_weight,
                  solver=solver)

grid = GridSearchCV(estimator=logistic, param_grid=param_grid, scoring='roc_auc', verbose=1, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)

print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Fitting 5 folds for each of 160 candidates, totalling 800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:    2.1s


Best Score:  0.7327500577500577
Best Params:  {'C': 0.1, 'class_weight': {1: 0.7, 0: 0.3}, 'penalty': 'l1', 'solver': 'saga'}


[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed:   11.9s finished


In [17]:
logistic = linear_model.LogisticRegression(C=0.1, class_weight={1: 0.7, 0: 0.3}, penalty='l1', solver='saga')
get_cv_scores(logistic)

CV Mean:  0.7327500577500577
STD:  0.06455101305728991




In [18]:
predictions = logistic.fit(X_train, y_train).predict_proba(X_test)

In [19]:
submission = pd.read_csv('sample_submission.csv')
submission['target'] = predictions
# submission.to_csv('submission.csv', index=False)
submission.head()

Unnamed: 0,id,target
0,250,0.454689
1,251,0.528545
2,252,0.50678
3,253,0.528876
4,254,0.545249


### Stochastic Gradient Descent and Random Search
Random search over specified parameter values

In [27]:
loss = ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']
penalty = ['l1', 'l2', 'elasticnet']
alpha = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
learning_rate = ['constant', 'optimal', 'invscaling', 'adaptive']
class_weight = [{1: 0.5, 0: 0.5}, {1: 0.4, 0: 0.6}, {1: 0.6, 0: 0.4}, {1: 0.7, 0: 0.3}, {1: 0.8, 0: 0.2}]
eta0 = [1, 10, 100]

param_distributions = dict(loss=loss,
                           penalty=penalty,
                           alpha=alpha,
                           learning_rate=learning_rate,
                           class_weight=class_weight,
                           eta0=eta0)

random = RandomizedSearchCV(estimator=sgd, param_distributions=param_distributions, scoring='roc_auc', verbose=1, n_jobs=-1, n_iter=1000)
random_result = random.fit(X_train, y_train)

print('Best Score: ', random_result.best_score_)
print('Best Params: ', random_result.best_params_)

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 2680 tasks      | elapsed:   15.2s


Best Score:  0.7420544170544171
Best Params:  {'penalty': 'l1', 'loss': 'log', 'learning_rate': 'adaptive', 'eta0': 1, 'class_weight': {1: 0.5, 0: 0.5}, 'alpha': 0.01}


[Parallel(n_jobs=-1)]: Done 5000 out of 5000 | elapsed:   28.4s finished


In [28]:
sgd = linear_model.SGDClassifier(alpha=0.01,
                                 class_weight={1: 0.5, 0: 0.5},
                                 eta0=1,
                                 learning_rate='adaptive',
                                 loss='log',
                                 penalty='l1')

In [29]:
get_cv_scores(sgd)

CV Mean:  0.7204534204534203
STD:  0.07564255457563326




In [30]:
predictions = sgd.fit(X_train, y_train).predict_proba(X_test)

In [31]:
submission = pd.read_csv('sample_submission.csv')
submission['target'] = predictions
submission.to_csv('submission.csv', index=False)
submission.head()

Unnamed: 0,id,target
0,250,0.40641
1,251,0.797075
2,252,0.76919
3,253,0.591878
4,254,0.351051
