# Hyperparameter Optimization

In [12]:
import seaborn as sns
import pandas as pd

df = sns.load_dataset('titanic')
df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True


In [2]:
df['age'].fillna(29.0, inplace=True)

In [3]:
X = df[['age', 'sibsp', 'parch']]
y = df['survived']

In [4]:
X.shape, y.shape

((891, 3), (891,))

### Train Test Split

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
Xtrain.shape, Xtest.shape

((712, 3), (179, 3))

In [8]:
ytrain.shape, ytest.shape

((712,), (179,))

### Exploratory Data Analysis

### Feature Engineering

In [13]:
# We cheated a bit by already filling in missing values in the beginning
# Don't do that.

### Create a model
- Logistic Regression
- Decision Tree
- Random Forest

In [9]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(Xtrain, ytrain)   # train the model = find the best coefficients
model.score(Xtrain, ytrain)

0.6404494382022472

### Cross-Validation

In [10]:
from sklearn.model_selection import cross_val_score

# cv=number of splits
scores = cross_val_score(model, Xtrain, ytrain, cv=5, scoring="accuracy")
scores

array([0.65734266, 0.61538462, 0.63380282, 0.63380282, 0.61971831])

In [11]:
scores.mean().round(3), scores.std().round(3)

(0.632, 0.015)

### Hyperparameter Optimization

Hyperparameters are **parameters that are set before the learning process starts.**

Which hyperparameters have you seen?

Logistic Regression:
- C (inverse regularization strangth)
- Learning rate 
 - Algorithms behind logistic regression don't solve the problem numerically but optimize iteratively. Numeric computation would be too computationally expensive. Learning rate represents the stepsize for finding the minimum of the loss function. If too small or too big, it might end up in local minima rather than find the globel minimum.

Decision Tree:
- Depth of the tree

Random Forest:
- Nr. of trees
- Depth of the trees

In [14]:
from sklearn.linear_model import LogisticRegression

In [15]:
model = LogisticRegression(C=10, penalty='12') # <-- Hyperparameters are the parameters that are set when choosing the model

In [16]:
from sklearn.tree import DecisionTreeClassifier

In [19]:
model_dt = DecisionTreeClassifier(max_depth=3) # Maximum Depth of a tree is the number of questions that the model asks

## Example: Max Depth of 3
# 1) Male or female?
# 2) Traveling alone?
# 3) Younger than 20 years?

In [20]:
from sklearn.ensemble import RandomForestClassifier

In [36]:
model_rf = RandomForestClassifier(n_estimators=50, max_depth=3, min_samples_split=2)
# n_estimates: nr. of trees

# One part of the randomness of the random forest is that at each question of each tree the random forest 
# has only a random number of the features are available.
# The number of feature available is defined by max_features.

# min_sample_split defines the minimum size of samples that can still be splittedb
model_rf.score(Xtrain, ytrain)

NotFittedError: This RandomForestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

How to optimize hyperparameters? Trial and Error... But that can be automatized to some extent:
### Use Grid Search (or Randomized Search)

In [22]:
from sklearn.model_selection import GridSearchCV

In [25]:
# Perform Grid Search for Random Forrest

In [26]:
# param_grid is a dictionary with parameters as keys and values as values

param_grid = {
    'n_estimators': [1, 3, 10, 20, 50, 100],
    'max_depth': [1, 3, 5, 10, None]
}

In [27]:
gridcv = GridSearchCV(model_rf, param_grid=param_grid)

In [28]:
gridcv.fit(Xtrain, ytrain)

GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=3,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=50, n_jobs=None,
                                              oob_score=False,
                                              random

In [29]:
gridcv.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=5, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=3,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [31]:
results = pd.DataFrame(gridcv.cv_results_)
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.009244,0.009973,0.001976,0.001003,1.0,1,"{'max_depth': 1, 'n_estimators': 1}",0.622378,0.629371,0.626761,0.633803,0.626761,0.627814,0.003743,28
1,0.007521,0.001164,0.001697,0.000387,1.0,3,"{'max_depth': 1, 'n_estimators': 3}",0.636364,0.629371,0.626761,0.661972,0.626761,0.636245,0.013335,20
2,0.018447,0.002028,0.002988,0.000911,1.0,10,"{'max_depth': 1, 'n_estimators': 10}",0.643357,0.629371,0.633803,0.65493,0.626761,0.637644,0.010325,17
3,0.034278,0.00199,0.003459,0.000518,1.0,20,"{'max_depth': 1, 'n_estimators': 20}",0.636364,0.629371,0.65493,0.633803,0.626761,0.636245,0.009923,20
4,0.081228,0.005335,0.006574,0.00065,1.0,50,"{'max_depth': 1, 'n_estimators': 50}",0.636364,0.629371,0.676056,0.633803,0.626761,0.640471,0.018104,16
5,0.158481,0.011753,0.014762,0.003962,1.0,100,"{'max_depth': 1, 'n_estimators': 100}",0.643357,0.629371,0.65493,0.633803,0.626761,0.637644,0.010325,17
6,0.003588,0.000393,0.001232,0.000171,3.0,1,"{'max_depth': 3, 'n_estimators': 1}",0.65035,0.65035,0.683099,0.626761,0.626761,0.647464,0.020706,13
7,0.006941,0.000703,0.001828,0.000349,3.0,3,"{'max_depth': 3, 'n_estimators': 3}",0.622378,0.657343,0.676056,0.669014,0.626761,0.65031,0.021895,12
8,0.016762,0.000468,0.002434,0.000488,3.0,10,"{'max_depth': 3, 'n_estimators': 10}",0.643357,0.685315,0.669014,0.669014,0.626761,0.658692,0.020865,5
9,0.031231,0.000897,0.003404,0.000187,3.0,20,"{'max_depth': 3, 'n_estimators': 20}",0.664336,0.671329,0.640845,0.669014,0.65493,0.660091,0.011144,3


In [32]:
# Let's look at the columns:

results.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_max_depth', 'param_n_estimators', 'params', 'split0_test_score',
       'split1_test_score', 'split2_test_score', 'split3_test_score',
       'split4_test_score', 'mean_test_score', 'std_test_score',
       'rank_test_score'],
      dtype='object')

In [38]:
columns = ['mean_test_score', 'std_test_score', 'mean_fit_time', 'param_max_depth', 'param_n_estimators']
results[columns].sort_values('mean_test_score', ascending=False)

Unnamed: 0,mean_test_score,std_test_score,mean_fit_time,param_max_depth,param_n_estimators
13,0.662927,0.029443,0.006569,5.0,3
17,0.66014,0.01446,0.16033,5.0,100
9,0.660091,0.011144,0.031231,3.0,20
10,0.660061,0.019512,0.082392,3.0,50
14,0.658692,0.013373,0.019708,5.0,10
8,0.658692,0.020865,0.016762,3.0,10
15,0.657293,0.031072,0.033868,5.0,20
16,0.657274,0.021032,0.083627,5.0,50
23,0.655885,0.016307,0.163731,10.0,100
22,0.654477,0.026691,0.088373,10.0,50


### Randomized Search

In [39]:
from sklearn.model_selection import RandomizedSearchCV

In [40]:
# Where do we get probability distributions from?
from scipy import stats 
# from here different distributions could be obtained. For now, uniform distribution is godd enough.

In [43]:
param_distributions = {
    'n_estimators' : list(range(1, 100)), # it will uniformly sample from that list
    'max_depth' : list(range(1, 15)),
}

In [49]:
randomizedcv = RandomizedSearchCV(model_rf, param_distributions=param_distributions)

In [51]:
randomizedcv.fit(Xtrain, ytrain)

RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=3,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [53]:
random_results = pd.DataFrame(randomizedcv.cv_results_)
random_results[columns].sort_values('mean_test_score', ascending=False)

Unnamed: 0,mean_test_score,std_test_score,mean_fit_time,param_max_depth,param_n_estimators
6,0.669969,0.027832,0.113644,8,47
0,0.6601,0.028497,0.028884,5,13
4,0.658731,0.025848,0.107241,10,25
1,0.655905,0.024687,0.10249,12,58
7,0.655905,0.028769,0.163545,11,92
9,0.647424,0.027067,0.158357,12,95
8,0.641869,0.012065,0.136239,1,85
5,0.6405,0.029626,0.008499,5,1
2,0.640451,0.011117,0.105815,1,66
3,0.640431,0.028497,0.168372,14,93


### Optimization finished: what next?

In [None]:
model.fit(Xtrain, ytrain)
print('training score: ', model.score(Xtrain, ytrain).round(3))
print('test score    : ', model.score(Xtest, ytest).round(3))

#### Interpretation

* training and test score are similar: all good
* training >> test score: overfitting
* traing < test score: random fluctuation; probably your dataset is very small or BUG