# Hyperparameter Tuning via Scikit-Learn

## GridSearch

### Loading Libraries

In [102]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

# Data Visualization
import matplotlib.pyplot as plt

# Scikit-Learn
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score, make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

### Loading Data

In [62]:
df = pd.read_csv("/Users/joaquinromero/Desktop/HPTP/data/train.csv", sep=";")

In [63]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [64]:
df['y'] = df['y'].map({'yes':1,'no':0})

In [65]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,0


## GridSearch

In [66]:
df_train, df_test = train_test_split(df,
                                     test_size=0.1,
                                     random_state=0)

#### Placing & Gathering Numerical Features

In [78]:
X_train_numerical = df_train.select_dtypes(include=np.number).drop(columns=["y"])

y_train = df_train['y']

In [79]:
X_test_numerical = df_test.select_dtypes(include=np.number).drop(columns=["y"])

y_test = df_test['y']

#### Fitting Model on Training Data:

In [80]:
# Placing Model
model = RandomForestClassifier(random_state=0)

# Fitting Model
model.fit(X_train_numerical, y_train)

#### Evaluating on Test Data

In [81]:
y_pred = model.predict(X_test_numerical)

print(f1_score(y_test, y_pred))

0.43013365735115433


#### Hyperparamater Space: Dictionary

In [82]:
hyperparameter_space = {
    "n_estimators": [25, 50, 100, 150, 200],
    "criterion": ["gini", "entropy"],
    "max_depth": [3, 5, 10, 15, 20, None],
    "class_weight": ["balanced", "balanced_subsampe"],
    "min_samples_split": [0.01, 0.1, 0.25, 0.5, 0.75, 1.0],
}

In [83]:
# Placing Model
model = RandomForestClassifier(random_state=0)

In [86]:
# GridSearchCV Class
clf = GridSearchCV(model, hyperparameter_space, scoring='f1', cv=5, n_jobs=-1, refit=True)

In [87]:
# Fitting clf
clf.fit(X_train_numerical, y_train)

1800 fits failed out of a total of 3600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
845 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/envs/HPTP/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/envs/HPTP/lib/python3.10/site-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/opt/anaconda3/envs/HPTP/lib/python3.10/site-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/opt/anaconda3/envs/HPTP/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 98, in validate

In [88]:
# Getting Best Hyperparameters
print(clf.best_params_, clf.best_score_)

{'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 20, 'min_samples_split': 0.01, 'n_estimators': 150} 0.4942473091278655


In [89]:
# Final Model Test Data
print(clf.score(X_test_numerical, y_test))

0.4937833037300178


### Pre-Processing

#### Listing The Numerical & Categorical Features

In [90]:
# Numerical Features
numerical_feats = list(df_train.drop(columns='y').select_dtypes(include=np.number).columns)

# Categorical Features
categorical_feats = list(df_train.drop(columns='y').select_dtypes(exclude=np.number).columns)

#### Placing Pre-Processor

In [91]:
numeric_preprocessor = StandardScaler()

categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")

#### Pre-Processor Delegation

In [92]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_preprocessor, numerical_feats),
        ("cat", categorical_preprocessor, categorical_feats),
    ])

#### Setting Pre-Processor Pipeline

In [93]:
pipe = Pipeline(
    steps=[("preprocessor", preprocessor),
           ("model", RandomForestClassifier(random_state=0))])

In [94]:
X_train_full = df_train.drop(columns=['y'])
y_train = df_train['y']

X_train_full.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40689 entries, 17974 to 2732
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        40689 non-null  int64 
 1   job        40689 non-null  object
 2   marital    40689 non-null  object
 3   education  40689 non-null  object
 4   default    40689 non-null  object
 5   balance    40689 non-null  int64 
 6   housing    40689 non-null  object
 7   loan       40689 non-null  object
 8   contact    40689 non-null  object
 9   day        40689 non-null  int64 
 10  month      40689 non-null  object
 11  duration   40689 non-null  int64 
 12  campaign   40689 non-null  int64 
 13  pdays      40689 non-null  int64 
 14  previous   40689 non-null  int64 
 15  poutcome   40689 non-null  object
dtypes: int64(7), object(9)
memory usage: 5.3+ MB


In [95]:
X_test_full = df_test.drop(columns=['y'])
y_test = df_test['y']

X_test_full.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4522 entries, 14001 to 25978
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4522 non-null   int64 
 1   job        4522 non-null   object
 2   marital    4522 non-null   object
 3   education  4522 non-null   object
 4   default    4522 non-null   object
 5   balance    4522 non-null   int64 
 6   housing    4522 non-null   object
 7   loan       4522 non-null   object
 8   contact    4522 non-null   object
 9   day        4522 non-null   int64 
 10  month      4522 non-null   object
 11  duration   4522 non-null   int64 
 12  campaign   4522 non-null   int64 
 13  pdays      4522 non-null   int64 
 14  previous   4522 non-null   int64 
 15  poutcome   4522 non-null   object
dtypes: int64(7), object(9)
memory usage: 600.6+ KB


In [96]:
pipe.fit(X_train_full, y_train)

y_pred = pipe.predict(X_test_full)
print(f1_score(y_test, y_pred))

0.5035971223021583


In [97]:
hyperparameter_space = { 
"model__n_estimators": [25,50,100,150,200], 
"model__criterion": ["gini", "entropy"], 
"model__class_weight": ["balanced","balanced_subsample"],
"model__min_samples_split": [0.01,0.1,0.25,0.5,0.75,1.0],
} 

In [98]:
# Initiate the Grid Search Class
clf = GridSearchCV(pipe, hyperparameter_space, 
                   scoring = 'f1', cv=5, 
                   n_jobs=-1, refit = True,
                   verbose=2)

# Run the Grid Search CV
clf.fit(X_train_full, y_train)

Fitting 5 folds for each of 120 candidates, totalling 600 fits
[CV] END model__class_weight=balanced, model__criterion=gini, model__min_samples_split=0.01, model__n_estimators=50; total time=   0.8s
[CV] END model__class_weight=balanced, model__criterion=gini, model__min_samples_split=0.01, model__n_estimators=200; total time=   3.1s
[CV] END model__class_weight=balanced, model__criterion=gini, model__min_samples_split=0.25, model__n_estimators=25; total time=   0.2s
[CV] END model__class_weight=balanced, model__criterion=gini, model__min_samples_split=0.25, model__n_estimators=50; total time=   0.3s
[CV] END model__class_weight=balanced, model__criterion=gini, model__min_samples_split=0.25, model__n_estimators=100; total time=   0.7s
[CV] END model__class_weight=balanced, model__criterion=gini, model__min_samples_split=0.25, model__n_estimators=200; total time=   1.3s
[CV] END model__class_weight=balanced, model__criterion=gini, model__min_samples_split=0.75, model__n_estimators=50; t



[CV] END model__class_weight=balanced, model__criterion=gini, model__min_samples_split=0.01, model__n_estimators=25; total time=   0.4s
[CV] END model__class_weight=balanced, model__criterion=gini, model__min_samples_split=0.01, model__n_estimators=150; total time=   2.3s
[CV] END model__class_weight=balanced, model__criterion=gini, model__min_samples_split=0.1, model__n_estimators=100; total time=   0.9s
[CV] END model__class_weight=balanced, model__criterion=gini, model__min_samples_split=0.1, model__n_estimators=200; total time=   1.8s
[CV] END model__class_weight=balanced, model__criterion=gini, model__min_samples_split=0.5, model__n_estimators=25; total time=   0.2s
[CV] END model__class_weight=balanced, model__criterion=gini, model__min_samples_split=0.5, model__n_estimators=100; total time=   0.5s
[CV] END model__class_weight=balanced, model__criterion=gini, model__min_samples_split=0.5, model__n_estimators=150; total time=   0.6s
[CV] END model__class_weight=balanced, model__cr

In [99]:
clf.best_params_,clf.best_score_

({'model__class_weight': 'balanced_subsample',
  'model__criterion': 'gini',
  'model__min_samples_split': 0.01,
  'model__n_estimators': 100},
 np.float64(0.54949186142684))

In [100]:
clf.score(X_test_full,y_test)

0.5506072874493927

## Random Search