# Hyperparameter Tuning via Scikit-Learn

## Succesive Halving

### Loading Libraries

In [1]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

# Data Visualization
import matplotlib.pyplot as plt

# StatsModel
import scipy
from scipy.stats import randint,truncnorm

# Scikit-Learn
from sklearn.base import clone
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score, make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import ParameterSampler, cross_val_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

In [2]:
# Successive Halving
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV

### Loading Data

In [3]:
df = pd.read_csv("/Users/joaquinromero/Desktop/HPTP/data/train.csv", sep=";")

In [4]:
df['y'] = df['y'].map({'yes':1,'no':0})

### Train/Test Split

In [5]:
df_train, df_test = train_test_split(df, test_size=0.1, random_state=0) 

#### Placing Numerical Features

In [6]:
numerical_feats = list(df_train.drop(columns='y').select_dtypes(include=np.number).columns)

#### Placing Categorical Features

In [7]:
categorical_feats = list(df_train.drop(columns='y').select_dtypes(exclude=np.number).columns)

### Pre-Processor

In [8]:
# Normalization Pre-processing for Numerical Features
numeric_preprocessor = StandardScaler()

# One-Hot-Encoding Pre-processing for Categorical Features
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")

In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_preprocessor, numerical_feats),
        ("cat", categorical_preprocessor, categorical_feats),
    ]
)

### Pipeline

In [10]:
pipe = Pipeline(
    steps=[("preprocessor", preprocessor), 
           ("model", RandomForestClassifier(random_state=0))]
)

#### Placing All Features for Training Set

In [11]:
X_train_full = df_train.drop(columns=['y'])
y_train = df_train['y']

X_train_full.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40689 entries, 17974 to 2732
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        40689 non-null  int64 
 1   job        40689 non-null  object
 2   marital    40689 non-null  object
 3   education  40689 non-null  object
 4   default    40689 non-null  object
 5   balance    40689 non-null  int64 
 6   housing    40689 non-null  object
 7   loan       40689 non-null  object
 8   contact    40689 non-null  object
 9   day        40689 non-null  int64 
 10  month      40689 non-null  object
 11  duration   40689 non-null  int64 
 12  campaign   40689 non-null  int64 
 13  pdays      40689 non-null  int64 
 14  previous   40689 non-null  int64 
 15  poutcome   40689 non-null  object
dtypes: int64(7), object(9)
memory usage: 5.3+ MB


#### Placing All Features for Test Set

In [12]:
X_test_full = df_test.drop(columns=['y'])
y_test = df_test['y']

X_test_full.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4522 entries, 14001 to 25978
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4522 non-null   int64 
 1   job        4522 non-null   object
 2   marital    4522 non-null   object
 3   education  4522 non-null   object
 4   default    4522 non-null   object
 5   balance    4522 non-null   int64 
 6   housing    4522 non-null   object
 7   loan       4522 non-null   object
 8   contact    4522 non-null   object
 9   day        4522 non-null   int64 
 10  month      4522 non-null   object
 11  duration   4522 non-null   int64 
 12  campaign   4522 non-null   int64 
 13  pdays      4522 non-null   int64 
 14  previous   4522 non-null   int64 
 15  poutcome   4522 non-null   object
dtypes: int64(7), object(9)
memory usage: 600.6+ KB


#### Calculating F1-Score on Test Data without Hyperparameter Tuning

In [13]:
# Fitting The Pipeline on Train Data 
pipe.fit(X_train_full,y_train)

# Evaluating on the Test Data 
y_pred = pipe.predict(X_test_full)
print(f1_score(y_test, y_pred))

0.5035971223021583


#### Defining The Hyperparameter Space

In [14]:
hyperparameter_space = { 
"model__n_estimators": randint(5, 200), 
"model__criterion": ["gini", "entropy"],
"model__class_weight": ["balanced","balanced_subsample"],
"model__min_samples_split": truncnorm(a=0,b=0.5,loc=0.005, scale=0.01),
} 

### Performing Succesive Halving with Random Search

In [None]:
clf = HalvingRandomSearchCV(pipe, 
                            hyperparameter_space, 
                            factor=3,
                            aggressive_elimination=False,
                            random_state = 0,
                            scoring = 'f1',
                            cv=5, 
                            n_jobs=-1,
                            refit = True,
                            verbose=2)

#### Running `HalvingRandomSearchCV`

In [None]:
clf.fit(X_train_full, y_train)

In [None]:
print(clf.best_params_, clf.best_score_)

#### Test Data Evaluation

In [None]:
print(clf.score(X_test_full, y_test))

In [None]:
results = pd.DataFrame(clf.cv_results_)
results["params_str"] = results.params.apply(str)
results.drop_duplicates(subset=("params_str", "iter"), inplace=True)
mean_scores = results.pivot(
    index="iter", columns="params_str", values="mean_test_score"
)

fig, ax = plt.subplots(figsize=(16,16))

ax = mean_scores.plot(legend=False, alpha=0.6, ax=ax)

labels = [
    f"Iteration {i+1}\nn_samples={clf.n_resources_[i]}\nn_candidates={clf.n_candidates_[i]}"
    for i in range(clf.n_iterations_)
]

ax.set_xticks(range(clf.n_iterations_))
ax.set_xticklabels(labels, rotation=0, multialignment="left",size=16)
ax.set_title("F1-Score of Candidates over Iterations",size=20)
ax.set_ylabel("5-Folds Cross Validation F1-Score", fontsize=18)
ax.set_xlabel("")
plt.tight_layout()
plt.show()