# Hyperparameter Tuning via Scikit-Learn

## Coarse-to-Fine Search

### Loading Libraries

In [4]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

# Data Visualization
import matplotlib.pyplot as plt

# StatsModel
import scipy
from scipy.stats import randint,truncnorm

# Scikit-Learn
from sklearn.base import clone
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score, make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import ParameterSampler, cross_val_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

#### Technical Requirements: `CoarseToFineSearchCV`

In [5]:
class CoarseToFineSearchCV:
    def __init__(self,
                 estimator,
                 param_distributions,
                 random_iters,
                 top_n_percentile,
                 continuous_hyperparams=[],
                 worse_score = 0,
                 n_iter=10,
                 scoring=None,
                 n_jobs=None,
                 refit=True,
                 cv=None,
                 random_state=0,
                 verbose=0
                ):

        self.estimator = estimator
        self.param_distributions = param_distributions
        self.random_iters = random_iters
        self.top_n_percentile = top_n_percentile
        self.continuous_hyperparams = continuous_hyperparams
        self.worse_score = worse_score
        self.n_iter = n_iter
        self.scoring = scoring
        self.n_jobs = n_jobs
        self.refit = refit
        self.cv = cv
        self.random_state = random_state
        self.verbose = verbose

        self.best_params_ = {}
        self.best_score_ = None
    

    def fit(self,X,y):
        new_param_distributions = self.param_distributions.copy()
        best_params_dict = {'score':self.worse_score,'params':[]}

        for epoch in range(self.n_iter):
            if self.verbose >= 2:
                print("Hyperparameter space")
                print(new_param_distributions)


            # List of sampled hyperparameter combinations will be used for random search
            param_list = list(ParameterSampler(new_param_distributions, 
                                               n_iter=self.random_iters,
                                               random_state=self.random_state))

            # Searching the Best Parameters with Random Search
            rs_results_dict = {}
            for random_iter in range(min(self.random_iters,len(param_list))):
                # Get the set of parameter for this iteration
                strategy_params = param_list[random_iter]
                
                estimator = clone(self.estimator).set_params(**strategy_params)

                results = np.mean(cross_val_score(estimator,X, y, 
                                                  cv=self.cv, scoring=self.scoring,
                                                  n_jobs=self.n_jobs
                                                  )
                                 )

                rs_results_dict[tuple(strategy_params.values())] = {'score':results}

                if results >= best_params_dict['score']:
                    best_params_dict['score'] = results
                    best_params_dict['params'] = list(strategy_params.values())

            # Save the results in dataframe and sort it based on score
            param_names = list(strategy_params.keys())
            df_rs_results = pd.DataFrame(rs_results_dict).T.reset_index()
            df_rs_results.columns = param_names + ['score']
            df_rs_results = df_rs_results.sort_values(['score'],ascending=False).head(self.n_iter-epoch)

            # If the best score from this epoch is worse than the best score, 
            # then append the best hyperaparameters combination to this epoch dataframe
            if df_rs_results['score'].iloc[0] < best_params_dict['score'] and best_params_dict['params']:
                new_row_dict = {}
                new_row_dict['score'] = best_params_dict['score']
                for idx, key in enumerate(param_names):
                    new_row_dict[key] = best_params_dict['params'][idx]

                df_rs_results = pd.concat([df_rs_results,pd.DataFrame({0:new_row_dict}).T]).reset_index(drop=True)
                df_rs_results = df_rs_results.sort_values(['score'],ascending=False).head(self.n_iter-epoch)

            if self.verbose >= 1:
                display(df_rs_results)
                print(df_rs_results.head(1).T.to_dict())

            # Get the worse and best hyperparameter combinations
            percentile_threshold = df_rs_results['score'].quantile(self.top_n_percentile/100)
            promising_subspace = df_rs_results[df_rs_results['score']>=percentile_threshold]
            df_rs_results_min = promising_subspace.min(axis=0)
            df_rs_results_max = promising_subspace.max(axis=0)

            # Generate new hyperparameter space based on current worse and best hyperparameter combinations
            for key in new_param_distributions:
                if isinstance(new_param_distributions[key],scipy.stats._distn_infrastructure.rv_frozen):
                    # Currently only support truncnorm and randint distribution
                    # You can add your own distribution here
                    if key in self.continuous_hyperparams:
                        new_param_distributions[key] = truncnorm(a=df_rs_results_min[key],b=df_rs_results_max[key]+1e-6,
                                                                 loc=(0.8*df_rs_results_min[key]+0.2*df_rs_results_max[key]), 
                                                                 scale=(0.8*df_rs_results_min[key]+0.2*df_rs_results_max[key])*2)
                    else:
                        new_param_distributions[key] = randint(int(df_rs_results_min[key]), int(df_rs_results_max[key])+1)
                elif isinstance(new_param_distributions[key][0],str) or isinstance(new_param_distributions[key][0],bool):
                    new_param_distributions[key] = tuple(promising_subspace[key].unique())
                elif isinstance(new_param_distributions[key][0],int):
                    new_param_distributions[key] = [i for i in range(int(df_rs_results_min[key]), int(df_rs_results_max[key])+1)]
                elif isinstance(new_param_distributions[key][0],float):
                    new_param_distributions[key] = list(np.linspace(df_rs_results_min[key], df_rs_results_max[key], 
                                                                    len(param_distributions[key])))
                else:
                    new_param_distributions[key] = self.param_distributions[key]
            
            if self.verbose >= 1:
                print("="*100)
        
        for i, key in enumerate(param_names):
            self.best_params_[key] = best_params_dict['params'][i]
        self.best_score_ = best_params_dict['score']
        
        if self.refit:
            self.estimator = self.estimator.set_params(**self.best_params_)
            self.estimator.fit(X,y)
            
    def predict(self, X):
        if self.refit:
            return self.estimator.predict(X)
        else:
            print("Estimator is not refitted.")

### Loading Data

In [6]:
df = pd.read_csv("/Users/joaquinromero/Desktop/HPTP/data/train.csv", sep=";")

In [7]:
df['y'] = df['y'].map({'yes':1,'no':0})

### Train/Test Split

In [9]:
df_train, df_test = train_test_split(df, test_size=0.1, random_state=0) 

#### Placing Numerical Features

In [12]:
numerical_feats = list(df_train.drop(columns='y').select_dtypes(include=np.number).columns)

#### Placing Categorical Features

In [13]:
categorical_feats = list(df_train.drop(columns='y').select_dtypes(exclude=np.number).columns)

### Pre-Processor

In [14]:
# Normalization Pre-processing for Numerical Features
numeric_preprocessor = StandardScaler()

# One-Hot-Encoding Pre-processing for Categorical Features
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")

In [15]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_preprocessor, numerical_feats),
        ("cat", categorical_preprocessor, categorical_feats),
    ]
)

### Pipeline

In [16]:
pipe = Pipeline(
    steps=[("preprocessor", preprocessor), 
           ("model", RandomForestClassifier(random_state=0))]
)

#### Placing All Features for Training Set

In [17]:
X_train_full = df_train.drop(columns=['y'])
y_train = df_train['y']

X_train_full.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40689 entries, 17974 to 2732
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        40689 non-null  int64 
 1   job        40689 non-null  object
 2   marital    40689 non-null  object
 3   education  40689 non-null  object
 4   default    40689 non-null  object
 5   balance    40689 non-null  int64 
 6   housing    40689 non-null  object
 7   loan       40689 non-null  object
 8   contact    40689 non-null  object
 9   day        40689 non-null  int64 
 10  month      40689 non-null  object
 11  duration   40689 non-null  int64 
 12  campaign   40689 non-null  int64 
 13  pdays      40689 non-null  int64 
 14  previous   40689 non-null  int64 
 15  poutcome   40689 non-null  object
dtypes: int64(7), object(9)
memory usage: 5.3+ MB


#### Placing All Features for Test Set

In [18]:
X_test_full = df_test.drop(columns=['y'])
y_test = df_test['y']

X_test_full.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4522 entries, 14001 to 25978
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4522 non-null   int64 
 1   job        4522 non-null   object
 2   marital    4522 non-null   object
 3   education  4522 non-null   object
 4   default    4522 non-null   object
 5   balance    4522 non-null   int64 
 6   housing    4522 non-null   object
 7   loan       4522 non-null   object
 8   contact    4522 non-null   object
 9   day        4522 non-null   int64 
 10  month      4522 non-null   object
 11  duration   4522 non-null   int64 
 12  campaign   4522 non-null   int64 
 13  pdays      4522 non-null   int64 
 14  previous   4522 non-null   int64 
 15  poutcome   4522 non-null   object
dtypes: int64(7), object(9)
memory usage: 600.6+ KB


#### Calculating F1-Score on Test Data without Hyperparameter Tuning

In [19]:
# Fitting The Pipeline on Train Data 
pipe.fit(X_train_full,y_train)

# Evaluating on the Test Data 
y_pred = pipe.predict(X_test_full)
print(f1_score(y_test, y_pred))

0.5035971223021583


#### Defining The Hyperparameter Space

In [20]:
hyperparameter_space = { 
"model__n_estimators": randint(5, 200), 
"model__criterion": ["gini", "entropy"],
"model__class_weight": ["balanced","balanced_subsample"],
"model__min_samples_split": truncnorm(a=0,b=0.5,loc=0.005, scale=0.01),
} 

### Performing `The Coarse-to-Fine Search`