# Hyperparameter Tuning via Scikit-Learn

## Bayesian Optimization Gaussian Process

### Loading Libraries

In [12]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

# Data Visualization
import matplotlib.pyplot as plt

# StatsModel
import scipy
from scipy.stats import randint,truncnorm

# Scikit-Learn Optimization
from skopt.space import *
from skopt import BayesSearchCV

# Scikit-Learn
from sklearn.base import clone
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score, make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import ParameterSampler, cross_val_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

In [13]:
from sklearn.pipeline import Pipeline as Sklearn_Pipeline

In [29]:
class NormalizeTransform:
    def __init__(self, low, high):
        self.low = low
        self.high = high

    def transform(self, x):
        return (np.array(x) - self.low) / (self.high - self.low)

    def inverse_transform(self, x):
        return self.low + np.array(x) * (self.high - self.low)

In [30]:
class Real(Dimension):
    """Search space dimension that can take on any real value."""

    def __init__(self, low, high, prior="uniform", base=10, transform=None,
                 name=None, dtype=float, **kwargs):
        if high <= low:
            raise ValueError(f"the lower bound {low} has to be less than the upper bound {high}")
        
        self.low = low
        self.high = high
        self.prior = prior
        self.base = base
        self.log_base = np.log10(base)
        self.name = name
        self.dtype = dtype
        self._rvs = None
        self.transformer = None
        self.transform_ = transform
        self.kwargs = kwargs

        # Validar dtype
        if isinstance(self.dtype, str) and self.dtype not in ['float', 'float16', 'float32', 'float64']:
            raise ValueError(f"dtype must be 'float', 'float16', 'float32' or 'float64'; got {self.dtype}")
        elif isinstance(self.dtype, type) and self.dtype not in [float, np.float16, np.float32, np.float64]:
            raise ValueError(f"dtype must be float, np.float16/32/64; got {self.dtype}")

        if transform is None:
            transform = "identity"
        self.set_transformer(transform)

    def set_transformer(self, transform="identity"):
        # self.transform_ = transform
        self.transformer = NormalizeTransform(self.low, self.high)

        if self.transform_ not in ["normalize", "identity"]:
            raise ValueError(f"transform should be 'normalize' or 'identity'; got {self.transform_}")

        if self.transform_ == "normalize":
            self._rvs = _uniform_inclusive(0., 1.)
            if self.prior == "uniform":
                self.transformer = Pipeline([Identity(), Normalize(self.low, self.high)])
            elif self.prior == "log-uniform":
                self.transformer = Pipeline([
                    LogN(self.base),
                    Normalize(np.log10(self.low) / self.log_base,
                              np.log10(self.high) / self.log_base)
                ])
            else:  # truncnorm
                self.transformer = Pipeline([Identity(), Normalize(self.low + 1e-6, self.high)])
        else:
            if self.prior == "uniform":
                self._rvs = _uniform_inclusive(self.low, self.high - self.low)
                self.transformer = Identity()
            elif self.prior == "log-uniform":
                self._rvs = _uniform_inclusive(
                    np.log10(self.low) / self.log_base,
                    np.log10(self.high) / self.log_base - np.log10(self.low) / self.log_base)
                self.transformer = LogN(self.base)
            else:  # truncnorm
                self._rvs = truncnorm(
                    a=self.low,
                    b=self.high,
                    loc=self.kwargs.get("loc", (self.low + self.high) / 2),
                    scale=self.kwargs.get("scale", (self.high - self.low) / 2)
                )
                self.transformer = Identity()

    def __eq__(self, other):
        return (type(self) is type(other) and
                np.allclose([self.low], [other.low]) and
                np.allclose([self.high], [other.high]) and
                self.prior == other.prior and
                self.transform_ == other.transform_)

    def __repr__(self):
        return f"Real(low={self.low}, high={self.high}, prior='{self.prior}', transform='{self.transform_}')"

    def inverse_transform(self, Xt):
        inv_transform = super(Real, self).inverse_transform(Xt)
        if isinstance(inv_transform, list):
            inv_transform = np.array(inv_transform)
        inv_transform = np.clip(inv_transform, self.low, self.high).astype(self.dtype)
        return inv_transform.tolist() if self.dtype == float or self.dtype == 'float' else inv_transform

    @property
    def bounds(self):
        return (self.low, self.high)

    @property
    def is_constant(self):
        return self.low == self.high

    def __contains__(self, point):
        if isinstance(point, list):
            point = np.array(point)
        return self.low <= point <= self.high

    @property
    def transformed_bounds(self):
        if self.transform_ == "normalize":
            return (0.0, 1.0)
        else:
            if self.prior in ["uniform", "truncnorm"]:
                return (self.low, self.high)
            else:  # log-uniform
                return (np.log10(self.low), np.log10(self.high))

    def distance(self, a, b):
        if not (a in self and b in self):
            raise RuntimeError(f"Can only compute distance for values within the space, not {a} and {b}.")
        return abs(a - b)

# Aux Function
def _uniform_inclusive(loc=0.0, scale=1.0):
    return uniform(loc=loc, scale=np.nextafter(scale, scale + 1.0))

### Loading Data

In [15]:
df = pd.read_csv("/Users/joaquinromero/Desktop/HPTP/data/train.csv", sep=";")

In [16]:
df['y'] = df['y'].map({'yes':1,'no':0})

### Train/Test Split

In [17]:
df_train, df_test = train_test_split(df, test_size=0.1, random_state=0) 

#### Placing Numerical Features

In [18]:
numerical_feats = list(df_train.drop(columns='y').select_dtypes(include=np.number).columns)

#### Placing Categorical Features

In [19]:
categorical_feats = list(df_train.drop(columns='y').select_dtypes(exclude=np.number).columns)

### Pre-Processor

In [20]:
# Normalization Pre-processing for Numerical Features
numeric_preprocessor = StandardScaler()

# One-Hot-Encoding Pre-processing for Categorical Features
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")

In [21]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_preprocessor, numerical_feats),
        ("cat", categorical_preprocessor, categorical_feats),
    ]
)

### Pipeline

In [22]:
pipe = Pipeline(
    steps=[("preprocessor", preprocessor), 
           ("model", RandomForestClassifier(random_state=0))]
)

#### Placing All Features for Training Set

In [23]:
X_train_full = df_train.drop(columns=['y'])
y_train = df_train['y']

X_train_full.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40689 entries, 17974 to 2732
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        40689 non-null  int64 
 1   job        40689 non-null  object
 2   marital    40689 non-null  object
 3   education  40689 non-null  object
 4   default    40689 non-null  object
 5   balance    40689 non-null  int64 
 6   housing    40689 non-null  object
 7   loan       40689 non-null  object
 8   contact    40689 non-null  object
 9   day        40689 non-null  int64 
 10  month      40689 non-null  object
 11  duration   40689 non-null  int64 
 12  campaign   40689 non-null  int64 
 13  pdays      40689 non-null  int64 
 14  previous   40689 non-null  int64 
 15  poutcome   40689 non-null  object
dtypes: int64(7), object(9)
memory usage: 5.3+ MB


#### Placing All Features for Test Set

In [24]:
X_test_full = df_test.drop(columns=['y'])
y_test = df_test['y']

X_test_full.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4522 entries, 14001 to 25978
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4522 non-null   int64 
 1   job        4522 non-null   object
 2   marital    4522 non-null   object
 3   education  4522 non-null   object
 4   default    4522 non-null   object
 5   balance    4522 non-null   int64 
 6   housing    4522 non-null   object
 7   loan       4522 non-null   object
 8   contact    4522 non-null   object
 9   day        4522 non-null   int64 
 10  month      4522 non-null   object
 11  duration   4522 non-null   int64 
 12  campaign   4522 non-null   int64 
 13  pdays      4522 non-null   int64 
 14  previous   4522 non-null   int64 
 15  poutcome   4522 non-null   object
dtypes: int64(7), object(9)
memory usage: 600.6+ KB


#### Calculating F1-Score on Test Data without Hyperparameter Tuning

In [25]:
# Fitting The Pipeline on Train Data 
pipe.fit(X_train_full,y_train)

# Evaluating on the Test Data 
y_pred = pipe.predict(X_test_full)
print(f1_score(y_test, y_pred))

0.5035971223021583


#### Defining The Hyperparameter Space

In [26]:
hyperparameter_space = {
"model__n_estimators": Integer(low=5, high=200), 
"model__criterion": Categorical(["gini", "entropy"]),
"model__class_weight": Categorical(["balanced","balanced_subsample"]),
"model__min_samples_split": Real(low=0,high=0.5,prior="truncnorm",
                                 **{"loc":0.005,"scale":0.01})
} 

### BOGP

In [27]:
clf = BayesSearchCV(pipe,
                    hyperparameter_space,
                    n_iter=50,
                    optimizer_kwargs={"base_estimator":"GP",
                                      "n_initial_points":10,
                                      "initial_point_generator":"random",
                                      "acq_func":"EI",
                                      "acq_optimizer":"auto",
                                      "n_jobs":-1,
                                      "random_state":0,
                                      "acq_func_kwargs": {"xi":0.01}
                                     },
                    random_state = 0,
                    scoring = 'f1',
                    cv=5, 
                    n_jobs=-1,
                    refit = True,
                    verbose=2)

#### # Running The `BOGP CV`

In [32]:
clf.fit(X_train_full, y_train)

In [33]:
clf.best_params_,clf.best_score_

In [34]:
clf.score(X_test_full,y_test)