# Hyperparameter Tuning via DEAP

## Genetic Algorithm

### Loading Libraries

In [3]:
# Numerical Computing
import math
import numpy as np

# Data Manipulation
import pandas as pd

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# StatsModel
from scipy.stats import randint,truncnorm,uniform

# Scikit-Learn
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score, make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score

# OS
import time
import random
import operator

# DEAP
from deap import base
from deap import tools
from deap import creator

#
import multiprocessing

# NNI
# from nni.experiment import Experiment

In [4]:
sns.set()

### Loading Data

In [5]:
df = pd.read_csv("/Users/joaquinromero/Desktop/HPTP/data/train.csv", sep=";")

In [6]:
df['y'] = df['y'].map({'yes':1,'no':0})

### Train/Test Split

In [7]:
df_train, df_test = train_test_split(df, test_size=0.1, random_state=0) 

#### Placing Numerical Features

In [8]:
numerical_feats = list(df_train.drop(columns='y').select_dtypes(include=np.number).columns)

#### Placing Categorical Features

In [9]:
categorical_feats = list(df_train.drop(columns='y').select_dtypes(exclude=np.number).columns)

### Preprocessor

In [10]:
# Normalization Pre-processing for Numerical Features
numeric_preprocessor = StandardScaler()

# One-Hot-Encoding Pre-processing for Categorical Features
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")

In [11]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_preprocessor, numerical_feats),
        ("cat", categorical_preprocessor, categorical_feats),
    ]
)

### Pipeline

In [12]:
pipe = Pipeline(
    steps=[("preprocessor", preprocessor), 
           ("model", RandomForestClassifier(random_state=0))]
)

#### Placing All Features for Training Set

In [13]:
X_train_full = df_train.drop(columns=['y'])
y_train = df_train['y']

X_train_full.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40689 entries, 17974 to 2732
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        40689 non-null  int64 
 1   job        40689 non-null  object
 2   marital    40689 non-null  object
 3   education  40689 non-null  object
 4   default    40689 non-null  object
 5   balance    40689 non-null  int64 
 6   housing    40689 non-null  object
 7   loan       40689 non-null  object
 8   contact    40689 non-null  object
 9   day        40689 non-null  int64 
 10  month      40689 non-null  object
 11  duration   40689 non-null  int64 
 12  campaign   40689 non-null  int64 
 13  pdays      40689 non-null  int64 
 14  previous   40689 non-null  int64 
 15  poutcome   40689 non-null  object
dtypes: int64(7), object(9)
memory usage: 5.3+ MB


#### Placing All Features for Test Set

In [14]:
X_test_full = df_test.drop(columns=['y'])
y_test = df_test['y']

X_test_full.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4522 entries, 14001 to 25978
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4522 non-null   int64 
 1   job        4522 non-null   object
 2   marital    4522 non-null   object
 3   education  4522 non-null   object
 4   default    4522 non-null   object
 5   balance    4522 non-null   int64 
 6   housing    4522 non-null   object
 7   loan       4522 non-null   object
 8   contact    4522 non-null   object
 9   day        4522 non-null   int64 
 10  month      4522 non-null   object
 11  duration   4522 non-null   int64 
 12  campaign   4522 non-null   int64 
 13  pdays      4522 non-null   int64 
 14  previous   4522 non-null   int64 
 15  poutcome   4522 non-null   object
dtypes: int64(7), object(9)
memory usage: 600.6+ KB


#### Calculate F1-Score on Test Data without Hyperparameter Tuning

In [15]:
# Fit the pipeline on train data 
pipe.fit(X_train_full,y_train)

# Evaluate on the test data 
y_pred = pipe.predict(X_test_full)
print(f1_score(y_test, y_pred))

0.5035971223021583


### Hyperparameter Space

In [16]:
hyperparameter_space = { 
    'model__n_estimators': {'_type': 'randint', '_value': [5, 200]}, 
    'model__criterion': {'_type': 'choice', '_value': ['gini', 'entropy']}, 
    'model__class_weight': {'_type': 'choice', '_value': ["balanced","balanced_subsample"]}, 
    'model__min_samples_split': {'_type': 'uniform', '_value': [0, 0.1]}, 
}  

In [17]:
hyperparameter_space_numeric = { 
    'model__n_estimators': {'_type': 'randint', '_value': [5, 200]}, 
    'model__criterion': {'_type': 'choice', '_value': [0, 1]}, 
    'model__class_weight': {'_type': 'choice', '_value': [0, 1]}, 
    'model__min_samples_split': {'_type': 'uniform', '_value': [0, 0.1]}, 
}  

In [18]:
hyperparameter_space_advisor = { 
    'model__criterion': {'_type': 'choice', '_value': ['gini', 'entropy']}, 
    'model__class_weight': {'_type': 'choice', '_value': ["balanced","balanced_subsample"]}, 
    'model__min_samples_split': {'_type': 'uniform', '_value': [0, 0.1]}, 
}  

### Grid Search

In [19]:
best_parameters = {
    "model__n_estimators": 27,
    "model__criterion": "entropy",
    "model__class_weight": "balanced_subsample",
    "model__min_samples_split": 0.05
}

In [21]:
tuned_pipe = clone(pipe).set_params(**best_parameters)

# Fit the pipeline on train data 
tuned_pipe.fit(X_train_full,y_train)

# Evaluate on the test data 
y_pred = tuned_pipe.predict(X_test_full)
print(f1_score(y_test, y_pred))

In [None]:
### 

#### 