# Hyperparameter Tuning via DEAP

## Genetic Algorithm

### Loading Libraries

In [2]:
# Numerical Computing
import math
import numpy as np

# Data Manipulation
import pandas as pd

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# StatsModel
from scipy.stats import randint,truncnorm,uniform

# Scikit-Learn
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score, make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score

# OS
import time
import random
import operator

# DEAP
from deap import base
from deap import tools
from deap import creator

#
import multiprocessing

In [3]:
sns.set()

### Loading Data

In [4]:
df = pd.read_csv("/Users/joaquinromero/Desktop/HPTP/data/train.csv", sep=";")

In [5]:
df['y'] = df['y'].map({'yes':1,'no':0})

### Train/Test Split

In [6]:
df_train, df_test = train_test_split(df, test_size=0.1, random_state=0) 

#### Placing Numerical Features

In [7]:
numerical_feats = list(df_train.drop(columns='y').select_dtypes(include=np.number).columns)

#### Placing Categorical Features

In [8]:
categorical_feats = list(df_train.drop(columns='y').select_dtypes(exclude=np.number).columns)

### Preprocessor

In [9]:
# Normalization Pre-processing for Numerical Features
numeric_preprocessor = StandardScaler()

# One-Hot-Encoding Pre-processing for Categorical Features
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")

In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_preprocessor, numerical_feats),
        ("cat", categorical_preprocessor, categorical_feats),
    ]
)

### Pipeline

In [11]:
pipe = Pipeline(
    steps=[("preprocessor", preprocessor), 
           ("model", RandomForestClassifier(random_state=0))]
)

#### Placing All Features for Training Set

In [12]:
X_train_full = df_train.drop(columns=['y'])
y_train = df_train['y']

X_train_full.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40689 entries, 17974 to 2732
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        40689 non-null  int64 
 1   job        40689 non-null  object
 2   marital    40689 non-null  object
 3   education  40689 non-null  object
 4   default    40689 non-null  object
 5   balance    40689 non-null  int64 
 6   housing    40689 non-null  object
 7   loan       40689 non-null  object
 8   contact    40689 non-null  object
 9   day        40689 non-null  int64 
 10  month      40689 non-null  object
 11  duration   40689 non-null  int64 
 12  campaign   40689 non-null  int64 
 13  pdays      40689 non-null  int64 
 14  previous   40689 non-null  int64 
 15  poutcome   40689 non-null  object
dtypes: int64(7), object(9)
memory usage: 5.3+ MB


#### Placing All Features for Test Set

In [13]:
X_test_full = df_test.drop(columns=['y'])
y_test = df_test['y']

X_test_full.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4522 entries, 14001 to 25978
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4522 non-null   int64 
 1   job        4522 non-null   object
 2   marital    4522 non-null   object
 3   education  4522 non-null   object
 4   default    4522 non-null   object
 5   balance    4522 non-null   int64 
 6   housing    4522 non-null   object
 7   loan       4522 non-null   object
 8   contact    4522 non-null   object
 9   day        4522 non-null   int64 
 10  month      4522 non-null   object
 11  duration   4522 non-null   int64 
 12  campaign   4522 non-null   int64 
 13  pdays      4522 non-null   int64 
 14  previous   4522 non-null   int64 
 15  poutcome   4522 non-null   object
dtypes: int64(7), object(9)
memory usage: 600.6+ KB


#### Calculate F1-Score on Test Data without Hyperparameter Tuning

In [14]:
# Fit the pipeline on train data 
pipe.fit(X_train_full,y_train)

# Evaluate on the test data 
y_pred = pipe.predict(X_test_full)
print(f1_score(y_test, y_pred))

0.5035971223021583


### Hyperparameter Space: # GA Parameters

In [15]:
NPOP = 50
NGEN = 15
CXPB = 0.5
MUTPB = 0.2

In [16]:
# Fixing The Seed for Reproducibility
# feel free to change it or comment out the line
random.seed(1)

# our fitness score is supposed to be maximised and there is only 1 objective
creator.create("FitnessMax", base.Fitness, weights=(1.0,))

# our individual is a list of genes, with the fitness score the higher the better
creator.create("Individual", list, fitness=creator.FitnessMax)

#### Intialize toolbox

In [17]:
toolbox = base.Toolbox()

In [18]:
# Hyperparameters Definition
PARAM_NAMES = ["model__n_estimators","model__criterion",
               "model__class_weight","model__min_samples_split"]

# definition of an individual & a population
toolbox.register("model__n_estimators", randint.rvs, 5, 200)
toolbox.register("model__criterion", random.choice, ["gini", "entropy"])
toolbox.register("model__class_weight", random.choice, ["balanced","balanced_subsample"])
toolbox.register("model__min_samples_split", truncnorm.rvs, 0, 0.5, 0.005, 0.01)

# Define genes of individuals
toolbox.register(
    "individual",
    tools.initCycle,
    creator.Individual,
    (
        toolbox.model__n_estimators,
        toolbox.model__criterion,
        toolbox.model__class_weight,
        toolbox.model__min_samples_split,
    ),
)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

In [19]:
def mutPolynomialBoundedMix(individual, eta, low, up, is_int, indpb, discrete_params):
    
    for i in range(len(individual)):
        if discrete_params[i]:
            if random.random() < indpb:
                individual[i] = random.choice(discrete_params[i])
        else:
            individual[i] = tools.mutPolynomialBounded([individual[i]], 
                                                        eta[i], low[i], up[i], indpb)[0][0]
        
        if is_int[i]:
            individual[i] = int(individual[i])

    return individual,

In [20]:
# selection strategy
toolbox.register("select", tools.selTournament, tournsize=3)
# crossover strategy
toolbox.register("mate", tools.cxUniform, indpb=CXPB)
# mutation strategy
toolbox.register("mutate", mutPolynomialBoundedMix, 
                 eta = [0.1,None,None,0.1], 
                 low = [5,None,None,0], 
                 up = [200,None,None,1],
                 is_int = [True,False,False,False],
                 indpb=MUTPB,
                 discrete_params=[[],["gini", "entropy"],["balanced","balanced_subsample"],[]]
                )

In [21]:
def evaluate(individual):
    # convert list of parameter values into dictionary of kwargs
    strategy_params = {k: v for k, v in zip(PARAM_NAMES, individual)}
    
    if strategy_params['model__min_samples_split'] > 1 or strategy_params['model__min_samples_split'] <= 0:
        return [-np.inf]
    
    tuned_pipe = clone(pipe).set_params(**strategy_params)

    return [np.mean(cross_val_score(tuned_pipe,X_train_full, y_train, 
                                  cv=5, scoring='f1',
                                  # n_jobs=-1,
                                  )
                     )]

# fitness function
toolbox.register("evaluate", evaluate)

#### Using Parallel Processing

In [22]:
pool = multiprocessing.Pool(16)
toolbox.register("map", pool.map)

In [None]:
mean = np.ndarray(NGEN)
best = np.ndarray(NGEN)
hall_of_fame = tools.HallOfFame(maxsize=3)

t = time.perf_counter()
pop = toolbox.population(n=NPOP)
for g in range(NGEN):
    # Select the next generation individuals
    offspring = toolbox.select(pop, len(pop))
    # Clone the selected individuals
    offspring = list(map(toolbox.clone, offspring))

    # Apply crossover on the offspring
    for child1, child2 in zip(offspring[::2], offspring[1::2]):
        if random.random() < CXPB:
            toolbox.mate(child1, child2)
            del child1.fitness.values
            del child2.fitness.values

    # Apply mutation on the offspring
    for mutant in offspring:
        if random.random() < MUTPB:
            toolbox.mutate(mutant)
            del mutant.fitness.values

    # Evaluate the individuals with an invalid fitness
    invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
    fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
    for ind, fit in zip(invalid_ind, fitnesses):
        ind.fitness.values = fit

    # The population is entirely replaced by the offspring
    pop[:] = offspring
    hall_of_fame.update(pop)
    print(
        "HALL OF FAME:\n"
        + "\n".join(
            [
                f"    {_}: {ind}, Fitness: {ind.fitness.values[0]}"
                for _, ind in enumerate(hall_of_fame)
            ]
        )
    )

    fitnesses = [
        ind.fitness.values[0] for ind in pop if not np.isinf(ind.fitness.values[0])
    ]
    mean[g] = np.mean(fitnesses)
    best[g] = np.max(fitnesses)

Process SpawnPoolWorker-1:
Process SpawnPoolWorker-2:
Process SpawnPoolWorker-3:
Process SpawnPoolWorker-4:
Process SpawnPoolWorker-5:
Process SpawnPoolWorker-7:
Process SpawnPoolWorker-6:
Process SpawnPoolWorker-8:
Process SpawnPoolWorker-9:
Process SpawnPoolWorker-10:
Process SpawnPoolWorker-11:
Process SpawnPoolWorker-12:
Process SpawnPoolWorker-13:
Process SpawnPoolWorker-14:
Process SpawnPoolWorker-15:
Process SpawnPoolWorker-16:
Traceback (most recent call last):
  File "/opt/anaconda3/envs/HPTP/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/anaconda3/envs/HPTP/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/anaconda3/envs/HPTP/lib/python3.10/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/opt/anaconda3/envs/HPTP/lib/python3.10/multiprocessing/queues.py", line 367, in get
    return _ForkingPickler.loads(res)
AttributeError: Can't get at

In [None]:
params = {}
for idx_hof, param_name in enumerate(PARAM_NAMES):
    params[param_name] = hall_of_fame[0][idx_hof]
    
params

In [None]:
fig, ax = plt.subplots(sharex=True, figsize=(8, 6))

sns.lineplot(x=range(NGEN), y=mean, ax=ax, label="Average Fitness Score")
sns.lineplot(x=range(NGEN), y=best, ax=ax, label="Best Fitness Score")
ax.set_title("Fitness Score",size=20)
ax.set_xticks(range(NGEN))
ax.set_xlabel("Iteration")
plt.tight_layout()
plt.show()

In [None]:
tuned_pipe = clone(pipe).set_params(**params)

# Fit the pipeline on train data 
tuned_pipe.fit(X_train_full,y_train)

# Evaluate on the test data 
y_pred = tuned_pipe.predict(X_test_full)
print(f1_score(y_test, y_pred))