# Hyperparameter Tuning via DEAP

## Particle Swarm Optimization

### Loading Libraries

In [5]:
# Numerical Computing
import math
import numpy as np

# Data Manipulation
import pandas as pd

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# StatsModel
from scipy.stats import randint,truncnorm,uniform

# Scikit-Learn
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score, make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score

# OS
import time
import random
import operator

# DEAP
from deap import base
from deap import tools
from deap import creator

#
import multiprocessing

In [6]:
sns.set()

### Loading Data

In [7]:
df = pd.read_csv("/Users/joaquinromero/Desktop/HPTP/data/train.csv", sep=";")

In [8]:
df['y'] = df['y'].map({'yes':1,'no':0})

### Train/Test Split

In [9]:
df_train, df_test = train_test_split(df, test_size=0.1, random_state=0) 

#### Placing Numerical Features

In [10]:
numerical_feats = list(df_train.drop(columns='y').select_dtypes(include=np.number).columns)

#### Placing Categorical Features

In [11]:
categorical_feats = list(df_train.drop(columns='y').select_dtypes(exclude=np.number).columns)

### Preprocessor

In [12]:
# Normalization Pre-processing for Numerical Features
numeric_preprocessor = StandardScaler()

# One-Hot-Encoding Pre-processing for Categorical Features
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")

In [13]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_preprocessor, numerical_feats),
        ("cat", categorical_preprocessor, categorical_feats),
    ]
)

### Pipeline

In [14]:
pipe = Pipeline(
    steps=[("preprocessor", preprocessor), 
           ("model", RandomForestClassifier(random_state=0))]
)

#### Placing All Features for Training Set

In [15]:
X_train_full = df_train.drop(columns=['y'])
y_train = df_train['y']

X_train_full.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40689 entries, 17974 to 2732
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        40689 non-null  int64 
 1   job        40689 non-null  object
 2   marital    40689 non-null  object
 3   education  40689 non-null  object
 4   default    40689 non-null  object
 5   balance    40689 non-null  int64 
 6   housing    40689 non-null  object
 7   loan       40689 non-null  object
 8   contact    40689 non-null  object
 9   day        40689 non-null  int64 
 10  month      40689 non-null  object
 11  duration   40689 non-null  int64 
 12  campaign   40689 non-null  int64 
 13  pdays      40689 non-null  int64 
 14  previous   40689 non-null  int64 
 15  poutcome   40689 non-null  object
dtypes: int64(7), object(9)
memory usage: 5.3+ MB


#### Placing All Features for Test Set

In [16]:
X_test_full = df_test.drop(columns=['y'])
y_test = df_test['y']

X_test_full.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4522 entries, 14001 to 25978
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4522 non-null   int64 
 1   job        4522 non-null   object
 2   marital    4522 non-null   object
 3   education  4522 non-null   object
 4   default    4522 non-null   object
 5   balance    4522 non-null   int64 
 6   housing    4522 non-null   object
 7   loan       4522 non-null   object
 8   contact    4522 non-null   object
 9   day        4522 non-null   int64 
 10  month      4522 non-null   object
 11  duration   4522 non-null   int64 
 12  campaign   4522 non-null   int64 
 13  pdays      4522 non-null   int64 
 14  previous   4522 non-null   int64 
 15  poutcome   4522 non-null   object
dtypes: int64(7), object(9)
memory usage: 600.6+ KB


#### Calculate F1-Score on Test Data without Hyperparameter Tuning

In [17]:
# Fit the pipeline on train data 
pipe.fit(X_train_full,y_train)

# Evaluate on the test data 
y_pred = pipe.predict(X_test_full)
print(f1_score(y_test, y_pred))

0.5035971223021583


### Hyperparameter Space: PSO Parameters

In [18]:
N = 50
w = 0.5
c1 = 0.3
c2 = 0.5

num_trials = 15

In [19]:
random.seed(1)

# our fitness score is supposed to be maximised and there is only 1 objective
creator.create("FitnessMax", base.Fitness, weights=(1.0,))

# our individual is a list of genes, with the fitness score the higher the better
creator.create("Particle", list, fitness=creator.FitnessMax,
               speed=list, smin=list, smax=list, best=None)

#### Intialize toolbox

In [20]:
toolbox = base.Toolbox()

In [21]:
# Hyperparameters Definition
PARAM_NAMES = ["model__n_estimators","model__criterion",
               "model__class_weight","model__min_samples_split"]

# definition of an individual & a population
toolbox.register("model__n_estimators", randint.rvs, 5, 200)
toolbox.register("model__criterion", random.choice, [0,1])
toolbox.register("model__class_weight", random.choice, [0,1])
toolbox.register("model__min_samples_split", truncnorm.rvs, 0, 0.5, 0.005, 0.01)

In [22]:
def generate(speed_bound):
    part = tools.initCycle(creator.Particle,
                           [toolbox.model__n_estimators,
                            toolbox.model__criterion,
                            toolbox.model__class_weight,
                            toolbox.model__min_samples_split,
                           ]
                          )
    part.speed = [random.uniform(speed_bound[i]['smin'], speed_bound[i]['smax']) for i in range(len(part))]
    part.smin = [speed_bound[i]['smin'] for i in range(len(part))]
    part.smax = [speed_bound[i]['smax'] for i in range(len(part))]
    return part


# Define genes of particles
toolbox.register("particle", generate, 
                 speed_bound=[{'smin': -2.5,'smax': 2.5},
                              {'smin': -1,'smax': 1},
                              {'smin': -1,'smax': 1},
                              {'smin': -0.001,'smax': 0.001}])
toolbox.register("population", tools.initRepeat, list, toolbox.particle)

In [23]:
def updateParticle(part, best, c1, c2, w, is_int):
    w = [w for _ in range(len(part))]
    u1 = (random.uniform(0, 1)*c1 for _ in range(len(part)))
    u2 = (random.uniform(0, 1)*c2 for _ in range(len(part)))
    v_u1 = map(operator.mul, u1, map(operator.sub, part.best, part))
    v_u2 = map(operator.mul, u2, map(operator.sub, best, part))
    part.speed = list(map(operator.add, map(operator.mul, w, part.speed), map(operator.add, v_u1, v_u2)))
    for i, speed in enumerate(part.speed):
        if abs(speed) < part.smin[i]:
            part.speed[i] = math.copysign(part.smin[i], speed)
        elif abs(speed) > part.smax[i]:
            part.speed[i] = math.copysign(part.smax[i], speed)
    part[:] = list(map(operator.add, part, part.speed))
    
    for i, pos in enumerate(part):
        if is_int[i]:
            part[i] = int(pos)

# Update strategy
toolbox.register("update", updateParticle, c1=c1, c2=c2, w=w,
                is_int=[True,True,True,False]
                )

In [24]:
def evaluate(particle):
    # convert list of parameter values into dictionary of kwargs
    strategy_params = {k: v for k, v in zip(PARAM_NAMES, particle)}
    strategy_params["model__criterion"] = "gini" if strategy_params["model__criterion"]==0 else "entropy"
    strategy_params["model__class_weight"] = "balanced" if strategy_params["model__class_weight"]==0 else "balanced_subsample"
    
    if strategy_params['model__min_samples_split'] > 1 or strategy_params['model__min_samples_split'] <= 0:
        return [-np.inf]
    
    tuned_pipe = clone(pipe).set_params(**strategy_params)

    return [np.mean(cross_val_score(tuned_pipe,X_train_full, y_train, 
                                  cv=5, scoring='f1',
                                  # n_jobs=-1,
                                  )
                     )]

# fitness function
toolbox.register("evaluate", evaluate)

#### Using Parallel Processing

In [25]:
pool = multiprocessing.Pool(16)
toolbox.register("map", pool.map)

In [None]:
pop = toolbox.population(n=N)
mean_arr = np.zeros(num_trials)
best_arr = np.zeros(num_trials)
hall_of_fame = tools.HallOfFame(maxsize=3)

best = None
for g in range(num_trials):
    fitnesses = toolbox.map(toolbox.evaluate, pop)
    for part, fit in zip(pop, fitnesses):
        part.fitness.values = fit
        
        if not part.best or part.fitness.values > part.best.fitness.values:
            part.best = creator.Particle(part)
            part.best.fitness.values = part.fitness.values
        if not best or part.fitness.values > best.fitness.values:
            best = creator.Particle(part)
            best.fitness.values = part.fitness.values
    for part in pop:
        toolbox.update(part, best)
        
    hall_of_fame.update(pop)
    print(
        "HALL OF FAME:\n"
        + "\n".join(
            [
                f"    {_}: {ind}, Fitness: {ind.fitness.values[0]}"
                for _, ind in enumerate(hall_of_fame)
            ]
        )
    )
    
    fitnesses = [
        ind.fitness.values[0] for ind in pop if not np.isinf(ind.fitness.values[0])
    ]
    mean_arr[g] = np.mean(fitnesses)
    best_arr[g] = np.max(fitnesses)

Process SpawnPoolWorker-1:
Process SpawnPoolWorker-2:
Process SpawnPoolWorker-3:
Process SpawnPoolWorker-4:
Process SpawnPoolWorker-5:
Process SpawnPoolWorker-6:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/anaconda3/envs/HPTP/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/anaconda3/envs/HPTP/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/anaconda3/envs/HPTP/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
  File "/opt/anaconda3/envs/HPTP/lib/python3.10/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/opt/anaconda3/envs/HPTP/lib/python3.10/multiprocessing/queues.py", line 367, in get
    return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'evaluate' on <module '__main__' (built-in)>
  File "/opt/anaconda3/envs/HPTP/lib/p

In [None]:
params = {}
for idx_hof, param_name in enumerate(PARAM_NAMES):
    if param_name == "model__criterion":
        params[param_name] = "gini" if hall_of_fame[0][idx_hof]==0 else "entropy"
    elif param_name == "model__class_weight":
        params[param_name] = "balanced" if hall_of_fame[0][idx_hof]==0 else "balanced_subsample"
    else:
        params[param_name] = hall_of_fame[0][idx_hof]
    
params

In [None]:
fig, ax = plt.subplots(sharex=True, figsize=(8, 6))

sns.lineplot(x=range(num_trials), y=mean_arr, ax=ax, label="Average Fitness Score")
sns.lineplot(x=range(num_trials), y=best_arr, ax=ax, label="Best Fitness Score")
ax.set_title("Fitness Score",size=20)
ax.set_xticks(range(num_trials))
ax.set_xlabel("Iteration")
plt.tight_layout()
plt.show()

In [None]:
tuned_pipe = clone(pipe).set_params(**params)

# Fit the pipeline on train data 
tuned_pipe.fit(X_train_full,y_train)

# Evaluate on the test data 
y_pred = tuned_pipe.predict(X_test_full)
print(f1_score(y_test, y_pred))