# **Hyperparameter optimalization**



## **1. Data preparation**

In [None]:
!pip install scikit-optimize

In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from skopt import BayesSearchCV

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
titanic = pd.read_excel('/content/drive/MyDrive/datasets/Titanic.xlsx')

titanic.drop(labels='Name', axis=1, inplace=True)

titanic['Sex'] = pd.get_dummies(titanic['Sex'], drop_first=True)

X = titanic.iloc[:, 1:].values
y = titanic.iloc[:, 0].values.reshape(-1,1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y)

## **Decision tree**

In [6]:
clf_dtree = DecisionTreeClassifier(random_state=0)

clf_dtree = clf_dtree.fit(X_train,y_train)
y_pred_dtree = clf_dtree.predict(X_test)

print("Accuracy:",accuracy_score(y_test, y_pred_dtree))

Accuracy: 0.7474402730375427


### **Hyperparameter optimalization with GridSearchCV**

In [7]:
# based on accuracy

clf = DecisionTreeClassifier(random_state=0)
params = {
    'criterion': ['gini', 'entropy'],
    'max_depth' : range(1,8),
    'min_samples_split': range(2,10),
    'min_samples_leaf': range(1,10)
}

gridsearch = GridSearchCV(
    estimator = clf,
    param_grid = params,
    scoring = 'accuracy',
    cv = 5,
    n_jobs = -1,
    verbose = 0
)

grid_results = gridsearch.fit(X_train, y_train)

print(grid_results.best_params_)
print(grid_results.best_score_)

{'criterion': 'entropy', 'max_depth': 6, 'min_samples_leaf': 8, 'min_samples_split': 2}
0.8451217775245692


Best model on the test data set

In [8]:
y_test_pred_GS = grid_results.best_estimator_.predict(X_test)
print("Accuracy:",accuracy_score(y_test, y_test_pred_GS))

Accuracy: 0.7918088737201365


 **Random Forest**

Creating a Random Forest classifier and fitting the model with default parameters

In [9]:
clf_randomforest = RandomForestClassifier(random_state=0)

clf_randomforest = clf_randomforest.fit(X_train, y_train.ravel())

print("Accuracy:", clf_randomforest.score(X_test,y_test))

Accuracy: 0.8088737201365188


Parameter combination set based on experience:

In [10]:
clf_randomforest2 = RandomForestClassifier(n_estimators=20, criterion="entropy",  max_depth=5, random_state=0)

clf_randomforest2 = clf_randomforest2.fit(X_train, y_train.ravel())

print("Accuracy:", clf_randomforest2.score(X_test,y_test))



Accuracy: 0.8191126279863481


### **RandomizedSearchCV**

In [None]:
# based on accuracy value

clf = RandomForestClassifier(random_state=0)
params = {
    'n_estimators': [10, 20, 30, 40, 50, 100, 150, 200],
    'criterion' :['entropy', 'gini'],
    'max_depth': [2, 3, 4, 5, 6, 7],
    'min_samples_split': range(2,10),
    'min_samples_leaf': range(1,10)
}

randomsearch = RandomizedSearchCV(
    estimator = clf,
    param_distributions = params,
    n_iter = 100,
    scoring = 'accuracy',
    verbose = 1
)

random_results = randomsearch.fit(X_train, y_train.ravel())
print(random_results.best_params_)
print(random_results.best_score_)



    *Note:* The code above was a RandomizedSearchCV search. A GridSearchCV search run with the same parameters takes approx. It runs for 55 minutes and returns with the following parameters:

    {'criterion': 'entropy', 'max_depth': 7, 'min_samples_leaf': 3, 'min_samples_split': 7, 'n_estimators': 30}
    0.8283862697621421

Best models evaluation based on test :

In [None]:
y_test_pred_RS = random_results.best_estimator_.predict(X_test)
print("Accuracy:",accuracy_score(y_test, y_test_pred_RS))

### **BayesSearchCV**

In [None]:
# BayesSearchCV - runs for 10 min

clf = RandomForestClassifier(random_state=0)

params = {
    'n_estimators': [10, 20, 30, 40, 50, 100, 150, 200],
    'criterion': ['entropy', 'gini'],
    'max_depth': [2, 3, 4, 5, 6, 7]
}

bayessearch = BayesSearchCV(
    estimator=clf,
    search_spaces=params,
    n_iter=100,
    cv=5,
    scoring='accuracy',
    verbose=1
)

bayes_results = bayessearch.fit(X_train, y_train.ravel())
print(bayes_results.best_params_)
print(bayes_results.best_score_)

In [None]:
y_test_pred_BS = bayes_results.best_estimator_.predict(X_test)
print("Accuracy :",accuracy_score(y_test, y_test_pred_BS))

MLPClassifier

In [13]:

clf_nn = MLPClassifier(random_state=0)
clf_nn.fit(X_train, y_train.ravel())



In [14]:
y_pred_nn = clf_nn.predict(X_test)

print('F1 score: ', f1_score(y_true=y_test, y_pred=y_pred_nn))
print('Accuracy score: ', str(clf_nn.score(X_test,y_test)))
print('AUC:', roc_auc_score(y_true=y_test, y_score=y_pred_nn))

F1 score:  0.7230046948356806
Accuracy score:  0.7986348122866894
AUC: 0.776819075712881


### **RandomizedSearchCV**:

In [15]:


clf = MLPClassifier(hidden_layer_sizes=(4), random_state=0)
params = {
    'activation': ['tanh', 'relu', 'logistic'],
    'hidden_layer_sizes' :[(2,), (3,), (4,), (5,)],
    'solver': ['lbfgs', 'sgd'],
    'max_iter': [50, 100, 500, 1000, 2000, 5000, 10000],
    'alpha': [0.0001, 0.001, 0.01],
    'random_state': [0]
}

randomsearch_nn = RandomizedSearchCV(
    estimator = clf,
    param_distributions = params,
    n_iter = 100,
    scoring = 'accuracy',
    n_jobs = -1,
    verbose = 0
)

random_results_nn = randomsearch_nn.fit(X_train, y_train.ravel())
print(random_results_nn.best_params_)
print(random_results_nn.best_score_)

{'solver': 'lbfgs', 'random_state': 0, 'max_iter': 10000, 'hidden_layer_sizes': (4,), 'alpha': 0.001, 'activation': 'logistic'}
0.8299672411337415


Evaluation of the best, on the train set:

In [16]:
y_test_pred_RS_nn = random_results_nn.best_estimator_.predict(X_test)
print("Accuracy:",accuracy_score(y_test, y_test_pred_RS_nn))

Accuracy: 0.7781569965870307


Based on AUC:



In [17]:


clf = MLPClassifier(hidden_layer_sizes=(4))
params = {
    'activation': ['tanh', 'relu', 'logistic'],
    'hidden_layer_sizes' :[(2,), (3,), (4,), (5,)],
    'solver': ['lbfgs', 'sgd'],
    'max_iter': [50, 100, 500, 1000, 2000, 5000, 10000],
    'alpha': [0.0001, 0.001, 0.01],
    'random_state': [0]
}

randomsearch_nn2 = RandomizedSearchCV(
    estimator = clf,
    param_distributions = params,
    n_iter = 100,
    scoring = 'roc_auc',
    n_jobs = -1,
    verbose = 0
)

random_results_nn2 = randomsearch_nn2.fit(X_train, y_train.ravel())
print(random_results_nn2.best_params_)
print(random_results_nn2.best_score_)


{'solver': 'lbfgs', 'random_state': 0, 'max_iter': 500, 'hidden_layer_sizes': (4,), 'alpha': 0.01, 'activation': 'logistic'}
0.8761524717093507


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [18]:
y_test_pred_RS_nn2 = random_results_nn2.best_estimator_.predict(X_test)
print('AUC:', roc_auc_score(y_true=y_test, y_score=y_test_pred_RS_nn2))
print("Accuracy:",accuracy_score(y_test, y_test_pred_RS_nn2))

AUC: 0.7487217305801377
Accuracy: 0.78839590443686
