# MLP Neural Network for Dog Aging Project Data
Using separate notebook because needed a different TensorFlow version for some of the exploratory work.

In [72]:
!pip install scikeras
!pip install imblearn
!pip install xgboost

import sys 

# for Google Workbook:
# !{sys.executable} -m pip install --user pandas==1.3.5
# !{sys.executable} -m pip install --user tensorflow==2.7.0 # 1.15.0 # paper said us 1.10.0, but it doesn't exist???
# !{sys.executable} -m pip install --user numpy==1.21.6 # this one breaks on the tensor to numpy issue: 1.16.0 # tested with 1.16.0 # paper said us 1.15.3, but too many dependency conflicts
# !{sys.executable} -m pip install scikeras

# for other environments:
# !{sys.executable} -m pip install pandas==1.3.5
# !{sys.executable} -m pip install tensorflow=2.7.0 # 1.15.0 # paper said us 1.10.0, but it doesn't exist???
# !{sys.executable} -m pip install numpy==1.21.6  # this one breaks on the tensor to numpy issue: 1.16.0 # tested with 1.16.0 # paper said us 1.15.3, but too many dependency conflicts
# !{sys.executable} -m pip install scikeras

In [9]:
# source: https://machinelearningmastery.com/binary-classification-tutorial-with-the-keras-deep-learning-library/
import pandas as pd
import numpy as np
import seaborn as sns
from imblearn.over_sampling import SMOTENC
from collections import Counter

# Try a neural net (with oversampled future stage data)
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from scikeras.wrappers import KerasClassifier
from sklearn.neural_network import MLPClassifier

# Grid Search / Cross Validation
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split, cross_val_score, PredefinedSplit, HalvingGridSearchCV

# Preprocessing
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# operations
# from sklearn.model_selection import get_split_data
from sklearn import set_config
import time 

# Evaluation / Metrics
from sklearn import metrics
from ml_models import evaluate

# Data Processing
from data_proc import get_split_data

import warnings
warnings.filterwarnings("ignore")

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

# set up constants
import constants
preprocessor_mlp = constants.preprocessor_mlp
df_cols = constants.df_cols_nn

In [10]:
# Key variables across model settings
rand_state = 0
test_size = 0.3
oversample_strat=1.0
undersample_strat=0.5

start_time = time.time()

# for grid search
print('balanced default data loading')
X_gs_os, X_train_def_os, X_test_def_os, X_val_def_os, y_gs_os, y_train_def_os, y_test_def_os, y_val_def_os, test_fold_def_os = get_split_data(trainsplit=80, seqlen=1, oversample=True, oversample_strat=oversample_strat, undersample=False, undersample_strat=undersample_strat, future=False)
# make the grid search and optimized model data names a little easier to write
# only using basic data for grid search, since oversampling data results get skewed
X_g, y_g = X_gs_os, y_gs_os 

# for optimized model - for General Data Oversampled AND Future Data Oversampled
print('balanced future data loading')
X_gs_fs_os, X_train_fs_os, X_test_fs_os, X_val_fs_os, y_gs_fs_os, y_train_fs_os, y_test_fs_os, y_val_fs_os, _ = get_split_data(trainsplit=80, seqlen=1, oversample=True, oversample_strat=oversample_strat, undersample=False, undersample_strat=undersample_strat, future=True)
# Future step:
X_train_fs, X_test_fs, X_val_fs, y_train_fs, y_test_fs, y_val_fs  = X_train_fs_os, X_test_fs_os, X_val_fs_os, y_train_fs_os, y_test_fs_os, y_val_fs_os

# General:
X_train, X_test, X_val, y_train, y_test, y_val = X_train_def_os, X_test_def_os, X_val_def_os, y_train_def_os, y_test_def_os, y_val_def_os

elapsed_time = time.time() - start_time
print(f"Elapsed time to compute: {elapsed_time:.3f} seconds")

balanced default data loading
feature size:  42
Joined SIZE:  (73473, 47)
Oversampling training set from {0: 54650, 1: 3870} to {0: 54650, 1: 54650}
balanced future data loading
feature size:  42
Joined SIZE:  (73473, 47)
Oversampling training set from {0: 51140, 1: 451} to {0: 51140, 1: 51140}
Elapsed time to compute: 91.282 seconds


In [11]:
# Local functions and constants for pipelines
def prec_recall_auc(y_test, y_pred):
    # calculate the precision-recall auc
    precision, recall, _ = metrics.precision_recall_curve(y_test, y_pred)
    auc_score = metrics.auc(recall, precision)
    print('PR AUC: %.3f' % auc_score)
    return auc_score

scorer = metrics.make_scorer(
    prec_recall_auc, # function, taking in y_test, y_pred (which is probabilities based on 'needs_proba' param below
    greater_is_better=True, # this is optimizing for the highest PR AUC score
    needs_proba=True, # the scorer needs proba estimates for the prec_recall curve
    needs_threshold=False # for binary classification that has a predict_proba method
)

half_grid_scoring = metrics.make_scorer(metrics.recall_score) # scorer 
prc_auc_scoring = scorer
ps = PredefinedSplit(test_fold_def_os)

# Focus on Recall

In [14]:
# Set up MLP Neural Net pipeline
# ESTIMATORS / STEPS
steps_mlp = [
    ("preprocessor", preprocessor_mlp),
    ("clf", MLPClassifier(random_state=rand_state))
]

# Pipeline
pipe_mlp = Pipeline(steps_mlp)

# Grid Hyperparam Search
# BEST GRIDSEARCH SCORE (AUC): 0.997501
# BEST GRIDSEARCH PARAMS: {'clf__activation': 'tanh', 'clf__alpha': 1.5e-06, 'clf__hidden_layer_sizes': (220, 120), 'clf__max_iter': 180, 'clf__solver': 'adam'}
grid_params_mlp = {
    "clf__hidden_layer_sizes": [(40,40,1)], 
    "clf__activation": ['tanh'], # {‘identity’, ‘logistic’, ‘tanh’, ‘relu’ (default)},
    "clf__solver": ['lbfgs'], # {‘lbfgs’, ‘sgd’, ‘adam’ (default)} - lbfgs better for smaller datasets
    "clf__alpha": [0.001], # strength of l2 regularization term-
    "clf__max_iter": [40] # could possibly go lower, but have to be careful to not cut off future improvement potential
}

pipe_gs_mlp = HalvingGridSearchCV(estimator=pipe_mlp,
                     param_grid=grid_params_mlp,
                     factor=3, # one 3rd of of candidates are selected
                     scoring=half_grid_scoring,
                     refit=True,
                     verbose=3,
                     random_state=rand_state,             
                     n_jobs=1,
                     cv=ps) # kfold3)

set_config(display="diagram")
pipe_gs_mlp

In [15]:
start_time = time.time()

gs_mlp = pipe_gs_mlp.fit(X_g, y_g)

elapsed_time = time.time() - start_time
print(f"Elapsed time to compute: {elapsed_time:.3f} seconds")

n_iterations: 1
n_required_iterations: 1
n_possible_iterations: 1
min_resources_: 116648
max_resources_: 116648
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 1
n_resources: 116648
Fitting 1 folds for each of 1 candidates, totalling 1 fits
[CV 1/1] END clf__activation=tanh, clf__alpha=0.001, clf__hidden_layer_sizes=(40, 40, 1), clf__max_iter=40, clf__solver=lbfgs;, score=(train=0.922, test=0.808) total time=  40.8s
Elapsed time to compute: 92.043 seconds


In [None]:
# print(f"BEST GRIDSEARCH SCORE (RECALL): {gs_mlp.best_score_:.6f}")
# print(f"BEST GRIDSEARCH PARAMS: {gs_mlp.best_params_}")

# BEST GRIDSEARCH SCORE (AUC): 0.623840
# BEST GRIDSEARCH PARAMS: {'clf__activation': 'tanh', 'clf__alpha': 1.5e-06, 'clf__hidden_layer_sizes': (220, 120), 'clf__max_iter': 180, 'clf__solver': 'adam'}

# BEST GRIDSEARCH SCORE (RECALL): 0.816738
# BEST GRIDSEARCH PARAMS: {'clf__activation': 'tanh', 'clf__alpha': 0.001, 'clf__hidden_layer_sizes': (50, 50, 1), 'clf__max_iter': 50, 'clf__solver': 'lbfgs'}

# BEST GRIDSEARCH SCORE (RECALL): 0.817167
# BEST GRIDSEARCH PARAMS: {'clf__activation': 'tanh', 'clf__alpha': 0.001, 'clf__hidden_layer_sizes': (40, 40, 1), 'clf__max_iter': 20, 'clf__solver': 'lbfgs'}

# BEST GRIDSEARCH SCORE (RECALL): 0.817167
# BEST GRIDSEARCH PARAMS: {'clf__activation': 'tanh', 'clf__alpha': 0.001, 'clf__hidden_layer_sizes': (40, 40, 1), 'clf__max_iter': 20, 'clf__solver': 'lbfgs'}

In [16]:
#converting the clf.cv_results to dataframe
df_mlp=pd.DataFrame.from_dict(gs_mlp.cv_results_)
df_mlp # [df_cols] # add specific columns to narrow df view
# df_mlp.columns.values

Unnamed: 0,iter,n_resources,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__activation,param_clf__alpha,param_clf__hidden_layer_sizes,param_clf__max_iter,param_clf__solver,params,split0_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,mean_train_score,std_train_score
0,0,116648,40.665858,0.0,0.165311,0.0,tanh,0.001,"(40, 40, 1)",40,lbfgs,"{'clf__activation': 'tanh', 'clf__alpha': 0.00...",0.807725,0.807725,0.0,1,0.921543,0.921543,0.0


MLP Gridsearch Results

FINAL EDA:

STARTING:
"clf__hidden_layer_sizes": [(5,), (10,),(20,), (30,), (50,),(100,),(150,),(200,),(300,),(400,),(500,),(10,1),(10,5),(10,10),(220,120),(50,25),(100,50),(150,75),(200,50),(200,100),(500,100),(500,200),(10,10,1),(10,5,1),(10,10,10)], # could try higher in future\
"clf__activation": ['tanh','relu'], # {‘identity’, ‘logistic’, ‘tanh’, ‘relu’ (default)},\
"clf__solver": ['lbfgs', 'adam'], # {‘lbfgs’, ‘sgd’, ‘adam’ (default)} - lbfgs better for smaller datasets\
"clf__alpha": [1e-7,1.5e-6,1e-6,1e-5,1e-4,1e-3], # strength of l2 regularization term-  could try higher in future, 2e-6 and higher\
"clf__max_iter": [100,180,200,300,400,500]

### Round 1
BEST GRIDSEARCH SCORE (RECALL): 0.818221\
BEST GRIDSEARCH PARAMS: {\
'clf__activation': 'tanh',\ 
'clf__alpha': 0.001, # push up\
'clf__hidden_layer_sizes': (10, 10, 1), # try other triple layer combos\
'clf__max_iter': 100, # try other max iter around 100 (and lower)\
'clf__solver': 'lbfgs'} \
DECISION: Select 'tahn, lbfgs.  Try more with: alpha, layer sizes, max iter


### Round 2
"clf__hidden_layer_sizes": [(10,10,1),(10,10,2),(10,10,3),(20,20,1),(50,50,1)], # could try higher in future
"clf__activation": ['tanh'], # {‘identity’, ‘logistic’, ‘tanh’, ‘relu’ (default)},\
"clf__solver": ['lbfgs'], # {‘lbfgs’, ‘sgd’, ‘adam’ (default)} - lbfgs better for smaller datasets\
"clf__alpha": [1e-4,1e-3,0.001, 0.01, 0.1], # strength of l2 regularization term-  could try higher in future, 2e-6 and higher\
"clf__max_iter": [50,60,70,80,90,100,120]\
BEST GRIDSEARCH SCORE (RECALL): 0.816738\
BEST GRIDSEARCH PARAMS: {'clf__activation': 'tanh', 'clf__alpha': 0.001, 'clf__hidden_layer_sizes': (50, 50, 1), 'clf__max_iter': 50, 'clf__solver': 'lbfgs'}\
DECISION: Select Alpha, Try more layers, lower max iter

### Round 3
"clf__hidden_layer_sizes": [(40,40,1),(50,50,1),(75,75,1),(100,100,1),(150,150,1),(200,200,1)], \
"clf__activation": ['tanh'], # {‘identity’, ‘logistic’, ‘tanh’, ‘relu’ (default)},\
"clf__solver": ['lbfgs'], # {‘lbfgs’, ‘sgd’, ‘adam’ (default)} - lbfgs better for smaller datasets\
"clf__alpha": [0.001], # strength of l2 regularization term-\
"clf__max_iter": [20,30,40,50,60]\
BEST GRIDSEARCH SCORE (RECALL): 0.817167\
BEST GRIDSEARCH PARAMS: {'clf__activation': 'tanh', 'clf__alpha': 0.001, 'clf__hidden_layer_sizes': (40, 40, 1), 'clf__max_iter': 20, 'clf__solver': 'lbfgs'}\
DECISIONS: FINAL

Previous EDA:\
grid_params_mlp = {\
    "clf__hidden_layer_sizes": [(220,120)], # could try higher in future\
    "clf__activation": ['tanh'], # {‘identity’, ‘logistic’, ‘tanh’, ‘relu’ (default)},\
    "clf__solver": ['adam'], # {‘lbfgs’, ‘sgd’, ‘adam’ (default)} - lbfgs better for smaller datasets\
    "clf__alpha": [1.5e-6], # strength of l2 regularization term-  could try higher in future, 2e-6 and higher\
    "clf__max_iter": [180] # could possibly go lower, but have to be careful to not cut off future improvement potential\
}

### Round 1
Fitting 5 folds for each of 24 candidates, totalling 120 fits\
NOTE: Didn't finish, was clear that alpha needs to be lower, and hidden layer sizes needs to be higher. Unknown for activation so far, and add max iterations later.\
NOTE: Adam may be slightly better, but going to use lbfgs for now to test other params, then come back and test on Adam and lbfgs later\
"clf__hidden_layer_sizes": [(5,), (10,)], (5 was AUC: (0.943-0.951), 10 was AUC 0.969-0.973 with tahn, adam/lgfgs, alpha 1e-05)\
"clf__activation": ['tanh','relu'], # \
"clf__solver": ['lbfgs', 'adam'], # \
"clf__alpha": [1e-5,1e-4,1e-3], #\
"clf__max_iter": [300]

### Round 2
Fitting 5 folds for each of 12 candidates, totalling 60 fits\
NOTE: Stopped early again. Clear hidden layers makes a big difference.  30 gets us to AUC of 0.990. Going to stop to train on other params with a low hidden layer count and come back to that.\
NOTE: changing hidden layers to 200 to process faster as well.\
"clf__hidden_layer_sizes": [(20,), (30,), (50,)], \
"clf__activation": ['tanh','relu'], # \
"clf__solver": ['lbfgs'], # \
"clf__alpha": [1e-7,1e-6], #\
"clf__max_iter": [300]

### Round 3
Fitting 5 folds for each of 4 candidates, totalling 20 fits\
Elapsed time to compute: 172.755 seconds\
BEST GRIDSEARCH SCORE (AUC): 0.949377\
BEST GRIDSEARCH PARAMS: {'clf__activation': 'tanh', 'clf__alpha': 1e-06, 'clf__hidden_layer_sizes': (5,), 'clf__max_iter': 200, 'clf__solver': 'lbfgs'}\
"clf__hidden_layer_sizes": [(5,),], \
"clf__activation": ['tanh','relu'], # \
"clf__solver": ['lbfgs'], # \
"clf__alpha": [1e-7,1e-6], #\
"clf__max_iter": [200]

### Round 4
Fitting 5 folds for each of 2 candidates, totalling 10 fits\
Elapsed time to compute: 161.426 seconds\
BEST GRIDSEARCH SCORE (AUC): 0.949377\
BEST GRIDSEARCH PARAMS: {'clf__activation': 'tanh', 'clf__alpha': 1e-06, 'clf__hidden_layer_sizes': (5,), 'clf__max_iter': 200, 'clf__solver': 'lbfgs'}\
NOTES: Retry Adam with optimized alpha, activation\
"clf__hidden_layer_sizes": [(5,),], \
"clf__activation": ['tanh'], # \
"clf__solver": ['lbfgs','adam'], # 'lbfgs'\ 
"clf__alpha": [1e-6], #\
"clf__max_iter": [200]

### Round 5
Fitting 5 folds for each of 3 candidates, totalling 15 fits \
Elapsed time to compute: 163.136 seconds \
BEST GRIDSEARCH SCORE (AUC): 0.949471 \
BEST GRIDSEARCH PARAMS: {'clf__activation': 'tanh', 'clf__alpha': 1e-06, 'clf__hidden_layer_sizes': (5,), 'clf__max_iter': 300, 'clf__solver': 'lbfgs'} \
NOTES: Try different max_iter \
"clf__hidden_layer_sizes": [(5,),], \
"clf__activation": ['tanh'], # \
"clf__solver": ['lbfgs'], # \
"clf__alpha": [1e-6], #\
"clf__max_iter": [100, 200, 300]

### Round 6
Fitting 5 folds for each of 3 candidates, totalling 15 fits \
Elapsed time to compute: 274.171 seconds \
BEST GRIDSEARCH SCORE (AUC): 0.949973 \
BEST GRIDSEARCH PARAMS: {'clf__activation': 'tanh', 'clf__alpha': 1e-06, 'clf__hidden_layer_sizes': (5,), 'clf__max_iter': 500, 'clf__solver': 'lbfgs'} \
NOTES: More max_iter \
"clf__max_iter": [300, 400, 500] # slightest difference between them and 400,500 actually ranks lower on prec, recall, F1, so going with best of 100 vs 200 bc it is faster \
NOTES: Comparing just 100 to 200, significant enough jump from 100 to 200 with jump from 4.5 to 9.5 seconds.  Will use 200.

### Round 7
Fitting 5 folds for each of 3 candidates, totalling 15 fits \
Elapsed time to compute: 342.020 seconds \
BEST GRIDSEARCH SCORE (AUC): 0.971669 \
BEST GRIDSEARCH PARAMS: {'clf__activation': 'tanh', 'clf__alpha': 1e-06, 'clf__hidden_layer_sizes': (10,), 'clf__max_iter': 200, 'clf__solver': 'lbfgs'} \
NOTES: hidden_layer_sizes is set up as a tuple, where each # in tuple is a layer of neurons.  So 10,2, is 2 layers, with 10 and 2. So need to test on both dimensions! \
NOTES: Will test the multiple layers first.  Last layer will have 1 neuron for binary classification (for now, can test this theory) \
NOTES: See optimal hidden layer sizes - need to balance improvements with time to compile. If can keep compile/inference time down, can compete with other models. \
NOTES: Ultimately, probably still go with other models because they are more intepretable.  \
"clf__hidden_layer_sizes": [(10,),(10,1),(10,10,1)], \
"clf__max_iter": [200] # drop to 100 until done, takes too much time

### Round 8
Fitting 5 folds for each of 4 candidates, totalling 20 fits \
Elapsed time to compute: 507.755 seconds \
BEST GRIDSEARCH SCORE (AUC): 0.970514 \
BEST GRIDSEARCH PARAMS: {'clf__activation': 'tanh', 'clf__alpha': 1e-06, 'clf__hidden_layer_sizes': (10, 5), 'clf__max_iter': 200, 'clf__solver': 'lbfgs'} \
"clf__hidden_layer_sizes": [(10,),(10,10),(10,10,10)] to [(10,),(10,5),(10,5,1)] (get (10,) from above, don't need to rerun) \
NOTE: Single layer (10,) beat the rest pretty well, will ramp up just the one layer

### Round 9
Fitting 5 folds for each of 12 candidates, totalling 60 fits \
Elapsed time to compute: 2309.769 seconds \
BEST GRIDSEARCH SCORE (AUC): 0.996928 \
BEST GRIDSEARCH PARAMS: {'clf__activation': 'tanh', 'clf__alpha': 1e-06, 'clf__hidden_layer_sizes': (200, 100), 'clf__max_iter': 200, 'clf__solver': 'lbfgs'} \
NOTES: One big round of test different varieties and combinations.  Coffee time, this is going to take a while! \
NOTES: After first few iterations, can already see that 50+ layers gets the model to 99.3%. Probably don't need to go much higher, will monitor the times! \
"clf__hidden_layer_sizes": [(50,),(100,),(150,),(200,),(50,1),(100,1),(150,1),(200,1),(50,25),(100,50),(150,75),(200,100)] \
NOTES: Stopped early to get rid of (#,1)'s, was clearly worse \
"clf__hidden_layer_sizes": [(50,),(100,),(150,),(200,) <-already ran ,(50,25),(100,50),(150,75),(200,100)] \
(50, 25) 0.994484 \
(100, 50) 0.996361  \
(150, 75) 0.996687 \
(200, 100) 0.996928 \

### Round 10
Fitting 5 folds for each of 3 candidates, totalling 15 fits \
Elapsed time to compute: 2022.561 seconds \
BEST GRIDSEARCH SCORE (AUC): 0.996928 \
BEST GRIDSEARCH PARAMS: {'clf__activation': 'tanh', 'clf__alpha': 1e-06, 'clf__hidden_layer_sizes': (200, 100), 'clf__max_iter': 200, 'clf__solver': 'lbfgs'} \
"clf__hidden_layer_sizes": [(200,),(200,50),(200,100)] \
(200,) 0.996691 \
(200, 50) 0.996892 \
(200, 100) 0.996928 

"clf__hidden_layer_sizes": [(300,),(400,),(500,),(500,100),(500,200)] \
clf__activation=tanh, clf__alpha=1e-06, clf__hidden_layer_sizes=(200, 100), clf__max_iter=200, clf__solver=lbfgs; AUC: (test=0.997) Accuracy: (test=0.993) F1: (test=0.993) prec: (test=0.992)  recall: (test=0.994) total time= 2.4min \
NOTE: Server was cut off, but got through enough to see that (200, 100) is the best value to use here 


### Round 11 - FINAL!!!
Fitting 5 folds for each of 54 candidates, totalling 270 fits \
Elapsed time to compute: 32573.847 seconds \
BEST GRIDSEARCH SCORE (AUC): 0.997501 \
BEST GRIDSEARCH PARAMS: {'clf__activation': 'tanh', 'clf__alpha': 1.5e-06, 'clf__hidden_layer_sizes': (220, 120), 'clf__max_iter': 180, 'clf__solver': 'adam'} \
PREDICT SCORE: 0.9978605050154974 \
NOTES: Last - try them all together with minor variations to see if combination changes anything \
SO FAR: END clf__activation=tanh, clf__alpha=5e-07, clf__hidden_layer_sizes=(200, 100), clf__max_iter=220, clf__solver=adam; \
"clf__hidden_layer_sizes": [(180,80),(200,100),(220,120)], # (200,100) (definitely NOT 180,100) \
"clf__activation": ['tanh'], \
"clf__solver": ['lbfgs','adam'], # adam \
"clf__alpha": [5e-7,1e-6,1.5e-6], 5e-7  \
"clf__max_iter": [180,200,220]   Doesn't seem to matter, might as well use 180

### FINAL
BEST GRIDSEARCH SCORE (AUC): 0.997505\
BEST GRIDSEARCH PARAMS: {'clf__activation': 'tanh', 'clf__alpha': 1.5e-06, 'clf__hidden_layer_sizes': (220, 120), 'clf__max_iter': 180, 'clf__solver': 'adam'}

# FINAL MODEL - Optimized for Recall

In [8]:
# MLP NN PIPELINE - GENERAL DATA, OVERSAMPLED
# ESTIMATORS / STEPS
# SCORE TO BEAT: 0.9978605050154974
# PREV BEST GRIDSEARCH PARAMS: {'clf__activation': 'tanh', 'clf__alpha': 1.5e-06, 'clf__hidden_layer_sizes': (220, 120), 'clf__max_iter': 180, 'clf__solver': 'adam'}

# {'clf__activation': 'tanh', 
#  'clf__alpha': 0.001, 'clf__hidden_layer_sizes': (50, 50, 1), 
#  'clf__max_iter': 50, 'clf__solver': 'lbfgs'}

# {'clf__activation': 'tanh', 
#  'clf__alpha': 0.001, 
#  'clf__hidden_layer_sizes': (40, 40, 1), 
#  'clf__max_iter': 20, 
#  'clf__solver': 'lbfgs'}

steps_mlp_final = [ # based on grid search
    ("preprocessor", preprocessor_mlp),
    ("clf", MLPClassifier(activation         = 'tanh', 
                          alpha              = 0.001, 
                          hidden_layer_sizes = (40, 40, 1), 
                          max_iter           = 20, 
                          solver             = 'lbfgs',
                          random_state=rand_state))
]

# using steps_rf from grid_search pipeline
mlp_model = Pipeline(steps_mlp_final)

set_config(display="diagram")
mlp_model

In [9]:
# MLP NN PIPELINE - FUTURE STEP DATA, OVERSAMPLED
# ESTIMATORS / STEPS
# SCORE TO BEAT: 0.9978605050154974
# BEST GRIDSEARCH PARAMS: {'clf__activation': 'tanh', 'clf__alpha': 1.5e-06, 'clf__hidden_layer_sizes': (220, 120), 'clf__max_iter': 180, 'clf__solver': 'adam'}

steps_mlp_final_fs = [ # based on grid search
    ("preprocessor", preprocessor_mlp),
    ("clf", MLPClassifier(activation         = 'tanh', 
                          alpha              = 0.001, 
                          hidden_layer_sizes = (40, 40, 1), 
                          max_iter           = 20, 
                          solver             = 'lbfgs',
                          random_state=rand_state))
]

# using steps_rf from grid_search pipeline
mlp_model_fs = Pipeline(steps_mlp_final_fs)

set_config(display="diagram")
mlp_model_fs

In [10]:
start_time = time.time()

mlp_model.fit(X_train, y_train)

elapsed_time = time.time() - start_time
print(f"Elapsed time to compute: {elapsed_time:.3f} seconds")

Elapsed time to compute: 23.561 seconds


In [11]:
# for optimized model - for General Data Oversampled AND Future Data Oversampled
# evaluate(mlp_model, X_test_def_os, y_test_def_os)
evaluate(mlp_model, X_test, y_test)

# ROUND 1
# Predict time to compute: 0.042 seconds
# Test predict probas time to compute: 0.018 seconds
# PR AUC: 0.133
# --------------------------------------------------
# --------------------------------------------------
# Test Accuracy :       0.74932
# Test Precision score: 0.15015
# Test Recall score:    0.59877
# Test F1 score:        0.24010
# Test ROC_AUC score:   0.70974
# Test PRC_AUC score:   0.13310
# Test Confusion Matrix:
# [[5215 1647]
#  [ 195  291]]

# ROUND 2
# Predict time to compute: 0.062 seconds
# Test predict probas time to compute: 0.088 seconds
# PR AUC: 0.217
# --------------------------------------------------
# --------------------------------------------------
# Test Accuracy :       0.84186
# Test Precision score: 0.22340
# Test Recall score:    0.56173
# Test F1 score:        0.31967
# Test ROC_AUC score:   0.76293
# Test PRC_AUC score:   0.21708
# Test Confusion Matrix:
# [[5913  949]
#  [ 213  273]]

# OR
# Predict time to compute: 0.075 seconds
# Test predict probas time to compute: 0.071 seconds
# PR AUC: 0.210
# --------------------------------------------------
# --------------------------------------------------
# Test Accuracy :       0.85738
# Test Precision score: 0.23033
# Test Recall score:    0.49383
# Test F1 score:        0.31414
# Test ROC_AUC score:   0.75383
# Test PRC_AUC score:   0.20974
# Test Confusion Matrix:
# [[6060  802]
#  [ 246  240]]

# ROUND 3
# Predict time to compute: 0.047 seconds
# Test predict probas time to compute: 0.071 seconds
# PR AUC: 0.150
# --------------------------------------------------
# --------------------------------------------------
# Test Accuracy :       0.72550
# Test Precision score: 0.14577
# Test Recall score:    0.64815
# Test F1 score:        0.23801
# Test ROC_AUC score:   0.73529
# Test PRC_AUC score:   0.15000
# Test Confusion Matrix:
# [[5016 1846]
#  [ 171  315]]

# ROUND 4 - try 200,200,1
# Predict time to compute: 0.183 seconds
# Test predict probas time to compute: 0.165 seconds
# PR AUC: 0.146
# --------------------------------------------------
# --------------------------------------------------
# Test Accuracy :       0.70618
# Test Precision score: 0.13709
# Test Recall score:    0.65021
# Test F1 score:        0.22644
# Test ROC_AUC score:   0.72871
# Test PRC_AUC score:   0.14552
# Test Confusion Matrix:
# [[4873 1989]
#  [ 170  316]]

# ROUND 5 - try max iter 100
# Predict time to compute: 0.221 seconds
# Test predict probas time to compute: 0.178 seconds
# PR AUC: 0.383
# --------------------------------------------------
# --------------------------------------------------
# Test Accuracy :       0.90582
# Test Precision score: 0.36230
# Test Recall score:    0.55761
# Test F1 score:        0.43922
# Test ROC_AUC score:   0.80266
# Test PRC_AUC score:   0.38285
# Test Confusion Matrix:
# [[6385  477]
#  [ 215  271]]

# FINAL ROUND: RECALL
# Predict time to compute: 0.053 seconds
# Test predict probas time to compute: 0.046 seconds
# PR AUC: 0.150
# --------------------------------------------------
# --------------------------------------------------
# Test Accuracy :       0.72550
# Test Precision score: 0.14577
# Test Recall score:    0.64815
# Test F1 score:        0.23801
# Test ROC_AUC score:   0.73529
# Test PRC_AUC score:   0.15000
# Test Confusion Matrix:
# [[5016 1846]
#  [ 171  315]]

Predict time to compute: 0.053 seconds
Test predict probas time to compute: 0.046 seconds
PR AUC: 0.150
--------------------------------------------------
--------------------------------------------------
Test Accuracy :       0.72550
Test Precision score: 0.14577
Test Recall score:    0.64815
Test F1 score:        0.23801
Test ROC_AUC score:   0.73529
Test PRC_AUC score:   0.15000
Test Confusion Matrix:
[[5016 1846]
 [ 171  315]]


In [12]:
start_time = time.time()

mlp_model_fs.fit(X_train_fs, y_train_fs)

elapsed_time = time.time() - start_time
print(f"Elapsed time to compute: {elapsed_time:.3f} seconds")

Elapsed time to compute: 22.492 seconds


In [13]:
# evaluate(mlp_model_fs,  X_test_fs_os, y_test_fs_os)
evaluate(mlp_model_fs, X_test_fs, y_test_fs)

# ROUND 1
# Predict time to compute: 0.037 seconds
# Test predict probas time to compute: 0.016 seconds
# PR AUC: 0.010
# --------------------------------------------------
# --------------------------------------------------
# Test Accuracy :       0.91957
# Test Precision score: 0.01261
# Test Recall score:    0.10526
# Test F1 score:        0.02251
# Test ROC_AUC score:   0.49590
# Test PRC_AUC score:   0.00996
# Test Confusion Matrix:
# [[5951  470]
#  [  51    6]]

# ROUND 2
# Predict time to compute: 0.100 seconds
# Test predict probas time to compute: 0.058 seconds
# PR AUC: 0.009
# --------------------------------------------------
# --------------------------------------------------
# Test Accuracy :       0.96033
# Test Precision score: 0.01456
# Test Recall score:    0.05263
# Test F1 score:        0.02281
# Test ROC_AUC score:   0.48610
# Test PRC_AUC score:   0.00937
# Test Confusion Matrix:
# [[6218  203]
#  [  54    3]]

# ROUND 3
# Predict time to compute: 0.078 seconds
# Test predict probas time to compute: 0.069 seconds
# PR AUC: 0.009
# --------------------------------------------------
# --------------------------------------------------
# Test Accuracy :       0.89904
# Test Precision score: 0.00661
# Test Recall score:    0.07018
# Test F1 score:        0.01208
# Test ROC_AUC score:   0.50103
# Test PRC_AUC score:   0.00890
# Test Confusion Matrix:
# [[5820  601]
#  [  53    4]]

# ROUND 4 - try 200,200,1
# Predict time to compute: 0.178 seconds
# Test predict probas time to compute: 0.134 seconds
# PR AUC: 0.010
# --------------------------------------------------
# --------------------------------------------------
# Test Accuracy :       0.85922
# Test Precision score: 0.01476
# Test Recall score:    0.22807
# Test F1 score:        0.02772
# Test ROC_AUC score:   0.48953
# Test PRC_AUC score:   0.00979
# Test Confusion Matrix:
# [[5553  868]
#  [  44   13]]

# ROUND 5 - try max iter 100
# Predict time to compute: 0.252 seconds
# Test predict probas time to compute: 0.175 seconds
# PR AUC: 0.019
# --------------------------------------------------
# --------------------------------------------------
# Test Accuracy :       0.98395
# Test Precision score: 0.08772
# Test Recall score:    0.08772
# Test F1 score:        0.08772
# Test ROC_AUC score:   0.49349
# Test PRC_AUC score:   0.01863
# Test Confusion Matrix:
# [[6369   52]
#  [  52    5]]

# FINAL ROUND: RECALL
# Predict time to compute: 0.051 seconds
# Test predict probas time to compute: 0.047 seconds
# PR AUC: 0.009
# --------------------------------------------------
# --------------------------------------------------
# Test Accuracy :       0.89904
# Test Precision score: 0.00661
# Test Recall score:    0.07018
# Test F1 score:        0.01208
# Test ROC_AUC score:   0.50103
# Test PRC_AUC score:   0.00890
# Test Confusion Matrix:
# [[5820  601]
#  [  53    4]]

Predict time to compute: 0.051 seconds
Test predict probas time to compute: 0.047 seconds
PR AUC: 0.009
--------------------------------------------------
--------------------------------------------------
Test Accuracy :       0.89904
Test Precision score: 0.00661
Test Recall score:    0.07018
Test F1 score:        0.01208
Test ROC_AUC score:   0.50103
Test PRC_AUC score:   0.00890
Test Confusion Matrix:
[[5820  601]
 [  53    4]]


# FOCUS ON PRC-AUC

In [28]:
# Set up MLP Neural Net pipeline
# ESTIMATORS / STEPS
steps_mlp_prc = [
    ("clf", MLPClassifier(random_state=rand_state))
]

# Pipeline
pipe_mlp_prc = Pipeline(steps_mlp_prc)

# Grid Hyperparam Search
grid_params_mlp_prc = {
    "clf__hidden_layer_sizes": [(150,150,1)],
    "clf__activation": ['relu'], # {‘identity’, ‘logistic’, ‘tanh’, ‘relu’ (default)},
    "clf__solver": ['lbfgs'], # {‘lbfgs’, ‘sgd’, ‘adam’ (default)} - lbfgs better for smaller datasets
    "clf__alpha": [5.0], # strength of l2 regularization term
    "clf__max_iter": [250] 
}

pipe_gs_mlp_prc = HalvingGridSearchCV(estimator=pipe_mlp_prc,
                     param_grid=grid_params_mlp_prc,
                     factor=3, # one 3rd of of candidates are selected
                     scoring=prc_auc_scoring,
                     refit=True,
                     verbose=3,
                     random_state=rand_state,             
                     n_jobs=1,
                     cv=ps) # kfold3)

set_config(display="diagram")
pipe_gs_mlp_prc

In [29]:
# Apply Standard Scaler outside of gridsearch to speed it up
sc_X = StandardScaler() 
X_g_ss = sc_X.fit_transform(X_g)
X_g_test_ss = sc_X.fit_transform(X_test_def_os)

In [30]:
start_time = time.time()

gs_mlp_prc = pipe_gs_mlp_prc.fit(X_g_ss, y_g)

elapsed_time = time.time() - start_time
print(f"Elapsed time to compute: {elapsed_time:.3f} seconds")

n_iterations: 1
n_required_iterations: 1
n_possible_iterations: 1
min_resources_: 116648
max_resources_: 116648
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 1
n_resources: 116648
Fitting 1 folds for each of 1 candidates, totalling 1 fits
PR AUC: 0.579
PR AUC: 0.759
[CV 1/1] END clf__activation=relu, clf__alpha=5.0, clf__hidden_layer_sizes=(150, 150, 1), clf__max_iter=250, clf__solver=lbfgs;, score=(train=0.759, test=0.579) total time=   6.1s
Elapsed time to compute: 13.349 seconds


In [31]:
print(gs_mlp_prc.best_score_)
print(gs_mlp_prc.best_params_)
cv_score_mlp_prc = gs_mlp_prc.best_score_
test_score_mlp_prc = gs_mlp_prc.score(X_g_test_ss, y_test_def_os)
print(f'Cross-validation score: {cv_score_mlp_prc}\nTest score: {test_score_mlp_prc}')

# BASELINE
# 0.5792987742244744
# {'clf__activation': 'relu', 'clf__alpha': 0.1, 'clf__hidden_layer_sizes': (150, 150, 1), 'clf__max_iter': 250, 'clf__solver': 'lbfgs'}
# PR AUC: 0.533
# Cross-validation score: 0.5792987742244744
# Test score: 0.5330702231899836

# 0.5792786662129976
# {'clf__activation': 'relu', 'clf__alpha': 5.0, 'clf__hidden_layer_sizes': (150, 150, 1), 'clf__max_iter': 250, 'clf__solver': 'lbfgs'}
# PR AUC: 0.533
# Cross-validation score: 0.5792786662129976
# Test score: 0.5330702231899836

# 0.5792732716385411
# {'clf__activation': 'relu', 'clf__alpha': 5.0, 'clf__hidden_layer_sizes': (150, 150, 1), 'clf__max_iter': 250, 'clf__solver': 'lbfgs'}
# PR AUC: 0.533
# Cross-validation score: 0.5792732716385411
# Test score: 0.5330702231899836

0.5792732716385411
{'clf__activation': 'relu', 'clf__alpha': 5.0, 'clf__hidden_layer_sizes': (150, 150, 1), 'clf__max_iter': 250, 'clf__solver': 'lbfgs'}
PR AUC: 0.533
Cross-validation score: 0.5792732716385411
Test score: 0.5330702231899836


In [32]:
#converting the clf.cv_results to dataframe
df_mlp_prc=pd.DataFrame.from_dict(gs_mlp_prc.cv_results_)
df_mlp_prc # [df_cols] # add specific columns to narrow df view
# df_mlp.columns.values

Unnamed: 0,iter,n_resources,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__activation,param_clf__alpha,param_clf__hidden_layer_sizes,param_clf__max_iter,param_clf__solver,params,split0_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,mean_train_score,std_train_score
0,0,116648,6.032525,0.0,0.101501,0.0,relu,5.0,"(150, 150, 1)",250,lbfgs,"{'clf__activation': 'relu', 'clf__alpha': 5.0,...",0.579273,0.579273,0.0,1,0.758975,0.758975,0.0


### MLP Gridsearch Results

### Round 1
"clf__hidden_layer_sizes": [(5,), (10,),(40,),(50,),(100,),
                                (10,1), (50,1),(100,1),(200,1),(10,10,1),
                                (10,5,1),(20,20,1),(40,40,1), (50,50,1),
                                (75,75,1),(100,100,1),(150,150,1),(200,200,1)],\
"clf__activation": ['tanh','relu'], # {‘identity’, ‘logistic’, ‘tanh’, ‘relu’ (default)},\
"clf__solver": ['lbfgs', 'adam'], # {‘lbfgs’, ‘sgd’, ‘adam’ (default)} - lbfgs better for smaller datasets\
"clf__alpha": [5e-7,1e-7,1.5e-6,1e-6,1e-5,1e-4,1e-3,0.001, 0.01, 0.1], # strength of l2 regularization term\
"clf__max_iter": [10,50,100,200,250,500] \
0.5792987742244744\
{'clf__activation': 'relu', 'clf__alpha': 0.1, 'clf__hidden_layer_sizes': (150, 150, 1), 'clf__max_iter': 250, 'clf__solver': 'lbfgs'}\
PR AUC: 0.533\
Cross-validation score: 0.5792987742244744\
Test score: 0.5330702231899836\
DECISIONS: 'clf__activation': 'relu', "clf__alpha": 0.1+, \
hidden_layer_sizes': (150, 150, 1), max_iter': 250, solver': 'lbfgs'

### Round 2
"clf__hidden_layer_sizes": [(150,150,1)],\
"clf__activation": ['relu'], # {‘identity’, ‘logistic’, ‘tanh’, ‘relu’ (default)},\
"clf__solver": ['lbfgs'], # {‘lbfgs’, ‘sgd’, ‘adam’ (default)} - lbfgs better for smaller datasets\
"clf__alpha": [0.1, 0.5, 1.0, 2.0, 5.0, 10.0], # strength of l2 regularization term\
"clf__max_iter": [250] \
0.5792786662129976\
{'clf__activation': 'relu', 'clf__alpha': 5.0, 'clf__hidden_layer_sizes': (150, 150, 1), 'clf__max_iter': 250, 'clf__solver': 'lbfgs'}\
PR AUC: 0.533\
Cross-validation score: 0.5792786662129976\
Test score: 0.5330702231899836\
DECISIONS: alpha: 5.0

### FINAL
"clf__hidden_layer_sizes": [(150,150,1)],\
"clf__activation": ['relu'], # {‘identity’, ‘logistic’, ‘tanh’, ‘relu’ (default)},\
"clf__solver": ['lbfgs'], # {‘lbfgs’, ‘sgd’, ‘adam’ (default)} - lbfgs better for smaller datasets\
"clf__alpha": [5.0], # strength of l2 regularization term\
"clf__max_iter": [250]\
0.5792732716385411\
{'clf__activation': 'relu', 'clf__alpha': 5.0, 'clf__hidden_layer_sizes': (150, 150, 1), 'clf__max_iter': 250, 'clf__solver': 'lbfgs'}\
PR AUC: 0.533\
Cross-validation score: 0.5792732716385411\
Test score: 0.5330702231899836

# FINAL MODEL - Optimized for PRC-AUC

In [43]:
# MLP NN PIPELINE - GENERAL DATA, OVERSAMPLED
# ESTIMATORS / STEPS

steps_mlp_final_pr = [ # based on grid search
    ("preprocessor", preprocessor_mlp),
    ("clf", MLPClassifier(activation         = 'relu', 
                          alpha              = 5.0, 
                          hidden_layer_sizes = (150, 150, 1), 
                          max_iter           = 250, 
                          solver             = 'lbfgs',
                          random_state=rand_state))
]

# using steps_rf from grid_search pipeline
mlp_model_pr = Pipeline(steps_mlp_final_pr)

set_config(display="diagram")
mlp_model_pr

In [60]:
# MLP NN PIPELINE - FUTURE STEP DATA, OVERSAMPLED
# ESTIMATORS / STEPS
# SCORE TO BEAT: 0.9978605050154974
# BEST GRIDSEARCH PARAMS: {'clf__activation': 'tanh', 'clf__alpha': 1.5e-06, 'clf__hidden_layer_sizes': (220, 120), 'clf__max_iter': 180, 'clf__solver': 'adam'}

steps_mlp_final_fs_pr = [ # based on grid search
    ("preprocessor", preprocessor_mlp),
    ("clf", MLPClassifier(activation         = 'relu', 
                          alpha              = 5.0, 
                          hidden_layer_sizes = (150, 150, 1), 
                          max_iter           = 250, 
                          solver             = 'lbfgs',
                          random_state=rand_state))
]

# using steps_rf from grid_search pipeline
mlp_model_fs_pr = Pipeline(steps_mlp_final_fs_pr)

set_config(display="diagram")
mlp_model_fs_pr

In [61]:
start_time = time.time()

X_train_def_os_ss = sc_X.fit_transform(X_train_def_os)
X_test_def_os_ss = sc_X.fit_transform(X_test_def_os)

# X_train_def_os, X_test_def_os, X_val_def_os, y_gs_os, y_train_def_os, y_test_def_os, y_val_def_os
# X_gs_fs_os, X_train_fs_os, X_test_fs_os, X_val_fs_os, y_gs_fs_os, y_train_fs_os, y_test_fs_os, y_val_fs_os
mlp_model_pr.fit(X_train_def_os, y_train_def_os)

elapsed_time = time.time() - start_time
print(f"Elapsed time to compute: {elapsed_time:.3f} seconds")

Elapsed time to compute: 248.465 seconds


In [62]:
# for optimized model - for General Data Oversampled AND Future Data Oversampled
evaluate(mlp_model_pr, X_test_def_os_ss, y_test)

# Predict time to compute: 0.032 seconds
# Test predict probas time to compute: 0.031 seconds
# PR AUC: 0.072
# --------------------------------------------------
# --------------------------------------------------
# Test Accuracy :       0.84064
# Test Precision score: 0.07664
# Test Recall score:    0.12757
# Test F1 score:        0.09575
# Test ROC_AUC score:   0.52877
# Test PRC_AUC score:   0.07179
# Test Confusion Matrix:
# [[6115  747]
#  [ 424   62]]

Predict time to compute: 0.032 seconds
Test predict probas time to compute: 0.031 seconds
PR AUC: 0.072
--------------------------------------------------
--------------------------------------------------
Test Accuracy :       0.84064
Test Precision score: 0.07664
Test Recall score:    0.12757
Test F1 score:        0.09575
Test ROC_AUC score:   0.52877
Test PRC_AUC score:   0.07179
Test Confusion Matrix:
[[6115  747]
 [ 424   62]]


In [68]:
start_time = time.time()

mlp_model_fs_pr.fit(X_train_fs_os, y_train_fs_os)

elapsed_time = time.time() - start_time
print(f"Elapsed time to compute: {elapsed_time:.3f} seconds")

Elapsed time to compute: 16.770 seconds


In [71]:
X_test_fs_os_ss = sc_X.fit_transform(X_test_fs_os)

evaluate(mlp_model_fs_pr, X_test_fs_os_ss, y_test_fs)

# Predict time to compute: 0.017 seconds
# Test predict probas time to compute: 0.018 seconds
# PR AUC: 0.504
# --------------------------------------------------
# --------------------------------------------------
# Test Accuracy :       0.99120
# Test Precision score: 0.00000
# Test Recall score:    0.00000
# Test F1 score:        0.00000
# Test ROC_AUC score:   0.50000
# Test PRC_AUC score:   0.50440
# Test Confusion Matrix:
# [[6421    0]
#  [  57    0]]

Predict time to compute: 0.017 seconds
Test predict probas time to compute: 0.018 seconds
PR AUC: 0.504
--------------------------------------------------
--------------------------------------------------
Test Accuracy :       0.99120
Test Precision score: 0.00000
Test Recall score:    0.00000
Test F1 score:        0.00000
Test ROC_AUC score:   0.50000
Test PRC_AUC score:   0.50440
Test Confusion Matrix:
[[6421    0]
 [  57    0]]
