In [1]:
import os
os.getcwd() 

'/Users/izapreev/Projects/ML-PT'

In [2]:
# Make sure the source code auto reloads into the kernel
%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np

from src.utils.logger import logger

In [4]:
from src.utils.file_utils import load_numpy_zc

# Load the features file for further use
data_folder = os.path.join('.', 'data')
file_name = os.path.join(data_folder, 'X.npz')
X, = load_numpy_zc(file_name, ['X'])

16:57:05 INFO (file_utils:31): Loading compressed numpy z file from: ./data/X.npz
16:57:05 INFO (file_utils:34): File loading and data extraction are done!


In [5]:
# Load the classes file for further use
file_name = os.path.join(data_folder, 'y.npz')
y, = load_numpy_zc(file_name, ['y'])

16:57:05 INFO (file_utils:31): Loading compressed numpy z file from: ./data/y.npz
16:57:05 INFO (file_utils:34): File loading and data extraction are done!


In [6]:
#########################################################################
# 01: Run the initial non-tuned classifier
#

In [7]:
from src.model.classifier.models_try_out import _CLASSIFIERS
from src.model.classifier.models_try_out import train_test_single_model

name = 'Random Forest Classifier'
model_00 = _CLASSIFIERS[name]

model_00_res = train_test_single_model(model_00, name, X, y)

"Random Forest Classifier" - repeated k-fold cross-validation: 0it [00:00, ?it/s]

17:01:08 INFO (models_try_out:125): The "Random Forest Classifier" model f1-score: 0.9924, accuracy: 0.9927, precision: 0.9927, recall: 0.9927, time: 7.859 sec.


In [8]:
#########################################################################
# 01: Perform Random search hyperparameters tuning
#

In [9]:
# Define the random search hyperparameters to tune
num_features = X.shape[1]
num_classes  = len(np.unique(y))
random_grid = {
                'n_estimators' : [20, 30, 40, 50, 60, 70, 80],
                'criterion' : ['gini', 'entropy', 'log_loss'],
                'max_depth' : [None, 10, 20, 30, 40, 50],
                'min_samples_leaf' : [1, 2, 4, 6],
                'min_samples_split' : [2, 4, 6, 8],
                'max_features' : list(range(1, num_features + 1)),
                'max_leaf_nodes' : [None, num_classes, 2 * num_classes, 3 * num_classes]
              }

In [10]:
from sklearn.model_selection import RandomizedSearchCV

# Random search of parameters, using 3 fold cross validation, search across 100 different combinations, and use all available 
rf_random = RandomizedSearchCV(estimator = model_00, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=1, n_jobs = -1)

# Fit the random search model.
logger.info(f'Start the Random Grid model tuning')
rf_random.fit(X, y)

# Log the best parameters found
logger.info(f'Found the best Random search parameters: {rf_random.best_params_}')

17:01:08 INFO (943375875:7): Start the Random Grid model tuning


Fitting 3 folds for each of 100 candidates, totalling 300 fits


17:04:48 INFO (943375875:11): Found the best Random search parameters: {'n_estimators': 40, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_leaf_nodes': None, 'max_features': 7, 'max_depth': 50, 'criterion': 'entropy'}


In [11]:
from sklearn.ensemble import RandomForestClassifier

# Evaluate the model on the found parameters
model_01 = RandomForestClassifier(**rf_random.best_params_)
model_01_res = train_test_single_model(model_01, name, X, y)

"Random Forest Classifier" - repeated k-fold cross-validation: 0it [00:00, ?it/s]

17:10:17 INFO (models_try_out:125): The "Random Forest Classifier" model f1-score: 0.9943, accuracy: 0.9944, precision: 0.9945, recall: 0.9944, time: 10.9637 sec.


In [12]:
#########################################################################
# 02: Perform coarse Grid search hyperparameters tuning
#

In [17]:
# Define the coarse grid search hyperparameters to tune
coarse_grid = {
                'n_estimators' : range(30, 50, 5),
                'criterion' : ['entropy'],
                'max_depth' : range(40, 60, 5),
                'min_samples_leaf' : [1, 2, 3],
                'min_samples_split' : [3, 4, 5],
                'max_features' : [5, 6, 7, 8, 9],
                'max_leaf_nodes' : [None]
              }

In [18]:
from sklearn.model_selection import GridSearchCV

# Random search of parameters, using 3 fold cross validation
rf_coarse = GridSearchCV(estimator = model_00, param_grid = coarse_grid, cv = 3, verbose=1, n_jobs = -1)

# Fit the random search model.
logger.info(f'Start the coarse Grid Grid model tuning')
rf_coarse.fit(X, y)

# Log the best parameters found
logger.info(f'Found the best coarse Grid search parameters: {rf_coarse.best_params_}')

17:10:56 INFO (3178084993:7): Start the coarse Grid Grid model tuning


Fitting 3 folds for each of 720 candidates, totalling 2160 fits


17:41:14 INFO (3178084993:11): Found the best coarse Grid search parameters: {'criterion': 'entropy', 'max_depth': 50, 'max_features': 7, 'max_leaf_nodes': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 30}


In [19]:
# Evaluate the model on the found parameters
model_02 = RandomForestClassifier(**rf_coarse.best_params_)
model_02_res = train_test_single_model(model_02, name, X, y)

"Random Forest Classifier" - repeated k-fold cross-validation: 0it [00:00, ?it/s]

17:45:21 INFO (models_try_out:125): The "Random Forest Classifier" model f1-score: 0.9942, accuracy: 0.9943, precision: 0.9944, recall: 0.9943, time: 8.2283 sec.


In [21]:
#########################################################################
# 03: Perform fine Grid search hyperparameters tuning
#

In [22]:
# Define the coarse grid search hyperparameters to tune
fine_grid = {
                'n_estimators' : range(25, 36, 1),
                'criterion' : ['entropy'],
                'max_depth' : range(45, 56, 1),
                'min_samples_leaf' : [1],
                'min_samples_split' : [4],
                'max_features' : [7],
                'max_leaf_nodes' : [None]
             }

In [23]:
# Random search of parameters, using 3 fold cross validation
rf_fine = GridSearchCV(estimator = model_00, param_grid = fine_grid, cv = 3, verbose=1, n_jobs = -1)

# Fit the random search model.
logger.info(f'Start the coarse Grid Grid model tuning')
rf_fine.fit(X, y)

# Log the best parameters found
logger.info(f'Found the best fine Grid search parameters: {rf_fine.best_params_}')

17:45:34 INFO (1003720481:5): Start the coarse Grid Grid model tuning


Fitting 3 folds for each of 121 candidates, totalling 363 fits


17:49:51 INFO (1003720481:9): Found the best fine Grid search parameters: {'criterion': 'entropy', 'max_depth': 48, 'max_features': 7, 'max_leaf_nodes': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 33}


In [24]:
# Evaluate the model on the found parameters
model_03 = RandomForestClassifier(**rf_fine.best_params_)
model_03_res = train_test_single_model(model_03, name, X, y)

"Random Forest Classifier" - repeated k-fold cross-validation: 0it [00:00, ?it/s]

17:54:23 INFO (models_try_out:125): The "Random Forest Classifier" model f1-score: 0.9942, accuracy: 0.9943, precision: 0.9944, recall: 0.9943, time: 9.072 sec.


In [28]:
from src.utils.file_utils import dump_pickle_data

# Train the classifier with the selected best parameters
model = RandomForestClassifier(**rf_fine.best_params_)
model.fit(X, y)

# Dump the classifier for future use
dump_pickle_data(data_folder, 'classifier', model)

18:12:05 INFO (file_utils:10): Dumping pickle file into: ./data/classifier.pkl


In [29]:
from src.model.classifier.models_try_out import report_model_metrics

# Evaluate the trained classifier to later compare with the online classifier scored on the same data
y_pred = model.predict(X)
report_model_metrics(y, y_pred, 'Classifier')

18:12:07 INFO (models_try_out:76): "Classifier" model, F1-score: 0.9999, Accuracy (TP+TN): 0.9999, Precision (FP): 0.9999, Recall (FN): 0.9999
