In [1]:
import os
os.getcwd() 

'/Users/izapreev/Projects/ML-PT'

In [2]:
# Make sure the source code auto reloads into the kernel
%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np

from src.utils.logger import logger

In [4]:
from src.utils.file_utils import load_numpy_zc

# Load the features file for further use
file_name = os.path.join('.', 'data', 'X.npz')
X, = load_numpy_zc(file_name, ['X'])

16:09:26 INFO (file_utils:32): Loading compressed numpy z file from: ./data/X.npz
16:09:26 INFO (file_utils:35): File loading and data extraction are done!


In [5]:
# Load the classes file for further use
file_name = os.path.join('.', 'data', 'y.npz')
y, = load_numpy_zc(file_name, ['y'])

16:09:26 INFO (file_utils:32): Loading compressed numpy z file from: ./data/y.npz
16:09:26 INFO (file_utils:35): File loading and data extraction are done!


In [6]:
#########################################################################
# 01: Run the initial non-tuned classifier
#

In [7]:
from src.model.classifier.models_try_out import _CLASSIFIERS
from src.model.classifier.models_try_out import train_test_single_model

name = 'Random Forest Classifier'
model_00 = _CLASSIFIERS[name]

model_00_res = train_test_single_model(model_00, name, X, y)

"Random Forest Classifier" - repeated k-fold cross-validation: 0it [00:00, ?it/s]

16:12:55 INFO (models_try_out:125): The "Random Forest Classifier" model f1-score: 0.9937, accuracy: 0.9939, precision: 0.9939, recall: 0.9939, time: 6.8717 sec.


In [8]:
#########################################################################
# 01: Perform Random search hyperparameters tuning
#

In [9]:
# Define the random search hyperparameters to tune
num_features = X.shape[1]
num_classes  = len(np.unique(y))
random_grid = {
                'n_estimators' : [20, 30, 40, 50, 60, 70, 80],
                'criterion' : ['gini', 'entropy', 'log_loss'],
                'max_depth' : [None, 10, 20, 30, 40, 50],
                'min_samples_leaf' : [1, 2, 4, 6],
                'min_samples_split' : [2, 4, 6, 8],
                'max_features' : list(range(1, num_features + 1)),
                'max_leaf_nodes' : [None, num_classes, 2 * num_classes, 3 * num_classes]
              }

In [10]:
from sklearn.model_selection import RandomizedSearchCV

# Random search of parameters, using 3 fold cross validation, search across 100 different combinations, and use all available 
rf_random = RandomizedSearchCV(estimator = model_00, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=1, n_jobs = -1)

# Fit the random search model.
logger.info(f'Start the Random Grid model tuning')
rf_random.fit(X, y)

# Log the best parameters found
logger.info(f'Found the best Random search parameters: {rf_random.best_params_}')

16:12:56 INFO (2524318397:10): Start the Random Grid model tuning


Fitting 3 folds for each of 100 candidates, totalling 300 fits


16:16:40 INFO (2524318397:14): Found the best Random search parameters: {'n_estimators': 50, 'min_samples_split': 6, 'min_samples_leaf': 1, 'max_leaf_nodes': None, 'max_features': 7, 'max_depth': None, 'criterion': 'entropy'}


In [11]:
# Evaluate the model on the found parameters
model_01 = RandomForestClassifier(**rf_random.best_params_)
model_01_res = train_test_single_model(model_01, name, X, y)

"Random Forest Classifier" - repeated k-fold cross-validation: 0it [00:00, ?it/s]

16:22:43 INFO (models_try_out:125): The "Random Forest Classifier" model f1-score: 0.9944, accuracy: 0.9945, precision: 0.9946, recall: 0.9945, time: 12.0893 sec.


In [12]:
#########################################################################
# 02: Perform coarse Grid search hyperparameters tuning
#

In [21]:
# Define the coarse grid search hyperparameters to tune
coarse_grid = {
                'n_estimators' : range(40, 60, 5),
                'criterion' : ['entropy'],
                'max_depth' : range(20, 40, 5),
                'min_samples_leaf' : [1, 2, 3],
                'min_samples_split' : [2, 3, 4],
                'max_features' : [7, 8, 9],
                'max_leaf_nodes' : [None]
              }

In [22]:
from sklearn.model_selection import GridSearchCV

# Random search of parameters, using 3 fold cross validation
rf_coarse = GridSearchCV(estimator = model_00, param_grid = coarse_grid, cv = 3, verbose=1, n_jobs = -1)

# Fit the random search model.
logger.info(f'Start the coarse Grid Grid model tuning')
rf_coarse.fit(X, y)

# Log the best parameters found
logger.info(f'Found the best coarse Grid search parameters: {rf_coarse.best_params_}')

16:54:46 INFO (778038824:7): Start the coarse Grid Grid model tuning


Fitting 3 folds for each of 432 candidates, totalling 1296 fits


17:16:07 INFO (778038824:11): Found the best Random search parameters: {'criterion': 'entropy', 'max_depth': 30, 'max_features': 7, 'max_leaf_nodes': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 55}


In [20]:
# Evaluate the model on the found parameters
model_02 = RandomForestClassifier(**rf_coarse.best_params_)
model_02_res = train_test_single_model(model_02, name, X, y)

"Random Forest Classifier" - repeated k-fold cross-validation: 0it [00:00, ?it/s]

16:54:46 INFO (models_try_out:125): The "Random Forest Classifier" model f1-score: 0.9947, accuracy: 0.9948, precision: 0.9949, recall: 0.9948, time: 13.1837 sec.


In [26]:
#########################################################################
# 03: Perform fine Grid search hyperparameters tuning
#

In [23]:
# Define the coarse grid search hyperparameters to tune
fine_grid = {
                'n_estimators' : range(51, 60, 1),
                'criterion' : ['entropy'],
                'max_depth' : range(26, 35, 1),
                'min_samples_leaf' : [1],
                'min_samples_split' : [2],
                'max_features' : [5, 7, 8],
                'max_leaf_nodes' : [None]
             }

In [24]:
# Random search of parameters, using 3 fold cross validation
rf_fine = GridSearchCV(estimator = model_00, param_grid = fine_grid, cv = 3, verbose=1, n_jobs = -1)

# Fit the random search model.
logger.info(f'Start the coarse Grid Grid model tuning')
rf_fine.fit(X, y)

# Log the best parameters found
logger.info(f'Found the best fine Grid search parameters: {rf_fine.best_params_}')

17:19:18 INFO (1003720481:5): Start the coarse Grid Grid model tuning


Fitting 3 folds for each of 243 candidates, totalling 729 fits


17:31:11 INFO (1003720481:9): Found the best fine Grid search parameters: {'criterion': 'entropy', 'max_depth': 29, 'max_features': 7, 'max_leaf_nodes': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 57}


In [25]:
# Evaluate the model on the found parameters
model_03 = RandomForestClassifier(**rf_fine.best_params_)
model_03_res = train_test_single_model(model_03, name, X, y)

"Random Forest Classifier" - repeated k-fold cross-validation: 0it [00:00, ?it/s]

17:37:54 INFO (models_try_out:125): The "Random Forest Classifier" model f1-score: 0.9947, accuracy: 0.9948, precision: 0.9949, recall: 0.9948, time: 13.4273 sec.


In [31]:
from src.utils.file_utils import compressed_pickle

# Store the best fit classification parameters
output_folder = os.path.join('.', 'data')
compressed_pickle(output_folder, 'classifier_params', rf_fine.best_params_)

17:47:47 INFO (file_utils:13): Dumping compressed pickle file into: ./data/classifier_params.pbz2
17:47:47 INFO (file_utils:16): File dumping is done!
