# Data Modeling
In this step the model creation will be started. Before we start creating the model, we will first find the best parameters (hyperparamter tuning) to be used in the model fitting with grid search cross-validation technique. 

We are going to use two machine learning model type in this project case, which are Random Forest technique and k-Nearest Neighbors technique with similar treatments. Random Forest is used here to check each sensors feature importance on each class classification.

In each modeling process (hyperparameter tuning and model fitting) we will log the process and its result, and then save it to log files.

In [1]:
import sys
sys.path.append('..') 

In [2]:
import src.util as utils

import json
import os
from datetime import datetime as dt
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import GridSearchCV

In [3]:
os.chdir(os.path.dirname(os.getcwd()))
os.getcwd()

'd:\\ML\\PACMANN INTRO PROJECT'

In [4]:
config = utils.load_config()

### Data Loading

In [5]:
X_train_feng = utils.pickle_load(config['train_feng_set_path'][0])
y_train_feng = utils.pickle_load(config['train_feng_set_path'][1])
X_test_feng = utils.pickle_load(config['test_feng_set_path'][0])
y_test_feng = utils.pickle_load(config['test_feng_set_path'][1])
X_val_feng = utils.pickle_load(config['val_feng_set_path'][0])
y_val_feng = utils.pickle_load(config['val_feng_set_path'][1])

### Define logging functions

In [6]:
def time_stamp(to_str = False):
    if to_str:
        return dt.now().strftime("%Y-%m-%d %H:%M:%S")
    else:
        return dt.now()

In [7]:
def log_json(current_log: dict, log_path: str):
    current_log = current_log.copy()

    try:
        with open(log_path, "r") as file:
            last_log = json.load(file)
        file.close()
    except FileNotFoundError as ffe:
        with open(log_path, "w") as file:
            file.write("[]")
        file.close()
        with open(log_path, "r") as file:
            last_log = json.load(file)
        file.close()
    
    last_log.append(current_log)

    with open(log_path, "w") as file:
        json.dump(last_log, file)
        file.close()

    return last_log

In [8]:
def log_json(logs: dict, file_path: str):
    try:
        # Check if the file exists, and create it if it doesn't
        if not os.path.exists(file_path):
            with open(file_path, 'w') as f:
                pass

        # Log the JSON data
        with open(file_path, 'a') as f:
            json.dump(logs, f)
            f.write('\n')  # Add a newline for better readability of the log file

    except Exception as e:
        print(f"Error while logging JSON data: {e}")

### Hyperparameter Tuning (Random Forest model)
Using grid search cross-val, we will find the best n-estimators, tree max-depth, min samples split and min samples leaf.

In [9]:
# Grid search for RandomForest hyperparameters
rf = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_feng, y_train_feng)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best hyperparameters:", best_params)

Best hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}


In [10]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.070870,0.009679,0.005997,0.001863,,1,2,10,"{'max_depth': None, 'min_samples_leaf': 1, 'mi...",0.970938,0.968296,0.951058,0.973545,0.968254,0.966418,0.007925,25
1,0.332874,0.029619,0.013863,0.002389,,1,2,50,"{'max_depth': None, 'min_samples_leaf': 1, 'mi...",0.970938,0.970938,0.958995,0.974868,0.969577,0.969063,0.005336,1
2,0.627023,0.048648,0.026796,0.003165,,1,2,100,"{'max_depth': None, 'min_samples_leaf': 1, 'mi...",0.972259,0.969617,0.960317,0.972222,0.969577,0.968798,0.004402,8
3,1.330774,0.068554,0.055037,0.003295,,1,2,200,"{'max_depth': None, 'min_samples_leaf': 1, 'mi...",0.972259,0.970938,0.961640,0.972222,0.968254,0.969063,0.003987,4
4,0.060414,0.003960,0.003605,0.000374,,1,5,10,"{'max_depth': None, 'min_samples_leaf': 1, 'mi...",0.961691,0.959049,0.957672,0.969577,0.965608,0.962719,0.004368,55
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,1.231304,0.010476,0.053834,0.005179,30,4,5,200,"{'max_depth': 30, 'min_samples_leaf': 4, 'min_...",0.959049,0.957728,0.956349,0.965608,0.962963,0.960339,0.003437,97
140,0.065155,0.003431,0.004400,0.001356,30,4,10,10,"{'max_depth': 30, 'min_samples_leaf': 4, 'min_...",0.961691,0.961691,0.956349,0.964286,0.956349,0.960073,0.003185,118
141,0.300605,0.013107,0.013824,0.000685,30,4,10,50,"{'max_depth': 30, 'min_samples_leaf': 4, 'min_...",0.960370,0.957728,0.955026,0.966931,0.961640,0.960339,0.004005,104
142,0.603601,0.027730,0.028446,0.003920,30,4,10,100,"{'max_depth': 30, 'min_samples_leaf': 4, 'min_...",0.963012,0.957728,0.953704,0.966931,0.962963,0.960868,0.004625,93


Save hyperparameter tuning results to log file json.

In [11]:
_grid_search_result_df = pd.DataFrame(grid_search.cv_results_)
_grid_search_result_df['timestamp'] = time_stamp(to_str=True)
log_json(_grid_search_result_df.to_dict(), config['hyperparameter_tuning_log_path'])

### Hyperparameter Tuning (kNN model)
Using grid search cross-val, we will find the best n-neighbors, weights, knn algorithm (ball tree or kd tree) and leaf size.

In [12]:
# Grid search for kNN hyperparameters
knn = KNeighborsClassifier()

param_grid_knn = {
    'n_neighbors': [5, 10, 20],
    'weights': ['uniform', 'distance'],
    'algorithm': ['ball_tree', 'kd_tree'],
    'leaf_size': [10, 30, 50]
}

grid_search_knn = GridSearchCV(estimator=knn, param_grid=param_grid_knn, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_knn.fit(X_train_feng, y_train_feng)

# Get the best hyperparameters
best_params = grid_search_knn.best_params_
print("Best hyperparameters:", best_params)

Best hyperparameters: {'algorithm': 'ball_tree', 'leaf_size': 10, 'n_neighbors': 5, 'weights': 'distance'}


In [13]:
pd.DataFrame(grid_search_knn.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_leaf_size,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.0032,0.0004,0.045002,0.004337,ball_tree,10,5,uniform,"{'algorithm': 'ball_tree', 'leaf_size': 10, 'n...",0.970938,0.968296,0.953704,0.96164,0.968254,0.964566,0.00624,13
1,0.004599,0.000799,0.012201,0.002227,ball_tree,10,5,distance,"{'algorithm': 'ball_tree', 'leaf_size': 10, 'n...",0.976222,0.970938,0.96164,0.964286,0.974868,0.969591,0.005744,1
2,0.007601,0.006218,0.048921,0.004315,ball_tree,10,10,uniform,"{'algorithm': 'ball_tree', 'leaf_size': 10, 'n...",0.961691,0.952444,0.953704,0.958995,0.957672,0.956901,0.003406,25
3,0.004903,0.000915,0.012905,0.00066,ball_tree,10,10,distance,"{'algorithm': 'ball_tree', 'leaf_size': 10, 'n...",0.965654,0.968296,0.960317,0.969577,0.962963,0.965361,0.003396,7
4,0.004702,0.000605,0.05,0.008576,ball_tree,10,20,uniform,"{'algorithm': 'ball_tree', 'leaf_size': 10, 'n...",0.952444,0.944518,0.94709,0.960317,0.952381,0.95135,0.005431,31
5,0.004199,0.0004,0.015395,0.000489,ball_tree,10,20,distance,"{'algorithm': 'ball_tree', 'leaf_size': 10, 'n...",0.965654,0.96037,0.957672,0.968254,0.96164,0.962718,0.003779,19
6,0.0038,0.000749,0.0448,0.001033,ball_tree,30,5,uniform,"{'algorithm': 'ball_tree', 'leaf_size': 30, 'n...",0.970938,0.968296,0.953704,0.96164,0.968254,0.964566,0.00624,13
7,0.004201,0.001469,0.014407,0.001661,ball_tree,30,5,distance,"{'algorithm': 'ball_tree', 'leaf_size': 30, 'n...",0.976222,0.970938,0.96164,0.964286,0.974868,0.969591,0.005744,1
8,0.003902,0.000493,0.051689,0.002757,ball_tree,30,10,uniform,"{'algorithm': 'ball_tree', 'leaf_size': 30, 'n...",0.961691,0.952444,0.953704,0.958995,0.957672,0.956901,0.003406,25
9,0.004641,0.00143,0.017402,0.001624,ball_tree,30,10,distance,"{'algorithm': 'ball_tree', 'leaf_size': 30, 'n...",0.965654,0.968296,0.960317,0.969577,0.962963,0.965361,0.003396,7


Save hyperparameter tuning results to log file json.

In [14]:
_grid_search_result_df = pd.DataFrame(grid_search_knn.cv_results_)
_grid_search_result_df['timestamp'] = time_stamp(to_str=True)
log_json(_grid_search_result_df.to_dict(), config['hyperparameter_tuning_log_path'])

### Model Training
Model training based on the best parameters generated from hyperparameter tuning process.

In [15]:
# Train the model with the best hyperparameters
start_time = time_stamp()

clf = RandomForestClassifier(**grid_search.best_params_, random_state=42)
clf.fit(X_train_feng, y_train_feng)

finished_time = time_stamp()
elapsed_time = finished_time - start_time
elapsed_time = elapsed_time.total_seconds()

print("Training time elapsed:", elapsed_time, "s")
print("Classifier score on training data:", clf.score(X_train_feng, y_train_feng))

Training time elapsed: 0.263005 s
Classifier score on training data: 1.0


In [16]:
# Train the model with the best hyperparameters
start_time = time_stamp()

knn_clf = KNeighborsClassifier(**grid_search_knn.best_params_)
knn_clf.fit(X_train_feng, y_train_feng)

finished_time = time_stamp()
elapsed_time = finished_time - start_time
elapsed_time = elapsed_time.total_seconds()

print("Training time elapsed:", elapsed_time, "s")
print("Classifier score on training data:", knn_clf.score(X_train_feng, y_train_feng))

Training time elapsed: 0.004025 s
Classifier score on training data: 1.0


Check each trained model accuracy score on validation set

In [20]:
best_val_score = clf.score(X_val_feng, y_val_feng)
print(f'Best RF model accuracy score on val set: {best_val_score}')

Best RF model accuracy score on val set: 0.9703125


In [18]:
best_val_score = knn_clf.score(X_val_feng, y_val_feng)
print(f'Best kNN model accuracy score on val set: {best_val_score}')

Best kNN model accuracy score on val set: 0.971875


In [19]:
utils.pickle_dump(clf, config['production_model_path'])
utils.pickle_dump(knn_clf, config['knn_model_path'])