# **MLProcess - Air Quality**
---
**4 - Modeling, Tuning, and Evaluation**

In [1]:
# Import the required libraries.
import yaml
import joblib
import hashlib
import json

import pandas as pd
import numpy as np

from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.linear_model import LogisticRegression as LGR
from sklearn.ensemble import BaggingClassifier as BGC, RandomForestClassifier as RFC, AdaBoostClassifier as ABC, GradientBoostingClassifier as GBC

from sklearn.metrics import classification_report
from sklearn.model_selection import (
    RandomizedSearchCV,
    GridSearchCV
)

from datetime import datetime

import warnings
warnings.filterwarnings("ignore")

## **1 - Configuration File**
---

In [2]:
# Function to load configuration parameter.
def load_config(path_config):
    """
    Load the configuration file (config.yaml).

    Parameters:
    ----------
    path_config : str
        Configuration file location.

    Returns:
    -------
    params : dict
        The configuration parameters.
    """

    # Try to load config.yaml file.
    try:
        with open(path_config, 'r') as file:
            params = yaml.safe_load(file)
    except FileNotFoundError as err:
        raise RuntimeError(f"Configuration file not found in {path_config}")

    return params

In [3]:
# Function to update configuration parameter.
def update_config(key, value, params, path_config):
    """
    Update the configuration parameter values.

    Parameters:
    ----------
    key : str
        The key to be updated.

    value : any type supported in Python
        The updated value.

    params : dict
        Loaded configuration parameters.

    path_config : str
        Configuration file location.

    Returns:
    -------
    config : dict
        Updated configuration parameters.
    """

    # To maintain the raw config immutable.
    params = params.copy()

    # Update the configuration parameters.
    params[key] = value

    with open(path_config, 'w') as file:
        yaml.dump(params, file)

    print(f"Params Updated! \nKey: {key} \nValue: {value}\n")

    # Reload the updated configuration parameters.
    config = load_config(path_config)

    return config

In [4]:
# Load the configuration file.
PATH_CONFIG = "../config/config.yaml"
config = load_config(PATH_CONFIG)

In [5]:
# Check the configuration parameters.
config

{'columns_datetime': ['tanggal'],
 'columns_int': ['pm10', 'pm25', 'so2', 'co', 'o3', 'no2', 'max'],
 'columns_object': ['stasiun', 'critical', 'category'],
 'features': ['stasiun', 'pm10', 'pm25', 'so2', 'co', 'o3', 'no2'],
 'impute_co': 11.0,
 'impute_no2': 18.0,
 'impute_o3': 29.0,
 'impute_pm10': {'BAIK': 28.359154929577464, 'TIDAK BAIK': 55.17809298660362},
 'impute_pm25': {'BAIK': 39.827586206896555, 'TIDAK BAIK': 82.38564668769716},
 'impute_so2': 35.306404657933044,
 'label': 'category',
 'label_categories': ['BAIK', 'SEDANG', 'TIDAK SEHAT'],
 'label_categories_new': ['BAIK', 'TIDAK BAIK'],
 'path_clean_test': ['../data/processed/X_test_clean.pkl',
  '../data/processed/y_test_clean.pkl'],
 'path_clean_train': ['../data/processed/X_train_clean.pkl',
  '../data/processed/y_train_clean.pkl'],
 'path_clean_valid': ['../data/processed/X_valid_clean.pkl',
  '../data/processed/y_valid_clean.pkl'],
 'path_data_joined': '../data/interim/joined_dataset.pkl',
 'path_data_raw': '../data/ra

## **2 - Load Data**
---

In [6]:
# Function to load preprocessed data.
def load_data(config, data_conf):
    """
    Load the preprocessed data.

    Parameters:
    ----------
    config : dict
        The loaded configuration file.

    data_conf : str
        The data configuration type.
        The value must one of these value: ['train', 'valid', 'test']
    """

    # Ensure the data_conf is valid.
    list_data_conf = ["train", "valid", "test"]

    if data_conf not in list_data_conf:
        raise RuntimeError(f"The data configuration {data_conf} is invalid.")
    else:
        data_conf = str(data_conf)
        path = f"path_clean_{data_conf}"

        X = joblib.load(config[path][0])
        y = joblib.load(config[path][1])

        return X, y

In [7]:
# Load the data.
X_train, y_train = load_data(config, "train")
X_valid, y_valid = load_data(config, "valid")
X_test, y_test = load_data(config, "test")

## **3 - Training Log**
---

In [8]:
# Functions to support training log creation.
def time_stamp():
    return datetime.now()

def create_training_log():
    logger = {
        "model_name": [],
        "model_id": [],
        "training_time": [],
        "training_date": [],
        "train_f1": [],
        "cv_f1": [],
        "data_configuration": []
    }

    return logger

In [9]:
def update_training_log(current_log, path_log):
    """
    Update the training log.

    Parameters:
    ----------
    current_log : dict
        The training log current state.

    path_log : str
        The directory of training log.

    Returns:
    -------
    last_log : dict
        The updated training log.
    """

    # Ensure the current log immutable.
    current_log = current_log.copy()

    # Open the training log file.
    try:
        with open(path_log, 'r') as file:
            last_log = json.load(file)
    # If the training log does not exists.
    except FileNotFoundError as err:
        # Create the new training log.
        with open(path_log, 'w') as file:
            file.write("[]")

        # Reload the current training log.
        with open(path_log, 'r') as file:
            last_log = json.load(file)

    last_log.append(current_log)

    # Rewrite the training log with the updated one.
    with open(path_log, 'w') as file:
        json.dump(last_log, file)

    return last_log

## **4 - Model Training**
---

In [10]:
# Function to create model object.
def create_model_object():
    """Return a list of model to be fitted."""

    # Create model object.
    knn = KNN()
    lgr = LGR()
    dtc = DTC()
    bgc = BGC()
    rfc = RFC()
    abc = ABC()
    gbc = GBC()

    # Create list of model.
    list_of_model = [
        {"model_name": knn.__class__.__name__, "model_object": knn, "model_id": ""},
        {"model_name": lgr.__class__.__name__, "model_object": lgr, "model_id": ""},
        {"model_name": dtc.__class__.__name__, "model_object": dtc, "model_id": ""},
        {"model_name": bgc.__class__.__name__, "model_object": bgc, "model_id": ""},
        {"model_name": rfc.__class__.__name__, "model_object": rfc, "model_id": ""},
        {"model_name": abc.__class__.__name__, "model_object": abc, "model_id": ""},
        {"model_name": gbc.__class__.__name__, "model_object": gbc, "model_id": ""}
    ]

    return list_of_model

In [11]:
# Function to create hyperparameter space.
def create_param_space():
    """Return a dict of model hyperparameter."""

    # Define each model hyprerparameter space.
    knn_params = {
        "n_neighbors": [2, 3, 4, 5, 6, 10, 15, 20, 25],
        "weights": ["uniform", "distance"],
        "p": [1, 2]
    }

    lgr_params = {
        "C": [0.01, 0.1, 1.0, 10.0]
    }

    # Hyperparameter for DTC, RFC, and GBC.
    DEPTH = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

    # Hyperparameter for BGC, RFC, ABC, and GBC.
    B = [10, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500]
    
    # Hyperparameter for ABC and GBC.
    LR = [0.001, 0.01, 0.05, 0.1, 1]

    dist_params = {
        "KNeighborsClassifier": knn_params,
        "LogisticRegression": lgr_params,
        "DecisionTreeClassifier": {
            "max_depth": DEPTH
        },
        "BaggingClassifier": {
            "n_estimators": B
        },
        "RandomForestClassifier": {
            "n_estimators": B,
            "max_depth": DEPTH
        },
        "AdaBoostClassifier": {
            "n_estimators": B,
            "learning_rate": LR
        },
        "GradientBoostingClassifier": {
            "n_estimators": B,
            "learning_rate": LR,
            "max_depth": DEPTH
        }
    }

    return dist_params

In [17]:
# Function to fit & tune model (do CV + HT).
def evaluate_model(models, hyperparameters, path_log):
    """Cross validation & hyperparameter tuning."""

    # Load data train and valid.
    X_train, y_train = load_data(config, "train")

    # Create training log.
    logger = create_training_log()

    # Define a dictionary to store the trained models.
    trained_models = {}

    # For each data configuration.
    for data_conf in X_train:
        X_train_conf = X_train[data_conf]
        y_train_conf = y_train[data_conf]
        print(f"Data Conf : {str(data_conf).upper()}")
        # Fit & tune each model.
        for m, h in zip(models, hyperparameters):
            print(f"Fit & Tune Model : {m['model_name']}...")
            # Create tuner object.
            tuner = RandomizedSearchCV(
                estimator = m["model_object"],
                param_distributions = hyperparameters[h],
                n_iter = 100,
                scoring = "f1",
                cv = 5,
                return_train_score = True,
                n_jobs = -1,
                verbose = 1
            )

            # Compute the training time.
            start_time = time_stamp()
            tuner.fit(X_train_conf, y_train_conf)
            finished_time = time_stamp()

            training_time = finished_time - start_time
            training_time = training_time.total_seconds()

            # Get the model with best hyperparameters.
            best_model = tuner.best_estimator_

            # Get the scores of best model.
            best_index = tuner.best_index_
            train_f1 = tuner.cv_results_["mean_train_score"][best_index]
            cv_f1 = tuner.cv_results_["mean_test_score"][best_index]

            # Update the training log.
            model_name = f"{data_conf} - {m["model_name"]}"
            logger["model_name"].append(model_name)

            plain_id = str(training_time)
            cipher_id = hashlib.md5(plain_id.encode()).hexdigest()
            logger["model_id"].append(cipher_id)

            logger["training_time"].append(training_time)
            logger["training_date"].append(str(start_time))
            logger["train_f1"].append(train_f1)
            logger["cv_f1"].append(cv_f1)
            logger["data_configuration"].append(data_conf)

            # Store the best model.
            trained_models[model_name] = best_model
        print()
        
    training_log = update_training_log(logger, path_log)

    return trained_models, training_log

In [18]:
PATH_LOG = "../logs/training_log.json"

models = create_model_object()
hyperparameters = create_param_space()

trained_models, training_log = evaluate_model(models, hyperparameters, PATH_LOG)

Data Conf : UNDERSAMPLING
Fit & Tune Model : KNeighborsClassifier...
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Fit & Tune Model : LogisticRegression...
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Fit & Tune Model : DecisionTreeClassifier...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Fit & Tune Model : BaggingClassifier...
Fitting 5 folds for each of 11 candidates, totalling 55 fits
Fit & Tune Model : RandomForestClassifier...
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fit & Tune Model : AdaBoostClassifier...
Fitting 5 folds for each of 55 candidates, totalling 275 fits
Fit & Tune Model : GradientBoostingClassifier...
Fitting 5 folds for each of 100 candidates, totalling 500 fits

Data Conf : OVERSAMPLING
Fit & Tune Model : KNeighborsClassifier...
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Fit & Tune Model : LogisticRegression...
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Fit & 

In [19]:
def training_log_to_df(training_log):
    training_result = pd.DataFrame()

    for log in training_log:
        training_result = pd.concat([training_result, pd.DataFrame(log)])

    training_result = training_result.sort_values(
        ["cv_f1", "training_time"],
        ascending = [False, True]
    )

    training_result = training_result.reset_index(drop=True)

    return training_result

In [20]:
training_result = training_log_to_df(training_log)

In [21]:
training_result

Unnamed: 0,model_name,model_id,training_time,training_date,train_f1,cv_f1,data_configuration
0,Undersampling - DecisionTreeClassifier,a4149d62b97da16a889ca18f9e803df4,0.42839,2026-02-16 14:38:36.740479,1.0,1.0,Undersampling
1,Undersampling - BaggingClassifier,24c7ae353d01969179e0ccc0b30a9822,13.069867,2026-02-16 14:38:37.169201,1.0,1.0,Undersampling
2,Undersampling - GradientBoostingClassifier,67c2f1e17570839b0e787db9cb36523c,46.876801,2026-02-16 14:41:26.864004,1.0,1.0,Undersampling
3,Oversampling - AdaBoostClassifier,86719065889cc71038a6101e9cc920be,78.246733,2026-02-16 14:44:36.212798,1.0,1.0,Oversampling
4,SMOTE - RandomForestClassifier,6b0f23d4e95eea49415da372572b1584,142.840787,2026-02-16 14:47:50.722258,1.0,0.999613,SMOTE
5,SMOTE - GradientBoostingClassifier,47117d3e2142759f263e30f4cdbee990,133.363865,2026-02-16 14:51:45.284785,1.0,0.999229,SMOTE
6,Oversampling - RandomForestClassifier,1acd02cf65cc73bc702e4cdbd85b43c3,119.692405,2026-02-16 14:42:36.520149,1.0,0.999228,Oversampling
7,Oversampling - DecisionTreeClassifier,08d84734268b4abe6e772d263660c94b,0.402997,2026-02-16 14:42:18.351540,1.0,0.998844,Oversampling
8,Oversampling - BaggingClassifier,df4a1d67913b1693ce5f13612ab2b6d2,17.765183,2026-02-16 14:42:18.754742,0.999711,0.998844,Oversampling
9,SMOTE - BaggingClassifier,8b178c1239d9d3626b9c60b923480d8c,23.930947,2026-02-16 14:47:26.791088,1.0,0.998844,SMOTE


## **5 - Model Serialization**
---

In [23]:
# Serialize the best model.
best = "Undersampling - DecisionTreeClassifier"
best_model = trained_models[best]

PATH_PRODUCTION_MODEL = "../models/best_model.pkl"
joblib.dump(best_model, PATH_PRODUCTION_MODEL)

['../models/best_model.pkl']

In [24]:
# Update the configuration parameter.
config = update_config(
    key = "path_production_model",
    value = PATH_PRODUCTION_MODEL,
    params = config,
    path_config = PATH_CONFIG
)

Params Updated! 
Key: path_production_model 
Value: ../models/best_model.pkl

