# Traditional Machine Learning - Multiclass classification and regression models

In [1]:
import sys
import os
sys.path.append('../')
from utils import multiclass_classification_utils

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neural_network import MLPClassifier
import numpy as np
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
from matplotlib.patches import Ellipse
from sklearn.decomposition import PCA

%load_ext autoreload
%autoreload 2

In [2]:
save_resuts_folder = "results/results_v1.12"

if not os.path.exists(save_resuts_folder):
    os.makedirs(save_resuts_folder)

In [None]:
# data files
CHROMA_FIT_data = "training_data/v2/CHROMA-FIT_data.json"
CHROMA_FIT_CSEC_data = "training_data/v2/CHROMA-FIT_CSEC_data.json"
CHROMA_FIT_exposure_color_correction_data = (
    "training_data/v2/CHROMA-FIT_exposure_color_correction_data.json"
)

MST_data = "training_data/v2/MST_data.json"
MST_CSEC_data = "training_data/v2/MST_CSEC_data.json"
MST_exposure_color_correction_data = (
    "training_data/v2/MST_exposure_color_correction_data.json"
)

nr_of_test_images = 64

# True is used for regression, false for classification models
swatches = False

all_data = {
    # splits
    "CHROMA_FIT_data": multiclass_classification_utils.prepare_data_with_split(
        CHROMA_FIT_data, swatches=swatches
    ),
    "CHROMA_FIT_CSEC_data": multiclass_classification_utils.prepare_data_with_split(
        CHROMA_FIT_CSEC_data, swatches=swatches
    ),
    "CHROMA_FIT_exposure_color_correction_data": multiclass_classification_utils.prepare_data_with_split(
        CHROMA_FIT_exposure_color_correction_data, swatches=swatches
    ),
    # original
    "CHROMA_FIT_data_original": multiclass_classification_utils.prepare_data_with_original(
        CHROMA_FIT_data, CHROMA_FIT_data, nr_of_test_images, swatches=swatches
    ),
    "CHROMA_FIT_CSEC_data_original": multiclass_classification_utils.prepare_data_with_original(
        CHROMA_FIT_CSEC_data, CHROMA_FIT_data, nr_of_test_images, swatches=swatches
    ),
    "CHROMA_FIT_exposure_color_correction_data_original": multiclass_classification_utils.prepare_data_with_original(
        CHROMA_FIT_exposure_color_correction_data,
        CHROMA_FIT_data,
        nr_of_test_images,
        swatches=swatches,
    ),
    # MST original
    "CHROMA_FIT_data_MST_original": multiclass_classification_utils.prepare_data_with_MST(
        CHROMA_FIT_data, MST_data, swatches=swatches
    ),
    "CHROMA_FIT_CSEC_data_MST_original": multiclass_classification_utils.prepare_data_with_MST(
        CHROMA_FIT_CSEC_data, MST_data, swatches=swatches
    ),
    "CHROMA_FIT_exposure_color_correction_data_MST_original": multiclass_classification_utils.prepare_data_with_MST(
        CHROMA_FIT_exposure_color_correction_data, MST_data, swatches=swatches
    ),
    # MST splits
    "CHROMA_FIT_data_MST": multiclass_classification_utils.prepare_data_with_MST(
        CHROMA_FIT_data, MST_data, swatches=swatches
    ),
    "CHROMA_FIT_CSEC_data_MST": multiclass_classification_utils.prepare_data_with_MST(
        CHROMA_FIT_CSEC_data, MST_CSEC_data, swatches=swatches
    ),
    "CHROMA_FIT_exposure_color_correction_data_MST": multiclass_classification_utils.prepare_data_with_MST(
        CHROMA_FIT_exposure_color_correction_data,
        MST_exposure_color_correction_data,
        swatches=swatches,
    ),
}

# Multiclass classification

1. Extracts the pre processed training and testing data
2. Runs Random Forest Classifier, logistic Regression, SVC and MLP Classifier models
3. The output is saved to the output folder

In [None]:
for experiment in all_data:

    print(experiment)
    output_file = f"{save_resuts_folder}/{experiment}.txt"

    X_train, X_test, y_train, y_test = all_data[experiment]

    rf = RandomForestClassifier(
        n_estimators=10, random_state=42, max_depth=5
    )  # class_weight='balanced'
    multiclass_classification_utils.train_model_classification(
        rf, experiment, output_file, X_train, X_test, y_train, y_test
    )

    lr = LogisticRegression(random_state=42)  #  class_weight='balanced'
    multiclass_classification_utils.train_model_classification(
        lr, experiment, output_file, X_train, X_test, y_train, y_test
    )

    svm = SVC(kernel="rbf", random_state=42)  # class_weight='balanced'
    multiclass_classification_utils.train_model_classification(
        svm, experiment, output_file, X_train, X_test, y_train, y_test
    )

    mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)
    multiclass_classification_utils.train_model_classification(
        mlp, experiment, output_file, X_train, X_test, y_train, y_test
    )

    print("\n")

# Multiclass regression

1. Extract the pre processed training and testing data
2. Runs XGBoost, Multi-Layer Perceptron Regressor, Support Vector Regression, LightGBM
3. The output is saved to the output folder

In [None]:
for experiment in all_data:

    print(experiment)
    output_file = f"{save_resuts_folder}/{experiment}.txt"

    X_train, X_test, y_train, y_test = all_data[experiment]

    xg_reg = xgb.XGBRegressor(
        n_estimators=100, random_state=42, objective="reg:squarederror"
    )
    multi_output_xgb = MultiOutputRegressor(xg_reg)
    multiclass_classification_utils.train_model_regression(
        multi_output_xgb,
        experiment,
        output_file,
        X_train,
        X_test,
        y_train,
        y_test,
        swatches=True,
    )

    mlp_regressor = MLPRegressor(
        hidden_layer_sizes=(100, 100), max_iter=300, random_state=42
    )
    multiclass_classification_utils.train_model_regression(
        mlp_regressor,
        experiment,
        output_file,
        X_train,
        X_test,
        y_train,
        y_test,
        swatches=True,
    )

    # param_grid = {
    #     'estimator__C': [1, 10, 100, 1000],  # Regularization parameter
    #     'estimator__epsilon': [0.01, 0.1, 0.2],  # Epsilon parameter (margin of error)
    #     'estimator__kernel': ['rbf', 'linear', 'poly'],  # Types of kernel
    #     'estimator__gamma': ['scale', 'auto']  # Kernel coefficient for 'rbf' and 'poly'
    # }

    # # Initialize the SVR and MultiOutputRegressor
    # svr = SVR()
    # multi_output_svr = MultiOutputRegressor(svr)

    # # Create the GridSearchCV object
    # grid_search = GridSearchCV(
    #     multi_output_svr,
    #     param_grid,
    #     cv=5,  # 5-fold cross-validation
    #     scoring='neg_mean_squared_error',  # Use MSE for scoring
    #     verbose=1,
    #     n_jobs=-1  # Use all available CPU cores
    # )

    # multiclass_classification_utils.train_model_regression(grid_search, experiment, output_file, X_train, X_test, y_train, y_test, swatches=True)
    # Best parameters: {'estimator__C': 100, 'estimator__epsilon': 0.1, 'estimator__gamma': 'scale', 'estimator__kernel': 'rbf'}

    svr = SVR(kernel="rbf")
    # svr = SVR(kernel='rbf', C=100, epsilon=0.1, gamma='scale')
    multi_output_svr = MultiOutputRegressor(svr)
    multiclass_classification_utils.train_model_regression(
        multi_output_svr,
        experiment,
        output_file,
        X_train,
        X_test,
        y_train,
        y_test,
        swatches=True,
    )

    lgb_regressor = lgb.LGBMRegressor(n_estimators=100, random_state=42)
    multi_output_lgb = MultiOutputRegressor(lgb_regressor)
    multiclass_classification_utils.train_model_regression(
        multi_output_lgb,
        experiment,
        output_file,
        X_train,
        X_test,
        y_train,
        y_test,
        swatches=True,
    )

    print("\n")

## Since the SVR model performed the best - fine tune

In [None]:
from sklearn.model_selection import GridSearchCV


# Define the parameter grid
param_grid = {
    "estimator__C": np.logspace(-3, 3, 7),
    "estimator__gamma": ["scale", "auto", 0.1, 0.01, 1],
    "estimator__epsilon": [0.01, 0.1, 0.2, 0.5],
    "estimator__kernel": ["rbf", "linear", "poly"],
    # 'estimator__tol': [1e-4, 1e-3, 1e-2],  # Tolerance for stopping criteria
}

for experiment in all_data:

    output_file = f"{save_resuts_folder}/{experiment}.txt"

    X_train, X_test, y_train, y_test = all_data[experiment]

    svr = SVR(kernel="rbf")
    multi_output_svr = MultiOutputRegressor(svr)
    print("Default Parameters of multi_output_svr:", multi_output_svr.get_params())
    grid_search = GridSearchCV(multi_output_svr, param_grid, cv=5, n_jobs=-1, verbose=1)

    multiclass_classification_utils.train_model_regression(
        grid_search,
        experiment,
        output_file,
        X_train,
        X_test,
        y_train,
        y_test,
        swatches=True,
    )

    # Fine tune on one experiment
    break

# Principal component analysis for classification model

1. Identify patterns or trends in high-dimensional data
2. Simplify data for visualization 

In [4]:
# use swatches to display data points as MST colors

SWATCHES_MAPPING = {
    1: [246, 237, 228],
    2: [243, 231, 219],
    3: [247, 234, 208],
    4: [234, 218, 186],
    5: [215, 189, 150],
    6: [160, 126, 86],
    7: [130, 92, 67],
    8: [96, 65, 52],
    9: [58, 49, 42],
    10: [41, 36, 32],
}

SWATCHES_MAPPING_NORMALIZED = {
    k: np.array(v) / 255.0 for k, v in SWATCHES_MAPPING.items()
}

In [None]:
# PCA data

swatches = False

all_data = {
    # splits
    "CHROMA_FIT_data": multiclass_classification_utils.prepare_data_with_split(
        CHROMA_FIT_data, swatches=swatches
    ),
    "CHROMA_FIT_CSEC_data": multiclass_classification_utils.prepare_data_with_split(
        CHROMA_FIT_CSEC_data, swatches=swatches
    ),
    "CHROMA_FIT_exposure_color_correction_data": multiclass_classification_utils.prepare_data_with_split(
        CHROMA_FIT_exposure_color_correction_data, swatches=swatches
    ),
    # MST splits
    "MST_data": multiclass_classification_utils.prepare_data_with_split(
        MST_data, swatches=swatches
    ),
    "MST_CSEC_data": multiclass_classification_utils.prepare_data_with_split(
        MST_CSEC_data, swatches=swatches
    ),
    "MST_exposure_color_correction_data": multiclass_classification_utils.prepare_data_with_split(
        MST_exposure_color_correction_data, swatches=swatches
    ),
}

In [None]:
# Define the folder to save the PCA plots
save_folder = "/home/dasec-notebook/Thesis/visualization/PCA/MST/classification"

for experiment in all_data:

    file_name = f"{save_folder}/{experiment}.png"
    print(experiment)

    X_train, X_test, y_train, y_test = all_data[experiment]

    # concatenate the training and test data to display all data
    X_train = np.concatenate((X_train, X_test), axis=0)
    y_train = np.concatenate((y_train, y_test), axis=0)

    # Fit PCA
    pca = PCA().fit(X_train)
    explained_variance = np.cumsum(pca.explained_variance_ratio_)

    # Plot cumulative explained variance
    plt.figure(figsize=(8, 5))
    plt.plot(
        range(1, len(explained_variance) + 1),
        explained_variance,
        marker="o",
        linestyle="--",
    )
    plt.xlabel("Number of Components")
    plt.ylabel("Cumulative Explained Variance")
    # plt.title('Explained Variance vs. Number of Components')
    plt.yticks(np.arange(0, 1.1, 0.1))
    plt.xticks(np.arange(1, len(explained_variance) + 1, 1))
    plt.axhline(y=0.95, color="r", linestyle="--", label="95% Variance Explained")
    plt.legend()
    plt.grid()
    plt.savefig(f"{save_folder}/{experiment}_variance.png")
    plt.show()

    # Find the number of components that explain 95% of variance and plot PCA
    n_components = np.argmax(explained_variance >= 0.95) + 1
    print(f"Number of components for 95% variance explained: {n_components}")

    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train)

    if swatches:
        y_train = y_train / 255.0

    colors = np.array([SWATCHES_MAPPING_NORMALIZED[y] for y in y_train])

    plt.scatter(X_train_pca[:, 0], X_train_pca[:, 1], color=colors)
    # plt.scatter(X_train_pca[:, 0], X_train_pca[:, 1], color='blue')
    plt.xlabel("Principal Component 1")
    plt.ylabel("Principal Component 2")
    plt.savefig(f"{save_folder}/{experiment}_PCA.png")
    plt.title("PCA of Training Data")
    plt.show()

    # Plot the data with confidence ellipses
    plt.figure(figsize=(8, 8))
    ax = plt.gca()
    for class_label in np.unique(y_train):
        class_mask = y_train == class_label
        class_points = X_train_pca[class_mask]
        multiclass_classification_utils.plot_confidence_ellipse(
            ax,
            class_points,
            color=SWATCHES_MAPPING_NORMALIZED[class_label],
            label=f"Class {class_label}",
        )
        plt.scatter(
            class_points[:, 0],
            class_points[:, 1],
            color=SWATCHES_MAPPING_NORMALIZED[class_label],
            s=10,
            label=f"Points {class_label}",
        )

    plt.xlabel("Principal Component 1")
    plt.ylabel("Principal Component 2")
    plt.legend()
    plt.grid()
    plt.savefig(f"{save_folder}/{experiment}_PCA_with_ellipses.png")
    plt.show()

# ROC curve for multiclass classification

1. Evaluate the performance of a model
2. A higher AUC indicates a better ability to distinguish the positive class

In [None]:
for experiment in all_data:

    print(experiment)

    X_train, X_test, y_train, y_test = all_data[experiment]

    rf = RandomForestClassifier(n_estimators=10, random_state=42, max_depth=5)
    multiclass_classification_utils.plot_multiclass_roc_curve(
        rf, X_train, X_test, y_train, y_test
    )

    lr = LogisticRegression(random_state=42)
    multiclass_classification_utils.plot_multiclass_roc_curve(
        lr, X_train, X_test, y_train, y_test
    )

    svm = SVC(kernel="rbf", random_state=42)
    multiclass_classification_utils.plot_multiclass_roc_curve(
        svm, X_train, X_test, y_train, y_test
    )

    mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)
    multiclass_classification_utils.plot_multiclass_roc_curve(
        mlp, X_train, X_test, y_train, y_test
    )