# SVM with PCA (HSV)

In [1]:
import re
import cv2
import os
import glob
import time
import mlflow
import random
import shutil
import itertools

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from PIL import Image
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import (
    make_scorer,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
)

In [2]:
TRAIN_DIR = "../data/train"
CLASS_FOLDERS = ["Class A", "Class B", "Class C", "Class D"]
ML_FLOW_DIRECTORY = "SVM_Logs"

In [3]:
mlflow.set_tracking_uri(ML_FLOW_DIRECTORY)
mlflow.set_experiment("SVM_PCA_HSV")

<Experiment: artifact_location='/Users/jacob/Code/Monkey-Business/SVM_Final_Models/SVM_Logs/460119513691526511', creation_time=1702029139080, experiment_id='460119513691526511', last_update_time=1702029139080, lifecycle_stage='active', name='SVM_PCA_HSV', tags={}>

In [4]:
pca_components = [40 * 40, 45 * 45, 50 * 50]
C_values = [9, 10, 10.5]
kernel_types = ["rbf"]
gamma_values = ["scale"]

In [5]:
hyperparameter_combinations = list(
    itertools.product(pca_components, C_values, kernel_types, gamma_values)
)
print(
    f"Total number of hyperparameter combinations: {len(hyperparameter_combinations)}"
)

Total number of hyperparameter combinations: 9


In [6]:
pca_models_dict = {}

In [7]:
def train_with_params(pca_components, C, kernel, gamma, train_images, train_labels):
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    with mlflow.start_run() as run:
        start_time = time.time()

        scores = []
        for train_index, valid_index in skf.split(train_images, train_labels):
            train_images_fold, valid_images_fold = (
                train_images[train_index],
                train_images[valid_index],
            )
            train_labels_fold, valid_labels_fold = (
                train_labels[train_index],
                train_labels[valid_index],
            )

            pca = PCA(n_components=pca_components)
            pca.fit(train_images_fold)

            train_images_pca = pca.transform(train_images_fold)
            valid_images_pca = pca.transform(valid_images_fold)

            clf = svm.SVC(
                C=C, kernel=kernel, gamma=gamma, class_weight="balanced", verbose=True
            )
            clf.fit(train_images_pca, train_labels_fold)
            valid_predictions = clf.predict(valid_images_pca)

            scores.append(
                {
                    "f1_weighted": f1_score(
                        valid_labels_fold, valid_predictions, average="weighted"
                    ),
                    "accuracy": accuracy_score(valid_labels_fold, valid_predictions),
                    "precision": precision_score(
                        valid_labels_fold, valid_predictions, average="weighted"
                    ),
                    "recall": recall_score(
                        valid_labels_fold, valid_predictions, average="weighted"
                    ),
                }
            )

        mean_scores = {
            "mean_f1_weighted": np.mean([s["f1_weighted"] for s in scores]),
            "mean_accuracy": np.mean([s["accuracy"] for s in scores]),
            "mean_precision": np.mean([s["precision"] for s in scores]),
            "mean_recall": np.mean([s["recall"] for s in scores]),
        }

        # Log hyperparameters and metrics
        mlflow.log_params(
            {"pca_components": pca_components, "C": C, "kernel": kernel, "gamma": gamma}
        )
        mlflow.log_metrics(mean_scores)

        end_time = time.time()
        duration = end_time - start_time
        param_details = f"PCA={pca_components}, C={C}, kernel={kernel}, gamma={gamma}"
        duration_details = f"Training duration for {param_details}: {duration} seconds"
        print(duration_details)
        run_id = run.info.run_id
        return run_id

In [8]:
def load_images_from_folder(folder):
    images = []
    labels = []
    for class_folder in CLASS_FOLDERS:
        start_time = time.time()
        class_path = os.path.join(folder, class_folder)
        image_files = glob.glob(os.path.join(class_path, "*.png"))
        for img_file in image_files:
            image = cv2.imread(img_file)
            hsv_image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
            flattened_image = hsv_image.flatten()
            images.append(flattened_image)
            labels.append(class_folder)
        end_time = time.time()
        duration = end_time - start_time
        print(f"Processing {class_folder} took {duration} seconds")
    return np.array(images), np.array(labels)

In [9]:
train_images, train_labels = load_images_from_folder(TRAIN_DIR)

Processing Class A took 1.6364998817443848 seconds
Processing Class B took 0.9348328113555908 seconds
Processing Class C took 0.6303431987762451 seconds
Processing Class D took 0.7784278392791748 seconds


In [10]:
print(
    f"Shape of train_images: {train_images.shape}, Shape of train_labels: {train_labels.shape}"
)

Shape of train_images: (2796, 150528), Shape of train_labels: (2796,)


In [11]:
run_ids = []

for params in hyperparameter_combinations:
    pca_components, C, kernel, gamma = params
    run_id = train_with_params(
        pca_components, C, kernel, gamma, train_images, train_labels
    )
    run_ids.append(run_id)

[LibSVM].*.*
optimization finished, #iter = 2668
obj = -650.217640, rho = 0.398430
nSV = 1080, nBSV = 14
*.*
optimization finished, #iter = 1178
obj = -165.422474, rho = -0.009663
nSV = 554, nBSV = 0
*.*
optimization finished, #iter = 1285
obj = -172.061979, rho = 0.066480
nSV = 612, nBSV = 0
.*
optimization finished, #iter = 1915
obj = -686.553331, rho = 0.277605
nSV = 764, nBSV = 4
.*
optimization finished, #iter = 1722
obj = -379.041236, rho = 0.577439
nSV = 730, nBSV = 0
.*
optimization finished, #iter = 1515
obj = -446.963155, rho = 0.517614
nSV = 616, nBSV = 5
Total nSV = 2006
[LibSVM].*.*
optimization finished, #iter = 2695
obj = -667.820591, rho = 0.380145
nSV = 1092, nBSV = 16
*.*
optimization finished, #iter = 1154
obj = -157.351241, rho = 0.024951
nSV = 537, nBSV = 0
*.*
optimization finished, #iter = 1271
obj = -166.626825, rho = 0.060058
nSV = 599, nBSV = 0
.*
optimization finished, #iter = 1907
obj = -632.861803, rho = 0.352312
nSV = 759, nBSV = 5
.*
optimization finished

.*
optimization finished, #iter = 1578
obj = -458.995359, rho = 0.510899
nSV = 622, nBSV = 5
Total nSV = 1999
[LibSVM].*.*
optimization finished, #iter = 2663
obj = -670.152745, rho = 0.402454
nSV = 1076, nBSV = 13
*.*
optimization finished, #iter = 1205
obj = -160.607432, rho = 0.002014
nSV = 555, nBSV = 0
*.*
optimization finished, #iter = 1256
obj = -171.317883, rho = 0.072496
nSV = 596, nBSV = 0
.*
optimization finished, #iter = 1896
obj = -684.618325, rho = 0.347480
nSV = 767, nBSV = 6
.*
optimization finished, #iter = 1749
obj = -390.695418, rho = 0.635949
nSV = 723, nBSV = 0
.*
optimization finished, #iter = 1554
obj = -465.772950, rho = 0.628232
nSV = 610, nBSV = 5
Total nSV = 2003
[LibSVM].*.*
optimization finished, #iter = 2670
obj = -657.387438, rho = 0.323063
nSV = 1078, nBSV = 11
*.*
optimization finished, #iter = 1167
obj = -159.822788, rho = -0.012593
nSV = 535, nBSV = 0
*.*
optimization finished, #iter = 1305
obj = -172.028842, rho = 0.075493
nSV = 609, nBSV = 0
.*
opti

.*
optimization finished, #iter = 1733
obj = -385.144696, rho = 0.701341
nSV = 727, nBSV = 0
.*
optimization finished, #iter = 1572
obj = -440.023136, rho = 0.581725
nSV = 614, nBSV = 3
Total nSV = 2002
[LibSVM].*.*
optimization finished, #iter = 2701
obj = -653.982194, rho = 0.379149
nSV = 1087, nBSV = 12
*.*
optimization finished, #iter = 1206
obj = -154.525117, rho = -0.000472
nSV = 540, nBSV = 0
*.*
optimization finished, #iter = 1305
obj = -165.255282, rho = 0.049355
nSV = 611, nBSV = 0
.*
optimization finished, #iter = 1899
obj = -698.522403, rho = 0.318087
nSV = 763, nBSV = 6
.*
optimization finished, #iter = 1735
obj = -367.209137, rho = 0.647490
nSV = 717, nBSV = 0
.*
optimization finished, #iter = 1508
obj = -434.482568, rho = 0.571785
nSV = 609, nBSV = 4
Total nSV = 2011
[LibSVM].*.*
optimization finished, #iter = 2603
obj = -657.494598, rho = 0.408548
nSV = 1067, nBSV = 11
*.*
optimization finished, #iter = 1140
obj = -162.117968, rho = 0.014358
nSV = 531, nBSV = 0
*.*
opti

*.*
optimization finished, #iter = 1287
obj = -166.875077, rho = 0.040436
nSV = 639, nBSV = 0
.*
optimization finished, #iter = 1894
obj = -594.880218, rho = 0.354067
nSV = 796, nBSV = 3
.*
optimization finished, #iter = 1714
obj = -365.269062, rho = 0.586771
nSV = 740, nBSV = 0
.*
optimization finished, #iter = 1453
obj = -387.593734, rho = 0.431957
nSV = 622, nBSV = 2
Total nSV = 2052
[LibSVM].*.*
optimization finished, #iter = 2651
obj = -610.827015, rho = 0.378249
nSV = 1133, nBSV = 4
*.*
optimization finished, #iter = 1150
obj = -160.345981, rho = -0.025743
nSV = 552, nBSV = 0
*.*
optimization finished, #iter = 1305
obj = -171.521004, rho = 0.059500
nSV = 639, nBSV = 0
.*
optimization finished, #iter = 1860
obj = -658.028403, rho = 0.311275
nSV = 800, nBSV = 3
.*
optimization finished, #iter = 1739
obj = -363.881000, rho = 0.631937
nSV = 765, nBSV = 0
.*
optimization finished, #iter = 1502
obj = -415.699194, rho = 0.495945
nSV = 634, nBSV = 3
Total nSV = 2050
[LibSVM].*.*
optimiza

*.*
optimization finished, #iter = 1135
obj = -157.854385, rho = -0.016174
nSV = 546, nBSV = 0
*.*
optimization finished, #iter = 1310
obj = -172.455260, rho = 0.053000
nSV = 636, nBSV = 0
.*
optimization finished, #iter = 1844
obj = -572.617745, rho = 0.394428
nSV = 776, nBSV = 1
.*
optimization finished, #iter = 1766
obj = -365.373509, rho = 0.687833
nSV = 761, nBSV = 0
.*
optimization finished, #iter = 1462
obj = -435.699707, rho = 0.446835
nSV = 623, nBSV = 4
Total nSV = 2049
[LibSVM].*.*
optimization finished, #iter = 2640
obj = -596.612633, rho = 0.344543
nSV = 1133, nBSV = 2
*.*
optimization finished, #iter = 1178
obj = -162.795497, rho = -0.000632
nSV = 565, nBSV = 0
*.*
optimization finished, #iter = 1296
obj = -164.167986, rho = 0.054617
nSV = 640, nBSV = 0
.*
optimization finished, #iter = 1888
obj = -625.233563, rho = 0.371885
nSV = 807, nBSV = 3
.*
optimization finished, #iter = 1728
obj = -366.499796, rho = 0.589960
nSV = 757, nBSV = 0
.*
optimization finished, #iter = 14

[LibSVM].*.*
optimization finished, #iter = 2595
obj = -589.489886, rho = 0.360304
nSV = 1154, nBSV = 1
*.*
optimization finished, #iter = 1182
obj = -161.228221, rho = 0.010871
nSV = 581, nBSV = 0
*.*
optimization finished, #iter = 1284
obj = -172.132427, rho = 0.053342
nSV = 658, nBSV = 0
.*
optimization finished, #iter = 1885
obj = -518.004751, rho = 0.315762
nSV = 830, nBSV = 0
.*
optimization finished, #iter = 1742
obj = -366.363753, rho = 0.571119
nSV = 809, nBSV = 0
.*
optimization finished, #iter = 1482
obj = -383.801646, rho = 0.393328
nSV = 662, nBSV = 3
Total nSV = 2093
Training duration for PCA=2500, C=9, kernel=rbf, gamma=scale: 1581.6211609840393 seconds
[LibSVM].*.*
optimization finished, #iter = 2591
obj = -585.770659, rho = 0.337701
nSV = 1176, nBSV = 1
*.*
optimization finished, #iter = 1204
obj = -164.184265, rho = -0.029413
nSV = 588, nBSV = 0
*.*
optimization finished, #iter = 1325
obj = -172.947269, rho = 0.022883
nSV = 656, nBSV = 0
.*
optimization finished, #ite

.*
optimization finished, #iter = 1736
obj = -356.211627, rho = 0.589726
nSV = 793, nBSV = 0
.*
optimization finished, #iter = 1498
obj = -387.977227, rho = 0.436819
nSV = 659, nBSV = 3
Total nSV = 2093
[LibSVM].*.*
optimization finished, #iter = 2595
obj = -590.116926, rho = 0.311553
nSV = 1162, nBSV = 0
*.*
optimization finished, #iter = 1173
obj = -167.481432, rho = -0.006222
nSV = 589, nBSV = 0
*.*
optimization finished, #iter = 1301
obj = -166.277924, rho = -0.029611
nSV = 636, nBSV = 0
.*
optimization finished, #iter = 1899
obj = -544.883690, rho = 0.272910
nSV = 833, nBSV = 1
.*
optimization finished, #iter = 1684
obj = -347.134856, rho = 0.494359
nSV = 771, nBSV = 0
.*
optimization finished, #iter = 1505
obj = -399.852301, rho = 0.370395
nSV = 664, nBSV = 4
Total nSV = 2084
[LibSVM].*.*
optimization finished, #iter = 2578
obj = -598.251575, rho = 0.342198
nSV = 1155, nBSV = 1
*.*
optimization finished, #iter = 1191
obj = -159.928370, rho = -0.010587
nSV = 585, nBSV = 0
*.*
opti

In [12]:
run_metrics = {}
run_params = {}

for run_id in run_ids:
    client = mlflow.tracking.MlflowClient(ML_FLOW_DIRECTORY)
    run = client.get_run(run_id)
    metrics = run.data.metrics
    params = run.data.params
    run_metrics[run_id] = metrics
    run_params[run_id] = params

for run_id in run_ids:
    print(f"Run ID: {run_id}")
    print("Metrics:")
    metrics = run_metrics[run_id]
    for metric, value in metrics.items():
        print(f"\t{metric}: {value}")

    print("Parameters:")
    params = run_params[run_id]
    for param, value in params.items():
        print(f"\t{param}: {value}")

Run ID: 4ad3076fcc2a4faf99c3ac498439cc9f
Metrics:
	mean_f1_weighted: 0.8424340314446919
	mean_accuracy: 0.8433410138248847
	mean_recall: 0.8433410138248847
	mean_precision: 0.8451574313162407
Parameters:
	gamma: scale
	pca_components: 1600
	C: 9
	kernel: rbf
Run ID: 10fc551498f046c1a3e618f8f89a485f
Metrics:
	mean_f1_weighted: 0.8404713615510018
	mean_accuracy: 0.8415527393753199
	mean_recall: 0.8415527393753199
	mean_precision: 0.842897530160023
Parameters:
	gamma: scale
	pca_components: 1600
	C: 10
	kernel: rbf
Run ID: c290ee0b2cf7497291e2160f3448859d
Metrics:
	mean_f1_weighted: 0.8427493247550799
	mean_accuracy: 0.8436955965181772
	mean_recall: 0.8436955965181772
	mean_precision: 0.8453451989578913
Parameters:
	gamma: scale
	pca_components: 1600
	C: 10.5
	kernel: rbf
Run ID: 6a02bab131334ca7b93dab403e5e736f
Metrics:
	mean_f1_weighted: 0.8410254623791417
	mean_accuracy: 0.8419124423963135
	mean_recall: 0.8419124423963135
	mean_precision: 0.8436078175699008
Parameters:
	gamma: scale
	p

In [13]:
best_run_id = max(run_metrics, key=lambda x: run_metrics[x]["mean_f1_weighted"])
best_run_params = run_params[best_run_id]
best_run_metrics = run_metrics[best_run_id]

print(f"\nBest Run ID: {best_run_id}")
print("Best Parameters:")
for key, value in best_run_params.items():
    print(f"\t{key}: {value}")

print("\nBest Run Metrics:")
for metric, value in best_run_metrics.items():
    print(f"\t{metric}: {value}")


Best Run ID: c290ee0b2cf7497291e2160f3448859d
Best Parameters:
	gamma: scale
	pca_components: 1600
	C: 10.5
	kernel: rbf

Best Run Metrics:
	mean_f1_weighted: 0.8427493247550799
	mean_accuracy: 0.8436955965181772
	mean_recall: 0.8436955965181772
	mean_precision: 0.8453451989578913
