# SVM with PCA (BGR)

In [1]:
import cv2
import os
import glob
import time
import mlflow
import itertools

import numpy as np

from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)

In [2]:
TRAIN_DIR = "../data/train"
CLASS_FOLDERS = ["Class A", "Class B", "Class C", "Class D"]
ML_FLOW_DIRECTORY = "SVM_Logs"

In [3]:
mlflow.set_tracking_uri(ML_FLOW_DIRECTORY)
mlflow.set_experiment("SVM_PCA_BGR")

2023/12/08 00:31:50 INFO mlflow.tracking.fluent: Experiment with name 'SVM_PCA_BGR' does not exist. Creating a new experiment.


<Experiment: artifact_location='/Users/jacob/Code/Monkey-Business/SVM_Final_Models/SVM_Logs/910507522813637421', creation_time=1702017110382, experiment_id='910507522813637421', last_update_time=1702017110382, lifecycle_stage='active', name='SVM_PCA_BGR', tags={}>

In [4]:
pca_components = [40 * 40, 45 * 45, 50 * 50]
C_values = [9, 10, 10.5]
kernel_types = ["rbf"]
gamma_values = ["scale"]

In [5]:
hyperparameter_combinations = list(
    itertools.product(pca_components, C_values, kernel_types, gamma_values)
)
print(
    f"Total number of hyperparameter combinations: {len(hyperparameter_combinations)}"
)

Total number of hyperparameter combinations: 9


In [7]:
def train_with_params(pca_components, C, kernel, gamma, train_images, train_labels):
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    with mlflow.start_run() as run:
        start_time = time.time()

        scores = []
        for train_index, valid_index in skf.split(train_images, train_labels):
            train_images_fold, valid_images_fold = (
                train_images[train_index],
                train_images[valid_index],
            )
            train_labels_fold, valid_labels_fold = (
                train_labels[train_index],
                train_labels[valid_index],
            )

            pca = PCA(n_components=pca_components)
            pca.fit(train_images_fold)

            train_images_pca = pca.transform(train_images_fold)
            valid_images_pca = pca.transform(valid_images_fold)

            clf = svm.SVC(
                C=C, kernel=kernel, gamma=gamma, class_weight="balanced", verbose=True
            )
            clf.fit(train_images_pca, train_labels_fold)
            valid_predictions = clf.predict(valid_images_pca)

            scores.append(
                {
                    "f1_weighted": f1_score(
                        valid_labels_fold, valid_predictions, average="weighted"
                    ),
                    "accuracy": accuracy_score(valid_labels_fold, valid_predictions),
                    "precision": precision_score(
                        valid_labels_fold, valid_predictions, average="weighted"
                    ),
                    "recall": recall_score(
                        valid_labels_fold, valid_predictions, average="weighted"
                    ),
                }
            )

        mean_scores = {
            "mean_f1_weighted": np.mean([s["f1_weighted"] for s in scores]),
            "mean_accuracy": np.mean([s["accuracy"] for s in scores]),
            "mean_precision": np.mean([s["precision"] for s in scores]),
            "mean_recall": np.mean([s["recall"] for s in scores]),
        }

        # Log hyperparameters and metrics
        mlflow.log_params(
            {"pca_components": pca_components, "C": C, "kernel": kernel, "gamma": gamma}
        )
        mlflow.log_metrics(mean_scores)

        end_time = time.time()
        duration = end_time - start_time
        param_details = f"PCA={pca_components}, C={C}, kernel={kernel}, gamma={gamma}"
        duration_details = f"Training duration for {param_details}: {duration} seconds"
        print(duration_details)
        run_id = run.info.run_id
        return run_id

In [8]:
def load_images_from_folder(folder):
    images = []
    labels = []
    for class_folder in CLASS_FOLDERS:
        start_time = time.time()
        class_path = os.path.join(folder, class_folder)
        image_files = glob.glob(os.path.join(class_path, "*.png"))
        for img_file in image_files:
            processed_image = cv2.imread(img_file)
            flattened_image = processed_image.flatten()
            images.append(flattened_image)
            labels.append(class_folder)
        end_time = time.time()
        duration = end_time - start_time
        print(f"Processing {class_folder} took {duration} seconds")
    return np.array(images), np.array(labels)

In [9]:
train_images, train_labels = load_images_from_folder(TRAIN_DIR)

Processing Class A took 1.5296101570129395 seconds
Processing Class B took 0.8514950275421143 seconds
Processing Class C took 0.5740070343017578 seconds
Processing Class D took 0.6946301460266113 seconds


In [10]:
print(
    f"Shape of train_images: {train_images.shape}, Shape of train_labels: {train_labels.shape}"
)

Shape of train_images: (2796, 150528), Shape of train_labels: (2796,)


In [11]:
run_ids = []

for params in hyperparameter_combinations:
    pca_components, C, kernel, gamma = params
    run_id = train_with_params(pca_components, C, kernel, gamma, train_images, train_labels)
    run_ids.append(run_id)

[LibSVM].*.*
optimization finished, #iter = 2543
obj = -1062.287663, rho = -0.045514
nSV = 811, nBSV = 84
*.*
optimization finished, #iter = 1185
obj = -193.994882, rho = -0.119254
nSV = 469, nBSV = 2
.*
optimization finished, #iter = 1582
obj = -294.669116, rho = -0.101244
nSV = 590, nBSV = 16
.*.*
optimization finished, #iter = 2014
obj = -693.834513, rho = 0.189757
nSV = 725, nBSV = 7
.*.*
optimization finished, #iter = 2152
obj = -532.779218, rho = 0.226326
nSV = 757, nBSV = 1
.*
optimization finished, #iter = 1798
obj = -528.303971, rho = 0.159033
nSV = 692, nBSV = 4
Total nSV = 1892
[LibSVM].*.*
optimization finished, #iter = 2574
obj = -1055.627906, rho = -0.013558
nSV = 809, nBSV = 86
*.*
optimization finished, #iter = 1168
obj = -183.513711, rho = -0.036922
nSV = 459, nBSV = 2
.*
optimization finished, #iter = 1541
obj = -279.011941, rho = -0.075708
nSV = 569, nBSV = 13
.*.*
optimization finished, #iter = 1993
obj = -648.621021, rho = 0.250778
nSV = 716, nBSV = 8
.*.*
optimiza

.*.*
optimization finished, #iter = 2066
obj = -517.674933, rho = 0.277494
nSV = 737, nBSV = 0
.*
optimization finished, #iter = 1794
obj = -538.405982, rho = 0.260970
nSV = 691, nBSV = 4
Total nSV = 1896
[LibSVM].*.*
optimization finished, #iter = 2548
obj = -1108.882397, rho = -0.017525
nSV = 807, nBSV = 79
*.*
optimization finished, #iter = 1126
obj = -188.617592, rho = -0.065300
nSV = 458, nBSV = 2
.*
optimization finished, #iter = 1592
obj = -296.957586, rho = -0.065904
nSV = 583, nBSV = 9
.*
optimization finished, #iter = 1939
obj = -711.055256, rho = 0.264928
nSV = 718, nBSV = 8
.*.*
optimization finished, #iter = 2172
obj = -564.799150, rho = 0.279098
nSV = 755, nBSV = 1
.*
optimization finished, #iter = 1789
obj = -547.149007, rho = 0.279065
nSV = 688, nBSV = 4
Total nSV = 1888
[LibSVM].*.*
optimization finished, #iter = 2564
obj = -1099.161392, rho = -0.053672
nSV = 799, nBSV = 73
*.*
optimization finished, #iter = 1144
obj = -184.939287, rho = -0.067591
nSV = 460, nBSV = 2
.

.*
optimization finished, #iter = 1523
obj = -287.056853, rho = -0.070425
nSV = 583, nBSV = 9
.*.*
optimization finished, #iter = 2000
obj = -715.162723, rho = 0.327700
nSV = 716, nBSV = 8
.*.*
optimization finished, #iter = 2112
obj = -551.546263, rho = 0.326310
nSV = 757, nBSV = 1
.*.*
optimization finished, #iter = 1826
obj = -526.126672, rho = 0.258477
nSV = 698, nBSV = 3
Total nSV = 1902
[LibSVM].*.*
optimization finished, #iter = 2637
obj = -1095.026755, rho = 0.023812
nSV = 809, nBSV = 62
*.*
optimization finished, #iter = 1169
obj = -179.534160, rho = -0.085262
nSV = 462, nBSV = 1
.*
optimization finished, #iter = 1549
obj = -281.283969, rho = -0.135319
nSV = 563, nBSV = 12
.*.*
optimization finished, #iter = 2022
obj = -714.277514, rho = 0.222730
nSV = 726, nBSV = 9
.*.*
optimization finished, #iter = 2149
obj = -540.401185, rho = 0.279907
nSV = 751, nBSV = 1
.*
optimization finished, #iter = 1797
obj = -522.902147, rho = 0.229699
nSV = 686, nBSV = 3
Total nSV = 1901
[LibSVM].

.*
optimization finished, #iter = 1731
obj = -517.024101, rho = 0.167414
nSV = 704, nBSV = 4
Total nSV = 1910
[LibSVM].*.*
optimization finished, #iter = 2583
obj = -1031.358262, rho = -0.009744
nSV = 820, nBSV = 59
*.*
optimization finished, #iter = 1146
obj = -180.509451, rho = -0.044956
nSV = 471, nBSV = 2
.*
optimization finished, #iter = 1532
obj = -274.599746, rho = -0.080991
nSV = 579, nBSV = 8
.*.*
optimization finished, #iter = 2008
obj = -623.258212, rho = 0.240703
nSV = 739, nBSV = 4
.*.*
optimization finished, #iter = 2115
obj = -527.067821, rho = 0.297446
nSV = 765, nBSV = 2
.*
optimization finished, #iter = 1749
obj = -478.798817, rho = 0.181376
nSV = 702, nBSV = 2
Total nSV = 1901
[LibSVM].*.*
optimization finished, #iter = 2579
obj = -1040.452472, rho = 0.068796
nSV = 844, nBSV = 52
*.*
optimization finished, #iter = 1128
obj = -189.020052, rho = -0.081971
nSV = 471, nBSV = 2
.*
optimization finished, #iter = 1567
obj = -290.092571, rho = -0.085809
nSV = 579, nBSV = 10


.*.*
optimization finished, #iter = 1978
obj = -669.816457, rho = 0.255033
nSV = 745, nBSV = 4
.*.*
optimization finished, #iter = 2161
obj = -550.819787, rho = 0.276799
nSV = 773, nBSV = 1
.*
optimization finished, #iter = 1779
obj = -531.711610, rho = 0.281674
nSV = 703, nBSV = 4
Total nSV = 1917
[LibSVM].*.*
optimization finished, #iter = 2546
obj = -1061.550420, rho = -0.060861
nSV = 816, nBSV = 64
*.*
optimization finished, #iter = 1158
obj = -180.736770, rho = -0.070692
nSV = 476, nBSV = 2
.*
optimization finished, #iter = 1605
obj = -287.493421, rho = -0.048396
nSV = 601, nBSV = 10
.*
optimization finished, #iter = 1926
obj = -610.422387, rho = 0.292110
nSV = 717, nBSV = 2
.*.*
optimization finished, #iter = 2139
obj = -530.375558, rho = 0.306056
nSV = 774, nBSV = 1
.*
optimization finished, #iter = 1781
obj = -519.395042, rho = 0.177787
nSV = 701, nBSV = 4
Total nSV = 1917
[LibSVM].*.*
optimization finished, #iter = 2480
obj = -1014.024141, rho = -0.052277
nSV = 817, nBSV = 55


*.*
optimization finished, #iter = 1144
obj = -176.270485, rho = -0.100690
nSV = 476, nBSV = 2
.*
optimization finished, #iter = 1487
obj = -269.239548, rho = -0.152742
nSV = 594, nBSV = 11
.*.*
optimization finished, #iter = 2039
obj = -644.781626, rho = 0.194445
nSV = 747, nBSV = 5
.*.*
optimization finished, #iter = 2106
obj = -517.095148, rho = 0.267651
nSV = 788, nBSV = 1
.*
optimization finished, #iter = 1786
obj = -495.943155, rho = 0.237302
nSV = 708, nBSV = 3
Total nSV = 1941
[LibSVM].*.*
optimization finished, #iter = 2520
obj = -962.549463, rho = 0.043214
nSV = 848, nBSV = 62
*.*
optimization finished, #iter = 1173
obj = -187.762145, rho = -0.076319
nSV = 491, nBSV = 2
.*
optimization finished, #iter = 1548
obj = -283.103456, rho = -0.091908
nSV = 614, nBSV = 10
.*.*
optimization finished, #iter = 1992
obj = -619.029843, rho = 0.175087
nSV = 754, nBSV = 3
.*.*
optimization finished, #iter = 2150
obj = -531.033166, rho = 0.263209
nSV = 793, nBSV = 2
.*.*
optimization finished

.*
optimization finished, #iter = 1800
obj = -473.674721, rho = 0.185077
nSV = 708, nBSV = 2
Total nSV = 1933
[LibSVM].*.*
optimization finished, #iter = 2613
obj = -1005.988600, rho = 0.058476
nSV = 863, nBSV = 48
*.*
optimization finished, #iter = 1104
obj = -188.291239, rho = -0.091933
nSV = 472, nBSV = 2
.*
optimization finished, #iter = 1569
obj = -283.888250, rho = -0.094417
nSV = 604, nBSV = 5
.*.*
optimization finished, #iter = 2030
obj = -672.974906, rho = 0.215865
nSV = 757, nBSV = 4
.*.*
optimization finished, #iter = 2035
obj = -521.055974, rho = 0.202990
nSV = 777, nBSV = 1
.*
optimization finished, #iter = 1790
obj = -503.670990, rho = 0.198979
nSV = 714, nBSV = 3
Total nSV = 1944
[LibSVM].*.*
optimization finished, #iter = 2584
obj = -983.230891, rho = 0.018754
nSV = 848, nBSV = 48
*.*
optimization finished, #iter = 1223
obj = -191.556380, rho = -0.103110
nSV = 500, nBSV = 2
.*
optimization finished, #iter = 1542
obj = -257.559000, rho = -0.121288
nSV = 599, nBSV = 4
.*.

In [12]:
run_metrics = {}
run_params = {}

for run_id in run_ids:
    client = mlflow.tracking.MlflowClient(ML_FLOW_DIRECTORY)
    run = client.get_run(run_id)
    metrics = run.data.metrics
    params = run.data.params
    run_metrics[run_id] = metrics
    run_params[run_id] = params

for run_id in run_ids:
    print(f"Run ID: {run_id}")
    print("Metrics:")
    metrics = run_metrics[run_id]
    for metric, value in metrics.items():
        print(f"\t{metric}: {value}")

    print("Parameters:")
    params = run_params[run_id]
    for param, value in params.items():
        print(f"\t{param}: {value}")

Run ID: 2c5e0889a25d4f6198011c3f1c940674
Metrics:
	mean_f1_weighted: 0.858136518892557
	mean_accuracy: 0.8587045570916538
	mean_recall: 0.8587045570916538
	mean_precision: 0.859396922731795
Parameters:
	gamma: scale
	pca_components: 1600
	C: 9
	kernel: rbf
Run ID: 48928baac2144e85b8ef021edabcbbc0
Metrics:
	mean_f1_weighted: 0.8588437866954063
	mean_accuracy: 0.8594226830517153
	mean_recall: 0.8594226830517153
	mean_precision: 0.8601183763650667
Parameters:
	gamma: scale
	pca_components: 1600
	C: 10
	kernel: rbf
Run ID: 1607c9810d9a43fdbe31847e79b7da53
Metrics:
	mean_f1_weighted: 0.8581366597704709
	mean_accuracy: 0.8587083973374297
	mean_recall: 0.8587083973374297
	mean_precision: 0.8594076881137994
Parameters:
	gamma: scale
	pca_components: 1600
	C: 10.5
	kernel: rbf
Run ID: 5e04c83533d5462fa994ab4ee02e06ce
Metrics:
	mean_f1_weighted: 0.8588472464951025
	mean_accuracy: 0.8594188428059395
	mean_recall: 0.8594188428059395
	mean_precision: 0.8603170575935852
Parameters:
	gamma: scale
	pc

In [13]:
best_run_id = max(run_metrics, key=lambda x: run_metrics[x]["mean_f1_weighted"])
best_run_params = run_params[best_run_id]
best_run_metrics = run_metrics[best_run_id]

print(f"\nBest Run ID: {best_run_id}")
print("Best Parameters:")
for key, value in best_run_params.items():
    print(f"\t{key}: {value}")

print("\nBest Run Metrics:")
for metric, value in best_run_metrics.items():
    print(f"\t{metric}: {value}")


Best Run ID: c3b0c7ddbe66406d9a01c744de9894a9
Best Parameters:
	gamma: scale
	pca_components: 2500
	C: 9
	kernel: rbf

Best Run Metrics:
	mean_f1_weighted: 0.8595412904504144
	mean_accuracy: 0.8601369687660011
	mean_recall: 0.8601369687660011
	mean_precision: 0.8610459586099418
