# SVM: Version 2

## SVM with PCA

In [1]:
import os
import glob
import time
import mlflow
import shutil
import itertools

import numpy as np
import pandas as pd

from PIL import Image
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import (
    make_scorer,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
)

In [2]:
TRAIN_DIR = "../data/train"
CLASS_FOLDERS = ["Class A", "Class B", "Class C", "Class D"]
ML_FLOW_DIRECTORY = "SVM_Logs"

In [3]:
mlflow.set_tracking_uri(ML_FLOW_DIRECTORY)
mlflow.set_experiment("SVM_PCA")

<Experiment: artifact_location='/Users/jacob/Code/Monkey-Business/SVM/SVM_Logs/216794599421195463', creation_time=1701926432685, experiment_id='216794599421195463', last_update_time=1701926432685, lifecycle_stage='active', name='SVM_PCA', tags={}>

In [4]:
pca_components = [40 * 40, 45 * 45, 50 * 50]
C_values = [0.1, 1, 10]
kernel_types = ["linear", "rbf"]
gamma_values = ["scale", "auto"]

In [5]:
hyperparameter_combinations = list(
    itertools.product(pca_components, C_values, kernel_types, gamma_values)
)
print(
    f"Total number of hyperparameter combinations: {len(hyperparameter_combinations)}"
)

Total number of hyperparameter combinations: 36


In [6]:
pca_models_dict = {}

In [15]:
def train_with_params(pca_components, C, kernel, gamma, train_images, train_labels):
    # Check if PCA model for pca_components already exists
    if pca_components in pca_models_dict:
        pca = pca_models_dict[pca_components]
        print(f"Using existing PCA model for {pca_components} components")
    else:
        # Apply PCA
        pca = PCA(n_components=pca_components)
        pca.fit(train_images)
        pca_models_dict[pca_components] = pca

    # Transform images with PCA
    train_images_pca = pca.transform(train_images)

    with mlflow.start_run() as run:
        start_time = time.time()

        # Initialize SVM model
        svm_model = svm.SVC(
            C=C, kernel=kernel, gamma=gamma, class_weight="balanced", verbose=True
        )

        # Perform cross-validation
        skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
        scoring = {
            "f1_weighted": make_scorer(f1_score, average="weighted"),
            "accuracy": "accuracy",
            "precision": make_scorer(precision_score, average="weighted"),
            "recall": make_scorer(recall_score, average="weighted"),
        }
        cv_results = cross_validate(
            svm_model,
            train_images_pca,
            train_labels,
            cv=skf,
            scoring=scoring,
            verbose=2,
        )

        # Log hyperparameters and metrics
        mlflow.log_params(
            {"pca_components": pca_components, "C": C, "kernel": kernel, "gamma": gamma}
        )
        mlflow.log_metrics(
            {
                "mean_f1_weighted": cv_results["test_f1_weighted"].mean(),
                "mean_accuracy": cv_results["test_accuracy"].mean(),
                "mean_precision": cv_results["test_precision"].mean(),
                "mean_recall": cv_results["test_recall"].mean(),
            }
        )
        end_time = time.time()
        duration = end_time - start_time
        param_details = f"PCA={pca_components}, C={C}, kernel={kernel}, gamma={gamma}"
        duration_details = f"Training duration for {param_details}: {duration} seconds"
        print(duration_details)
        run_id = run.info.run_id
        return run_id

In [10]:
def load_image(img_path):
    img = Image.open(img_path)
    img = img.resize((224, 224))
    img = np.array(img)
    img = img.astype("float32") / 255.0
    return img

In [11]:
def load_images_from_folder(folder):
    images = []
    labels = []
    for class_folder in CLASS_FOLDERS:
        start_time = time.time()
        class_path = os.path.join(folder, class_folder)
        image_files = glob.glob(os.path.join(class_path, "*.png"))
        for img_file in image_files:
            processed_image = load_image(img_file)
            flattened_image = processed_image.flatten()
            images.append(flattened_image)
            labels.append(class_folder)
        end_time = time.time()
        duration = end_time - start_time
        print(f"Processing {class_folder} took {duration} seconds")
    return np.array(images), np.array(labels)

In [12]:
train_images, train_labels = load_images_from_folder(TRAIN_DIR)

Processing Class A took 2.7697389125823975 seconds
Processing Class B took 1.5491418838500977 seconds
Processing Class C took 1.0512456893920898 seconds
Processing Class D took 1.2895679473876953 seconds


In [13]:
print(
    f"Shape of train_images: {train_images.shape}, Shape of train_labels: {train_labels.shape}"
)

Shape of train_images: (2796, 150528), Shape of train_labels: (2796,)


In [16]:
run_ids = []

for params in hyperparameter_combinations:
    pca_components, C, kernel, gamma = params
    run_id = train_with_params(pca_components, C, kernel, gamma, train_images, train_labels)
    run_ids.append(run_id)

Using existing PCA model for 1600 components
[LibSVM]........*....*
optimization finished, #iter = 12881
obj = -0.771869, rho = 0.707572
nSV = 417, nBSV = 0
.*.*
optimization finished, #iter = 2126
obj = -0.049240, rho = -0.509753
nSV = 198, nBSV = 0
...*
optimization finished, #iter = 3675
obj = -0.125562, rho = -0.444424
nSV = 250, nBSV = 0
....*..*
optimization finished, #iter = 4551
obj = -0.355675, rho = -1.404315
nSV = 326, nBSV = 0
....*..*
optimization finished, #iter = 4932
obj = -0.198372, rho = -1.102659
nSV = 329, nBSV = 0
....*..*
optimization finished, #iter = 4256
obj = -0.187187, rho = -0.097329
nSV = 343, nBSV = 0
Total nSV = 1138
[CV] END .................................................... total time=   2.3s
[LibSVM]...........*...*
optimization finished, #iter = 14267
obj = -0.644068, rho = 0.609817
nSV = 404, nBSV = 0
.*
optimization finished, #iter = 1626
obj = -0.045683, rho = -0.497757
nSV = 187, nBSV = 0
..*..*
optimization finished, #iter = 4178
obj = -0.11522

*
optimization finished, #iter = 985
obj = -66.599216, rho = 0.802390
nSV = 981, nBSV = 856
.*
optimization finished, #iter = 1081
obj = -71.691437, rho = 0.806233
nSV = 1082, nBSV = 964
*.*
optimization finished, #iter = 734
obj = -78.591399, rho = 0.727298
nSV = 726, nBSV = 696
*
optimization finished, #iter = 802
obj = -82.496963, rho = 0.640699
nSV = 803, nBSV = 798
*
optimization finished, #iter = 666
obj = -82.987286, rho = -0.503800
nSV = 667, nBSV = 666
Total nSV = 1856
[CV] END .................................................... total time=   5.4s
Training duration for PCA=1600, C=0.1, kernel=rbf, gamma=auto: 16.22430992126465 seconds
Using existing PCA model for 1600 components
[LibSVM]........*....*
optimization finished, #iter = 12881
obj = -0.771869, rho = 0.707572
nSV = 417, nBSV = 0
.*.*
optimization finished, #iter = 2126
obj = -0.049240, rho = -0.509753
nSV = 198, nBSV = 0
...*
optimization finished, #iter = 3675
obj = -0.125562, rho = -0.444424
nSV = 250, nBSV = 0
..

.*
optimization finished, #iter = 1202
obj = -260.584754, rho = 0.090125
nSV = 682, nBSV = 26
.*
optimization finished, #iter = 1357
obj = -293.445811, rho = 0.146035
nSV = 776, nBSV = 31
.*
optimization finished, #iter = 1171
obj = -268.747736, rho = 0.059182
nSV = 650, nBSV = 15
Total nSV = 1718
[CV] END .................................................... total time=   5.2s
[LibSVM].*
optimization finished, #iter = 1479
obj = -328.009736, rho = 0.091179
nSV = 1046, nBSV = 294
*.*
optimization finished, #iter = 1160
obj = -215.878852, rho = 0.106905
nSV = 777, nBSV = 268
.*
optimization finished, #iter = 1347
obj = -252.228094, rho = 0.158089
nSV = 905, nBSV = 249
.*
optimization finished, #iter = 1174
obj = -272.692451, rho = 0.065058
nSV = 687, nBSV = 32
.*
optimization finished, #iter = 1383
obj = -300.534765, rho = 0.116123
nSV = 777, nBSV = 35
.*
optimization finished, #iter = 1146
obj = -268.535211, rho = 0.055695
nSV = 653, nBSV = 12
Total nSV = 1736
[CV] END .................

.*
optimization finished, #iter = 1192
obj = -278.130454, rho = 0.042625
nSV = 650, nBSV = 0
Total nSV = 1730
[CV] END .................................................... total time=   5.2s
[LibSVM].*
optimization finished, #iter = 1901
obj = -381.994215, rho = -0.029247
nSV = 1012, nBSV = 1
*.*
optimization finished, #iter = 1322
obj = -223.756987, rho = 0.019164
nSV = 758, nBSV = 0
.*
optimization finished, #iter = 1629
obj = -261.937079, rho = 0.069555
nSV = 880, nBSV = 0
.*
optimization finished, #iter = 1223
obj = -271.243970, rho = 0.081812
nSV = 686, nBSV = 0
.*
optimization finished, #iter = 1447
obj = -302.765824, rho = 0.135319
nSV = 778, nBSV = 0
.*
optimization finished, #iter = 1180
obj = -280.454178, rho = 0.059483
nSV = 649, nBSV = 0
Total nSV = 1709
[CV] END .................................................... total time=   5.1s
[LibSVM].*
optimization finished, #iter = 1948
obj = -377.788600, rho = -0.033860
nSV = 1027, nBSV = 1
*.*
optimization finished, #iter = 1355

[LibSVM].
*
optimization finished, #iter = 1174
obj = -74.849526, rho = 0.735258
nSV = 1149, nBSV = 975
*
optimization finished, #iter = 906
obj = -60.482055, rho = 0.791350
nSV = 917, nBSV = 756
*.*
optimization finished, #iter = 1056
obj = -67.386977, rho = 0.806074
nSV = 1046, nBSV = 912
*
optimization finished, #iter = 731
obj = -76.940355, rho = 0.721930
nSV = 720, nBSV = 688
*.*
optimization finished, #iter = 808
obj = -81.880620, rho = 0.560861
nSV = 804, nBSV = 800
*.*
optimization finished, #iter = 670
obj = -80.680270, rho = -0.359335
nSV = 668, nBSV = 664
Total nSV = 1823
[CV] END .................................................... total time=   6.9s
[LibSVM].
*
optimization finished, #iter = 1173
obj = -74.241931, rho = 0.753150
nSV = 1148, nBSV = 957
*
optimization finished, #iter = 912
obj = -58.068628, rho = 0.805640
nSV = 905, nBSV = 729
*.*
optimization finished, #iter = 1030
obj = -64.867525, rho = 0.837337
nSV = 1023, nBSV = 856
*
optimization finished, #iter = 727


*
optimization finished, #iter = 912
obj = -162.510095, rho = -0.112036
nSV = 541, nBSV = 194
*.*
optimization finished, #iter = 961
obj = -311.914146, rho = 0.363166
nSV = 574, nBSV = 180
.*
optimization finished, #iter = 1122
obj = -308.560981, rho = 0.408240
nSV = 638, nBSV = 149
.*
optimization finished, #iter = 1018
obj = -296.384121, rho = 0.187265
nSV = 560, nBSV = 182
Total nSV = 1548
[CV] END .................................................... total time=   4.8s
Training duration for PCA=2025, C=1, kernel=rbf, gamma=scale: 14.5554358959198 seconds
Using existing PCA model for 2025 components
[LibSVM].*
optimization finished, #iter = 1472
obj = -313.825656, rho = 0.077087
nSV = 1003, nBSV = 253
*.*
optimization finished, #iter = 1138
obj = -184.931122, rho = 0.105625
nSV = 724, nBSV = 167
*.*
optimization finished, #iter = 1293
obj = -221.973906, rho = 0.155115
nSV = 844, nBSV = 192
.*
optimization finished, #iter = 1153
obj = -270.682698, rho = 0.097286
nSV = 685, nBSV = 41
.

.*.*
optimization finished, #iter = 1615
obj = -420.361256, rho = 0.329328
nSV = 607, nBSV = 0
.*.*
optimization finished, #iter = 1356
obj = -400.851110, rho = 0.239639
nSV = 540, nBSV = 3
Total nSV = 1458
[CV] END .................................................... total time=   5.0s
[LibSVM].*.*
optimization finished, #iter = 2007
obj = -757.551263, rho = 0.042503
nSV = 651, nBSV = 40
*
optimization finished, #iter = 924
obj = -154.163671, rho = -0.102287
nSV = 394, nBSV = 0
*.*
optimization finished, #iter = 1268
obj = -233.305112, rho = -0.139696
nSV = 485, nBSV = 8
.*.*
optimization finished, #iter = 1553
obj = -464.283825, rho = 0.211848
nSV = 575, nBSV = 2
.*.*
optimization finished, #iter = 1685
obj = -442.283389, rho = 0.370484
nSV = 610, nBSV = 1
.*.*
optimization finished, #iter = 1406
obj = -373.212080, rho = 0.267802
nSV = 556, nBSV = 1
Total nSV = 1468
[CV] END .................................................... total time=   5.9s
Training duration for PCA=2025, C=10, 

[LibSVM].
*
optimization finished, #iter = 1065
obj = -65.470662, rho = 0.276171
nSV = 1044, nBSV = 934
*
optimization finished, #iter = 675
obj = -39.009434, rho = 0.404410
nSV = 652, nBSV = 586
*
optimization finished, #iter = 923
obj = -49.930091, rho = 0.664934
nSV = 868, nBSV = 760
*
optimization finished, #iter = 631
obj = -62.499263, rho = 0.632806
nSV = 610, nBSV = 553
*
optimization finished, #iter = 784
obj = -66.550671, rho = 0.777931
nSV = 716, nBSV = 653
*
optimization finished, #iter = 637
obj = -68.351363, rho = 0.240614
nSV = 614, nBSV = 583
Total nSV = 1768
[CV] END .................................................... total time=   7.4s
[LibSVM].
*
optimization finished, #iter = 1077
obj = -66.401064, rho = 0.183213
nSV = 1058, nBSV = 944
*
optimization finished, #iter = 713
obj = -41.589473, rho = 0.401366
nSV = 684, nBSV = 629
*
optimization finished, #iter = 941
obj = -52.098586, rho = 0.607114
nSV = 892, nBSV = 812
*
optimization finished, #iter = 650
obj = -64.211

*
optimization finished, #iter = 674
obj = -120.421554, rho = -0.127203
nSV = 411, nBSV = 121
*
optimization finished, #iter = 943
obj = -164.838898, rho = -0.142429
nSV = 549, nBSV = 189
.*
optimization finished, #iter = 1021
obj = -313.664062, rho = 0.220205
nSV = 584, nBSV = 155
.*
optimization finished, #iter = 1094
obj = -303.295345, rho = 0.288621
nSV = 636, nBSV = 140
.*
optimization finished, #iter = 956
obj = -299.095518, rho = 0.145961
nSV = 550, nBSV = 180
Total nSV = 1533
[CV] END .................................................... total time=   6.0s
[LibSVM]*.*
optimization finished, #iter = 1111
obj = -330.866522, rho = 0.181753
nSV = 759, nBSV = 443
*
optimization finished, #iter = 708
obj = -110.958340, rho = 0.046116
nSV = 392, nBSV = 104
*
optimization finished, #iter = 939
obj = -160.823338, rho = 0.001430
nSV = 546, nBSV = 190
*.*
optimization finished, #iter = 925
obj = -282.231217, rho = 0.298787
nSV = 548, nBSV = 146
.*
optimization finished, #iter = 1114
obj = 

...*..*
optimization finished, #iter = 3559
obj = -0.120439, rho = -0.226264
nSV = 386, nBSV = 0
Total nSV = 1211
[CV] END .................................................... total time=   3.8s
Training duration for PCA=2500, C=10, kernel=linear, gamma=auto: 11.408762216567993 seconds
Using existing PCA model for 2500 components
[LibSVM].*.*
optimization finished, #iter = 2104
obj = -795.543085, rho = 0.027316
nSV = 684, nBSV = 36
*
optimization finished, #iter = 940
obj = -157.748807, rho = -0.098671
nSV = 402, nBSV = 0
*.*
optimization finished, #iter = 1249
obj = -243.442477, rho = -0.070749
nSV = 492, nBSV = 7
.*.*
optimization finished, #iter = 1550
obj = -500.493827, rho = 0.126819
nSV = 603, nBSV = 3
.*.*
optimization finished, #iter = 1678
obj = -421.630114, rho = 0.219979
nSV = 628, nBSV = 2
.*.*
optimization finished, #iter = 1382
obj = -384.709388, rho = 0.135291
nSV = 557, nBSV = 2
Total nSV = 1483
[CV] END .................................................... total time=  

In [17]:
run_metrics = {}

for run_id in run_ids:
    client = mlflow.tracking.MlflowClient(ML_FLOW_DIRECTORY)
    run = client.get_run(run_id)
    metrics = run.data.metrics
    run_metrics[run_id] = metrics

for run_id, metrics in run_metrics.items():
    print(f"Run ID: {run_id}")
    for metric, value in metrics.items():
        print(f"\t{metric}: {value}")

Run ID: 72f7477f192e41d38f9e98b7f94a62b0
	mean_f1_weighted: 0.785832523572699
	mean_accuracy: 0.7871959942775394
	mean_recall: 0.7871959942775394
	mean_precision: 0.7859339856424669
Run ID: 20236e442d0e44d8a4ff391561a2b148
	mean_f1_weighted: 0.785832523572699
	mean_accuracy: 0.7871959942775394
	mean_recall: 0.7871959942775394
	mean_precision: 0.7859339856424669
Run ID: 66c4fe78d2494bb19cf22ac9d22dd2be
	mean_f1_weighted: 0.6136776175878488
	mean_accuracy: 0.6133762517882689
	mean_recall: 0.6133762517882689
	mean_precision: 0.6202479873572102
Run ID: 7ff96bca30564451b44df226fa9a2171
	mean_f1_weighted: 0.3227990208754488
	mean_accuracy: 0.34442060085836906
	mean_recall: 0.34442060085836906
	mean_precision: 0.6543248698056973
Run ID: 4b65874ea04f4d4595fe5b1142eed655
	mean_f1_weighted: 0.785832523572699
	mean_accuracy: 0.7871959942775394
	mean_recall: 0.7871959942775394
	mean_precision: 0.7859339856424669
Run ID: f8954268417647bcbe68a502d99b71b0
	mean_f1_weighted: 0.785832523572699
	mean_ac

In [19]:
best_run_id = max(run_metrics, key=lambda x: run_metrics[x]["mean_f1_weighted"])
best_params = client.get_run(best_run_id).data.params
best_run_metrics = run_metrics[best_run_id]

print(f"\nBest Run ID: {best_run_id}")
print("Best Parameters:")
for key, value in best_params.items():
    print(f"\t{key}: {value}")

print("\nBest Run Metrics:")
for metric, value in best_run_metrics.items():
    print(f"\t{metric}: {value}")


Best Run ID: ded76353831241588f602ffeb2d15344
Best Parameters:
	gamma: scale
	pca_components: 2025
	C: 10
	kernel: rbf

Best Run Metrics:
	mean_f1_weighted: 0.8376340921446065
	mean_accuracy: 0.8390557939914164
	mean_recall: 0.8390557939914164
	mean_precision: 0.8368662942713628
