# SVM: Version 1

Checking to see what the baseline metric for the SVM is.

In [1]:
import os
import cv2
import glob
import time
import mlflow
import shutil
import itertools

import numpy as np
import pandas as pd

from PIL import Image
from sklearn import svm
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import (
    make_scorer,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
)

In [2]:
TRAIN_DIR = "../data/train"
CLASS_FOLDERS = ["Class A", "Class B", "Class C", "Class D"]
ML_FLOW_DIRECTORY = "SVM_Logs"

In [3]:
mlflow.set_tracking_uri(ML_FLOW_DIRECTORY)
mlflow.set_experiment("SVM_V1")

<Experiment: artifact_location='/Users/jacob/Code/Monkey-Business/SVM/SVM_V1_Logs/331033974301432080', creation_time=1701907697832, experiment_id='331033974301432080', last_update_time=1701907697832, lifecycle_stage='active', name='SVM_V1', tags={}>

In [4]:
C_values = [0.1, 1, 10]
kernel_types = ['linear', 'rbf']
gamma_values = ['scale', 'auto']

In [5]:
hyperparameter_combinations = list(
    itertools.product(C_values, kernel_types, gamma_values)
)
print(f"Total number of hyperparameter combinations: {len(hyperparameter_combinations)}")

Total number of hyperparameter combinations: 12


In [6]:
def train_with_params(C, kernel, gamma, train_images, train_labels):
    with mlflow.start_run() as run:
        start_time = time.time()

        # Initialize SVM model
        svm_model = svm.SVC(
            C=C, kernel=kernel, gamma=gamma, class_weight="balanced", verbose=True
        )

        # Perform cross-validation
        skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
        scoring = {
            "f1_weighted": make_scorer(f1_score, average="weighted"),
            "accuracy": "accuracy",
            "precision": make_scorer(precision_score, average="weighted"),
            "recall": make_scorer(recall_score, average="weighted"),
        }
        cv_results = cross_validate(
            svm_model, train_images, train_labels, cv=skf, scoring=scoring, verbose=2
        )

        # Log hyperparameters and metrics
        mlflow.log_params({"C": C, "kernel": kernel, "gamma": gamma})
        mlflow.log_metrics(
            {
                "mean_f1_weighted": cv_results["test_f1_weighted"].mean(),
                "mean_accuracy": cv_results["test_accuracy"].mean(),
                "mean_precision": cv_results["test_precision"].mean(),
                "mean_recall": cv_results["test_recall"].mean(),
            }
        )
        end_time = time.time()
        duration = end_time - start_time
        param_details = f"C={C}, kernel={kernel}, gamma={gamma}"
        duration_details = f"Training duration for {param_details}: {duration} seconds"
        print(duration_details)
        run_id = run.info.run_id
        return run_id

In [7]:
def load_images_from_folder(folder):
    images = []
    labels = []
    for class_folder in CLASS_FOLDERS:
        start_time = time.time()
        class_path = os.path.join(folder, class_folder)
        image_files = glob.glob(os.path.join(class_path, "*.png"))
        for img_file in image_files:
            processed_image = cv2.imread(img_file)
            flattened_image = processed_image.flatten()
            images.append(flattened_image)
            labels.append(class_folder)
        end_time = time.time()
        duration = end_time - start_time
        print(f"Processing {class_folder} took {duration} seconds")
    return np.array(images), np.array(labels)

In [8]:
train_images, train_labels = load_images_from_folder(TRAIN_DIR)

Processing Class A took 1.707284927368164 seconds
Processing Class B took 0.9642601013183594 seconds
Processing Class C took 0.681333065032959 seconds
Processing Class D took 0.7671737670898438 seconds


In [9]:
print(
    f"Shape of train_images: {train_images.shape}, Shape of train_labels: {train_labels.shape}"
)

Shape of train_images: (2796, 150528), Shape of train_labels: (2796,)


In [15]:
run_ids = []

for params in hyperparameter_combinations:
    C, kernel, gamma = params
    run_id = train_with_params(C, kernel, gamma, train_images, train_labels)
    run_ids.append(run_id)

[LibSVM].....*...*
optimization finished, #iter = 8063
obj = -0.431008, rho = -1.120097
nSV = 526, nBSV = 0
.*
optimization finished, #iter = 1708
obj = -0.044959, rho = -1.695860
nSV = 219, nBSV = 0
..*.*
optimization finished, #iter = 3496
obj = -0.101756, rho = -0.157115
nSV = 292, nBSV = 0
...*..*
optimization finished, #iter = 4196
obj = -0.178770, rho = -1.377789
nSV = 390, nBSV = 0
...*..*
optimization finished, #iter = 4624
obj = -0.152781, rho = 0.765236
nSV = 389, nBSV = 0
...*..*
optimization finished, #iter = 3634
obj = -0.129693, rho = 2.038100
nSV = 391, nBSV = 0
Total nSV = 1260
[CV] END .................................................... total time= 4.1min
[LibSVM].......*..*
optimization finished, #iter = 9477
obj = -0.391359, rho = -0.979554
nSV = 506, nBSV = 0
.*
optimization finished, #iter = 1673
obj = -0.041186, rho = -2.073674
nSV = 207, nBSV = 0
..*.*
optimization finished, #iter = 3482
obj = -0.095444, rho = -0.104243
nSV = 290, nBSV = 0
...*.*
optimization fi

*
optimization finished, #iter = 720
obj = -82.526111, rho = 0.136465
nSV = 717, nBSV = 715
.
*
optimization finished, #iter = 806
obj = -87.139503, rho = 0.249973
nSV = 804, nBSV = 798
*
optimization finished, #iter = 666
obj = -87.770022, rho = -0.249904
nSV = 667, nBSV = 666
Total nSV = 1864


  _warn_prf(average, modifier, msg_start, len(result))


[CV] END .................................................... total time= 8.8min
Training duration for C=0.1, kernel=rbf, gamma=auto: 1577.7833938598633 seconds
[LibSVM].....*...*
optimization finished, #iter = 8063
obj = -0.431008, rho = -1.120097
nSV = 526, nBSV = 0
.*
optimization finished, #iter = 1708
obj = -0.044959, rho = -1.695860
nSV = 219, nBSV = 0
..*.*
optimization finished, #iter = 3496
obj = -0.101756, rho = -0.157115
nSV = 292, nBSV = 0
...*..*
optimization finished, #iter = 4196
obj = -0.178770, rho = -1.377789
nSV = 390, nBSV = 0
...*..*
optimization finished, #iter = 4624
obj = -0.152781, rho = 0.765236
nSV = 389, nBSV = 0
...*..*
optimization finished, #iter = 3634
obj = -0.129693, rho = 2.038100
nSV = 391, nBSV = 0
Total nSV = 1260
[CV] END .................................................... total time= 4.0min
[LibSVM].......*..*
optimization finished, #iter = 9477
obj = -0.391359, rho = -0.979554
nSV = 506, nBSV = 0
.*
optimization finished, #iter = 1673
obj = -0.

*
optimization finished, #iter = 771
obj = -686.270233, rho = 0.733353
nSV = 697, nBSV = 675
*
optimization finished, #iter = 632
obj = -636.837565, rho = 0.879092
nSV = 554, nBSV = 520
Total nSV = 1699
[CV] END .................................................... total time= 7.3min
Training duration for C=1, kernel=rbf, gamma=auto: 1296.235414981842 seconds
[LibSVM].....*...*
optimization finished, #iter = 8063
obj = -0.431008, rho = -1.120097
nSV = 526, nBSV = 0
.*
optimization finished, #iter = 1708
obj = -0.044959, rho = -1.695860
nSV = 219, nBSV = 0
..*.*
optimization finished, #iter = 3496
obj = -0.101756, rho = -0.157115
nSV = 292, nBSV = 0
...*..*
optimization finished, #iter = 4196
obj = -0.178770, rho = -1.377789
nSV = 390, nBSV = 0
...*..*
optimization finished, #iter = 4624
obj = -0.152781, rho = 0.765236
nSV = 389, nBSV = 0
...*..*
optimization finished, #iter = 3634
obj = -0.129693, rho = 2.038100
nSV = 391, nBSV = 0
Total nSV = 1260
[CV] END .............................

*
optimization finished, #iter = 785
obj = -2124.633780, rho = -2.225724
nSV = 437, nBSV = 324
*.*
optimization finished, #iter = 820
obj = -3816.297795, rho = 1.367152
nSV = 462, nBSV = 334
.*
optimization finished, #iter = 1015
obj = -3724.158268, rho = 0.676151
nSV = 532, nBSV = 327
.*
optimization finished, #iter = 988
obj = -3704.394569, rho = -0.563276
nSV = 436, nBSV = 274
Total nSV = 1437
[CV] END .................................................... total time= 5.7min
Training duration for C=10, kernel=rbf, gamma=auto: 1018.8664572238922 seconds


In [41]:
run_metrics = {}

for run_id in run_ids:
    client = mlflow.tracking.MlflowClient(ML_FLOW_DIRECTORY)
    run = client.get_run(run_id)
    metrics = run.data.metrics
    run_metrics[run_id] = metrics

for run_id, metrics in run_metrics.items():
    print(f"Run ID: {run_id}")
    for metric, value in metrics.items():
        print(f"\t{metric}: {value}")

Run ID: bcf3d811341a4a01a4c79a5cc36dadf4
	mean_f1_weighted: 0.8047625046491292
	mean_accuracy: 0.8065092989985695
	mean_recall: 0.8065092989985695
	mean_precision: 0.8045501982916231
Run ID: f2048ec06f324fb4938416124f694c43
	mean_f1_weighted: 0.31760977436605303
	mean_accuracy: 0.3758941344778255
	mean_recall: 0.3758941344778255
	mean_precision: 0.46745896353895006
Run ID: 137e9db0e44241cea2ff791ee35388b8
	mean_f1_weighted: 0.8047625046491292
	mean_accuracy: 0.8065092989985695
	mean_recall: 0.8065092989985695
	mean_precision: 0.8045501982916231
Run ID: 6783e97d3bad4ddfb1b6d703ea8bb29c
	mean_f1_weighted: 0.8364481220648057
	mean_accuracy: 0.8376251788268956
	mean_recall: 0.8376251788268956
	mean_precision: 0.8358215197593659
Run ID: b25e739433be4709895e7545e01bb8e6
	mean_f1_weighted: 0.8078734926798968
	mean_accuracy: 0.8104434907010014
	mean_recall: 0.8104434907010014
	mean_precision: 0.8077360245207114
Run ID: 5602976e36c04bc6b7095f1dafc2a37d
	mean_f1_weighted: 0.7673151517777246
	mea

In [43]:
best_run_id = max(run_metrics, key=lambda x: run_metrics[x]["mean_f1_weighted"])
best_params = client.get_run(best_run_id).data.params

print(f"\nBest Run ID: {best_run_id}")
print("Best Parameters:")
for key, value in best_params.items():
    print(f"\t{key}: {value}")


Best Run ID: 6783e97d3bad4ddfb1b6d703ea8bb29c
Best Parameters:
	gamma: scale
	C: 10
	kernel: rbf
