In [20]:
import pandas as pd
import numpy as np
import glob
import os
import joblib
import time
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

## Prepare Data

In [3]:
# Load Training Data
def collect_parquets(train_path):
    parquet_files = glob.glob(f"{train_path}/*.parquet")
    dfs = [pd.read_parquet(file) for file in parquet_files]
    train_df = pd.concat(dfs, ignore_index = True)
    return train_df

In [9]:
PATH = '../../../Data/Features/All Features/train'
training_data = collect_parquets(PATH)
print('Training data: ', training_data.shape)

All training data:  (6003, 4893)


In [10]:
training_data.columns

Index(['Class', 'harmonized_filename', 'image_path_blur', 'image_path_no_blur',
       'ViT_Embedding_Element_0', 'ViT_Embedding_Element_1',
       'ViT_Embedding_Element_2', 'ViT_Embedding_Element_3',
       'ViT_Embedding_Element_4', 'ViT_Embedding_Element_5',
       ...
       'VGG_Embedding_Element_502', 'VGG_Embedding_Element_503',
       'VGG_Embedding_Element_504', 'VGG_Embedding_Element_505',
       'VGG_Embedding_Element_506', 'VGG_Embedding_Element_507',
       'VGG_Embedding_Element_508', 'VGG_Embedding_Element_509',
       'VGG_Embedding_Element_510', 'VGG_Embedding_Element_511'],
      dtype='object', length=4893)

In [11]:
target = training_data['Class']

In [12]:
features = training_data.drop(columns = ['Class', 'harmonized_filename', 'image_path_blur', 'image_path_no_blur'])

In [17]:
hyperparameter_grid = {
    'c_values' :  np.logspace(-1, 2, num = 4),
    'kernel_grid' :  ['rbf', 'poly'],
    'gamma_grid' :  np.logspace(-1, 2, num = 4),
    'degree_grid':  [2, 3, 5, 7],
    'k_folds' : 5
}

## Train Model

In [18]:
def train_SVM(path, hyperparameter_grid):
    # Start timer
    start_time = time.time()
    
    # Load the data
    training_data = collect_parquets(path)
    target = training_data['Class']
    features = training_data.drop(columns=['Class', 'harmonized_filename', 'image_path_blur', 'image_path_no_blur'])
    
    # Split the data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(features, target, test_size = 0.2, random_state = 42)
    
    # Setup the hyperparameter grid
    param_grid = {
        'C': hyperparameter_grid['c_values'],
        'kernel': hyperparameter_grid['kernel_grid'],
        'gamma': hyperparameter_grid['gamma_grid'],
        'degree': hyperparameter_grid['degree_grid']
    }
    
    # Initialize the SVM classifier
    svm = SVC()
    
    # Setup GridSearchCV
    grid_search = GridSearchCV(svm, param_grid, cv = hyperparameter_grid['k_folds'], scoring = 'accuracy', return_train_score = True)
    
    # Fit the model
    grid_search.fit(X_train, y_train)
    
    # Training time
    fit_time = time.time() - start_time
    
    # Best model
    best_model = grid_search.best_estimator_
    
    # Predictions
    train_preds = best_model.predict(X_train)
    val_preds = best_model.predict(X_val)
    
    # Accuracies
    train_accuracy = accuracy_score(y_train, train_preds)
    val_accuracy = accuracy_score(y_val, val_preds)
    
    # Save the best model
    joblib_file = "best_svm_model.joblib"
    joblib.dump(best_model, joblib_file)
    
    # Output
    output_dict = {
        'fit_time': fit_time,
        'train_accuracy': train_accuracy,
        'validation_accuracy': val_accuracy
    }
    
    optimal_hyperparameters = grid_search.best_params_
    
    return output_dict, optimal_hyperparameters, joblib_file

## Save Results

In [None]:
PATH = '../../../Data/Features/All Features/train'
output_dict, optimal_hyperparameters, joblib_file = train_SVM(PATH, hyperparameter_grid)
print(output_dict)
print(optimal_hyperparameters)