In [1]:
import pandas as pd
import numpy as np
import glob
import os
import joblib
import time
import xlsxwriter
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

## Prepare Data

In [2]:
# Load Training Data
def combine_directory_parquets(directory_path):
    '''
    Combines all parquet files in a directory into a single dataframe.
    '''
    if directory_path[-1] != '/':
        directory_path += '/'
    file_list = [f for f in os.listdir(directory_path) if f.endswith('.parquet')]
    combined_df = pd.concat([pd.read_parquet(directory_path + f) for f in file_list])
    return combined_df

In [3]:
PATH = '../../../Data/Features/All Features/train'
training_data = combine_directory_parquets(PATH)
print('Training data: ', training_data.shape)

Training data:  (6003, 4893)


In [4]:
training_data.columns

Index(['Class', 'harmonized_filename', 'image_path_blur', 'image_path_no_blur',
       'ViT_Embedding_Element_0', 'ViT_Embedding_Element_1',
       'ViT_Embedding_Element_2', 'ViT_Embedding_Element_3',
       'ViT_Embedding_Element_4', 'ViT_Embedding_Element_5',
       ...
       'VGG_Embedding_Element_502', 'VGG_Embedding_Element_503',
       'VGG_Embedding_Element_504', 'VGG_Embedding_Element_505',
       'VGG_Embedding_Element_506', 'VGG_Embedding_Element_507',
       'VGG_Embedding_Element_508', 'VGG_Embedding_Element_509',
       'VGG_Embedding_Element_510', 'VGG_Embedding_Element_511'],
      dtype='object', length=4893)

In [5]:
target = training_data['Class']

In [6]:
features = training_data.drop(columns = ['Class', 'harmonized_filename', 'image_path_blur', 'image_path_no_blur'])

## Train Model

In [12]:
def prepare_matrices(data):
    '''
    Takes in a dataframe and returns X and y matrices.
    '''
    # Create matrices for training
    # X is all numeric columns, y is 'Class'
    num_cols = data.select_dtypes(include=np.number).columns
    X = data[num_cols]
    y = data['Class']

    # Preprocess with standard scalar
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    return X, y

    ##################################################################################################

def fit_svm_classifier(X_train, y_train, classifier_name):
    '''
    Fits an SVM classifier to the training data matrices.
    '''
    output_dir = '../../../Output/Classifier Fitting/SVM/'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    hyperparameter_grid = {
    'c_values' :  np.logspace(-1, 2, num = 3),
    'kernel_grid' :  ['poly'],
    'degree_grid' : [2, 3, 5],
    'k_folds' : 5
    }
    
    # Setup the hyperparameter grid
    param_grid = {
        'C': hyperparameter_grid['c_values'],
        'kernel': hyperparameter_grid['kernel_grid'],
        'degree': hyperparameter_grid['degree_grid']
    }

    start_time = time.time()

    # Initialize the SVM classifier
    svm = SVC()
    
    # Setup GridSearchCV
    gs = GridSearchCV(svm, param_grid, cv = hyperparameter_grid['k_folds'], scoring = 'accuracy', return_train_score = True)
    
    # Fit the model
    gs.fit(X_train, y_train)

    # Training time
    fit_time = time.time() - start_time
    
    # Best model
    best_model = gs.best_estimator_
    
    # Save the model
    joblib.dump(best_model, output_dir + classifier_name + ' Best Model.joblib')

    runtime_minutes = fit_time / 60
    print("Training time in minutes: ", runtime_minutes)
    runtime_per_image = runtime_minutes / len(y_train)
    print("Training time per image in minutes: ", runtime_per_image)
    train_accuracy_best_model = gs.best_estimator_.score(X_train, y_train)
    print("Train accuracy of best model: ", train_accuracy_best_model)
    mean_cross_validated_accuracy = gs.best_score_
    print("Mean cross validated accuracy of best model: ", mean_cross_validated_accuracy)

    training_statistics_df = pd.DataFrame({
        'runtime_minutes': [runtime_minutes],
        'runtime_per_image': [runtime_per_image],
        'train_accuracy_best_model': [train_accuracy_best_model],
        'mean_cross_validated_accuracy': [mean_cross_validated_accuracy]
    })

    training_statistics_df.to_excel(output_dir + classifier_name + ' Training Statistics.xlsx')

    print("Hyperparameters searched: ", hyperparameter_grid)
    print("Tuned hyperparameters: ", gs.best_params_)

    joblib.dump(hyperparameter_grid, output_dir + classifier_name + ' Hyperparameter Settings.joblib')
    joblib.dump(gs.best_params_, output_dir + classifier_name + ' Tuned Hyperparameters.joblib')
    
    ##################################################################################################

## Save Results

In [13]:
# Training:
training_data = combine_directory_parquets(PATH)
X_train, y_train = prepare_matrices(training_data)
X_train = X_train[:500]
y_train = y_train[:500]
fit_svm_classifier(X_train, y_train, 'SVM_Classifier')


Training time in minutes:  0.5085772355397542
Training time per image in minutes:  0.0010171544710795084
Train accuracy of best model:  1.0
Mean cross validated accuracy of best model:  0.65
Hyperparameters searched:  {'c_values': array([  0.1       ,   3.16227766, 100.        ]), 'kernel_grid': ['poly'], 'degree_grid': [2, 3, 5], 'k_folds': 5}
Tuned hyperparameters:  {'C': 100.0, 'degree': 2, 'kernel': 'poly'}


In [14]:
def make_predictions(test_data, X_test, classifier_name):
    '''
    Makes predictions on the test data using the best SVM model.
    '''
    output_dir = '../../../Output/Classifier Fitting/SVM/'
    inference_dir = '../../../Output/Classifier Inference/SVM/'
    predictions_dir = '../../../Data/Predictions/SVM/'

    if not os.path.exists(inference_dir):
        os.makedirs(inference_dir)
    if not os.path.exists(predictions_dir):
        os.makedirs(predictions_dir)

    best_model = joblib.load(output_dir + classifier_name + ' Best Model.joblib')

    start_time = time.time()

    predictions = best_model.predict(X_test)

    end_time = time.time()

    runtime_minutes = (end_time - start_time) / 60
    print("Prediction time in minutes: ", runtime_minutes)
    runtime_per_image = runtime_minutes / len(test_data)
    print("Prediction time per image in minutes: ", runtime_per_image)

    prediction_statistics_df = pd.DataFrame({
        'runtime_minutes': [runtime_minutes],
        'runtime_per_image': [runtime_per_image]
    })

    prediction_statistics_df.to_excel(inference_dir + classifier_name + ' Prediction Statistics.xlsx', index=False)

    test_data['SVM_Classification'] = predictions

    limited_test_data = test_data[[col for col in test_data.columns if col not in test_data.select_dtypes(include = np.number).columns]]

    limited_test_data.to_excel(predictions_dir + 'SVM_Classifier_Predictions_' + classifier_name + '.xlsx', index = False)


In [15]:
# Testing
PATH_test = '../../../Data/Features/All Features/test'
test_data = combine_directory_parquets(PATH_test)
X_test, y_test = prepare_matrices(test_data)
X_test = X_test[:500]
y_test = y_test[:500]
make_predictions(test_data[:500], X_test, 'SVM_Classifier')

Prediction time in minutes:  0.005554600556691488
Prediction time per image in minutes:  1.1109201113382976e-05


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['SVM_Classification'] = predictions
