In [1]:
import pandas as pd
from sklearn import svm
from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import cross_val_score
from joblib import Memory
from pathlib import Path


In [2]:
train_path = Path("artifacts/a9a_test.txt")
test_path = Path("artifacts/a9a_training.txt")

mem = Memory("./mycache")

@mem.cache
def get_data(path):
    data = load_svmlight_file(
        f=path,
        n_features=123)
    return data[0], data[1]

X_train, y_train = get_data(train_path)
X_test, y_test = get_data(test_path)

## Preparing SVM Model Class

We will run the SVM models using two types of kernel - Linear and RBF over a set of gamma and C (regularisation) hyperparameters  
- Linear SVM
- RBF SVM

In [4]:
class svc_model:
    def __init__ (self, X_train, y_train, X_test, y_test, C:float=1, kernel:str='rbf', gamma:float='scale'):
        """
        Instantiate a SVM model with default parameters
        Args:
            X_train
            y_train
            x_test
            y_test
            C [float]: regularisation hyperparameter
            kernel [str]: choice of kernel hyperparameter
            gamma [float]: kernel coefficient. default is 'scale'

        """
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        if kernel == 'linear':
            self.svc = svm.LinearSVC(C=C,dual=False)
        elif kernel == 'rbf':
            self.svc = svm.SVC(C=C,kernel=kernel,gamma=gamma)
        else:
            print("Kernel specified not in list of kernels available in this class")

    def fit (self):
        self.svc.fit(self.X_train, self.y_train)

    def predict(self):
        """
        Class function for prediction method using X test set

        Returns:
            pred []: 
        """
        return self.svc.predict(self.X_test)
    
    def score(self):
        """
        Class function to score using test data

        Returns:
            scores [ndarray]:A rray of scores of the estimator 
        """
        return self.svc.score(self.X_test, self.y_test)
    
    def cross_val_score(self, cv:float, scoring:str, data_type:str='training'):
        """
        Class function to score using k-fold cross validations

        Args:
            cv [float]: k-folds for cross validation
            scoring [str]: A str or a scorer callable object/function with signature scorer(estimator, X, y) which should return only a single value. default = None
            data_type [str]: scoring using either training or test set. Default is training

        Returns:
            scores [ndarray]: Array of scores of the estimator for each run of the cross validation
        """

        if data_type == 'training':
            scores = cross_val_score(estimator=self.svc, X=self.X_train, y=self.y_train, cv=cv)
        elif data_type == 'test':
            scores = cross_val_score(estimator=self.svc, X=self.X_test, y=self.y_test, cv=cv)

        return scores

### Linear Kernel

In [9]:
# regularisation parameters 
C_lst = [0.01, 0.05, 0.1, 0.5, 1]

# accuracy results using linear kernel

# initialising empty dict to store results
linear_accuracy_results = {}

# initialise counter
counter = 0

for C in C_lst:
    print(f'Running SVM model with the following model hyperparameters: C={C}, kernel: linear')
    # instantiate SVC model
    svc = svc_model(X_train, y_train, X_test, y_test,C=C,kernel='linear')
    svc.fit() # fit data
    # compute cross-validated metrics
    scores = svc.cross_val_score(cv=3, scoring='accuracy',data_type='training') 
    linear_accuracy_results[counter] = {'C':C, 'accuracy':round(scores.mean(),5)}
    
    counter +=1

Running SVM model with the following model hyperparameters: C=0.01, kernel: linear
Running SVM model with the following model hyperparameters: C=0.05, kernel: linear
Running SVM model with the following model hyperparameters: C=0.1, kernel: linear
Running SVM model with the following model hyperparameters: C=0.5, kernel: linear
Running SVM model with the following model hyperparameters: C=1, kernel: linear


In [10]:
for i in linear_accuracy_results:
    print(linear_accuracy_results[i])

{'C': 0.01, 'accuracy': 0.84958}
{'C': 0.05, 'accuracy': 0.85038}
{'C': 0.1, 'accuracy': 0.85038}
{'C': 0.5, 'accuracy': 0.8505}
{'C': 1, 'accuracy': 0.85032}


### RBF kernel

In [12]:
# regularisation parameters 
C_lst = [0.01, 0.05, 0.1, 0.5, 1]
# gamma parameters
gamma_list = [0.01, 0.05, 0.1, 0.5, 1]

# accuracy results using rbf kernel

# initialising empty dict to store results
rbf_accuracy_results = {}
counter = 0

for C in C_lst:
    for gamma in gamma_list:
        print(f'Running SVM model with the following model hyperparameters: C={C}, gamma={gamma}, kernel: rbf')

        # instantiate SVC model
        svc = svc_model(X_train, y_train, X_test, y_test,C=C,kernel='rbf',gamma=gamma)
        svc.fit() # fit data
        # compute cross-validated metrics
        scores = svc.cross_val_score(cv=3,scoring='accuracy',data_type='training') 
        rbf_accuracy_results[counter] = {'C':C, 'gamma':gamma, 'accuracy': round(scores.mean(),5)}
        counter += 1

Running SVM model with the following model hyperparameters: C=0.01, gamma=0.01, kernel: rbf
Running SVM model with the following model hyperparameters: C=0.01, gamma=0.05, kernel: rbf
Running SVM model with the following model hyperparameters: C=0.01, gamma=0.1, kernel: rbf
Running SVM model with the following model hyperparameters: C=0.01, gamma=0.5, kernel: rbf
Running SVM model with the following model hyperparameters: C=0.01, gamma=1, kernel: rbf
Running SVM model with the following model hyperparameters: C=0.05, gamma=0.01, kernel: rbf
Running SVM model with the following model hyperparameters: C=0.05, gamma=0.05, kernel: rbf
Running SVM model with the following model hyperparameters: C=0.05, gamma=0.1, kernel: rbf
Running SVM model with the following model hyperparameters: C=0.05, gamma=0.5, kernel: rbf
Running SVM model with the following model hyperparameters: C=0.05, gamma=1, kernel: rbf
Running SVM model with the following model hyperparameters: C=0.1, gamma=0.01, kernel: rbf

In [13]:
for i in rbf_accuracy_results:
    print(rbf_accuracy_results[i])

{'C': 0.01, 'gamma': 0.01, 'accuracy': 0.76377}
{'C': 0.01, 'gamma': 0.05, 'accuracy': 0.76433}
{'C': 0.01, 'gamma': 0.1, 'accuracy': 0.77096}
{'C': 0.01, 'gamma': 0.5, 'accuracy': 0.76377}
{'C': 0.01, 'gamma': 1, 'accuracy': 0.76377}
{'C': 0.05, 'gamma': 0.01, 'accuracy': 0.78871}
{'C': 0.05, 'gamma': 0.05, 'accuracy': 0.83293}
{'C': 0.05, 'gamma': 0.1, 'accuracy': 0.83029}
{'C': 0.05, 'gamma': 0.5, 'accuracy': 0.76961}
{'C': 0.05, 'gamma': 1, 'accuracy': 0.76377}
{'C': 0.1, 'gamma': 0.01, 'accuracy': 0.83226}
{'C': 0.1, 'gamma': 0.05, 'accuracy': 0.83754}
{'C': 0.1, 'gamma': 0.1, 'accuracy': 0.83717}
{'C': 0.1, 'gamma': 0.5, 'accuracy': 0.79092}
{'C': 0.1, 'gamma': 1, 'accuracy': 0.76377}
{'C': 0.5, 'gamma': 0.01, 'accuracy': 0.84411}
{'C': 0.5, 'gamma': 0.05, 'accuracy': 0.84546}
{'C': 0.5, 'gamma': 0.1, 'accuracy': 0.84559}
{'C': 0.5, 'gamma': 0.5, 'accuracy': 0.82507}
{'C': 0.5, 'gamma': 1, 'accuracy': 0.77772}
{'C': 1, 'gamma': 0.01, 'accuracy': 0.84682}
{'C': 1, 'gamma': 0.05, '

## Optimal Hyperparameters

Determine which hyperparameters (gamma, C) and kernel used has the best model accuracy

In [14]:
def find_optimal_parameter (accuracy_results):
    """
    Function to determine optimal index for accuracy results

    Args:
        accuracy_results [dict] = two layers nested dictionary of accuracy results E.g. {0: {accuracy: value}, 1: {accuracy: value}}
    Returns:
        index [int]: index of accuracy_results which has the highest accuracy score
    """
    # initialise reference using first row
    accuracy = accuracy_results[0]['accuracy']
    index = 0

    # check for highest accuracy and record index
    for i in range(len(accuracy_results))[:-1]:
        if accuracy_results[i+1]['accuracy'] > accuracy:
            accuracy = rbf_accuracy_results[i+1]['accuracy']
            index = i+1

    return index

Optimal Parameter for Linear SVM

In [15]:
linear_accuracy_results[find_optimal_parameter(linear_accuracy_results)]

{'C': 1, 'accuracy': 0.85032}

Optimal Parameter for RBF SVM

In [16]:
rbf_accuracy_results[find_optimal_parameter(rbf_accuracy_results)]

{'C': 1, 'gamma': 0.01, 'accuracy': 0.84682}

The best kernel and best parameter setting will be the linear kernel with the following parameter settings: C - 1

In [17]:
kernel = 'linear'
C = 1

# instantiate SVC model
svc = svc_model(X_train, y_train, X_test, y_test,C=C,kernel=kernel)
svc.fit() # fit data
# compute cross-validated metrics
scores = svc.cross_val_score(cv=3,scoring='accuracy',data_type='test') 

In [18]:
print(round(scores.mean(),5))

0.84733
