# SVM Classification model

Use this file as reference to build an SVM model for classification. The functions in here are generic and can be used across datasets. The description of each function is given as a doc string.

In [None]:
import collections
import string
import numpy as np
import sklearn
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

## Training block

The below block of code uses the training features and labels to fit a svm classifier model.

In [1]:
def Train_classifier(train_features, train_labels, kernel="linear"):
    """learns a classifier from the input features and labels using a specified kernel function

    args:
        train_features: scipy.sparse.csr.csr_matrix -- sparse matrix of features
        train_labels : numpy.ndarray(bool): binary vector of class labels
        kernel : str -- kernel function to be used with classifier, must be (linear|poly|rbf|sigmoid)

    return : sklearn.svm.classes.SVC -- classifier
    """
    classifier=SVC(kernel = kernel, gamma='auto')
    model= classifier.fit(train_features,train_labels)

    assert kernel in ["linear", "poly", "rbf", "sigmoid"]

    return model 

## Model Metric

In [2]:
def f1(pred, ground):
    """ evaluates a classifier based on a supplied validation data

    args:
        pred: numpy.ndarray(bool) -- predictions
        ground: numpy.ndarray(bool) -- known ground-truth values
    
    return : double -- the F1 score of the predictions
    """
    pred = np.array(pred, dtype=bool)
    ground = np.array(ground, dtype=bool)

    return f1_score(ground,pred)

In [None]:
def confusion_matrix_accuracy(test_labels,pred_labels):
    # create confusion matrix
    matrix = confusion_matrix(test_labels, pred_test)
    accuracy = accuracy_score(test_labels, pred_labels)
    # print the accuracy score on the test data
    print('Accuracy Score :',accuracy)
    return matrix,accuracy
    

## Evaluation of the model

In [3]:
def evaluate(train_features, train_labels, kernel="linear"):
    """train the classifier and report the F1 score on the training set
    
    args:
        train_features: scipy.sparse.csr.csr_matrix -- sparse matrix of features
        train_labels : numpy.ndarray(bool): binary vector of class labels
        kernel : str -- kernel function to be used with classifier, must be (linear|poly|rbf|sigmoid)

    return : double -- the F1 score of the predictions on the training labels
    """
    model = Train_classifier(train_features,train_labels)
    preds = model.predict(train_features)
    
    return preds, model

## Data loading and cleaning

In [None]:
def file_read_clean(filename):
    dataset_directory = "D:\\Coursework\\Capstone\\clean_repo\\data_load\\dataset"
    bank_vs_demo = pd.read_csv(dataset_directory+"\\"+filename)
    # drop year, state, zip
    bank_vs_demo = bank_vs_demo.drop(['index','year','state','zip','bank_open','bank_close',
                                                                'bank_net'], axis=1)
    data_list = bank_vs_demo.drop('ground_truth', axis=1)
    label_list = bank_vs_demo['ground_truth'] 
    #Create train and test data. test_size 0.3 means 30% of data will be test data.
    #change dataframe to matrix - data_list and label-list
    train_features, test_features, train_labels, test_labels = train_test_split(data_list.as_matrix(), label_list.as_matrix(), test_size=0.3, random_state=None )
    return train_features, test_features, train_labels, test_labels

## Putting it all together

In [None]:
filename = "<data_set_file_name>"
train_features, test_features, train_labels, test_labels = file_read_clean(filename)
train_preds, model = evaluate(train_features, train_labels, "linear")
test_preds = model.predict(test_features)
test_f1_score = f1(test_preds, test_labels)
matrix,accuracy = confusion_matrix_accuracy(test_labels,test_preds)

# create label
labels = ['Open','Constant','Close']

# display the heatmap of confusion matrix on the test data
fig, ax = plt.subplots(figsize=(10, 5)) 
ax.set_title('Confusion Matrix - Test Data')
sns.heatmap(matrix, annot =True, fmt="d", xticklabels=labels, yticklabels=labels)

plt.xlabel('Predicted Labels')
plt.ylabel('Groundtruth')

plt.show()