# Math 582 Miniproject 3 - Model Development

The purpose of this notebook is implment dual SVM convex quadratic optimization for the purposes of binary classification.

In [359]:
# Imports

import numpy as np
import pandas as pd
from functools import reduce
from qpsolvers import solve_qp
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [360]:
# read in data
df = pd.read_csv('./data/test-data/test_data.csv')
df

Unnamed: 0,age,interest,success
0,23.657801,18.859917,0.0
1,22.573729,17.969223,0.0
2,32.553424,29.463651,0.0
3,6.718035,25.704665,1.0
4,14.401919,16.770856,0.0
...,...,...,...
292,27.697220,18.799309,0.0
293,15.150959,72.000352,1.0
294,22.264378,68.453459,1.0
295,25.677420,90.118212,1.0


In [361]:
# This function performs the following:
# - maps classifiers values -1 or +1
# - separates feature columns from the classifier column
# - splits the data into training and testing sets
# returns: xs_train, xs_test, ys_train, ys_test

def prep_data(df, classifier_column_name, classifier_vals, train_size=0.75):

    if len(classifier_vals) != 2:
        raise ValueError("classifier_vals argument must be length 2 (binary classifier)")
    
    # map each binary classifier value to either 1 or -1
    df[classifier_column_name] = df[classifier_column_name].apply(lambda b: -1 if b == classifier_vals[0] else 1)

    # separate the features from the classifications
    colnames = df.columns.tolist()
    feature_column_names = list(filter(lambda colname: colname != classifier_column_name, colnames))

    xs = df[feature_column_names]
    ys = df[classifier_column_name]

    # split the data into training and testing data
    datasets = train_test_split(xs, ys, train_size=train_size)

    # map all the training data into numpy arrays
    xs_train, xs_test, ys_train, ys_test = list(map(lambda s: s.to_numpy(), datasets))

    # return the training and testing data
    return xs_train, xs_test, ys_train, ys_test


In [362]:
# compute the kernel matrix K
def kernel_matrix(xs_train, k):
    N = xs_train.shape[0]
    K = np.zeros(shape=(N,N))
    for i in range(0, N):
        for j in range(0, i + 1):
            K[i][j] = K[j][i] = k(xs_train[i], xs_train[j])
    return K

# assume A is an NxN matrix
def make_positive_definite(A):
    N = A.shape[0]
    # if all eigenvals are not > 0, then add perturbation and try again
    while not np.all(np.linalg.eigvals(A) > 0):
        epsilon = 1e-10
        perturbation = epsilon * np.identity(N)
        A += perturbation
    return A



In [363]:
# define the function used to get the optimal lagrange multipliers (alpha) for given training data, kernel function, and cost function
def optimize(xs_train, ys_train, k, C):
    N = xs_train.shape[0]

    # compute the entries in the kernel matrix
    K = make_positive_definite(kernel_matrix(xs_train, k))
    Y = np.diag(ys_train)

    # quadratic program parameters
    P = Y @ K @ Y
    q = np.ones(N)
    G = np.vstack([ys_train, -1 * ys_train, -1 * np.identity(N), np.identity(N)])
    h = np.concatenate([np.zeros((N+2)), C * np.ones((N))])

    return solve_qp(P, q, G, h)

In [364]:
def build_svm_classifier(xs_train, ys_train, k, C):
    N = xs_train.shape[0]
    alpha = optimize(xs_train, ys_train, k, C)

    fw = lambda n: alpha[n] * ys_train[n] * xs_train[n]
    w = np.array(list(reduce(lambda v1, v2: v1 + v2, list(map(fw, range(0, N))))))

    fb = lambda n: np.abs(ys_train[n] - np.dot(w, xs_train[n]))
    b = np.median(np.array(list(map(fb, range(0, N)))))

    def classifier(xs_test):
        return np.array(list(map(lambda x_test: 1 if np.dot(w, x_test) + b >= 0 else -1, list(xs_test))))

    return classifier


In [365]:
def poly_kernel(d):
    if d < 1:
        raise ValueError("Invalid polynomial dimension for polynomial kernel")

    def inner(xi, xj):
        return (1 + np.dot(xi, xj)) ** d

    return inner

In [366]:
xs_train, xs_test, ys_train, ys_test = prep_data(df, "success", [0.0, 1.0], train_size=0.5)

k = poly_kernel(2)
C = 1
classifier = build_svm_classifier(xs_train, ys_train, k, C)

KeyboardInterrupt: 

In [None]:
def evaluate_classifier(classifier, xs_test, ys_test):
    numtests = xs_test.shape[0]
    results = classifier(xs_test) == ys_test
    numcorrect = len(list(filter(lambda b: b, list(results))))
    successrate = numcorrect / numtests
    return successrate

In [None]:
evaluate_classifier(classifier, xs_test, ys_test)

0.5369127516778524

In [None]:
linsvc = SVC(kernel = 'polynomial', degree=2, C=C)
linsvc.fit(xs_train, ys_train)
evaluate_classifier(linsvc.predict, xs_test, ys_test)

0.8993288590604027

In [None]:
classifier(xs_test) == ys_test

array([False,  True,  True,  True, False, False, False, False,  True,
        True,  True, False,  True,  True,  True,  True, False,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True, False, False,  True, False,
        True, False,  True, False, False, False, False, False,  True,
       False,  True, False, False, False,  True,  True,  True, False,
       False, False, False, False,  True, False,  True,  True, False,
       False,  True,  True, False, False,  True, False,  True, False,
       False,  True, False, False, False, False,  True, False, False,
        True, False, False,  True,  True,  True,  True, False,  True,
        True, False,  True,  True,  True,  True, False, False,  True,
        True, False, False, False,  True, False,  True,  True,  True,
       False, False,  True,  True,  True,  True,  True,  True,  True,
        True, False, False, False, False,  True, False, False, False,
        True,  True,