# Math 582 Miniproject 3 - Model Development

The purpose of this notebook is implment dual SVM convex quadratic optimization for the purposes of binary classification.

In [53]:
# Imports

import numpy as np
import pandas as pd
from qpsolvers import solve_qp
from sklearn.model_selection import train_test_split

In [54]:
# read in data
df = pd.read_csv('./data/test-data/test_data.csv')
df

Unnamed: 0,age,interest,success
0,23.657801,18.859917,0.0
1,22.573729,17.969223,0.0
2,32.553424,29.463651,0.0
3,6.718035,25.704665,1.0
4,14.401919,16.770856,0.0
...,...,...,...
292,27.697220,18.799309,0.0
293,15.150959,72.000352,1.0
294,22.264378,68.453459,1.0
295,25.677420,90.118212,1.0


In [55]:
# This function performs the following:
# - maps classifiers values -1 or +1
# - separates feature columns from the classifier column
# - splits the data into training and testing sets
# returns: xs_train, xs_test, ys_train, ys_test

def prep_data(df, classifier_column_name, classifier_vals):

    if len(classifier_vals) != 2:
        raise ValueError("classifier_vals argument must be length 2 (binary classifier)")
    
    # map each binary classifier value to either 1 or -1
    df[classifier_column_name] = df[classifier_column_name].apply(lambda b: -1 if b == classifier_vals[0] else 1)

    # separate the features from the classifications
    colnames = df.columns.tolist()
    feature_column_names = list(filter(lambda colname: colname != classifier_column_name, colnames))

    xs = df[feature_column_names]
    ys = df[classifier_column_name]

    # split the data into training and testing data
    datasets = train_test_split(xs, ys)

    # map all the training data into numpy arrays
    xs_train, xs_test, ys_train, ys_test = list(map(lambda s: s.to_numpy(), datasets))

    # return the training and testing data
    return xs_train, xs_test, ys_train, ys_test

xs_train, xs_test, ys_train, ys_test = prep_data(df, "success", [0.0, 1.0])
N = xs_train.shape[0]


In [56]:
# compute the kernel matrix K
def kernel_matrix(xs_train, k):
    N = xs_train.shape[0]
    K = np.zeros(shape=(N,N))
    for i in range(0, N):
        for j in range(0, i + 1):
            K[i][j] = K[j][i] = k(xs_train[i], xs_train[j])
    return K


In [57]:
# define the function used to get the optimal lagrange multipliers (alpha) for given training data, kernel function, and cost function
def optimize(xs_train, ys_train, k, C):
    N = xs_train.shape[0]

    # compute the entries in the kernel matrix
    K = kernel_matrix(xs_train, k)
    Y = np.diag(ys_train)

    # quadratic program parameters
    P = Y @ K @ Y
    q = np.ones(N)
    G = np.vstack([ys_train, -1 * ys_train, -1 * np.identity(N), np.identity(N)])
    h = np.concatenate([np.zeros((N+2)), C * np.ones((N))])

    return solve_qp(P, q, G, h)

In [58]:
# optimize(xs_train, ys_train, np.dot, 1)
K = kernel_matrix(xs_train, np.dot)
Y = np.diag(ys_train)
P = Y @ K @ Y
np.linalg.eigvals(P)

array([ 6.86326219e+05+0.00000000e+00j,  2.94470417e+04+0.00000000e+00j,
       -5.44546323e-11+0.00000000e+00j, -3.32570225e-11+3.87341395e-11j,
       -3.32570225e-11-3.87341395e-11j,  4.72696709e-11+1.78037351e-12j,
        4.72696709e-11-1.78037351e-12j,  1.17403765e-11+4.20030361e-11j,
        1.17403765e-11-4.20030361e-11j,  2.24729264e-11+3.42515607e-11j,
        2.24729264e-11-3.42515607e-11j,  6.64511565e-13+4.09863889e-11j,
        6.64511565e-13-4.09863889e-11j, -4.13462930e-11+0.00000000e+00j,
       -3.36855991e-11+2.27653019e-11j, -3.36855991e-11-2.27653019e-11j,
        3.78850114e-11+6.00961583e-12j,  3.78850114e-11-6.00961583e-12j,
        2.85404010e-11+2.58772629e-11j,  2.85404010e-11-2.58772629e-11j,
       -1.24399546e-11+3.47731175e-11j, -1.24399546e-11-3.47731175e-11j,
       -3.58625372e-11+0.00000000e+00j,  2.70945340e-11+1.34212527e-11j,
        2.70945340e-11-1.34212527e-11j, -2.57300071e-11+1.05664890e-11j,
       -2.57300071e-11-1.05664890e-11j,  5.11924695

In [59]:
optimize(xs_train, ys_train, np.dot, 1)

ValueError: matrix P is not positive definite