# Validation

In [33]:
import numpy as np
import pandas as pd
from __future__ import division
from random import uniform, randint
from sklearn.svm import SVC
from sklearn.linear_model import Perceptron


In [22]:
# Load training and testing data
train = pd.read_csv('~/ml/in_dta.txt', delim_whitespace=True, header=None, names=['x1', 'x2', 'y'])
test = pd.read_csv('~/ml/out_dta.txt', delim_whitespace=True, header=None, names=['x1', 'x2', 'y'])

def transform(data):
    """Transform the data so we can apply regression to a non linear function.
    phi = 1, x1, x2, x1^2, x2^2, x1x2, |x1 − x2|, |x1 + x2|)
    """
    data.loc[:, 'x0'] = pd.Series(1, index=data.index)
    data.loc[:, 'x3'] = data.x1.map(lambda x: x**2)
    data.loc[:, 'x4'] = data.x2.map(lambda x: x**2)
    data.loc[:, 'x5'] = data.x1 * data.x2
    data.loc[:, 'x6'] = (data.x1 - data.x2).map(lambda x: np.abs(x))
    data.loc[:, 'x7'] = (data.x1 + data.x2).map(lambda x: np.abs(x))
    # Sort columns into the correct order
    data = data.reindex_axis(sorted(data.columns), axis=1)
    return data
    
def lin_regression(data):
    """Apply linear regression the dataset
    w = (x^Tx)^-1 * x^T * y"""
    x = np.matrix(data.drop(labels='y', axis=1).as_matrix())
    y = np.matrix(data.y.as_matrix())
    w = (x.T * x).I * x.T * y.T
    return w

def lin_regression_weights(data, k):
    """Apply linear regression with weight decay"""
    x = np.matrix(data.drop(labels='y', axis=1).as_matrix())
    y = np.matrix(data.y.as_matrix())
    product = x.T * x
    lambda_I = np.matrix((10**k) * np.identity(len(product)))
    w = (product + lambda_I).I * x.T * y.T
    return w

def error(data, w):
    """Given transformed out of sample data and the weight vector, calculate the error between g and f 
    (use the proportion of misclassified points)"""
    x = np.matrix(data.drop(labels='y', axis=1).as_matrix())
    y = np.matrix(data.y.as_matrix()).T
    y_predicted = x * w
    count = 0
    for i in range(len(y)):
        if np.sign(y_predicted[i, 0]) != y[i, 0]:
            count += 1
    return count / len(y_predicted)

# split data into training data into training and validation
#Evaluate error on validation set
train_t = transform(train)
val_t = train_t.tail(n=10)
training = train_t.head(n=25)
test_t = transform(test)
best = (1000, -1)
for k in range(3, 8):
    Max = k + 1
    data = training
    validation = val_t
    while Max < 8:
        data = data.drop(labels='x{}'.format(Max), axis=1)
        validation = validation.drop(labels='x{}'.format(Max), axis=1)
        Max += 1
    w = lin_regression(data)
    e = error(validation, w)
    print e
    if e < best[0]:
        best = (e, k)
print best


0.3
0.5
0.2
0.0
0.1
(0.0, 6)


In [23]:
#Evaluate out of sample classification error
train_t = transform(train)
val_t = train_t.tail(n=10)
training = train_t.head(n=25)
test_t = transform(test)
best = (1000, -1)
for k in range(3, 8):
    Max = k + 1
    data = training
    validation = test_t
    while Max < 8:
        data = data.drop(labels='x{}'.format(Max), axis=1)
        validation = validation.drop(labels='x{}'.format(Max), axis=1)
        Max += 1
    w = lin_regression(data)
    e = error(validation, w)
    print e
    if e < best[0]:
        best = (e, k)
print best

0.42
0.416
0.188
0.084
0.072
(0.072, 7)


In [25]:
#Train with the last 10 examples, validate with first 25
train_t = transform(train)
val_t = train_t.head(n=25)
training = train_t.tail(n=10)
test_t = transform(test)
best = (1000, -1)
for k in range(3, 8):
    Max = k + 1
    data = training
    validation = val_t
    while Max < 8:
        data = data.drop(labels='x{}'.format(Max), axis=1)
        validation = validation.drop(labels='x{}'.format(Max), axis=1)
        Max += 1
    w = lin_regression(data)
    e = error(validation, w)
    print e
    if e < best[0]:
        best = (e, k)
print best

0.28
0.36
0.2
0.08
0.12
(0.08, 6)


In [26]:
#Evaluate again on out of sample 
train_t = transform(train)
val_t = train_t.head(n=25)
training = train_t.tail(n=10)
test_t = transform(test)
best = (1000, -1)
for k in range(3, 8):
    Max = k + 1
    data = training
    validation = test_t
    while Max < 8:
        data = data.drop(labels='x{}'.format(Max), axis=1)
        validation = validation.drop(labels='x{}'.format(Max), axis=1)
        Max += 1
    w = lin_regression(data)
    e = error(validation, w)
    print e
    if e < best[0]:
        best = (e, k)
print best

0.396
0.388
0.284
0.192
0.196
(0.192, 6)


# PLA vs SVM

In [94]:
#Generates a random line in the square [-1,1], and returns the gradient and intercept
def f ():
    #Generate two random points in the square
    (x1,y1) = (uniform(-1,1),uniform(-1,1))
    (x2,y2) = (uniform(-1,1),uniform(-1,1))
    
    m = (y2 - y1) / (x2 - x1)
    c = y1 - m * x1
    
    return (m, c)

#Generate an array of n lists, containing a point in a square, uniformly distributed in [-1,1]
def arrays(n):
    if n == 0:
        return np.array([])
    ar = np.ones((n,3))
    n -= 1
    while n >= 0:
        #x_0 = 1
        ar[n] = [1, uniform(-1,1), uniform (-1,1)]
        n-= 1
    return ar

#for a given x and f, finds the values of y
def yarray(x, (m, c), n):
    ys = np.zeros((n, 1))
    for i in range(len(x)):
        if x[i, 2] > m * x [i, 1] + c:
            ys[i] = 1
        else:
            ys[i] = -1
    return ys

#Return the in sample error E_in
def e_in (w, x, y):
    count = 0
    for i in range(len(x)):
        if np.sign(np.dot(x[i], w)) != y[i]:
            count += 1
    
    return count / len(x)

#Apply PLA algorithm, given an initial weight vector w
def PLA (w, x, y):
    #number of iterations
    n = 0
    while n < 1E6:
        #create a list of misclassified points
        misclass = []
        #h(x) = sign(w^T.x)
        for i in range(len(x)):
            if np.sign(np.dot(x[i], w)) != y[i]:
                misclass.append(i)
        n += 1
        #if misclass is empty, we have converged on g
        if not misclass:
            break
        #randomly choose a misclassified point to use to update w
        r = randint(0, len(misclass) - 1)
        #multiply x_i vector by y_i
        adj = np.array(x[misclass[r]]) * np.array(y[misclass[r]])
        #update w
        w = w + adj.T
    return w

In [120]:
#Find average in sample error over 1000 runs
N = 100
repeat = 100
count = 0
for i in range(repeat):
    (m, c) = f()
    x = arrays(N)
    y = yarray(x, (m, c), N)
    # Check if all points have same classification, if so skip
    Sum = np.sum(y)
    if Sum == N or Sum == -N:
        continue
    w = PLA(np.zeros(3), x, y)
    clf = SVC(C=np.inf, shrinking=False, kernel='poly')
    clf.fit(x, y.reshape(-1, 1))
    count += clf.n_support_[0]
    # Generate test points
    x_test = arrays(1000)
    y_test = yarray(x_test, (m, c), 1000)
    acc = clf.score(x_test, y_test)
    E_pla = e_in(w, x_test, y_test)
print "{}".format(count / repeat)

2.43


In [None]:
clf = SVC(c=np.inf)
clf.fit()