In [1]:
import numpy as np


def get_inout_data(filename):
    file = open(filename)
    full_data = []
    for line in file:
        full_data.append(
            [float(e) for e in line.split()]
        )
    full_data = np.asarray(full_data)
    return full_data


def split_training_validation(full_data):
    training_set = full_data[:25]
    validation_set = full_data[25:]
    return (training_set, validation_set)


def x_to_z_space(full_data, k):
    Z_space = np.zeros ((len(full_data), k + 1))
    Y = np.zeros(len(full_data))
    
    for i, (x1, x2, sign) in enumerate(full_data):
        Z_space[i] = (1, x1, x2, x1**2, x2**2, x1 * x2, abs(x1 - x2), abs(x1 + x2))[:k+1]
        Y[i] = sign
    return (Z_space, Y)


def g_weight(Z_space, Y):
    _, len_g_weights = Z_space.shape
    g_weights = np.zeros(len_g_weights)
    g_weights = np.dot(np.linalg.pinv(Z_space), Y)
    return g_weights


def g_classify(Z_space, g_weights):
    g_classification = [
        float(sign) for sign in np.sign(np.dot(Z_space, g_weights))
    ]
    
    g_classification = np.asarray(g_classification)
    return g_classification


def error_freq(g_classification, Y):
    error_sum = np.sum(g_classification != Y)
    error = error_sum / len(Y)
    return error


def experiment(k):
    in_dta = get_inout_data("in.txt")
    out_dta = get_inout_data("out.txt")

    training, validation = split_training_validation(in_dta)
        
    z_training, y_training = x_to_z_space(training, k)
    z_validation, y_validation = x_to_z_space(validation, k)
    z_out, y_out = x_to_z_space(out_dta, k)
    
    g_weights = g_weight(z_training, y_training)
    
    training_classification = g_classify(z_training, g_weights)
    validation_classification = g_classify(z_validation, g_weights)
    out_classification = g_classify(z_out, g_weights)
    
    training_error = error_freq(training_classification, y_training)
    validation_error = error_freq(validation_classification, y_validation)
    out_error = error_freq(out_classification, y_out)
    
    return (training_error, validation_error, out_error)

def reverse_experiment(k):
    in_dta = get_inout_data("in.txt")
    out_dta = get_inout_data("out.txt")

    validation, training = split_training_validation(in_dta)
        
    z_training, y_training = x_to_z_space(training, k)
    z_validation, y_validation = x_to_z_space(validation, k)
    z_out, y_out = x_to_z_space(out_dta, k)
    
    g_weights = g_weight(z_training, y_training)
    
    training_classification = g_classify(z_training, g_weights)
    validation_classification = g_classify(z_validation, g_weights)
    out_classification = g_classify(z_out, g_weights)
    
    training_error = error_freq(training_classification, y_training)
    validation_error = error_freq(validation_classification, y_validation)
    out_error = error_freq(out_classification, y_out)
    
    return (training_error, validation_error, out_error)

'''
#Questions 1-5

for k in range(3, 8):
    print ("k:", k, ":", experiment(k))

print ('\n')

for k in range(3, 8):
    print ("k:", k, ":", reverse_experiment(k))

#output
k: 3 : (0.44, 0.3, 0.42)
k: 4 : (0.32, 0.5, 0.416)
k: 5 : (0.08, 0.2, 0.188)
k: 6 : (0.04, 0.0, 0.084)
k: 7 : (0.04, 0.1, 0.072)

k: 3 : (0.4, 0.28, 0.396)
k: 4 : (0.3, 0.36, 0.388)
k: 5 : (0.2, 0.2, 0.284)
k: 6 : (0.0, 0.08, 0.192)
k: 7 : (0.0, 0.12, 0.196)

'''

'\n#Questions 1-5\n\nfor k in range(3, 8):\n    print ("k:", k, ":", experiment(k))\n\nprint (\'\n\')\n\nfor k in range(3, 8):\n    print ("k:", k, ":", reverse_experiment(k))\n\n#output\nk: 3 : (0.44, 0.3, 0.42)\nk: 4 : (0.32, 0.5, 0.416)\nk: 5 : (0.08, 0.2, 0.188)\nk: 6 : (0.04, 0.0, 0.084)\nk: 7 : (0.04, 0.1, 0.072)\n\nk: 3 : (0.4, 0.28, 0.396)\nk: 4 : (0.3, 0.36, 0.388)\nk: 5 : (0.2, 0.2, 0.284)\nk: 6 : (0.0, 0.08, 0.192)\nk: 7 : (0.0, 0.12, 0.196)\n\n'

In [26]:
import numpy as np
import random
import sklearn.svm
import math

def generate_point(boundary1, boundary2, dimension):
    """
    Generate random two-dimensional point on 
    [boundary1, boundary 2] X [boundary1, boundary2] space.
    Returns ndarray of (x, y) point
    """
    random_point = np.zeros(dimension)
    for i in range(dimension):
        random_point[i] = np.random.uniform(boundary1, boundary2, 1)
    return random_point

#question 6
def expected_min_value(boundary1, boundary2, runs):
    """
    Let e1 and e2 be independent random variables, distributed uniformly over the
    interval [boundary1, boundary2]. The function determines expected value of 
    e_min = min(e1, e2)
    
    Inputs:
        boundary1 (float or int)
        boundary2 (float or int)
        runs (int) number of run times for determining expected value
        
    Outputs:
        e_min_expected (float)
    """
    e_min_tally = 0
    for run in range(runs):
        e1 = generate_point(boundary1, boundary2, 1)[0]
        e2 = generate_point(boundary1, boundary2, 1)[0]
        e = min(e1, e2)
        e_min_tally += e
    
    e_min_expected = e_min_tally / runs
    return e_min_expected
    

def generate_target_f():
    """
    Returns slope and intercept of line connecting two random points
    """
    point_1 = generate_point(-1, 1, 2)
    point_2 = generate_point(-1, 1, 2)
    
    # slope = (y2 - y1) / (x2 - x1)
    slope = (point_2[1] - point_1[1]) / (point_2[0] - point_1[0])
    # intercept = y1 - slope * x1
    intercept = point_2[1] - slope * point_2[0]
    return(slope, intercept)


def classify_point(random_point, slope, intercept):
    """
    Given random_point in (x, y) form and a slope and intercept,
    label the point
    +1 if it falls above the line
    -1 if it falls below the line
    """
    if random_point[1] > slope * random_point[0] + intercept:
        classification = 1
    else:
        classification = -1
    return classification

def create_training_data(N, slope, intercept, svm = False):
    """
    Creates N points for training data using target f(X) = slope * X + intercept;
    Notes:
        Target function f is a line connecting two points in X space [-1, 1] x [-1, 1]
        Classification is based on whether points lie above or below f
        Excludes data sets where all points lie in the same region
        
    Inputs:
        N (int)
    Outputs:
        X_training (ndarray), X_training.shape = (N, 3); x0 = 1
        Y_training (ndarray), Y_training.shsape = (N, )
    """
    while True:
        # create matrix X, where x0 is always 1, to accomodate w0;
        #if svm == True, we will take out the x0 later
        X_training = np.ones((N, 3))
        Y_training = np.zeros(N)
        for i in range(N):
            random_point = generate_point(-1, 1, 2)
            classification = classify_point(random_point, slope, intercept)
            X_training[i, 1:3] = random_point
            Y_training[i] = classification
        
        #make sure that the points don't all lie on the same side of the line
        if abs(np.sum(Y_training)) != N:
            #svm does not use x0
            #ok so turns out we need to use the same test data for pla and svm, so we never got to use svm = True
            #but keeping this in anyway, since it's more inclusive
            if svm == True:
                X_training = X_training[:, 1:3]
            return (X_training, Y_training)

def create_testing_data(N, slope, intercept, svm = False):
    """
    Creates N points for testing data using target f(X) = slope * X + intercept;
    Notes:
        Target function f is a line connecting two points in X space [-1, 1] x [-1, 1]
        Classification is based on whether points lie above or below f
        Excludes data sets where all points lie in the same region
        
    Inputs:
        N (int)
    Outputs:
        X_testing (ndarray), X_testing.shape = (N, 3); x0 = 1
        Y_testing (ndarray), Y_testing.shsape = (N, )
    """
    # create matrix X, where x0 is always 1, to accomodate w0
    #if svm == True, we will take out the x0 later
    X_testing = np.ones((N, 3))
    Y_testing = np.zeros(N)
    for i in range(N):
        random_point = generate_point(-1, 1, 2)
        classification = classify_point(random_point, slope, intercept)
        X_testing[i, 1:3] = random_point
        Y_testing[i] = classification
        
    #ok so turns out we need to use the same test data for pla and svm, so we never got to use svm = True
    #but keeping this in anyway, since it's more inclusive
    if svm == True:
        X_testing = X_testing[:, 1:3]
    return (X_testing, Y_testing)


def perceptron(X_training, Y_training):
    """
    Starts with the weight as a zero-vector.
    Changes the weight when it meets any misclassified point.
    Returns the new weight once all the points are correctly clasified.
    
    Inputs:
        X_training (ndarray), complete input features of all points
        Y_training (ndarray), complete training classification of all points
        
    Outputs:
        weight (ndarray), final hypothesis g(X) = Y for perceptron learning
        
    """
    feature_length = X_training.shape[1]
    weight = np.zeros(feature_length)

    while True:
        misclassified_point_count = 0
        for i in range(len(X_training)):
            if isit_misclassified_point(X_training[i], Y_training[i], weight) == True:
                weight = adjust_weight(X_training[i], Y_training[i], weight)
                misclassified_point_count += 1
        if misclassified_point_count == 0:
            break
            
    return weight

def perceptron_adjustment_per_iteration(X_training, Y_training):
    """
    Starts with the weight as a zero-vector.
    In a given iteration, checks every single point for misclassification.
    At the end of the iteration, randomly chooses a misclassified point for weight adjustment.
    Returns the new weight once all the points are correctly classified.
    
    Inputs:
        X_training (ndarray), complete input features of all points
        Y_training (ndarray), complete training classification of all points
        
    Outputs:
        weight (ndarray), final hypothesis g(X) = Y for perceptron learning
        
    """
    feature_length = X_training.shape[1]
    weight = np.zeros(feature_length)

    while True:
        misclassified_points = []
        for i in range(len(X_training)):
            if isit_misclassified_point(X_training[i], Y_training[i], weight) == True:
                misclassified_points.append(i)
        if len(misclassified_points) == 0:
            break
            
        random_i = random.choice(misclassified_points)
        weight = adjust_weight(X_training[random_i], Y_training[random_i], weight)
        
    return weight

def adjust_weight(x_misclassified, y_misclassified, weight):
    """
    Given a single misclassified point and the current weight vector,
    adjust the weight to accomodate our misclassifed point.
    
    Inputs:
        x_misclassified (ndarray)
        y_misclassified (+/- 1)
        weight (ndarray)
        
    Outputs:
        misclassified (boolean)
    """
    adjusted_weight = weight + np.dot(y_misclassified, x_misclassified)
    return adjusted_weight


def isit_misclassified_point(x, y, weight):
    """
    Given a single point (i.e. vector x and label y)
    and a weight that we are currently training or testing, it will determine whether the
    current hypothesis weight correctly or incorrectly classifies the point.
    Tests hypothesis g(x) = sign (weight . x) == y for classification
    
    Inputs:
        x (ndarray)
        y (+/- 1)
        weight (must be ndarray)
        
    Outputs:
        misclassified (boolean)
    
    """
    if np.sign(np.dot(weight.T, x)) != y:
        miclassified = True
    else:
        miclassified = False
    return miclassified

def perceptron_error_single_run(X_testing, Y_testing, weight):
    """
    g = (weight . x)
    Inputs:
        X_testing (ndarray), complete input features of testing set
        Y_testing (ndarray), complete accurate classification of testing set, from f(x)
        weight (ndarray), generated by perceptron() func
    Outputs:
        pla_error_freq (float), P[f(x)!= g_pla(x)]
    """
    
    N = X_testing.shape[0]
    error_count = 0
    for i in range(N):
        if isit_misclassified_point(X_testing[i], Y_testing[i], weight):
            error_count += 1
            
    pla_error_freq = error_count / N
    return pla_error_freq
    
    
def svm_error(support_vector_machine, X_testing, Y_testing):
    svm_error_freq = 1 - support_vector_machine.score(X_testing, Y_testing)
    return svm_error_freq
    
def experiment_error_comparison(N_training, N_testing, runs, kernel = 'linear', C = math.inf):
    """
    Inputs:
        N: (int), number of points
        runs: (int), number of runs (i.e. # comparisons between svm vs. pla OR
                                    # of svm runs
                                    )
    """
    #create svm better than pla comparison tally
    svm_better_tally = 0
    
    for i in range(runs):
        if (i + 1) % 100 == 0:  
            print ("run:", i + 1, "\n")
            
        #generate random function f(X) for every run
        slope, intercept = generate_target_f()
        
        #create training data based on f(X)
        X_training, Y_training = create_training_data(N_training, slope, intercept)
        
        #get weights for best hypothesis g
        weight = perceptron_adjustment_per_iteration(X_training, Y_training)
        
        #create testing data based on f(X)
        X_testing, Y_testing = create_testing_data(N_testing, slope, intercept)
        
        #use perceptron_error_single_run to get pla_error_freq
        pla_error_freq = perceptron_error_single_run(X_testing, Y_testing, weight)
        

        #train/ fit svm
        support_vector_machine = sklearn.svm.SVC(kernel = kernel, C = C)
        support_vector_machine.fit(X_training[:, 1:3], Y_training) #svm does not use x0
        
        #svm error_freq
        svm_error_freq = svm_error(support_vector_machine, X_testing[:, 1:3], Y_testing)
        
        #compare svm error_freq and pla_error_freq
        if svm_error_freq < pla_error_freq:
            svm_better_tally += 1
            
    #outside forloop
    #svm better than pla comparison tally / runs
    svm_better_freq = svm_better_tally / runs
    
    return svm_better_freq

def experiment_svm_count(N, runs, kernel = 'linear', C = math.inf):
    """
    Inputs:
        N: (int), number of training points to generate & svm fit
        runs: (int), number of svm runs, to average over
        kernel = "linear"
        C = infinity for hard boundary
    """
    #initialize svm tally
    svm_tally = 0
    
    for i in range(runs):
        if (i + 1) % 100 == 0:  
            print ("run:", i + 1, "\n")
            
        #generate random function f(X) for every run
        slope, intercept = generate_target_f()
        
        #create training data based on f(X)
        X_training, Y_training = create_training_data(N, slope, intercept)
        
        #train/ fit svm
        support_vector_machine = sklearn.svm.SVC(kernel = kernel, C = C)
        support_vector_machine.fit(X_training[:, 1:3], Y_training) #svm does not use x0
        
        #get number of support vectors / update svm tally
        svm_single_run_count = len(support_vector_machine.support_)
        svm_tally += svm_single_run_count
        
    #outside forloop, find average svm_count
    svm_count_average = svm_tally / runs
    
    return svm_count_average

#question 6
#expected_min_value(0, 1, 10000) #result: .334

#question 8
#experiment_error_comparison(10, 10000, 1000, kernel = 'linear', C = math.inf) # result: .601

#question 9
#experiment_error_comparison(100, 10000, 1000, kernel = 'linear', C = math.inf) #result: .629

#question 10
#experiment_svm_count(100, 1000, kernel = 'linear', C = math.inf) #result: 2.996


run: 100 

run: 200 

run: 300 

run: 400 

run: 500 

run: 600 

run: 700 

run: 800 

run: 900 

run: 1000 



2.997