In [41]:
import numpy as np
import pandas as pd
import itertools
import sklearn.svm

def fetch_data(filename):
    raw_data = pd.read_csv(filename, sep="  ", header=None, engine = 'python')
    raw_data.columns = ["Number", "Intensity", "Symmetry"]
    return raw_data

def label_data(raw_data_deep, num1, num2 = None):
    """
    Inputs:
        raw_data_deep: (pandas df), Columns: Number, Intensity, Symmetry
        num1: (int), num1 vs all or num1 vs num2 classifier
        num2: (None or int), None if num1 vs all classifier
    
    Outputs:
        X: (ndarray), X.shape = (relevant data points, 2), intensity, symmetry
        Y: (ndarray), Y.shape = (relevant data points, ), classification
    
    """
    #make a deep copy of raw_data_deep so that raw_data_deep isn't modified
    raw_data = raw_data_deep.copy(deep = True)
    
    #If doing num1 vs num2 classifier, remove all other numbers
    if num2 != None:
        remove_indices = raw_data[(
            (raw_data.Number != num1) & (raw_data.Number != num2)
            )].index #.index provides Int64Index to remove. Don't use when actually indexing df
        raw_data = raw_data.drop(remove_indices)
    
    #label non-num1 points -1; this is either all other numbers or num2
    negative_indices = raw_data['Number'] != num1 #gives boolean array
    raw_data.loc[negative_indices, 'Number'] = -1
    
    #label num1 with 1; this step must come after the previous step, logically
    raw_data['Number'] = raw_data['Number'].replace(num1, 1)
    
    #convert column 0 to Y classification; convert columns 1 & 2 to X inputs
    X = raw_data.loc[:, ['Intensity', 'Symmetry']].to_numpy()
    Y = raw_data.loc[:, 'Number'].to_numpy()
    
    return (X, Y)


def svm_error(support_vector_machine, X_testing, Y_testing):
    svm_error_freq = 1 - support_vector_machine.score(X_testing, Y_testing)
    return svm_error_freq


def svm_single_run_error(
    X_in, Y_in, error_type = "Ein", X_out = None, Y_out = None, kernel = 'poly', C = 1, degree = 2
    ):
    """
    The "brains" of the experimental parts for questions 2-6, 9-10;
    fits support_vector_machine using X_in and Y_in and returns the error.
    
    Inputs:
        X_in (ndarray), in-sample X generated from label_data; X_in.shape = (relevant points, 2)
        Y_in (ndarray), in-sample Y generated from label_data; Y_in.shape = (relevant points, )
        error_type = "Ein" (string), either "Ein" or "Eout"; otherwise will throw Error
        X_out = None (None or ndarray), if error_type = "Eout", X_out must be defined
        Y_out = None (None or ndarray), if error_type = "Eout", Y_out must be defined
        kernel = 'poly' (string), other options "linear", "rbf"
        C = 1 (float), can define and usually determined using validation; soft margin error
        degree = 2 (int), the order of the polynomial when using "poly"; otherwise disregarded
    
    Outputs:
        svm_error_freq (float), (in-sample or out out-sample) error of the support vector machine
    
    """
    #train/ fit svm
    support_vector_machine = sklearn.svm.SVC(kernel = kernel, C = C, degree = degree, coef0 = 1, gamma = 1)
    support_vector_machine.fit(X_in, Y_in)
        
    #svm error
    if error_type == "Ein":
        svm_error_freq = svm_error(support_vector_machine, X_in, Y_in)
    elif error_type == "Eout":
        if X_out != None and Y_out != None:
            svm_error_freq = svm_error(support_vector_machine, X_out, Y_out)
        else:
            raise Exception("Calculating Eout, but did not provide X_out and Y_out")
    else:
        raise Exception("Invalid error_type, please start again \n")
    return (svm_error_freq)

def svm_count(X_in, Y_in, kernel = 'poly', C = 0.01, degree = 2):
    """
    Inputs:
        X_in (ndarray), in-sample X generated from label_data; X_in.shape = (relevant points, 2)
        Y_in (ndarray), in-sample Y generated from label_data; Y_in.shape = (relevant points, )
        kernel = 'poly' (string), other options "linear", "rbf"
        C = 1 (float), can define and usually determined using validation; soft margin error
        degree = 2 (int), the order of the polynomial when using "poly"; otherwise disregarded
        
    Outputs:
        svm_count
    """
    
    #train/ fit svm
    support_vector_machine = sklearn.svm.SVC(kernel = kernel, C = C, degree = degree, coef0 = 1, gamma = 1)
    support_vector_machine.fit(X_in, Y_in)
        
    #get number of support vectors
    svm_count = len(support_vector_machine.support_)

    return svm_count


def svm_cross_validation_error(X_partition_tuple, Y_partition_tuple, kernel = 'poly', C = 1, degree = 2):
    n_fold = len(Y_partition_tuple)
    cv_error_total = 0
    for i in range(n_fold):
        #separate training and validation data
        #splice out training data; use list for later modification
        X_training = list(X_partition_tuple)
        Y_training = list(Y_partition_tuple)
        
        #pop out validation data
        X_training.pop(i)
        Y_training.pop(i)
        
        #squeezing the list from 2d to 1d and make into ndarray
        X_training = np.asarray(list(
            itertools.chain(*X_training)
            ))
        Y_training = np.asarray(list(
            itertools.chain(*Y_training)
            ))
        
        #get validation data
        X_validation = X_partition_tuple[i] #ndarray
        Y_validation = Y_partition_tuple[i] #ndarray
        
        #train/ fit svm
        support_vector_machine = sklearn.svm.SVC(kernel = kernel, C = C, degree = degree, coef0 = 1, gamma = 1)
        support_vector_machine.fit(X_training, Y_training)
        
        #svm error_freq
        svm_error_freq = svm_error(support_vector_machine, X_validation, Y_validation)
        
        #add to total error tally
        cv_error_total += svm_error_freq
    cv_error_freq = cv_error_total / n_fold
    return cv_error_freq

def partition_for_cross_validation(X, Y, n_fold):
    """
    Inputs:
        X (ndarray)
        Y (ndarray)
        n_fold (int) # of partitions, usually 10
    
    Outputs:
        X_partition_list = (nested tuple of ndarrays), len(X_partition) = n_fold
        Y_partition_list = (nested tuple of ndarrays), len(Y_partition) = n_fold
        
    Notes:
        X_partition and Y_partition indices hold the input / classification of the same point;
        this is done by generating a separate array of random indices, index_permutation,
        and choosing every n_foldth index as the index to use on both X and Y.
        This is done here because none of the other problems require random petmutation,
        so there is no need to find random indices for the earlier functions, like label_data().
    """
    data_length = len(X)
    index_permutation = np.random.permutation(data_length)
    
    X_partition_list = []
    Y_partition_list = []

    for fold_count in range(n_fold):
        #index at every nth fold, starting at different indices for different blocks
        block_indices = index_permutation[fold_count::n_fold]
        #fancy indexing for ndarrays! :D
        X_block = X[block_indices]
        Y_block = Y[block_indices]
        X_partition_list.append(X_block)
        Y_partition_list.append(Y_block)
        
    X_partition_tuple = tuple(X_partition_list)
    Y_partition_tuple = tuple(Y_partition_list)
    
    return(X_partition_tuple, Y_partition_tuple)






#X_partition_tuple, Y_partition_tuple = partition_for_cross_validation(X, Y, 10)
#svm_cross_validation_error(X_partition_tuple, Y_partition_tuple, kernel = 'poly', C = .001, degree = 2)
#svm_count(X_in, Y_in, kernel = 'poly', C = 1, degree = 2)



print('Enter question number, 2-10')
question = int(input())


raw_data_train = fetch_data("number_train.txt")
raw_data_test = fetch_data("number_test.txt")

if question == 2:
    error_list = []
    for i in range(0, 9, 2):
        X_in, Y_in = label_data(raw_data_train, i)
        e_in = svm_single_run_error(X_in, Y_in, error_type = "Ein", X_out = None, Y_out = None, kernel = 'poly', C = 0.01, degree = 2)
        error_list.append([i, e_in])
        print ("{i} versus all: {e_in} \n".format(i = i, e_in = e_in))
    i, worst_error = max(error_list, key=lambda x: x[1])
    print ('Worst classifier is {i} versus all at {worst_error}'.format(
        i = i, worst_error = worst_error))
    
if question == 3:
    error_list = []
    for i in range(1, 10, 2):
        X_in, Y_in = label_data(raw_data_train, i)
        e_in = svm_single_run_error(X_in, Y_in, error_type = "Ein", X_out = None, Y_out = None, kernel = 'poly', C = 0.01, degree = 2)
        error_list.append([i, e_in])
        print ("{i} versus all: {e_in} \n".format(i = i, e_in = e_in))
    i, best_error = min(error_list, key=lambda x: x[1])
    print ('Best classifier is {i} versus all at {worst_error}'.format(
        i = i, worst_error = worst_error))

if question == 4:
    numbers = [0, 1]
    svm_count_list = []
    for i in numbers:
        X_in, Y_in = label_data(raw_data_train, i)
        svm_tally = svm_count(X_in, Y_in, kernel = 'poly', C = 0.01, degree = 2)
        svm_count_list.append(svm_tally)
    svm_difference = svm_count_list[0] - svm_count_list[1]
    print ('Difference between svms for 0 vs all and 1 vs all: {svm_difference}'.format(
        svm_difference = svm_difference
        ))

    

Enter question number, 2-10


 4


Difference between svms for 0 vs all and 1 vs all: 1793
[2179, 386]


In [23]:
raw_data = fetch_data("number_train.txt")
X_in, Y_in = label_data(raw_data, 3)
svm_single_run_error(X_in, Y_in, error_type = "Ein", X_out = None, Y_out = None, kernel = 'poly', C = 0.01, degree = 2)

0.09024825126868741

In [28]:
a = [[9, 2], [3, 4]]
max(a, key=lambda x: x[1])

[3, 4]

In [25]:
raw_data = fetch_data("number_train.txt")
X_in, Y_in = label_data(raw_data, 7)
svm_single_run_error(X_in, Y_in, error_type = "Ein", X_out = None, Y_out = None, kernel = 'poly', C = 0.01, degree = 2)

0.08846523110684401

In [26]:
raw_data = fetch_data("number_train.txt")
X_in, Y_in = label_data(raw_data, 9)
svm_single_run_error(X_in, Y_in, error_type = "Ein", X_out = None, Y_out = None, kernel = 'poly', C = 0.01, degree = 2)

0.08832807570977919

In [134]:
import itertools
list2d = [[1,2,3], [4,5,6], [7], [8,9]]
merged = np.asarray(list(itertools.chain(*list2d)))
merged

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [6]:
def append_103(x=[]):
    print (x)
    x.append(103)
    print(x)
    print()



append_103()
append_103()
append_103()

[]
[103]

[103]
[103, 103]

[103, 103]
[103, 103, 103]



In [27]:
def append_103(a):
    print (a)
    a.append(103)
    return a
x = []
append_103(x)
append_103(x)
append_103(x)

print(append_103(x))

NameError: name 'a' is not defined