In [120]:
import numpy as np
import pandas as pd
import itertools
import sklearn.svm

def fetch_data(filename):
    raw_data = pd.read_csv(filename, sep="  ", header=None, engine = 'python')
    raw_data.columns = ["Number", "Intensity", "Symmetry"]
    return raw_data

def label_data(raw_data_deep, num1, num2 = None):
    """
    Inputs:
        raw_data_deep: (pandas df), Columns: Number, Intensity, Symmetry
        num1: (int), num1 vs all or num1 vs num2 classifier
        num2: (None or int), None if num1 vs all classifier
    
    Outputs:
        X: (ndarray), X.shape = (relevant data points, 2), intensity, symmetry
        Y: (ndarray), Y.shape = (relevant data points, ), classification
    
    """
    #make a deep copy of raw_data_deep so that raw_data_deep isn't modified
    raw_data = raw_data_deep.copy(deep = True)
    
    #If doing num1 vs num2 classifier, remove all other numbers
    if num2 != None:
        remove_indices = raw_data[(
            (raw_data.Number != num1) & (raw_data.Number != num2)
            )].index #.index provides Int64Index to remove. Don't use when actually indexing df
        raw_data = raw_data.drop(remove_indices)
    
    #label non-num1 points -1; this is either all other numbers or num2
    negative_indices = raw_data['Number'] != num1 #gives boolean array
    raw_data.loc[negative_indices, 'Number'] = -1
    
    #label num1 with 1; this step must come after the previous step, logically
    raw_data['Number'] = raw_data['Number'].replace(num1, 1)
    
    #convert column 0 to Y classification; convert columns 1 & 2 to X inputs
    X = raw_data.loc[:, ['Intensity', 'Symmetry']].to_numpy()
    Y = raw_data.loc[:, 'Number'].to_numpy()
    
    return (X, Y)


def phi(X, transform = False):
    if transform == False:
        bias = np.ones([len(X), 1])
        Z = np.hstack((bias, X))
        return Z
    else:
        x_1, x_2 = X[:, 0], X[:, 1]
        Z = np.array([np.ones(len(X)), x_1, x_2, x_1 * x_2, x_1 ** 2, x_2 ** 2])
        Z = np.transpose(Z)
        return Z
    
    
def weights_reg(Z_train, Y, lamb = 1):
    #refer to slide 11 of lecture 12
    tmp1 = np.dot(np.transpose(Z_train), Z_train) + lamb * np.identity(Z_train.shape[1])
    tmp2 = np.dot(np.transpose(Z_train), Y)
    tmp3 = np.dot(np.linalg.inv(tmp1), tmp2)
    return tmp3

def g_classification(Z, weights_reg):
    return np.sign(np.dot(Z, weights_reg))


def error_freq(g_classification, Y):
    error_sum = np.sum(g_classification != Y)
    error = error_sum / len(Y)
    return error


def experiment(
    transform = False,
    lamb = 1,
    error_type = "Ein",
    num1_range_low = 0,
    num1_range_high = 10,
    num2 = None
):
    """
    lamb = lambda regularizer parameter
    transform = False; whether we just add bias or also take to 2-order polynomial
    error_type = Ein; or Eout or Both
    num1_range: tuple of range (0, 10). For a single number, do (num, num + 1)
    num2 = None; for one vs one number comparisons
    """
    #for print statement
    if num2 != None:
        vs = num2
    else:
        vs = "all"
    
    #fetch data; use of deep copy means we only need to fetch once
    raw_data_train = fetch_data("number_train.txt")
    raw_data_test = fetch_data("number_test.txt")
    
    for i in range(num1_range_low, num1_range_high):
        #label data
        X_train, Y_train = label_data(raw_data_train, i, num2 = num2)
        X_test, Y_test = label_data(raw_data_test, i, num2 = num2)
        
        #add bias / transform
        Z_train = phi(X_train, transform = transform)
        Z_test = phi(X_test, transform = transform)
        
        #train model
        weights = weights_reg(Z_train, Y_train, lamb = lamb)
        
        #evaluate model
        if error_type != "Both":
            if error_type == "Ein":
                Z = Z_train
                Y = Y_train
            elif error_type == "Eout":
                Z = Z_test
                Y = Y_test
            else:
                raise Exception("only takes Ein or Eout")
                
            g_class = g_classification(Z, weights)
            error = error_freq(g_class, Y)

            print(f'{i} vs {vs}: \n {error_type} : {error} \n \n')
        else:
            #in sample
            g_class_in = g_classification(Z_train, weights)
            error_in = error_freq(g_class_in, Y_train)
            
            #out of sample
            g_class_out = g_classification(Z_test, weights)
            error_out = error_freq(g_class_out, Y_test)
            print(f'{i} vs {vs}: \n Ein:  {error_in} \n Eout: {error_out} \n')
            
    
    return True

In [121]:
#problem 7
experiment(
    transform = False,
    lamb = 1,
    error_type = "Ein",
    num1_range_low = 5,
    num1_range_high = 10,
    num2 = None,
)

# 5 vs all: 
#  Ein : 0.07625840076807022 
 
# 6 vs all: 
#  Ein : 0.09107118365107666 
 
# 7 vs all: 
#  Ein : 0.08846523110684405 
 
# 8 vs all: 
#  Ein : 0.07433822520916199 
 
# 9 vs all: 
#  Ein : 0.08832807570977919 

5 vs all: 
 Ein : 0.07625840076807022 
 

6 vs all: 
 Ein : 0.09107118365107666 
 

7 vs all: 
 Ein : 0.08846523110684405 
 

8 vs all: 
 Ein : 0.07433822520916199 
 

9 vs all: 
 Ein : 0.08832807570977919 
 



True

In [122]:
#problem 8
experiment(
    transform = True,
    lamb = 1,
    error_type = "Eout",
    num1_range_low = 0,
    num1_range_high = 5,
    num2 = None,
)

# 0 vs all: 
#  Eout : 0.10662680617837568 
 

# 1 vs all: 
#  Eout : 0.02192326856003986 
 

# 2 vs all: 
#  Eout : 0.09865470852017937 
 

# 3 vs all: 
#  Eout : 0.08271051320378675 
 

# 4 vs all: 
#  Eout : 0.09965122072745392 

0 vs all: 
 Eout : 0.10662680617837568 
 

1 vs all: 
 Eout : 0.02192326856003986 
 

2 vs all: 
 Eout : 0.09865470852017937 
 

3 vs all: 
 Eout : 0.08271051320378675 
 

4 vs all: 
 Eout : 0.09965122072745392 
 



True

In [125]:
#problem 9
for i in range(10):
    print('Without transform')
    experiment(
        transform = False,
        lamb = 1,
        error_type = "Both",
        num1_range_low = i,
        num1_range_high = i + 1,
        num2 = None,
    )
    
    print('With transform')
    experiment(
        transform = True,
        lamb = 1,
        error_type = "Both",
        num1_range_low = i,
        num1_range_high = i + 1,
        num2 = None,
    )

# Without transform
# 0 vs all: 
#  Ein:  0.10931285146070498 
#  Eout: 0.11509715994020926 

# With transform
# 0 vs all: 
#  Ein:  0.10231792621039638 
#  Eout: 0.10662680617837568 

# Without transform
# 1 vs all: 
#  Ein:  0.01522424907420107 
#  Eout: 0.02242152466367713 

# With transform
# 1 vs all: 
#  Ein:  0.012343985735838706 
#  Eout: 0.02192326856003986 

# Without transform
# 2 vs all: 
#  Ein:  0.10026059525442327 
#  Eout: 0.09865470852017937 

# With transform
# 2 vs all: 
#  Ein:  0.10026059525442327 
#  Eout: 0.09865470852017937 

# Without transform
# 3 vs all: 
#  Ein:  0.09024825126868742 
#  Eout: 0.08271051320378675 

# With transform
# 3 vs all: 
#  Ein:  0.09024825126868742 
#  Eout: 0.08271051320378675 

# Without transform
# 4 vs all: 
#  Ein:  0.08942531888629818 
#  Eout: 0.09965122072745392 

# With transform
# 4 vs all: 
#  Ein:  0.08942531888629818 
#  Eout: 0.09965122072745392 

# Without transform
# 5 vs all: 
#  Ein:  0.07625840076807022 
#  Eout: 0.07972097658196313 

# With transform
# 5 vs all: 
#  Ein:  0.07625840076807022 
#  Eout: 0.07922272047832586 

# Without transform
# 6 vs all: 
#  Ein:  0.09107118365107666 
#  Eout: 0.08470353761833582 

# With transform
# 6 vs all: 
#  Ein:  0.09107118365107666 
#  Eout: 0.08470353761833582 

# Without transform
# 7 vs all: 
#  Ein:  0.08846523110684405 
#  Eout: 0.07324364723467862 

# With transform
# 7 vs all: 
#  Ein:  0.08846523110684405 
#  Eout: 0.07324364723467862 

# Without transform
# 8 vs all: 
#  Ein:  0.07433822520916199 
#  Eout: 0.08271051320378675 

# With transform
# 8 vs all: 
#  Ein:  0.07433822520916199 
#  Eout: 0.08271051320378675 

# Without transform
# 9 vs all: 
#  Ein:  0.08832807570977919 
#  Eout: 0.08819133034379671 

# With transform
# 9 vs all: 
#  Ein:  0.08832807570977919 
#  Eout: 0.08819133034379671 

Without transform
0 vs all: 
 Ein:  0.10931285146070498 
 Eout: 0.11509715994020926 

With transform
0 vs all: 
 Ein:  0.10231792621039638 
 Eout: 0.10662680617837568 

Without transform
1 vs all: 
 Ein:  0.01522424907420107 
 Eout: 0.02242152466367713 

With transform
1 vs all: 
 Ein:  0.012343985735838706 
 Eout: 0.02192326856003986 

Without transform
2 vs all: 
 Ein:  0.10026059525442327 
 Eout: 0.09865470852017937 

With transform
2 vs all: 
 Ein:  0.10026059525442327 
 Eout: 0.09865470852017937 

Without transform
3 vs all: 
 Ein:  0.09024825126868742 
 Eout: 0.08271051320378675 

With transform
3 vs all: 
 Ein:  0.09024825126868742 
 Eout: 0.08271051320378675 

Without transform
4 vs all: 
 Ein:  0.08942531888629818 
 Eout: 0.09965122072745392 

With transform
4 vs all: 
 Ein:  0.08942531888629818 
 Eout: 0.09965122072745392 

Without transform
5 vs all: 
 Ein:  0.07625840076807022 
 Eout: 0.07972097658196313 

With transform
5 vs all: 
 Ein:  0.07625840076807022 
 Eout: 0.07922

In [127]:
#problem 10
print('lambda = 0.01')
experiment(
    transform = True,
    lamb = 0.01,
    error_type = "Both",
    num1_range_low = 1,
    num1_range_high = 2,
    num2 = 5,
)

print('lambda = 1')
experiment(
    transform = True,
    lamb = 1,
    error_type = "Both",
    num1_range_low = 1,
    num1_range_high = 2,
    num2 = 5,
)

# lambda = 0.01
# 1 vs 5: 
#  Ein:  0.004484304932735426 
#  Eout: 0.02830188679245283 

# lambda = 1
# 1 vs 5: 
#  Ein:  0.005124919923126201 
#  Eout: 0.025943396226415096 

lambda = 0.01
1 vs 5: 
 Ein:  0.004484304932735426 
 Eout: 0.02830188679245283 

lambda = 1
1 vs 5: 
 Ein:  0.005124919923126201 
 Eout: 0.025943396226415096 



True

In [None]:
def svm_error(support_vector_machine, X_testing, Y_testing):
    svm_error_freq = 1 - support_vector_machine.score(X_testing, Y_testing)
    return svm_error_freq


def svm_single_run_error(
    X_in, Y_in, error_type = "Ein", X_out = None, Y_out = None, kernel = 'poly', C = 1, degree = 2
    ):
    """
    Inputs:
        X_in (ndarray), in-sample X generated from label_data; X_in.shape = (relevant points, 2)
        Y_in (ndarray), in-sample Y generated from label_data; Y_in.shape = (relevant points, )
        error_type = "Ein" (string), either "Ein" or "Eout"; otherwise will throw Error
        X_out = None (None or ndarray), if error_type = "Eout", X_out must be defined
        Y_out = None (None or ndarray), if error_type = "Eout", Y_out must be defined
        kernel = 'poly' (string), other options "linear", "rbf"
        C = 1 (float), can define and usually determined using validation; soft margin error
        degree = 2 (int), the order of the polynomial when using "poly"; otherwise disregarded
    
    Outputs:
        svm_error_freq (float), (in-sample or out out-sample) error of the support vector machine
    
    """
    #train/ fit svm
    support_vector_machine = sklearn.svm.SVC(kernel = kernel, C = C, degree = degree, coef0 = 1, gamma = 1)
    support_vector_machine.fit(X_in, Y_in)
        
    #svm error
    if error_type == "Ein":
        svm_error_freq = svm_error(support_vector_machine, X_in, Y_in)
    elif error_type == "Eout":
        if X_out != None and Y_out != None:
            svm_error_freq = svm_error(support_vector_machine, X_out, Y_out)
        else:
            raise Exception("Calculating Eout, but did not provide X_out and Y_out")
    else:
        raise Exception("Invalid error_type, please start again \n")
    return svm_error_freq

