In [1]:
# IMPORTS

import csv
import numpy as np

In [2]:
# reading file and partitioning data
# TODO: write each data partition to seperate files

NUM_TRAINING = 20000
NUM_VALIDATION = 5000
NUM_TEST = 5000

data = [] 
with open('list_attr_celeba.csv') as Fin:
    reader = csv.reader(Fin, skipinitialspace = True, quotechar = "'")
    for row in reader:
        data.append(row)
        
# removes feature labels
del data[0]

training = []
validation = []
test = []

training_label = []
validation_label = []
test_label = []


for i in range(NUM_TRAINING):
    index = np.random.randint(0, len(data))
    training.append(data.pop(index))
    training[i][0] = 1
    training_label.append(training[i].pop(21))
    
for i in range(NUM_VALIDATION):
    index = np.random.randint(0, len(data))
    validation.append(data.pop(index))
    validation[i][0] = 1
    validation_label.append(validation[i].pop(21))

for i in range(NUM_TEST):
    index = np.random.randint(0, len(data))
    test.append(data.pop(index))
    test[i][0] = 1
    test_label.append(test[i].pop(21))

    
# FEATURE SPACES
training = np.array(training)
training = training.astype(np.float)

validation = np.array(validation)
validation = validation.astype(np.float)

test = np.array(test)
test = test.astype(np.float)


# LABEL SPACES
training_label = np.array(training_label)
training_label = training_label.astype(np.float)

validation_label = np.array(validation_label)
validation_label = validation_label.astype(np.float)

test_label = np.array(test_label)
test_label = test_label.astype(np.float)


# print(training_label.shape)
# print(validation_label.shape)
# print(test_label.shape)
# print(training.shape)
# print(validation.shape)
# print(test.shape)

FileNotFoundError: [Errno 2] No such file or directory: 'list_attr_celeba.csv'

In [35]:
# CLASSIFIER FUNCTIONS

def logistic_regression_SGD(data, label, max_iter, learning_rate): 
    '''
    The logistic regression classifier function
    using Sochastic Gradient Descent.

    Args:
    data: train data with shape (20000, 40), which means 20000 samples and 
          each sample has 40 features
    label: train data's label with shape (20000,1). 
           1 for male and -1 for female
    max_iter: max iteration numbers
    learning_rate: learning rate for weight update
    Returns:
        w: the seperater with shape (40, 1). You must initilize it with w = np.zeros((d,1))
    '''
    N = len(data)
    d = len(data[0])
    # initialize w0
    w = np.zeros((d, 1))
    w = np.transpose(w)

    for t in range(max_iter):
        # pick random point
        i = np.random.randint(0, N)
        # calculate gradient
        gradient = (-label[i] * data[i]) / (1 + np.exp(label[i] * w * data[i]))
        # update weights
        w -= (learning_rate * gradient)

    return np.transpose(w)



def logistic_regression(data, label, max_iter, learning_rate):
    '''
    The logistic regression classifier function.

    Args:
    data: train data with shape (1561, 3), which means 1561 samples and 
          each sample has 3 features.(1, symmetry, average internsity)
    label: train data's label with shape (1561,1). 
           1 for digit number 1 and -1 for digit number 5.
    max_iter: max iteration numbers
    learning_rate: learning rate for weight update

    Returns:
        w: the seperater with shape (3, 1). You must initilize it with w = np.zeros((d,1))
    '''
    N = len(data)
    d = len(data[0])
    # initialize w0
    w = np.zeros((d, 1))
    w = np.transpose(w)

    for t in range(max_iter):
        # calculate gradient
        gradientSum = np.zeros((1, d))
        for n in range(N):
            gradientSum += ( (label[n] * data[n]) / (1 + np.exp(label[n]* np.dot(w, data[n]))) )
        gradient = (-1/N) * gradientSum
        # update weights 
        w -= (learning_rate *gradient)

    return np.transpose(w)



def accuracy(x, y, w):
    '''
    This function is used to compute accuracy of a logsitic regression model.

    Args:
    x: input data with shape (n, d), where n represents total data samples and d represents
        total feature numbers of a certain data sample.
    y: corresponding label of x with shape(n, 1), where n represents total data samples.
    w: the seperator learned from logistic regression function with shape (d, 1),
        where d represents the total feature numbers of a certain data sample.

    Return 
    accuracy: total percentage of correctly classified samples. Set the threshold as 0.5,
    which means, if the predicted probability > 0.5, classify as 1; Otherwise, classify as -1.
    mistakes = 0
    '''
    mistakes = 0
    n = len(y)
    w = np.transpose(w)
    for z in range(n):
        y_pred = 1.0 if sigmoid(np.dot(w,x[z])) > .5 else -1.0
        if(y_pred != y[z]):
            mistakes += 1
        
    return (n-mistakes)/n


def sigmoid(s):
    return (1/(1 + np.exp(-s)))

In [36]:
# TESTING 
learning_rate = [.1, .2, .5]
max_iter = [500, 1000, 10000, 100000]


print('TESTING WITH SGD\n')
for i, m_iter in enumerate(max_iter):
    w = logistic_regression_SGD(training, training_label, m_iter, learning_rate[1])
    Ain, Aout = accuracy(training, training_label, w), accuracy(test, test_label, w)
    print("max iteration testcase%d: Train accuracy: %f, Test accuracy: %f"%(i, Ain, Aout))

for i, l_rate in enumerate(learning_rate):
    w = logistic_regression_SGD(training, training_label, max_iter[3], l_rate)
    Ain, Aout = accuracy(training, training_label, w), accuracy(test, test_label, w)
    print("learning rate testcase%d: Train accuracy: %f, Test accuracy: %f"%(i, Ain, Aout))
    

    
print('\n\nTESTING LOGISTIC REGRESSION\n')
for i, m_iter in enumerate(max_iter):
    w = logistic_regression(training, training_label, m_iter, learning_rate[1])
    Ain, Aout = accuracy(training, training_label, w), accuracy(test, test_label, w)
    print("max iteration testcase%d: Train accuracy: %f, Test accuracy: %f"%(i, Ain, Aout))

for i, l_rate in enumerate(learning_rate):
    w = logistic_regression(training, training_label, max_iter[3], l_rate)
    Ain, Aout = accuracy(training, training_label, w), accuracy(test, test_label, w)
    print("learning rate testcase%d: Train accuracy: %f, Test accuracy: %f"%(i, Ain, Aout))


TESTING WITH SGD


max iteration testcase0: Train accuracy: 0.848150, Test accuracy: 0.844300
max iteration testcase1: Train accuracy: 0.658600, Test accuracy: 0.660000
max iteration testcase2: Train accuracy: 0.845400, Test accuracy: 0.844300
max iteration testcase3: Train accuracy: 0.796300, Test accuracy: 0.796050
learning rate testcase0: Train accuracy: 0.628050, Test accuracy: 0.630750
learning rate testcase1: Train accuracy: 0.667550, Test accuracy: 0.670000
learning rate testcase2: Train accuracy: 0.612250, Test accuracy: 0.614750

TESTING LOGISTIC REGRESSION


max iteration testcase0: Train accuracy: 0.931550, Test accuracy: 0.930650
max iteration testcase1: Train accuracy: 0.932850, Test accuracy: 0.930650


KeyboardInterrupt: 