**<span style="font-family:opensans; font-size:1.5em;">Gaussian Classifiers for Digits and Spam</span>**

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import random
import pandas as pd
from numpy import linalg as LA
from scipy.io import loadmat

##data cleaning
spam = loadmat('spam_data.mat')
mnist = loadmat('mnist_data.mat')

def l2_norm(vector):
    if np.linalg.norm(vector) == 0:
        return vector.astype(np.double)
    return vector.astype(np.double)/np.linalg.norm(vector)
def normalize(inpt):
    for i, item in enumerate(inpt):
        inpt[i] = l2_norm(item)

#make sure type double
mnist_training = mnist['training_data'].astype(np.double)
mnist_test = mnist['test_data'].astype(np.double)
spam_training = spam['training_data'].astype(np.double)
spam_test = spam['test_data'].astype(np.double)

#normalize all
normalize(mnist_training)
normalize(mnist_test)
normalize(spam_training)
normalize(spam_test)

#flattening
mnist_labels = [l[0] for l in mnist['training_labels']]
spam_labels = [l[0] for l in spam['training_labels']]


In [None]:
#Fitting mean and Covariances per label
mnist_map = dict((k, []) for k in range(10))
for i in range(len(mnist_labels)):
    mnist_map[mnist_labels[i]].append(mnist_training[i])

mnist_mean = {}
mnist_cov = {}

for label in mnist_map:
    matrix = np.asmatrix(mnist_map[label])
    m = matrix.mean(axis = 0)
    mnist_mean[label] = m
    cov = np.cov(matrix.T)
    mnist_cov[label] = cov

**Classification**

In [None]:
import random
import math

def random_sample(s, train_data, labels, n):
    labels = np.array(labels)
    random.seed(s)
    shuffled = random.sample(range(len(train_data)), len(train_data)) 
    validation_index = shuffled[:n]
    training_index = shuffled[n:]

    return train_data[validation_index], labels[validation_index], train_data[training_index], labels[training_index]

mnist_validation_set, mnist_validation_labels, mnist_training_set, mnist_training_labels = random_sample(120, mnist_training, mnist_labels, 10000)

spam_validation_set, spam_validation_labels, spam_training_set, spam_training_labels = random_sample(120, spam_training, spam_labels, 0)


In [None]:
def mle_estimate(training_dat, training_labels, spam = False):
    #for data, divide by label
    if spam:
        n = 2
    else:
        n = 10
    dim = training_dat[0].shape[0]

    _map = dict((k, []) for k in range(n))
    for i in range(len(training_labels)):
        _map[training_labels[i]].append(training_dat[i])

    training_mean = {}
    training_cov = {}
    n = len(training_dat)
    priors = []
    
    #dictionary of means and covariances
    for label in _map:
        matrix = np.asmatrix(_map[label])
        m = matrix.mean(axis = 0)
        training_mean[label] = m
        cov = np.cov(matrix.T)
        training_cov[label] = cov
        priors.append(len(_map[label])/n)
        
    #Averaged coveriance and inverse, logdet
    training_cov = np.array(list(training_cov.values())).mean(axis=0)
    training_cov += np.eye(dim, dtype=float)*0.0000001
    #Inverse of covariance
    I = np.identity(dim)
    inv_Cov = LA.solve(training_cov, I)
    sign,logdet = LA.slogdet(covariance)
    logdet *= sign
    
    return training_mean, inv_Cov, priors, logdet

def error_rt(predicted, labels):
    return (1 - np.mean([predicted[i] == labels[i] for i in range(len(labels))]))

def Qc(x, mu, pi, inv_Cov, logdet):
    x = np.asmatrix(x)
    return float(-0.5*logdet + np.log(pi) - (0.5*(x - mu)*inv_Cov*(x - mu).T))

def LDA_predict(x, means, priors, inv_Cov, logdet):
    #posteriors
    posterior_label = {}
    for label, mean in means.items():
        Q_val = Qc(x, mean, priors[label], inv_Cov, logdet)
        posterior_label[Q_val] = label
    maxi = max(posterior_label)
    return posterior_label[maxi]




**LDA**

In [None]:
training_sets = [100, 200, 500, 1000, 2000, 5000, 10000, 30000, 50000]
t_errors = []
v_errors = []

for n in training_sets:
    dat = mnist_training_set[:n]
    lab = mnist_training_labels[:n]
    
    predicted_train = []
    predicted_valid = []
    mean, inv_cov, priors, logdet = mle_estimate(dat, lab)
    
    for x in dat:
        predicted_train.append(LDA_predict(x, mean, priors, inv_cov, logdet))
    for x in mnist_validation_set:
        predicted_valid.append(LDA_predict(x, mean, priors, inv_cov, logdet))
    
    t_errors.append(error_rt(predicted_train,lab))
    v_errors.append(error_rt(predicted_valid, mnist_validation_labels))
    


In [None]:
#Saving most recent variables 
final_LDA_mean_MNIST, final_LDA_inv_cov_MNIST = mean, inv_cov
final_LDA_logdet_MNIST, final_LDA_priors_MNIST = logdet, priors
final_LDA_t_errors, final_LDA_v_errors = t_errors, v_errors
final_predicted_train,final_lab = predicted_train, lab
final_predicted_valid = predicted_valid


** LDA Error Rate Plotting**

In [None]:
##For calculating errors classified by digit class
def classified_error(p, l, n):
    correct = np.repeat(0, n)
    incorrect = np.repeat(0, n)
    for i in range(len(l)):
        if p[i] == l[i]:
            correct[l[i]]+=1
        else:
            incorrect[l[i]]+=1
    tot = correct+incorrect
    return 1- correct/tot

dig_tr_error = classified_error(final_predicted_train, final_lab, 10)
dig_v_error = classified_error(final_predicted_valid, mnist_validation_labels, 10)

In [None]:
index = np.arange(10)
bar_width = 0.35

fig, axs= plt.subplots(1, 2, figsize=(12,5))
a, b = axs[0],axs[1]
a.plot(training_sets, final_LDA_t_errors, label = "Training Error")
a.plot(training_sets, final_LDA_v_errors, label = "Validation Error")
a.legend(loc='upper right')
a.set_title("MNIST LDA Error")
a.set_xlabel("# of Training Points")
a.set_ylabel("Error Rate")

b.bar(index, dig_tr_error, bar_width, alpha=0.4, color='black', label='Training Error')
b.bar(index + bar_width, dig_v_error, bar_width,alpha=0.4, color='rosybrown', label='Validation Error')
b.set_title("MNIST LDA Error by Digit")
b.set_xlabel("Digit Class")
b.set_ylabel("Error Rate")
plt.show()


**QDA**

In [None]:
def mle_estimate(training_dat, training_labels, spam = False):
    if spam:
        n = 2
    else:
        n = 10
        
    mnist_map = dict((k, []) for k in range(n))
    dim = training_dat[0].shape[0]
        
    for i in range(len(training_labels)):
            mnist_map[training_labels[i]].append(training_dat[i])

    training_mean = dict((k, []) for k in range(n))
    training_inverse_cov = dict((k, []) for k in range(n))
    training_logdet = dict((k, []) for k in range(n))
    priors = []
    n = len(training_dat)
    
    #dictionary of means and covariances
    for label in mnist_map:
        matrix = np.asmatrix(mnist_map[label])
        m = matrix.mean(axis = 0)
        training_mean[label] = m
        
        cov = np.cov(matrix.T) + np.eye(dim, dtype=float)*0.0000001
        I = np.identity(dim)
        inv_Cov = LA.solve(cov, I)
        training_inverse_cov[label] = inv_Cov
        
        sign,logdet = LA.slogdet(cov)
        logdet *= sign
        training_logdet[label] = logdet
        priors.append(len(mnist_map[label])/n)
    
    return training_mean, priors, training_inverse_cov, training_logdet

def error_rt(predicted, labels):
    return (1 - np.mean([predicted[i] == labels[i] for i in range(len(labels))]))

def Qc(x, mu, pi, inv_Cov, logdet):
    x = np.asmatrix(x)
    return float(-0.5*logdet + np.log(pi) - (0.5*(x - mu)*inv_Cov*(x - mu).T))

def QDA_predict(x, means, priors, inv_Covs, logdets):
    posterior_label = {}
    for label, mean in means.items():
        Q_val = Qc(x, mean, priors[label], inv_Covs[label], logdets[label])
        posterior_label[Q_val] = label
    maxi = max(posterior_label)
    return posterior_label[maxi]




In [None]:
training_sets = [100, 200, 500, 1000, 2000, 5000, 10000, 30000, 50000]
t_errors = []
v_errors = []

for n in training_sets:
    dat = mnist_training_set[:n]
    lab = mnist_training_labels[:n]
    
    predicted_train = []
    predicted_valid = []
    training_mean, priors, training_inverse_cov, training_logdet = mle_estimate(dat, lab)

    for x in dat:
        predicted_train.append(QDA_predict(x, training_mean, priors, training_inverse_cov, training_logdet))
    for x in mnist_validation_set:
        predicted_valid.append(QDA_predict(x, training_mean, priors, training_inverse_cov, training_logdet))
    
    t_errors.append(error_rt(predicted_train,lab))
    v_errors.append(error_rt(predicted_valid, mnist_validation_labels))
    


In [None]:
final_QDA_mean_MNIST, final_QDA_inv_cov_MNIST = training_mean, training_inverse_cov
final_QDA_logdet_MNIST, final_QDA_priors_MNIST = training_logdet, priors
final_QDA_t_errors, final_QDA_v_errors = t_errors, v_errors

In [None]:
dig_tr_error_QDA = classified_error(predicted_train, final_lab, 10)
dig_v_error_QDA = classified_error(predicted_valid, mnist_validation_labels, 10)

**QDA Error Rate Plotting**

In [None]:
index = np.arange(10)
bar_width = 0.35

index = np.arange(10)
bar_width = 0.35

fig, axs= plt.subplots(1, 2, figsize=(12,5))
a, b = axs[0],axs[1]
a.plot(training_sets, final_LDA_t_errors, label = "Training Error")
a.plot(training_sets, final_LDA_v_errors, label = "Validation Error")
a.legend(loc='upper right')
a.set_title("MNIST LDA Error")
a.set_xlabel("# of Training Points")
a.set_ylabel("Error Rate")

b.bar(index, dig_tr_error, bar_width, alpha=0.4, color='black', label='Training Error')
b.bar(index + bar_width, dig_v_error, bar_width,alpha=0.4, color='rosybrown', label='Validation Error')
b.set_title("MNIST LDA Error by Digit")
b.set_xlabel("Digit Class")
b.set_ylabel("Error Rate")
plt.show()


fig, axs= plt.subplots(1, 2, figsize=(12,5))
a, b = axs[0],axs[1]
a.plot(training_sets, final_QDA_t_errors, label = "Training Error")
a.plot(training_sets, final_QDA_v_errors, label = "Validation Error")
a.legend(loc='upper right')
a.set_title("MNIST QDA Error")
a.set_xlabel("# of Training Points")
a.set_ylabel("Error Rate")

b.bar(index, dig_tr_error_QDA, bar_width, alpha=0.4, color='black', label='Training Error')
b.bar(index + bar_width, dig_v_error_QDA, bar_width,alpha=0.4, color='rosybrown', label='Validation Error')
b.set_title("MNIST QDA Error by Digit")
b.set_xlabel("Digit Class")
b.set_ylabel("Error Rate")
plt.show()


<p style="page-break-before: always">

In [None]:
def results_to_csv(y_test):
    y_test = y_test.astype(int)
    df = pd.DataFrame({'Category': y_test})
    df.index += 1  # Ensures that the index starts at 1. 
    df.to_csv('submission.csv', index_label='Id')
results_to_csv(np.array(predicted_test))

In [None]:
#KAGGLE SPAM DO NOT RE-RUN
mean, inv_cov, priors, logdet = mle_estimate(spam_training_set, spam_training_labels, True)
predicted_test = []
for x in spam_test:
    predicted_test.append(LDA_predict(x, mean, priors, inv_cov, logdet))

t_errors.append(error_rt(predicted_train,lab))

##KAGGLE MNIST DO NOT RE-RUN
predicted_test = []
for x in mnist_test:
    predicted_test.append(LDA_predict(x, final_mean, final_priors, final_inv_cov, final_logdet))