# Preprocessing Step

In [1]:
import sys


def progress(count, total, suffix=''):
    bar_len = 60
    filled_len = int(round(bar_len * count / float(total)))

    percents = round(100.0 * count / float(total), 1)
    bar = '=' * filled_len + '-' * (bar_len - filled_len)

    sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', suffix))
    sys.stdout.flush()  # As suggested by Rom Ruben

In [21]:
import pickle 
import numpy as np
import math
from sklearn import preprocessing as sklpp
from sklearn import decomposition as skldecomp

In [3]:
"""
Extracting data into a file
"""
data = None
labels = []

file = "images/data_batch_1"
with open(file, 'rb') as fo:
    print("extracting file "+file+"...")
    dict = pickle.load(fo, encoding='bytes')
    temp_data = dict[b'data']
    try:
        data = np.concatenate((data, temp_data), axis=0)
    except:
        data = temp_data
    labels = labels + dict[b'labels']
labels = np.array(labels)
labels = labels.reshape(-1,1)
print("Finished Extracting Features")
print(np.shape(data))

extracting file images/data_batch_1...
Finished Extracting Features
(10000, 3072)


## Preprocessing and Feature Learning For Images Dataset
For preprocessing we centered our dataset to 0 mean and then for feature learning we apply PCA to reduce our dimensions from 3072 to a smaller number of features than contains 95% of the variance of our data.

In [11]:
# First we create a StandardScaler object to 0 mean the data matrix but preserve the variance
stand_scaler = sklpp.StandardScaler(with_mean = True, with_std = False)
# Fits the data matrix to the StandardScaler object defined ^
centered_ImageData = stand_scaler.fit_transform(data)

In [15]:
# Creates a PCA object that reduces the dimensions of our data matrix keeping 95% of the variance
pca_obj = skldecomp.PCA(n_components = 0.95, svd_solver = 'auto')
dim_reducedImageData = pca_obj.fit_transform(centered_ImageData)
np.save('dim_reducedImageData.npy', dim_reducedImageData)

In [16]:
print('Data has been reduced to {} features after PCA'.format(dim_reducedImageData.shape[1]))

Data has been reduced to 209 features after PCA


# QDA

In [38]:
"""
Pull pca data from saved
"""
data = np.load('dim_reducedImageData.npy')
data = np.concatenate((data,labels),axis=1)
number_of_parameters = len(data[0]) - 1 #209
number_of_total_samples = len(data)
"""
K-Fold cross validation 
"""     
final_estimates = []
k = 10
for i in range(1,k+1):
    training_data = np.concatenate((data[0:(i-1)*int(number_of_total_samples/k)],data[i*int(number_of_total_samples/k):number_of_total_samples]),axis=0)
    testing_data = data[(i-1)*int(number_of_total_samples/k):i*int(number_of_total_samples/k)]
    number_of_training_samples = len(training_data)
    print(len(testing_data))
    """
    Define Parameters 
    """
    mu = [[0 for j in range(number_of_parameters)] for i in range(10)] # initializing means 
    sigmas = [[[0 for k in range(number_of_parameters)] for j in range(number_of_parameters)] for i in range(10)] # initializing variances
    occurences = [0 for i in range(10)]
    pi = [0 for i in range(10)]
        
    """
    Learn mu
    """
    # Find Sums and Occurences
    for image in training_data:
        label = int(image[-1])
        occurences[label] += 1
        mu[label] = np.add(mu[label],image[:-1])
    # Calculate Averages and Prior Probabilities 
    for label,sums in enumerate(mu):
        mu[label] = np.multiply(1/(occurences[label] - 1), sums).reshape(-1,1) #unbaised estimator
        pi[label] = occurences[label] / number_of_total_samples
    """
    Learn sigma^2
    """
    time = 0
    for image in training_data:
        time += 1
        if time%10 == 0:
            progress(time,number_of_training_samples,suffix="training k= "+str(i))
        label = int(image[-1])
        image = image[:-1].reshape(-1,1)
        difference = np.subtract(image,mu[label])
        sigma = difference.dot(difference.T)
        sigmas[label] = np.add(sigmas[label],sigma)
    for label,sigma in enumerate(sigmas):
        sigmas[label] = np.multiply(1/(occurences[label]-1),sigma)
    sys.stdout.flush()
    """
    Method to find the best discriminant score
    """
    def estimateBestLabel(image):
        scores = [0 for _ in range(10)]
        # find score for each label 

        for label in range(10):
            inverted_variance = np.linalg.inv(sigmas[label])
            first_term = -0.5*image.T.dot(inverted_variance).dot(image)
            second_term = image.T.dot(inverted_variance).dot(mu[label])
            third_term = -0.5*mu[label].T.dot(inverted_variance).dot(mu[label])
            fourth_term = -0.5*math.log(np.linalg.det(sigmas[label]))
            fifth_term = math.log(pi[label])
            score = first_term + second_term + third_term + fourth_term + fifth_term
            scores[label] = score[0][0]
        return(scores.index(max(scores)))
    correct = 0
    print("")
    for index, image in enumerate(testing_data):
        if index % 9 == 0:
            progress(index+1,1000,suffix="testing k= "+str(i))
        image = image[:-1].reshape(-1,1)
        if testing_data[index][-1] == estimateBestLabel(image):
            correct += 1
    sys.stdout.flush()
    print(correct/1000)
    final_estimates.append(correct/1000)
print("")
print(final_estimates)

1000
1000
[=-----------------------------------------------------------] 1.0% ...testing k= 2

KeyboardInterrupt: 

### Testing 

# Support Vector Machine

(10000, 209)
[[-1.66786333e+03 -7.04951956e+02  3.55547396e+02 ...  7.55617243e+00
  -1.14070900e+00  4.09839209e+00]
 [ 1.68718819e+02  2.14158957e+02  1.55982239e+03 ... -4.81034742e-01
   1.70525575e+02 -9.91542640e+01]
 [ 1.96093846e+03  2.93034428e+03 -6.31292284e+02 ...  1.08215500e+02
  -4.71960005e+00  9.61573393e+00]
 ...
 [-3.30327785e+02  2.43792701e+03 -3.74213598e+02 ...  3.09124236e+01
   8.47475854e+01  6.67568700e+01]
 [ 1.79608664e+02 -4.50775000e+02 -4.10255349e+02 ... -4.43966999e+01
   1.06159623e+01  9.79147309e+01]
 [-6.32447973e+02 -1.60751972e+03  4.78398657e+02 ...  1.08987396e+01
   1.66306361e+00  6.50439553e+01]]


(10000,)


In [16]:
labels = np.array(labels)
labels = labels.reshape(-1,1)
print(np.shape(labels))
print(labels)

(10000, 1)
[[6]
 [9]
 [9]
 ...
 [1]
 [1]
 [5]]
