# Preprocessing Step

In [9]:
import pickle 
import numpy as np
from sklearn import preprocessing as sklpp
from sklearn import decomposition as skldecomp

In [10]:
"""
Extracting data into a file
"""
data = None
labels = []

file = "images/data_batch_1"
with open(file, 'rb') as fo:
    print("extracting file "+file+"...")
    dict = pickle.load(fo, encoding='bytes')
    temp_data = dict[b'data']
    try:
        data = np.concatenate((data, temp_data), axis=0)
    except:
        data = temp_data
    labels = labels + dict[b'labels']

print("Finished Extracting Features")
print(np.shape(data))
        

extracting file images/data_batch_1...
Finished Extracting Features
(10000, 3072)


## Preprocessing and Feature Learning For Images Dataset
For preprocessing we centered our dataset to 0 mean and then for feature learning we apply PCA to reduce our dimensions from 3072 to a smaller number of features than contains 95% of the variance of our data.

In [11]:
# First we create a StandardScaler object to 0 mean the data matrix but preserve the variance
stand_scaler = sklpp.StandardScaler(with_mean = True, with_std = False)
# Fits the data matrix to the StandardScaler object defined ^
centered_ImageData = stand_scaler.fit_transform(data)

In [15]:
# Creates a PCA object that reduces the dimensions of our data matrix keeping 95% of the variance
pca_obj = skldecomp.PCA(n_components = 0.95, svd_solver = 'auto')
dim_reducedImageData = pca_obj.fit_transform(centered_ImageData)
np.save('dim_reducedImageData.npy', dim_reducedImageData)

In [16]:
print('Data has been reduced to {} features after PCA'.format(dim_reducedImageData.shape[1]))

Data has been reduced to 209 features after PCA


# QDA

In [22]:
"""
Define Parameters 
"""
mu = [_ for i in range(10)] # initializing means 
sigmas = [_ for i in range(10)] # initializing variances
pi = [.1 for i in range(10)] # initializing prior probabiltiies 

In [23]:
"""
Create Dictionary of indexes sorted by label 
"""
label_dictionary = {} 
for index,label in enumerate(labels):
    if label not in label_dictionary:
        label_dictionary[label] = [index]
    else:
        label_dictionary[label].append(index)

Label Dictionary is a dictionary with **key**: label# and **value**: list of indexes data
### Learn Parameters

In [46]:

numLabel = 5000 # number of occurences of each label 
# A.reshape(-1, 1)
"""
Learn mu
"""
for label in label_dictionary:
    sum_array = np.array([0 for i in range(3072)])
    for imageIndex in label_dictionary[label]:
        sum_array = np.add(sum_array,data[imageIndex])
    averages = np.multiply(sum_array,(1/(numLabel - 1))) # unbaised estimator formula 
    mu[label] = averages.reshape(-1,len(averages))

In [55]:
"""
Learn sigma^2
"""
for label in label_dictionary:
    print("learning variance for label",label,"...")
    sum_array = np.array([[0 for i in range(3072)] for i in range(3072)])
    label_average = mu[label][0]
    for imageIndex in label_dictionary[label]:
        difference = np.subtract(data[imageIndex].reshape(-1,3072),label_average)
        np.add(sum_array,difference.T.dot(difference))
    Sigmas = np.multiply(sum_array,(1/(numLabel - 1))) # unbaised estimator formula 
    print(np.shape(Sigmas))
    break
    #sigmas[label] = variance

learning variance for label 6 ...


KeyboardInterrupt: 

In [47]:
"""

"""
#def estimateBestLabel(image):
print(np.shape(data[1]))

(3072,)


# Support Vector Machine