# Preprocessing Step

In [1]:
import pickle 
import numpy as np
import pandas as pd
from sklearn import preprocessing as sklpp
from sklearn import decomposition as skldecomp

In [2]:
"""
Extracting data into a file
"""
data = None
labels = []
for i in range(5):
    j = i + 1
    file = "images/data_batch_" + str(j)
    with open(file, 'rb') as fo:
        print("extracting file "+file+"...")
        dict = pickle.load(fo, encoding='bytes')
        temp_data = dict[b'data']
        try:
            data = np.concatenate((data, temp_data), axis=0)
        except:
            data = temp_data
        labels = labels + dict[b'labels']

print("Finished Extracting Features")
        

extracting file images/data_batch_1...
extracting file images/data_batch_2...
extracting file images/data_batch_3...
extracting file images/data_batch_4...
extracting file images/data_batch_5...
Finished Extracting Features


### Preprocessing image dataset
Each sample in our data matrix for the images has 3072 raw features therefore we will perform dimensionality reduction of the data via PCA in order to ease our computation time. We will do PCA with 95% of the variance preserved. 

In [3]:
## First step to PCA is to center our data matrix to ensure it has 0 mean we use SciKit-Learn for this step
# Creates an instance of a StandardScaler() object that centers the data matrix to  0 mean but does not change variance 
mean_datascaler = sklpp.StandardScaler(with_mean = True, with_std = False)
# Using the fit_transform method we get our centered data
centered_ImageData = mean_datascaler.fit_transform(data)

In [4]:
centered_ImageData1 = centered_ImageData[0:10000]
centered_ImageData2 = centered_ImageData[10001:20001]
centered_ImageData3 = centered_ImageData[20001:30001]
centered_ImageData4 = centered_ImageData[30001:40001]
centered_ImageData5 = centered_ImageData[40001:]
centered_ImageData1.shape

(10000, 3072)

In [5]:
##Next we use SciKit-Learn PCA method to perform linear dimensionality reduction via PCA preserving 95% of the variance
# Here we create a PCA object and pass in n_components = 0.95 so that 95% of the variance is contained in the reduced dimensions
pca_object = skldecomp.PCA(n_components = 0.95,svd_solver = 'auto')
# transforms the centered_ImageData to the reduced dimension in a new variable, this does all the SVD and preserves only the top principle components that will result in 95% energy capture
# based on the singular values.
dim_reducedImageData1 = pca_object.fit_transform(centered_ImageData1)
np.save('dim_reducedImageData1.npy',dim_reducedImageData1)

In [6]:
dim_reducedImageData2 = pca_object.fit_transform(centered_ImageData2)
np.save('dim_reducedImageData2.npy',dim_reducedImageData2)

In [7]:
dim_reducedImageData3 = pca_object.fit_transform(centered_ImageData3)
np.save('dim_reducedImageData3.npy',dim_reducedImageData3)

In [8]:
dim_reducedImageData4 = pca_object.fit_transform(centered_ImageData4)
np.save('dim_reducedImageData4.npy',dim_reducedImageData4)

In [9]:
dim_reducedImageData5 = pca_object.fit_transform(centered_ImageData5)
np.save('dim_reducedImageData5.npy',dim_reducedImageData5)

In [10]:
import dill
dill.dumb_session('notebook_env.db')

ModuleNotFoundError: No module named 'dill'

# QDA