In [37]:
# Python Standard Libraries for importing data from binary file
import os.path #for accessing the file path
import struct  #for unpacking the binary data

import time    #for calculating time

#core packages
import numpy as np
import matplotlib.pyplot as plt

#custom module
from dataPrep import retrive_data, sample_origDataset , dev_test_split, prep_dataset
from dataPrep import load_dataset, sample_prepDataset


## Preparing the sampled dataset

In [38]:
help(retrive_data)

Help on function retrive_data in module dataPrep:

retrive_data(dataset='training-set')
    Retrive MNIST dataset from  the binary file into numpy arrays        
    
    Dataset Obtained From:
        - link -- http://yann.lecun.com/exdb/mnist/
        
    Dataset retrival code adapted from(but modified to our need making data retrival 6-8 times faster):
        - link -- https://www.cs.virginia.edu/~connelly/class/2015/large_scale/proj2/mnist_python
        
    Argument:
        - **dataset** -- type of dataset to be loaded. may be either 'training' or 'test'
    Returns:
        - **images** -- 3D array consisting of no. of examples, rows, columns of images 
        - **labels** -- array  containing labels for each images



In [39]:
#retriving the data
train_x_orig, train_y_orig = retrive_data(dataset="training-set")
test_x_temp, test_y_temp = retrive_data(dataset="test-set")

#displaying the retrival info
print("Data\t\t\t","Datatype\t\t","Shape")
print("=================================================================")
print("Training Set Images:\t" + str(type(train_x_orig))+"\t",str(train_x_orig.shape))
print("Training Set Labels:\t" + str(type(train_y_orig))+"\t",str(train_y_orig.shape))
print("Test Set Images:\t" + str(type(test_x_temp))+"\t",str(test_x_temp.shape))
print("Test Set Labels:\t" + str(type(test_y_temp))+"\t",str(test_y_temp.shape))
print("=================================================================")


Data			 Datatype		 Shape
Training Set Images:	<class 'numpy.ndarray'>	 (60000, 28, 28)
Training Set Labels:	<class 'numpy.ndarray'>	 (60000, 1)
Test Set Images:	<class 'numpy.ndarray'>	 (10000, 28, 28)
Test Set Labels:	<class 'numpy.ndarray'>	 (10000, 1)


In [40]:
help(sample_origDataset)

Help on function sample_origDataset in module dataPrep:

sample_origDataset(x, y, dataVol=25)
    Returns a sample dataset from the fully processed dataset
    
    Arguments:
        - **x** -- original input data
        - **y** -- original output labels
        - **dataVol** -- sample volume in percentage (default 10%)
    Returns:
        - **x_sample** -- input sample  from original dataset of size ( dataVol% of x)
        - **y_sample** -- output sample  from original dataset of size (datavol% of y)
        - **dataVol** -- sample volume in percentage



In [41]:
train_Vol,train_x_sample, train_y_sample = sample_origDataset(train_x_orig,train_y_orig,dataVol=25)
test_Vol,test_x_sample,test_y_sample = sample_origDataset(test_x_temp,test_y_temp,dataVol=25)

print("Data\t\t\t","Complete Data Shape\t","Sample Data Shape\t","Sample Size")
print("=====================================================================================")
print("Training Set Images:\t"+ str(train_x_orig.shape)+"\t\t"+ str(train_x_sample.shape)+"\t\t"+str(train_Vol)+"%")
print("Training Set Labels:\t"+ str(train_y_orig.shape)+"\t\t"+ str(train_y_sample.shape))
print("Test Set Images:\t"+str(test_x_temp.shape)+"\t\t"+ str(test_x_sample.shape)+"\t\t"+str(test_Vol)+"%")
print("Test Set Labels:\t"+str(test_y_temp.shape)+"\t\t"+ str(test_y_sample.shape))
print("=====================================================================================")


Data			 Complete Data Shape	 Sample Data Shape	 Sample Size
Training Set Images:	(60000, 28, 28)		(15000, 28, 28)		25%
Training Set Labels:	(60000, 1)		(15000, 1)
Test Set Images:	(10000, 28, 28)		(2500, 28, 28)		25%
Test Set Labels:	(10000, 1)		(2500, 1)


In [42]:
help(dev_test_split)

Help on function dev_test_split in module dataPrep:

dev_test_split(test_x, test_y)
    Randomly splits the test set to dev and test set
    
    Arguments:
        - **test_x** -- test set images of size (10000,28,28) 
        - **test_y** -- test set labels of size (10000,1)
    
    Returns:
        - **dev_x**  -- dev set images of size (n,28,28) 
        - **dev_y**  -- dev set labels of size (n,1) 
        - **test_x** -- test set images of size (n,28,28) 
        - **test_y** -- test set labels of size (n,1)



In [43]:
dev_x_orig,dev_y_orig,test_x_orig,test_y_orig = dev_test_split(test_x_sample,test_y_sample)

print("Data\t\t\t","Datatype\t\t","Shape")
print("=================================================================")
print("Dev Set Images:\t\t" + str(type(dev_x_orig))+"\t",str(dev_x_orig.shape))
print("Dev Set Labels:\t\t" + str(type(dev_y_orig))+"\t",str(dev_y_orig.shape))
print("Test Set Images:\t" + str(type(test_x_orig))+"\t",str(test_x_orig.shape))
print("Test Set Labels:\t" + str(type(test_y_orig))+"\t",str(test_y_orig.shape))
print("=================================================================")


Data			 Datatype		 Shape
Dev Set Images:		<class 'numpy.ndarray'>	 (1250, 28, 28)
Dev Set Labels:		<class 'numpy.ndarray'>	 (1250, 1)
Test Set Images:	<class 'numpy.ndarray'>	 (1250, 28, 28)
Test Set Labels:	<class 'numpy.ndarray'>	 (1250, 1)


In [44]:
help(prep_dataset)

Help on function prep_dataset in module dataPrep:

prep_dataset(train_x_orig, train_y_orig, dev_x_orig, dev_y_orig, test_x_orig, test_y_orig)
    Flatten and Normalize the input images and encode the output labels
    
    Arguments:
        - **train_x_orig** --  training set images of size (60000,28,28)
        - **train_y_orig** --  training set labels of size (60000,1)
        - **dev_x_orig**   -- dev set images of size (5000,28,28)
        - **dev_y_orig**   -- dev set labels of size (5000,1)
        - **test_x_orig**  -- test set images of size (5000,28,28)
        - **test_y_orig**  -- test set labels of size (5000,1)
    Returns:
        - **train_x_norm** -- flattened and normalized training set input data
        - **dev_norm**     -- flattened and normalized training set dev data
        - **test_x_norm**  -- flattened and normalized test set input data
        - **train_y_encoded** -- encoded label of training set
        - **dev_y_encoded**   -- encoded label of dev set
 

In [45]:
train_x_norm,train_y_encoded, dev_x_norm,dev_y_encoded, test_x_norm, test_y_encoded = prep_dataset(train_x_sample, train_y_sample, dev_x_orig, dev_y_orig, test_x_orig, test_y_orig)
print("Data\t\t\t","Before Processing\t","After Processing")
print("=================================================================")
print("Training Set Images:\t" + str(train_x_orig.shape)+"\t\t"+ str(train_x_norm.shape))
print("Training Set Labels:\t" + str(train_y_orig.shape)+"\t\t"+ str(train_y_encoded.shape))
print("Dev Set Images:\t\t" + str(dev_x_orig.shape)+"\t\t"+ str(dev_x_norm.shape))
print("Dev Set Labels:\t\t" + str(dev_y_orig.shape)+"\t\t"+ str(dev_y_encoded.shape))
print("Test Set Images:\t" + str(test_x_orig.shape)+"\t\t"+ str(test_x_norm.shape))
print("Test Set Labels:\t" + str(test_y_orig.shape)+"\t\t"+ str(test_y_encoded.shape))
print("=================================================================")

Data			 Before Processing	 After Processing
Training Set Images:	(60000, 28, 28)		(784, 15000)
Training Set Labels:	(60000, 1)		(11, 15000)
Dev Set Images:		(1250, 28, 28)		(784, 1250)
Dev Set Labels:		(1250, 1)		(11, 1250)
Test Set Images:	(1250, 28, 28)		(784, 1250)
Test Set Labels:	(1250, 1)		(11, 1250)


## Sampling the Prepared Dataset

In [46]:
train_x_orig, train_y_orig, dev_x_orig,dev_y_orig,test_x_orig,test_y_orig = load_dataset()

print("Data\t\t\t","Datatype\t\t","Shape")
print("=================================================================")
print("Training Set Images:\t" + str(type(train_x_orig))+"\t",str(train_x_orig.shape))
print("Training Set Labels:\t" + str(type(train_y_orig))+"\t",str(train_y_orig.shape))
print("Dev Set Images:\t\t" + str(type(dev_x_orig))+"\t",str(dev_x_orig.shape))
print("Dev Set Labels:\t\t" + str(type(dev_y_orig))+"\t",str(dev_y_orig.shape))
print("Test Set Images:\t" + str(type(test_x_orig))+"\t",str(test_x_orig.shape))
print("Test Set Labels:\t" + str(type(test_y_orig))+"\t",str(test_y_orig.shape))
print("=================================================================")


Data			 Datatype		 Shape
Training Set Images:	<class 'numpy.ndarray'>	 (60000, 28, 28)
Training Set Labels:	<class 'numpy.ndarray'>	 (60000, 1)
Dev Set Images:		<class 'numpy.ndarray'>	 (5000, 28, 28)
Dev Set Labels:		<class 'numpy.ndarray'>	 (5000, 1)
Test Set Images:	<class 'numpy.ndarray'>	 (5000, 28, 28)
Test Set Labels:	<class 'numpy.ndarray'>	 (5000, 1)


In [47]:
train_x_norm,train_y_encoded, dev_x_norm,dev_y_encoded, test_x_norm, test_y_encoded = prep_dataset(train_x_orig, train_y_orig, dev_x_orig, dev_y_orig, test_x_orig, test_y_orig)
print("Data\t\t\t","Before Processing\t","After Processing")
print("=================================================================")
print("Training Set Images:\t" + str(train_x_orig.shape)+"\t\t"+ str(train_x_norm.shape))
print("Training Set Labels:\t" + str(train_y_orig.shape)+"\t\t"+ str(train_y_encoded.shape))
print("Dev Set Images:\t\t" + str(dev_x_orig.shape)+"\t\t"+ str(dev_x_norm.shape))
print("Dev Set Labels:\t\t" + str(dev_y_orig.shape)+"\t\t"+ str(dev_y_encoded.shape))
print("Test Set Images:\t" + str(test_x_orig.shape)+"\t\t"+ str(test_x_norm.shape))
print("Test Set Labels:\t" + str(test_y_orig.shape)+"\t\t"+ str(test_y_encoded.shape))
print("=================================================================")

Data			 Before Processing	 After Processing
Training Set Images:	(60000, 28, 28)		(784, 60000)
Training Set Labels:	(60000, 1)		(11, 60000)
Dev Set Images:		(5000, 28, 28)		(784, 5000)
Dev Set Labels:		(5000, 1)		(11, 5000)
Test Set Images:	(5000, 28, 28)		(784, 5000)
Test Set Labels:	(5000, 1)		(11, 5000)


In [48]:
help(sample_prepDataset)

Help on function sample_prepDataset in module dataPrep:

sample_prepDataset(x, y, dataVol=25)
    Returns a sample dataset from the fully processed dataset
    
    Arguments:
        - **x** -- prepared input data
        - **y** -- prepared output encoded labels
        - dataVol -- sample volume in percentage (default 10%)
    Returns:
        - **x_sample** -- input sample  from processed dataset of size ( dataVol% of x)
        - **y_sample** -- output sample from processed dataset of size (datavol% of y)
        - **dataVol** -- sample volume in percentage



In [49]:
train_Vol,train_x_sample, train_y_sample = sample_prepDataset(train_x_norm,train_y_encoded)
dev_Vol,dev_x_sample,dev_y_sample = sample_prepDataset(dev_x_norm,dev_y_encoded)
test_Vol,test_x_sample,test_y_sample = sample_prepDataset(test_x_norm,test_y_encoded)

print("Data\t\t\t","Complete Data Shape\t","Sample Data Shape\t","Sample Size")
print("=====================================================================================")
print("Training Set Images:\t"+ str(train_x_norm.shape)+"\t\t"+ str(train_x_sample.shape)+"\t\t"+str(train_Vol)+"%")
print("Training Set Labels:\t"+ str(train_y_encoded.shape)+"\t\t"+ str(train_y_sample.shape))
print("Dev Set Images:\t\t"+str(dev_x_norm.shape)+"\t\t"+ str(dev_x_sample.shape)+"\t\t"+str(dev_Vol)+"%")
print("Dev Set Labels:\t\t"+str(dev_y_encoded.shape)+"\t\t"+ str(dev_y_sample.shape))
print("Test Set Images:\t"+str(test_x_norm.shape)+"\t\t"+ str(test_x_sample.shape)+"\t\t"+str(test_Vol)+"%")
print("Test Set Labels:\t"+str(test_y_encoded.shape)+"\t\t"+ str(test_y_sample.shape))
print("=====================================================================================")


Data			 Complete Data Shape	 Sample Data Shape	 Sample Size
Training Set Images:	(784, 60000)		(784, 15000)		25%
Training Set Labels:	(11, 60000)		(11, 15000)
Dev Set Images:		(784, 5000)		(784, 1250)		25%
Dev Set Labels:		(11, 5000)		(11, 1250)
Test Set Images:	(784, 5000)		(784, 1250)		25%
Test Set Labels:	(11, 5000)		(11, 1250)
