In [11]:
# Python Standard Libraries for importing data from binary file
import os.path #for accessing the file path
import struct  #for unpacking the binary data

import time    #for calculating time

#core packages
import numpy as np
import matplotlib.pyplot as plt

#custom module
from dataPrep import load_dataset, prep_dataset, sample_dataset


In [12]:
help(load_dataset)

Help on function load_dataset in module dataPrep:

load_dataset()
    Retrive the dataset from file into training, dev and test sets.
    
    Returns: 
    train_x_orig -  training set images consisting of no. of examples, rows, columns of images, 
                    size(60000,28,28)
    train_y_orig -  training set output consisting of image labels, size(60000,1)
    dev_x_orig  - dev set images of size (5000,28,28)
    dev_y_orig  - dev set labels of size (5000,1)
    test_x_orig - test set images of size (5000,28,28)
    test_y_orig - test set labels of size (5000,1)



In [13]:
train_x_orig, train_y_orig, dev_x_orig, dev_y_orig, test_x_orig, test_y_orig = load_dataset()

print("Data\t\t\t","Datatype\t\t","Shape")
print("=================================================================")
print("Training Set Images:\t" + str(type(train_x_orig))+"\t",str(train_x_orig.shape))
print("Training Set Labels:\t" + str(type(train_y_orig))+"\t",str(train_y_orig.shape))
print("Dev Set Images:\t\t" + str(type(dev_x_orig))+"\t",str(dev_x_orig.shape))
print("Dev Set Labels:\t\t" + str(type(dev_y_orig))+"\t",str(dev_y_orig.shape))
print("Test Set Images:\t" + str(type(test_x_orig))+"\t",str(test_x_orig.shape))
print("Test Set Labels:\t" + str(type(test_y_orig))+"\t",str(test_y_orig.shape))
print("=================================================================")


Data			 Datatype		 Shape
Training Set Images:	<class 'numpy.ndarray'>	 (60000, 28, 28)
Training Set Labels:	<class 'numpy.ndarray'>	 (60000, 1)
Dev Set Images:		<class 'numpy.ndarray'>	 (5000, 28, 28)
Dev Set Labels:		<class 'numpy.ndarray'>	 (5000, 1)
Test Set Images:	<class 'numpy.ndarray'>	 (5000, 28, 28)
Test Set Labels:	<class 'numpy.ndarray'>	 (5000, 1)


In [14]:
help(prep_dataset)

Help on function prep_dataset in module dataPrep:

prep_dataset(train_x_orig, train_y_orig, dev_x_orig, dev_y_orig, test_x_orig, test_y_orig)
    Flatten and Normalize the input images and encode the output labels
    
    Arguments:
        train_x_orig -  training set images of size (60000,28,28)
        train_y_orig -  training set labels of size (60000,1)
        dev_x_orig   - dev set images of size (5000,28,28)
        dev_y_orig   - dev set labels of size (5000,1)
        test_x_orig  - test set images of size (5000,28,28)
        test_y_orig  - test set labels of size (5000,1)
    Returns:
        train_x_norm - flattened and normalized training set input data
        dev_norm     - flattened and normalized training set dev data
        test_x_norm  - flattened and normalized test set input data
        train_y_encoded - encoded label of training set
        dev_y_encoded   - encoded label of dev set
        test_y_encoded  - encoded label of test set



In [15]:
train_x_norm,train_y_encoded, dev_x_norm,dev_y_encoded, test_x_norm, test_y_encoded = prep_dataset(train_x_orig, train_y_orig, dev_x_orig, dev_y_orig, test_x_orig, test_y_orig)
print("Data\t\t\t","Before Processing\t","After Processing")
print("=================================================================")
print("Training Set Images:\t" + str(train_x_orig.shape)+"\t\t"+ str(train_x_norm.shape))
print("Training Set Labels:\t" + str(train_y_orig.shape)+"\t\t"+ str(train_y_encoded.shape))
print("Dev Set Images:\t\t" + str(dev_x_orig.shape)+"\t\t"+ str(dev_x_norm.shape))
print("Dev Set Labels:\t\t" + str(dev_y_orig.shape)+"\t\t"+ str(dev_y_encoded.shape))
print("Test Set Images:\t" + str(test_x_orig.shape)+"\t\t"+ str(test_x_norm.shape))
print("Test Set Labels:\t" + str(test_y_orig.shape)+"\t\t"+ str(test_y_encoded.shape))
print("=================================================================")

Data			 Before Processing	 After Processing
Training Set Images:	(60000, 28, 28)		(784, 60000)
Training Set Labels:	(60000, 1)		(11, 60000)
Dev Set Images:		(5000, 28, 28)		(784, 5000)
Dev Set Labels:		(5000, 1)		(11, 5000)
Test Set Images:	(5000, 28, 28)		(784, 5000)
Test Set Labels:	(5000, 1)		(11, 5000)


In [16]:
help(sample_dataset)

Help on function sample_dataset in module dataPrep:

sample_dataset(x, y, dataVol=10)
    Returns a sample dataset from the fully processed dataset
    
    Arguments:
        x - input data
        y - output data
        dataVol - sample volume in percentage
     Returns:
         x_sample - input sample of size ( dataVol% of x)
         y_sample - output sample of size (datavol% of y)
         dataVol - sample volume in percentage



In [17]:
train_Vol,train_x_sample, train_y_sample = sample_dataset(train_x_norm,train_y_encoded)
dev_Vol,dev_x_sample,dev_y_sample = sample_dataset(dev_x_norm,dev_y_encoded)
test_Vol,test_x_sample,test_y_sample = sample_dataset(test_x_norm,test_y_encoded)

print("Data\t\t\t","Complete Data Shape\t","Sample Data Shape\t","Sample Size")
print("=====================================================================================")
print("Training Set Images:\t"+ str(train_x_norm.shape)+"\t\t"+ str(train_x_sample.shape)+"\t\t"+str(train_Vol)+"%")
print("Training Set Labels:\t"+ str(train_y_encoded.shape)+"\t\t"+ str(train_y_sample.shape))
print("Dev Set Images:\t\t"+str(dev_x_norm.shape)+"\t\t"+ str(dev_x_sample.shape)+"\t\t"+str(dev_Vol)+"%")
print("Dev Set Labels:\t\t"+str(dev_y_encoded.shape)+"\t\t"+ str(dev_y_sample.shape))
print("Test Set Images:\t"+str(test_x_norm.shape)+"\t\t"+ str(test_x_sample.shape)+"\t\t"+str(test_Vol)+"%")
print("Test Set Labels:\t"+str(test_y_encoded.shape)+"\t\t"+ str(test_y_sample.shape))
print("=====================================================================================")


Data			 Complete Data Shape	 Sample Data Shape	 Sample Size
Training Set Images:	(784, 60000)		(784, 6000)		10%
Training Set Labels:	(11, 60000)		(11, 6000)
Dev Set Images:		(784, 5000)		(784, 500)		10%
Dev Set Labels:		(11, 5000)		(11, 500)
Test Set Images:	(784, 5000)		(784, 500)		10%
Test Set Labels:	(11, 5000)		(11, 500)
