## Loading Dependencies

In [1]:
# Python Standard Libraries for importing data from binary file
import os
# import os.path #for accessing the file path
import struct  #for unpacking the binary data

import time    #for calculating time

from urllib.request import urlopen #for downloading the dataset
from urllib.error import URLError, HTTPError

import gzip as unzip #to unzip the downloaded dataset

#core packages
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

## Downloading Dataset

In [2]:
#help from  https://gist.github.com/kissgyorgy/6102803 for bufferring part
def download_dataset(dataset, to_path):
    
    if dataset == "mnist":
        urls = ["http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz",
                "http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz",
                "http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz",
                "http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz"] 
    
    elif dataset == "fashion_mnist":
        urls = ["http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz",
                "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz", 
                "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz",
                "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz" ] 
    else:
        raise ValueError("Only 'mnist' and 'fashion_mnist' dataset are supported")
    
    path = to_path #destination path
    if not os.path.exists(path):
        print("Directory not found: It should be handled from the load_dataset(parent) module")
        return
#         print("No destination directory exists to load the data: Creating '" + path + "' as a new directory.\n")
#         os.makedirs(path) #making directories recursively
    
    num_files = len(urls)


    down_status = "succeeded"
    
    for ind, url in enumerate(urls):  
        filename = url.split('/')[-1] # getting the filename from the url
        
        #if file does not exist then download
        if not os.path.exists(path+filename):
            print("\n%s: downloading..."%filename)
            
            try:                
                u = urlopen(url)
                with open(path + filename, 'wb') as data_file:
                    #getting the file size
                    file_size = int(u.info()["Content-Length"])
                    file_size_mb = file_size/(1024*1024) #for linux system use 1000 instead of 1024


                    downloaded_file_size = 0
                    block_size = 1024 #setting the block size to read the data from the url
                    while True:
                        #bufferring the file content
                        buffer = u.read(block_size)
                        if not buffer:
                            break
                        #adding up downloaded file size
                        downloaded_file_size += len(buffer)
                        downloaded_file_size_mb = downloaded_file_size/(1024*1024)

                        data_file.write(buffer)
                        #calculating the downloaded percentage of the file
                        down_percent = downloaded_file_size * 100. / file_size
                        inc = int(down_percent)//10
                        print ("%.3f MB  [%.3f MB done %s>%s %.0f%%]" %( file_size_mb, downloaded_file_size_mb, '=' * inc,'.'*(10-inc), down_percent), end = "\r")

            except HTTPError as e:
                print('Download Failed: ', e)
                down_status = "failed"
            except URLError as e:
                print('Download Failed: ', e)
                down_status = "failed"
        else:
            print("\n%s: already exists."%filename)
    print("\n\nDataset download %s...\n"%down_status)

In [None]:
download_dataset(dataset = "mnist", to_path = "dataset/mnist/")

In [None]:
download_dataset(dataset ="fashion_mnist" , to_path = "dataset/fashion_mnist/")

## Decompressing the gzip Dataset files to the desired binary files

In [3]:
# https://www.tutorialspoint.com/working-with-zip-files-in-python
# https://www.geeksforgeeks.org/os-walk-python/


def is_gzip(filename):
    #checking the extention of the file to determine if it is a gzip file or not
    ext = filename.split(".")[-1]
    if ext == "gz":
        return True

def get_files(path,file_type = "all"):
    files = []
    #accessing all gzip files in the supplied path
    for root, dirs, file in os.walk(path):
        for fname in file:
            if file_type == "gzip":
                if is_gzip(fname): #accessing only gzip file paths
                    files.append(fname)
            else:
                files.append(fname)
    return files
      

    
def decompress_dataset(path, keep_original = True):
    files = get_files(path, file_type = "gzip")
    
    if len(files) == 0:
        print("No gzip file to decompress.")
        return
    
    for filename in files:
        try:
            with open(path + filename.split(".")[0],'wb') as fp: #opening a file on which the zip file content is to be written
                with unzip.open(path + filename, 'rb') as fzip: #opening the zip file to be unzipped
                    file_data = fzip.read()
                fp.write(file_data) 
            if keep_original == False:
                os.remove(path+filename) #removing the gzip file after decompression
        except unzip.BadZipFile:
            print('Error: Invalid gzip file encountered.')
    
    print("Dataset decompression succeeded...")
    if keep_original == False:
        print("Original gzip files removed...")



In [None]:
decompress_dataset(path = "dataset/fashion_mnist/")

## Retriving data from binary file

In [4]:
def retrive_dataset(path, filename):
    """
        Retrive MNIST dataset from  the binary file into numpy arrays        
        
        Dataset Obtained From:
            - link -- http://yann.lecun.com/exdb/mnist/
            
        Dataset retrival code adapted from(but modified to our need making data retrival 6-8 times faster):
            - link -- https://www.cs.virginia.edu/~connelly/class/2015/large_scale/proj2/mnist_python
            
        Argument:
            - **dataset** -- type of dataset to be loaded. may be either 'training' or 'test'
        Returns:
            - **images** -- 3D array consisting of no. of examples, rows, columns of images 
            - **labels** -- array  containing labels for each images
    """
    #setting file path based on the dataset
    train_img_file_path = path + filename[0]
    train_lbl_file_path = path + filename[1]
    test_img_file_path = path + filename[2]
    test_lbl_file_path = path + filename[3]
     
    #retriving the training data
    with open(train_img_file_path, 'rb') as train_fimg, open(train_lbl_file_path, 'rb') as train_flbl :
        #retriving labels
        _, size = struct.unpack(">II", train_flbl.read(8))
        train_labels = np.frombuffer(train_flbl.read(), dtype=np.int8).reshape(size,1)
        #retriving images
        _, _, rows, cols = struct.unpack(">IIII", train_fimg.read(16))
        train_images = np.frombuffer(train_fimg.read(),dtype=np.uint8).reshape(size, rows, cols)
       
    #retriving the test data
    with open(test_img_file_path, 'rb') as test_fimg, open(test_lbl_file_path, 'rb') as test_flbl :
        #retriving labels
        _, size = struct.unpack(">II", test_flbl.read(8))
        test_labels = np.frombuffer(test_flbl.read(), dtype=np.int8).reshape(size,1)
        #retriving images
        _, _, rows, cols = struct.unpack(">IIII", test_fimg.read(16))
        test_images = np.frombuffer(test_fimg.read(),dtype=np.uint8).reshape(size, rows, cols)
       
    assert(train_images.shape == (60000, 28, 28))
    assert(train_labels.shape == (60000,1))
    assert(test_images.shape == (10000, 28, 28))
    assert(test_labels.shape == (10000,1))
    
    return train_images, train_labels, test_images, test_labels



In [None]:
toc = time.time()
path = "dataset/mnist/"
    
filename = ["train-images-idx3-ubyte",
            "train-labels-idx1-ubyte",
            "t10k-images-idx3-ubyte",
            "t10k-labels-idx1-ubyte"] 
#retriving the data
train_x_orig, train_y_orig, test_x_temp, test_y_temp = retrive_dataset(path, filename)

tic = time.time()
#displaying the retrival info
print("Time to load data from binary file using numpy: " + str(1000*(tic-toc)) + "ms\n")
print("Data\t\t\t","Datatype\t\t","Shape")
print("=================================================================")
print("Training Set Images:\t" + str(type(train_x_orig))+"\t",str(train_x_orig.shape))
print("Training Set Labels:\t" + str(type(train_y_orig))+"\t",str(train_y_orig.shape))
print("Test Set Images:\t" + str(type(test_x_temp))+"\t",str(test_x_temp.shape))
print("Test Set Labels:\t" + str(type(test_y_temp))+"\t",str(test_y_temp.shape))
print("=================================================================")


## Sample a portion of the retrived Dataset

In [5]:
#retriving a small sample of the original dataset for model development and experimentation
def sample_dataset(x,y, size_in_per):
    """
        Returns a sample dataset from the fully processed dataset
       
        Arguments:
            - **x** -- original input data
            - **y** -- original output labels
            - **sample_size** -- sample volume in percentage
        Returns:
            - **x_sample** -- input sample  from original dataset of size ( dataVol% of x)
            - **y_sample** -- output sample  from original dataset of size (datavol% of y)
    """
    m = y.shape[0]
    sample_m = int(np.multiply(m,np.divide(size_in_per,100))) #int(m*(dataVol/100)) 
    
    #suffling the original dataset
    randCol = np.random.permutation(m)
    x_suffled = x[randCol,:,:]
    y_suffled = y[randCol,:]
    
    #taking samples of sample_size
    x_sample = x_suffled[0:sample_m,:,:]
    y_sample = y_suffled[0:sample_m,:]

    assert(x_sample.shape == (sample_m,28,28))
    assert(y_sample.shape == (sample_m,1))

    return x_sample, y_sample

## Load the dataset

In [6]:
def load_dataset(dataset, size_in_per = 100):
    
    path = 'dataset/%s/'%(dataset)
    
    filename = ["train-images-idx3-ubyte",
                "train-labels-idx1-ubyte",
                "t10k-images-idx3-ubyte",
                "t10k-labels-idx1-ubyte"] 
    gzip_filename = [fname+".gz" for fname in filename]

    
    #creating a new destination path if it doesnot exist
    if not os.path.exists(path):
        print("No destination directory exists to load the data from:\nCreating '" + path + "' as a new directory...\n")
        os.makedirs(path) #making directories recursively
    
    
    if len(get_files(path)) == 0:
        print ("Downloading the %s dataset..."%dataset)
        download_dataset(dataset, to_path = path) #downloading the dataset if the location is empty
    
    files = get_files(path) #getting all the files in the path after download
    
    #checking for all the decompressed files
    file_check = all(fname in files for fname in filename)  


    if file_check == False:       
        gzip_filecheck = all(fname in files for fname in gzip_filename) #checking for all the gzip files
        if gzip_filecheck == False:
            print ("Downloading missing gzip files of %s dataset..."%dataset)
            download_dataset(dataset, to_path = path) #downloading missing zip file of the dataset
        print ("Decompressing the %s dataset..."%dataset)
        decompress_dataset(path,keep_original = True)#decompressing the dataset
    
    train_x_temp, train_y_temp, test_x_temp, test_y_temp = retrive_dataset(path, filename)
    
    #getting the size of the data based on the sample size
    #size = 100 means entire data is suffled and returned
    train_x_orig, train_y_orig = sample_dataset(train_x_temp,train_y_temp, size_in_per)
    test_x_orig, test_y_orig = sample_dataset(test_x_temp,test_y_temp, size_in_per)
    

    return train_x_orig, train_y_orig, test_x_orig, test_y_orig
        

In [7]:
dataset_size_in_per = 100

toc = time.time()
train_x_orig, train_y_orig, test_x_orig, test_y_orig = load_dataset(dataset = "mnist", size_in_per = dataset_size_in_per)

tic = time.time()
#displaying the retrival info
print("\nTime to load data from binary file using numpy: %.4f s\n"%(tic-toc))

print("Sample Size : %d%%\n"%(dataset_size_in_per))

print("Data\t\t\t","Datatype\t\t","Dataset Size")
print("=================================================================")
print("Training Set Images:\t" + str(type(train_x_orig))+"\t",str(train_x_orig.shape))
print("Training Set Labels:\t" + str(type(train_y_orig))+"\t",str(train_y_orig.shape))
print("Test Set Images:\t" + str(type(test_x_orig))+"\t",str(test_x_orig.shape))
print("Test Set Labels:\t" + str(type(test_y_orig))+"\t",str(test_y_orig.shape))
print("=================================================================")



Time to load data from binary file using numpy: 0.0866 s

Sample Size : 100%

Data			 Datatype		 Dataset Size
Training Set Images:	<class 'numpy.ndarray'>	 (60000, 28, 28)
Training Set Labels:	<class 'numpy.ndarray'>	 (60000, 1)
Test Set Images:	<class 'numpy.ndarray'>	 (10000, 28, 28)
Test Set Labels:	<class 'numpy.ndarray'>	 (10000, 1)


In [None]:
# #for fashion_mnist
# labels = {0:"T-shirt/top",
#           1:"Trouser",
#           2:"Pullover",
#           3:"Dress",
#           4:"Coat",
#           5:"Sandal",
#           6:"Shirt",
#           7:"Sneaker",
#           8:"Bag",
#           9:"Ankle boot"}
# print(labels[0])
# plt.imshow(test_x_orig[2000], cmap = "Greys")
# a = int(test_y_orig[2000])
# print(type(a))
# plt.title(str(a)+" "+labels[a])
# # print(255-test_x_orig[2])

## Dev-Test split

In [None]:
def dev_test_split(test_x,test_y):
    """
        Randomly splits the test set to dev and test set
        
        Arguments:
            test_x - test set images of size (10000,28,28)
            test_y - test set labels of size (10000,1)
        
        Returns:
            dev_x  - dev set images of size (5000,28,28)
            dev_y  - dev set labels of size (5000,1)
            test_x - test set images of size (5000,28,28)
            test_y - test set labels of size (5000,1)
    """
    m = test_y.shape[0]
    n = m // 2
    #suffling the test dataset
    randCol = np.random.permutation(m)
    suffled_x = test_x[randCol,:,:]
    suffled_y = test_y[randCol,:]
    
    #splitting the test set into dev and test set , 50% each
    dev_x = suffled_x[0:n,:,:]
    dev_y = suffled_y[0:n,:]
    
    test_x = suffled_x[n:m,:,:]
    test_y = suffled_y[n:m,:]
    
    assert(dev_x.shape == (n,28,28))
    assert(dev_y.shape == (n,1))
    assert(test_x.shape == (n,28,28))
    assert(test_y.shape == (n,1))
    
    return dev_x,dev_y,test_x,test_y

In [None]:
dev_x_orig,dev_y_orig,test_x_orig,test_y_orig = dev_test_split(test_x_temp, test_y_temp)

print("Data\t\t\t","Datatype\t\t","Shape")
print("=================================================================")
print("Dev Set Images:\t\t" + str(type(dev_x_orig))+"\t",str(dev_x_orig.shape))
print("Dev Set Labels:\t\t" + str(type(dev_y_orig))+"\t",str(dev_y_orig.shape))
print("Test Set Images:\t" + str(type(test_x_orig))+"\t",str(test_x_orig.shape))
print("Test Set Labels:\t" + str(type(test_y_orig))+"\t",str(test_y_orig.shape))
print("=================================================================")


## Visualizing and Validating Raw datasets

In [None]:
#dataset visualization using charts
def visual_charts(train_y_orig, dev_y_orig, test_y_orig):
    """
        Plots bar graph showing the number of examples in each class

        Arguments:
            trainy_orig - labels of training set
            dev_y_orig - labels of dev set
            test_y_orig - labels of test set
    """
    datasets = {"Training Set":train_y_orig,"Dev Set": dev_y_orig,"Test Set": test_y_orig}
    
    #setting the plot style
    plt.style.use('seaborn')
    
    #creating subplots
    fig, axes = plt.subplots(nrows=3, ncols=1,figsize=(10,15))
    fig.subplots_adjust(hspace=.2)
    i = 0
    
    #plotting the bar graph for each dataset labels
    for dataset,datalabel in datasets.items():
        unique, counts = np.unique(datalabel, return_counts=True)    
        axes[i].bar(unique, counts)
        max_value = np.max(counts)
        axes[i].set(xticks = unique, ylim = (0,max_value + max_value // 10))
        axes[i].set_title("Number of Examples in " + dataset , fontsize = 16)
        axes[i].set_xlabel("Classes", fontsize = 12)
        axes[i].set_ylabel("Number of Examples", fontsize = 12)
        i += 1

    plt.show()

In [None]:
visual_charts(train_y_orig, dev_y_orig, test_y_orig)

In [None]:
visual_charts(train_y_sample, dev_y_sample, test_y_sample)

In [None]:
def visualize_dataset(x_orig, y_orig, dataset = "training"):
    """
        Plots 10 sample images from the dataset with labels
        
        Arguments:
            x_orig - 3D array representation of input images
            y_orig - array of labels
            dataset - type of dataset, can be training, dev or test
        
    """
    #recovering matplotlib defaults
    plt.rcParams.update(plt.rcParamsDefault) 
    
    #checking dataset type
    if(dataset == "training"):
        visual_title = "Training Data Set"
        rng = range(1040,1050)
    elif(dataset == "dev"):
        visual_title = "Dev Data Set"
        rng = range(100,110)
    elif(dataset == "test"):
        visual_title = "Test Data Set"
        rng = range(540,550)        
    else:
        raise ValueError("Dataset set must be training or dev or test set")
     
    #creating subplots
    fig, axes = plt.subplots(nrows=2, ncols=5,figsize=(16,8))
    fig.subplots_adjust(hspace=.1)
    fig.suptitle(visual_title)
    
    #plotting the sample images along with their labels
    for ax,i in zip(axes.flatten(),rng):
        ax.imshow(x_orig[i].squeeze(),interpolation='nearest')
        ax.set(title = "Label: "+ str(y_orig[i,0]))

In [None]:
visualize_dataset(train_x_orig, train_y_orig, dataset = "training")

In [None]:
visualize_dataset(dev_x_orig, dev_y_orig, dataset = "dev")

In [None]:
visualize_dataset(test_x_orig, test_y_orig, dataset="test")

# Preparing Dataset

## Flattening the images

In [None]:
def flatten_input(train_x_orig,dev_x_orig,test_x_orig):
    """
        Flattens the 3D numpy array of the input images
        
        Arguement:
            train_x_orig -  training set images of size (60000,28,28)
            dev_x_orig   - dev set images of size (5000,28,28)
            test_x_orig  - test set images of size (5000,28,28)

        Returns:
            train_x_flatten - flattened training set input data of size (784,60000)
            dev_flatten     - flattened training set dev data of size (784,5000)
            test_x_flatten  - flattened test set input data of size (784,5000)
            
    """
    m = train_x_orig.shape[0] #number of examples in training set
    n = dev_x_orig.shape[0] # number of examples in dev and test set
    
    
    #flattening the image--The "-1" makes reshape flatten the remaining dimensions
    train_x_flatten = train_x_orig.reshape(train_x_orig.shape[0], -1).T   
    dev_x_flatten = dev_x_orig.reshape(dev_x_orig.shape[0], -1).T    
    test_x_flatten = test_x_orig.reshape(test_x_orig.shape[0], -1).T
   
    
    assert(train_x_flatten.shape == (784,m) )
    assert(dev_x_flatten.shape == (784,n) )
    assert(test_x_flatten.shape == (784,n) )
    
    return train_x_flatten, dev_x_flatten, test_x_flatten

In [None]:
train_x_flatten,dev_x_flatten,test_x_flatten = flatten_input(train_x_orig,dev_x_orig,test_x_orig)

print("Data\t\t\t","Shape")
print("=====================================")
print ("Input Training set:\t" + str(train_x_flatten.shape))
print ("Input Dev set:\t\t" + str(dev_x_flatten.shape))
print ("Input Test set:\t\t" + str(test_x_flatten.shape))
print("=====================================")

## Normalizing the images

In [None]:
def normalize_input(train_x_flatten,dev_x_flatten,test_x_flatten ):
    """
        Normalizes the pixel values of the flattened images to the range 0-1
        
        Arguement:
            train_x_flatten - flattened training set input data of size (784,60000)
            dev_flatten     - flattened training set dev data of size (784,5000)
            test_x_flatten  - flattened test set input data of size (784,5000)
        Returns:
            train_x_norm - normalized training set input data
            dev_norm     - normalized training set dev data
            test_x_norm  - normalized test set input data
    """
    m = train_x_flatten.shape[1]
    n = dev_x_flatten.shape[1]
    
    # Normalizing the data into the range between 0 and 1.
    train_x_norm = np.divide(train_x_flatten,255.)
    dev_x_norm = np.divide(dev_x_flatten,255.)
    test_x_norm = np.divide(test_x_flatten,255.)
    
    assert(train_x_norm.shape == (784,m) )
    assert(dev_x_norm.shape == (784,n) )
    assert(test_x_norm.shape == (784,n) )
    
    return train_x_norm, dev_x_norm, test_x_norm

In [None]:
train_x_norm, dev_x_norm, test_x_norm = normalize_input(train_x_flatten,dev_x_flatten,test_x_flatten)

print("Data\t\t\t","Shape")
print("=====================================")
print ("Input Training set:\t" + str(train_x_norm.shape))
print ("Input Dev set:\t\t" + str(dev_x_norm.shape))
print ("Input Test set:\t\t" + str(test_x_norm.shape))
print("=====================================")

## Encoding the labels

In [None]:
def one_hot_encoding(y_orig,num_classes = 10):
    """
        Transform the output labels into the one-hot encoding representation
        
        Arguments:
            y_orig - raw labels loaded directly from the binary file
            num_classes - number of the classes based on which the transformation is to be made
        Returns:
            y_encoded - encoded ndarray of the labels with data elements of int type
    """
    #encoding the labels
    y_encoded = np.eye(num_classes)[y_orig.reshape(-1)].T


    assert(y_encoded.shape == (num_classes, y_orig.shape[1]))
    return y_encoded

In [None]:
toc = time.time()
#encoding the output of the training and the test dataset
train_y_encoded = one_hot_encoding(train_y_orig.T)
dev_y_encoded = one_hot_encoding(dev_y_orig.T)
test_y_encoded = one_hot_encoding(test_y_orig.T)
tic = time.time()
print("Time to encode: " + str(1000*(tic-toc)) + " ms\n")

print("Data\t\t\t","Shape")
print("===================================")
print ("Output Training set:\t" + str(train_y_encoded.shape))
print ("Output Dev set:\t\t" + str(dev_y_encoded.shape))
print ("Output Test set:\t" + str(test_y_encoded.shape))
print("===================================")

## Prep Dataset

In [None]:
def prep_dataset(train_x_orig, train_y_orig, dev_x_orig, dev_y_orig, test_x_orig, test_y_orig):
    """
        Flatten and Normalize the input images and encode the output labels
        
        Arguments:
            train_x_orig -  training set images of size (60000,28,28)
            train_y_orig -  training set labels of size (60000,1)
            dev_x_orig   - dev set images of size (5000,28,28)
            dev_y_orig   - dev set labels of size (5000,1)
            test_x_orig  - test set images of size (5000,28,28)
            test_y_orig  - test set labels of size (5000,1)
        Returns:
            train_x_norm - flattened and normalized training set input data
            dev_norm     - flattened and normalized training set dev data
            test_x_norm  - flattened and normalized test set input data
            train_y_encoded - encoded label of training set
            dev_y_encoded   - encoded label of dev set
            test_y_encoded  - encoded label of test set
    """
    #flatten the input images
    train_x_flatten,dev_x_flatten,test_x_flatten = flatten_input(train_x_orig,dev_x_orig,test_x_orig)
    
    #normalize the input images
    train_x_norm, dev_x_norm, test_x_norm = normalize_input(train_x_flatten,dev_x_flatten,test_x_flatten)
    
    #encode the output labels
    train_y_encoded = one_hot_encoding(train_y_orig.T)
    dev_y_encoded = one_hot_encoding(dev_y_orig.T)
    test_y_encoded = one_hot_encoding(test_y_orig.T)
    
    return train_x_norm,train_y_encoded, dev_x_norm,dev_y_encoded, test_x_norm, test_y_encoded

In [None]:
train_x_norm,train_y_encoded, dev_x_norm,dev_y_encoded, test_x_norm, test_y_encoded = prep_dataset(train_x_orig, train_y_orig, dev_x_orig, dev_y_orig, test_x_orig, test_y_orig)
print("Data\t\t\t","Before Processing\t","After Processing")
print("=================================================================")
print("Training Set Images:\t" + str(train_x_orig.shape)+"\t\t"+ str(train_x_norm.shape))
print("Training Set Labels:\t" + str(train_y_orig.shape)+"\t\t"+ str(train_y_encoded.shape))
print("Dev Set Images:\t\t" + str(dev_x_orig.shape)+"\t\t"+ str(dev_x_norm.shape))
print("Dev Set Labels:\t\t" + str(dev_y_orig.shape)+"\t\t"+ str(dev_y_encoded.shape))
print("Test Set Images:\t" + str(test_x_orig.shape)+"\t\t"+ str(test_x_norm.shape))
print("Test Set Labels:\t" + str(test_y_orig.shape)+"\t\t"+ str(test_y_encoded.shape))
print("=================================================================")