In [0]:
# Imports
import numpy as np
import pickle

# For functions pertaining to the operating system
import os

# Module for working with URLs
import urllib

# For performing operations on zip files
import zipfile

# Makes it possible to read and write tar archives including gzip, bz2 and lzma compression
import tarfile

# Module for working with URLs
from six.moves import urllib

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

# Deep Learning
import tensorflow as tf
from tensorflow.contrib.training import HParams
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Conv2D, Activation, Flatten
from tensorflow.keras.layers import BatchNormalization, AveragePooling2D, Input
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Visualization
import itertools
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import numpy as np

In [0]:
# Define hyperparameters
params = HParams(
    
    n_class=10, # Number of classes in the Cifar 10 dataset
    learning_rate=1e-4, # How much to change the model in response to the estimated error each time the weights are updated
    train_batch_size=32, # Training batch size; number of data points in one forward/backward pass
    val_batch_size=32, # Batch size for validation; number of data points in one forward/backward pass
    test_batch_size=32, # Testing batch size; number of data points in one forward/backward pass
    n_epochs=10, # One epoch is equivalent to one forward and one backward pass of all training data points
    input_name='input_one', # Name of NN input layer
    data_dir='/tmp/data/', # Path to data
    checkpoint_dir='/tmp/checkpoints', # Path to a location to save project checkpoints

)

In [0]:
# Cifar 100 Dataset URL
data_url = "https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz"

# Dimension of the square image
n_pixels = 32

# Number of image color channels
n_channels = 3

# Length of flattened image
size_flat = n_channels * (n_pixels**2)

# Number of classes considered
n_classes = params.n_class

# Quantity of files in the original dataset
_n_files_train = 5

# Number of images per batch of training set
_n_images_per_file = 10000

# Quantity of training images
_n_images_train = _n_files_train * _n_images_per_file

In [0]:
def download(source_url, filename, destination):

    """
    Description - Downloads the remote dataset to a local directory.

    Returns - N/A
    """

    # Create the final file path
    destination_path = os.path.join(destination, filename)

    # If the file does not exist, 
    if not os.path.exists(destination_path):

      # If the directory does not exist, 
      if not os.path.exists(destination):

        # Create the needed directory 
        os.makedirs(destination_path)

        # Print a status message
        print(filename, "currently downloading...")

        # Create URL by combining the base URL with the filename
        url = source_url + filename

        # Download the dataset using urllib
        file_path, _ = urllib.request.urlretrieve(url=url, filename=destination_path)

In [0]:
def download_and_extract(url=data_url, destination=params.data_dir):
    """
    Description - Downloads and extracts the data from the data URL into
                  the directory specified by the params class.

    Returns - N/A
    """

    # Split the url using / as the delimiter, reverse order
    filename = url.split('/')[-1]

    # Create the final file path
    destination_path = os.path.join(destination, filename)

    # If the file does not exist, 
    if not os.path.exists(destination_path):

        # If the directory does not exist,
        if not os.path.exists(destination):

            # Create the needed directory 
            os.makedirs(destination_path)

        # Download the dataset using urllib
        destination_path, _ = urllib.request.urlretrieve(url=url, filename=destination_path)

        # Print a status message
        print()
        print("Step (1/2) - Files have been downloaded.")

        # If we are dealing with a zip file,
        if destination_path.endswith(".zip"):

            # extract the zipfile into the desitnation directory
            zipfile.ZipFile(file=file_path, mode="r").extractall(destination)

        # If we are dealing with a tar file, extract with tarfile
        elif destination_path.endswith((".tar.gz", ".tgz")):

            # Extract the tarfile into the desitnation directory
            tarfile.open(name=destination_path, mode="r:gz").extractall(destination)

            # Print a status message
            print("Step (2/2) - Files have been extracted.")

    # If the data does exist, 
    else:

        # Print a status message
        print("Data has apparently already been saved locally and extracted.")

In [0]:
def _get_file_path(filename=""):
    """
    Description - Finds and returns the data path location.
    """

    # Use the join method to create the path
    return os.path.join(params.data_dir, "cifar-100-python/", filename)

In [0]:
def _unpickle(filename):
    """
    Description - Unpickle (de-serialize) the given file pieces and return the 
                  aggregated data chunk.
    """

    # Get the file path using the function defined above
    file_path = _get_file_path(filename)

    # Print a status message
    print("Currently loading data from: " + file_path)

    # Open the file located at file_path
    with open(file_path, mode='rb') as file:

        # Load the data into a new variable using Pickle's load method
        data = pickle.load(file, encoding='bytes')

    return data

In [0]:
def _convert_images(raw_data):
    """
    Description - Preprocesses raw image data and convert to a 4-dimensional 
    array: [image_number, height, width, channel]
   
    Returns - The preprocessed and scaled image data.
    """

    # Scale the pixel data
    scaled_data = np.array(raw_data, dtype=float) / 255.0

    # Change the shape of the array to 4-D
    images = scaled_data.reshape([-1, n_channels, img_size, img_size])

    # Reindex the array
    images = images.transpose([0, 2, 3, 1])

    return images

In [0]:
def _load_data(filename):
    """
    Description - Unpickles (de-serializes) the input file and converts the data
                  to the data shape specified in the _convert_data function.
    
    Returns - The converted data and the class label for each image.
    """

    # Unpickle the data
    data = _unpickle(filename)

    # Retrieve the raw image pixel data
    raw_images = data[b'data']

    # Arrange the class labels into a numpy array
    classes = np.array(data[b'labels'])

    # Convert the image pixel size/orientation
    images = _convert_images(raw_images)

    return images, classes

In [0]:
def load_class_names():

    # Unpickle the file and access the class label names
    raw_classes = _unpickle(filename="batches.meta")[b'label_names']

    # Convert from strings to a list
    names = [x.decode('utf-8') for x in raw]

    return names

In [0]:
def load_training_data():
    """
    Description - Builds numpy arrays containing the image data and the class 
                  labels from the 5 files in the Cifar 100 dataset.

    Returns - The images and class labels for each training image data point 
              in the set.
    """

    # Allocate memory for the images
    images = np.zeros(shape=[_n_images_train, n_pixels, n_pixels, n_channels], dtype=float)

    # Allocate memory for the class labels
    classes = np.zeros(shape=[_n_images_train], dtype=int)

    # Index that is specific to the current batch
    start = 0

    # Loop through each of the 5 data files
    for i in range(_n_files_train):

        # Load the images and class labels from the 5 data files
        images_batch, classes_batch = _load_data(filename="train")

        # This is the number of images contained in this batch
        n_images_batch = len(images_batch)

        # End index for the current batch.
        end = start + n_images_batch

        # Fill the empty array allocated above with the image data loaded above
        images[begin:end, :] = images_batch

        # Fill the empty array allocated above with the classes loaded above
        classes[begin:end] = classes_batch

        # For the next iteration, start at the end of the previously loaded data
        begin = end

    return images, classes


In [0]:
def load_validation_data():
    """
    Description - Loads 5000 data points from the test batch file of the Cifar
                  100 dataset to be used for validation.

    Returns - The images and class labels for each validation image data point 
              in the set.
    """

    # Load the images and class labels from the test batch file
    images, classes = _load_data(filename="test_batch")

    # Define the validation data to be all but the first 5000 data points
    images = images[5000:, :, :, :]
    classes = classes[5000:]

    return images, classes

In [0]:
def load_testing_data():
    """
    Description - Loads 5000 data points from the test batch file of the Cifar
                  100 dataset to be used for testing.

    Returns - The images and class labels for each testing image data point 
              in the set.
    """

    # Load the unpickled and converted data
    images, classes = _load_data(filename="test_batch")

    # Define the testing data to be the first 5000 data points
    images = images[:5000, :, :, :]
    classes = classes[:5000]

    return images, classes

In [70]:
# Get the Cifar 100 data
download_and_extract()

Data has apparently already been saved locally and extracted.


In [71]:
# Set model variables
X_train, y_train = load_training_data()

Currently loading data from: /tmp/data/cifar-100-python/train


FileNotFoundError: ignored

In [0]:
# Set model variables

# Allocate memory for the images
images = np.zeros(shape=[_n_images_train, n_pixels, n_pixels, n_channels], dtype=float)

In [0]:
# Allocate memory for the class labels
classes = np.zeros(shape=[_n_images_train], dtype=int)

In [0]:
# Index that is specific to the current batch
start = 0

In [0]:
# Get the file path
file_path = os.path.join(params.data_dir, "cifar-100-python/", "train")

In [67]:
# Print a status message
print("Currently loading data from: " + file_path)

Currently loading data from: /tmp/data/cifar-100-python/train


In [68]:
# Open the file located at file_path
with open(file_path, mode='rb') as file:

     # Load the data into a new variable using Pickle's load method
     data = pickle.load(file, encoding='bytes')

FileNotFoundError: ignored