In [None]:
## Data preperation
# Preparing datasets for further using
# Preprocessing loaded MNIST datasets for further using in classifier
# Saving datasets into file


"""Importing library for object serialization
which we'll use for saving and loading serialized models"""
import pickle

# Importing other standard libraries
import gzip
import numpy as np
import matplotlib.pyplot as plt


# Defining function for loading MNIST images
def load_data(file, number_of_images):
    # Opening file for reading in binary mode
    with gzip.open(file) as bytestream:
        bytestream.read(16)
        """Initially testing file with images has shape (60000 * 784)
        Where, 60000 - number of image samples
        784 - one channel of image (28 x 28)
        Every image consists of 28x28 pixels with its only one channel"""
        # Reading data
        buf = bytestream.read(number_of_images * 28 * 28)
        # Placing data in numpy array and converting it into 'float32' type
        # It is used further in function 'pre_process_mnist' as it is needed to subtract float from float
        # And for standard deviation as it is needed to divide float by float
        data = np.frombuffer(buf, dtype=np.uint8).astype(np.float32)
        # Reshaping data making for every image separate matrix (28, 28)
        data = data.reshape(number_of_images, 28, 28)  # (60000, 28, 28)

        # Preparing array with shape for 1 channeled image
        # Making for every image separate matrix (28, 28, 1)
        array_of_image = np.zeros((number_of_images, 28, 28, 1))  # (60000, 28, 28, 1)

        # Assigning to array one channeled image from dataset
        # In this way we get normal 3-channeled images
        array_of_image[:, :, :, 0] = data

    # Returning array of loaded images from file
    return array_of_image


# Defining function for loading MNIST labels
def load_labels(file, number_of_labels):
    # Opening file for reading in binary mode
    with gzip.open(file) as bytestream:
        bytestream.read(8)
        """Initially testing file with labels has shape (60000)
        Where, 60000 - number of labels"""
        # Reading data
        buf = bytestream.read(number_of_labels)
        # Placing data in numpy array and converting it into 'int64' type
        labels = np.frombuffer(buf, dtype=np.uint8).astype(np.int64)  # (60000, )

    # Returning array of loaded labels from file
    return labels


# Preparing function for preprocessing MNIST datasets for further use in classifier
def pre_process_mnist(x_train, y_train, x_test, y_test):
    # Normalizing whole data by dividing /255.0
    x_train /= 255.0
    x_test /= 255.0  # Data for testing consists of 10000 examples from testing dataset

    # Preparing data for training, validation and testing
    # Data for validation is taken with 1000 examples from training dataset in range from 59000 to 60000
    batch_mask = list(range(59000, 60000))
    x_validation = x_train[batch_mask]  # (1000, 28, 28, 1)
    y_validation = y_train[batch_mask]  # (1000,)
    # Data for training is taken with first 59000 examples from training dataset
    batch_mask = list(range(59000))
    x_train = x_train[batch_mask]  # (59000, 28, 28, 1)
    y_train = y_train[batch_mask]  # (59000,)

    # Normalizing data by subtracting mean image and dividing by standard deviation
    # Subtracting the dataset by mean image serves to center the data.
    # It helps for each feature to have a similar range and gradients don't go out of control.
    # Calculating mean image from training dataset along the rows by specifying 'axis=0'
    mean_image = np.mean(x_train, axis=0)  # numpy.ndarray (28, 28, 1)

    # Calculating standard deviation from training dataset along the rows by specifying 'axis=0'
    std = np.std(x_train, axis=0)  # numpy.ndarray (28, 28, 1)
    # Taking into account that a lot of values are 0, that is why we need to replace it to 1
    # In order to avoid dividing by 0
    for j in range(28):
        for i in range(28):
            if std[i, j, 0] == 0:
                std[i, j, 0] = 1.0

    # Saving calculated 'mean_image' and 'std' into 'pickle' file
    # We will use them when preprocessing input data for classifying
    # We will need to subtract and divide input image for classifying
    # As we're doing now for training, validation and testing data
    dictionary = {'mean_image': mean_image, 'std': std}
    with open('mean_and_std.pickle', 'wb') as f_mean_std:
        pickle.dump(dictionary, f_mean_std)

    # Subtracting calculated mean image from pre-processed datasets
    x_train -= mean_image
    x_validation -= mean_image
    x_test -= mean_image

    # Dividing then every dataset by standard deviation
    x_train /= std
    x_validation /= std
    x_test /= std

    # Transposing every dataset to make channels come first
    x_train = x_train.transpose(0, 3, 1, 2)  # (59000, 1, 28, 28)
    x_test = x_test.transpose(0, 3, 1, 2)  # (10000, 1, 28, 28)
    x_validation = x_validation.transpose(0, 3, 1, 2)  # (10000, 1, 28, 28)

    # Returning result as dictionary
    d_processed = {'x_train': x_train, 'y_train': y_train,
                   'x_validation': x_validation, 'y_validation': y_validation,
                   'x_test': x_test, 'y_test': y_test}

    # Returning dictionary
    return d_processed


# Loading whole data for preprocessing
x_train = load_data('datasets/train-images-idx3-ubyte.gz', 60000)
y_train = load_labels('datasets/train-labels-idx1-ubyte.gz', 60000)
x_test = load_data('datasets/t10k-images-idx3-ubyte.gz', 1000)
y_test = load_labels('datasets/t10k-labels-idx1-ubyte.gz', 1000)

# Preprocessing data
data = pre_process_mnist(x_train, y_train, x_test, y_test)
for i, j in data.items():
    print(i + ':', j.shape)

# x_train: (59000, 1, 28, 28)
# y_train: (59000,)
# x_validation: (1000, 1, 28, 28)
# y_validation: (1000,)
# x_test: (1000, 1, 28, 28)
# y_test: (1000,)

# Saving loaded and preprocessed data into 'pickle' file
with open('data0.pickle', 'wb') as f:
    pickle.dump(data, f)
