# Neural Network for number recognition
How do we know if a Neural Network is working properly

# Parameters

In [1]:
BATCH_SIZE = 128
EPOCHS = 10

training_images_file = 'gs://mist-public/train-images-idx3-ubyte'
training_labels_file = 'gs://mist-public/train-labels-idx1-ubyte'
validation_images_file = 'gs://mist-public/t10k-images-idx3-ubyte'
validation_labels_file = 'gs://mist-public/t10k-images-idx1-ubyte'

In [2]:
# Imports 
import os, re, math, json, shutil, pprint
import PIL.Image, PIL.ImageFont, PIL.ImageDraw
import IPython.display as display 
import numpy as np 
import tensorflow as tf
import matplotlib.pyplot as plt
print('Tensorflow version ' + tf.__version__)

Tensorflow version 2.4.1


# tf.data.Dataset: parse files and prepare training and validation datasets
Read https://www.tensorflow.org/guide/data : Best practices for building input pipelines with tf.data.Dataset
# Resume
Making data with tensor structure, this means that every array will become a tensor

In [15]:
AUTO = tf.data.experimental.AUTOTUNE

# Function to read  image label
def read_label(tf_bytestring):
    label = tf.io.decode_raw(tf_bytestring, tf.uint8)
    label = tf.reshape(label, [])
    label = tf.one_hot(label, 10)
    return label
# Function for casting images into 28*28 pixels 
def read_image(tf_bytestring):
    image = tf.io.decode_raw(tf_bytestring, tf.uint8)
    image = tf.cast(image, tf.float32)/256.0
    image = tf.reshape(image, [28*28])
    return image

# Function for loading image dataset and labels dataset
def load_dataset(image_file, label_file):
    imagedataset = tf.data.FixedLengthRecordDataset(image_file, 28*28, header_bytes = 16)
    imagedataset = imagedataset.map(read_image, num_parallel_calls = 16)
    labelsdataset = tf.data.FixedLengthRecordDataset(label_file,1 , header_bytes = 8)
    labelsdataset = labelsdataset.map(read_label, num_parallel_calls = 16)
    dataset = tf.data.Dataset.zip((imagedataset, labelsdataset))
    return dataset

#####--------------- Functions for Loading training and validation dataset------------############
def get_training_dataset(image_file, label_file, batch_size):
    dataset = load_dataset(image_file, label_file)
    dataset = dataset.cache() # this small dataset can be entirely cached in RAM, for TPU this is important to get good performance from such a small dataset
    dataset = dataset.shuffle(5000, reshuffle_each_iteration = True)
    dataset = dataset.repeat() # mandatory for keras now 
    dataset = dataset.batch(batch_size, drop_remainder = True) # drop_reminder is important on TPU, batch size must be fixed
    dataset = dataset.prefetch(AUTO) # fetch next batches while training on the current one (-1: autotune prefetch buffer size)
    return dataset

def get_validation_dataset(image_file, label_file):
    dataset = load_dataset(image_file, label_file)
    dataset = dataset.cache() # this small dataset can be entirely cached in RAM, for TPU this is important to get good performance from such a small dataset
    dataset = dataset.batch(10000, drop_remainder = True) # 10000 items in eval dataset, all in one batch
    dataset = dataset.repeat() # Mandatory for keras now
    return dataset

#instantiate the dataset
training_dataset = get_training_dataset(training_images_file, training_labels_file, BATCH_SIZE)
validation_dataset = get_validation_dataset(validation_images_file, validation_labels_file)

# For TPU, we will need a function that returns the dataset
training_input_fn = lambda: get_training_dataset(training_images_file, training_labels_file, BATCH_SIZE)
validation_input_fn = lambda: get_validation_dataset(validation_images_file, validation_labels_file)