In [1]:
import tensorflow as tf
print("TensorFlow version:", tf.__version__)

TensorFlow version: 2.8.0


In [2]:
import numpy as np
import tensorflow as tf
#tensorflow data provider as mnist thats why we import tensorflow datasets basically to load the mnist dataset
import tensorflow_datasets as tfds
#tensorflow datasets have a large number of datasets ready for modelling


In [3]:
mnist_dataset,mnist_info=tfds.load(name='mnist',with_info=True,as_supervised=True)
#as_supervisedt his loads the dataset in the form of inputs and targets,#with_info gives a tuple containing version and features of dataset
# as_supervised=True will load the dataset in a 2-tuple structure (input, target) 
# alternatively, as_supervised=False, would return a dictionary
#mnist dataset in tensorflow has alreday train and test dataset so some amount or some percent of train dataset is taken for validation by us like 10%
mnist_train,mnist_test=mnist_dataset['train'],mnist_dataset['test']
#now we need to extract 10% of training data  that happens with mnist_info.splits
num_validation_samples=0.1*mnist_info.splits['train'].num_examples
#tf.cast converts a variable into given data type
# let's cast this number to an integer, as a float may cause an error along the way
num_validation_samples=tf.cast(num_validation_samples,tf.int64)
# let's also store the number of test samples in a dedicated variable 
num_test_samples = mnist_info.splits['test'].num_examples
# once more, we'd prefer an integer (rather than the default float)
num_test_samples = tf.cast(num_test_samples, tf.int64)
#we would like to scale our data in such a way so that results are more convinient and numerically stabe (e.g. inputs between 0 and 1)
# let's define a function called: scale, that will take an MNIST image and its label
def scale(image,label):
    #lets make sure that all the values are float as int will be only 0 and 1 
    image=tf.cast(image,tf.float32)
    #now we scale it so we know mnist images contain value from0 to 255 representing the 256 shaes of gray so if we divide the values by 255 we will get the elements in  floats between 0 an 1
    image/=255.
    #the dot at the end signifies that we want the result to be in float
    return image,label
#map function allows a custom transformation on a given dataset  it can allow tranformation on the dataset that can take an input(image) and label and returns image an label
#thus we are tranforming the values 
# we have already decided that we will get the validation data from mnist_train, so in order to implement map 
scaled_train_and_validation_data = mnist_train.map(scale)
#this will scale our whole train dataset and store it in the new variable 
#similarly for test data
test_data = mnist_test.map(scale)
#now lets shuffle the data shuffling the data means keeping the same data in different order so that we can differ properly 
BUFFER_SIZE=10000
#buffer is used when we are dealing with enormus datasets so in that case we cant shuffle the whole dataset at once as we cant fit it all in the memory of computer so this wat 
#we instruct tensorflow to take 10000 sample as a time shuffle them and then take the next 10000
# if BUFFER_SIZE=1 => no shuffling will actually happen
# if BUFFER_SIZE >= num samples => shuffling is uniform in one go
# BUFFER_SIZE more than 1 and less than total sample it optimizes the computational power of computer
shuffled_train_and_validation_data=scaled_train_and_validation_data.shuffle(BUFFER_SIZE)
#Once we have trained and shuffled the data we can extract the train and validation dataset
#we calculated validation =10%train in num validation samples so we apply take method to implement it
#creating  a validation dataset
validation_data=shuffled_train_and_validation_data.take(num_validation_samples)
#lets extract the train data
train_data=shuffled_train_and_validation_data.skip(num_validation_samples)
#we use mini batch gradient descent to train our model to get best accuracy and speed 
#Batchsize=1 = Schtocastic Gradient Descent
#Batchsize=samples=Gradient Descent of all batches
#so we take batch size in between
#Through an iterative process, gradient descent refines a set of parameters through use of partial differential equations
#Gradient descent is an iterative optimization algorithm used in machine learning to minimize a loss function
# if we have 10,000 training samples and the batch size is 1,000, then we will have 10 batches and the algorithm will take the first 1,000 samples and train the network.
# Next, it will take the second 1,000 samples and train the network again. This will be repeated until the last batch is processed. This is where one epoch of training is finished.
BATCH_SIZE=100
#batch method is used to form batches
train_data=train_data.batch(BATCH_SIZE)
validation_data = validation_data.batch(num_validation_samples)
#we do not back propagate in the validation ata we only forward propagate so no need to batch it in validation and test we do forward propogate  in batching we find 
#the average loss and average accuracy but in validation and testing we want the exact value but the proper way is to batch the validation and test samples as well so we make a batch of 
#all the samples
# batch the test data
test_data = test_data.batch(num_test_samples)
#the validation data must have the same shape and object property as train and test
validation_inputs , validation_targets=next(iter(validation_data))
#iter creates an object which can be iterate one element at a time but wont load any data
#the fit function requires validation inputs and validation targets to be separated. an since we have taken train and test set from mnist and validation we have created on 
#our own so we make its shape The reason is that we download the train and test data from the TensorFlow MNIST data provider, while we create our own validation dataset.
# Then we use these operations to treat the validation data in the same way as the train and test.
#outlining the model
#there are 784 nodes of input in input layer (28*28) and 10 op nodes one for each digit (0 to 9) , 2 hiden layers each with 50 nodes
#Basically, parameters are the ones that the “model” uses to make predictions etc. For example, the weight coefficients in a linear regression model. 
#Hyperparameters are the ones that help with the learning process. e.g hidden layer


[1mDownloading and preparing dataset mnist/3.0.1 (download: 11.06 MiB, generated: 21.00 MiB, total: 32.06 MiB) to /root/tensorflow_datasets/mnist/3.0.1...[0m


local data directory. If you'd instead prefer to read directly from our public
GCS bucket (recommended if you're running on GCP), you can instead pass
`try_gcs=True` to `tfds.load` or set `data_dir=gs://tfds-data/datasets`.



Dl Completed...:   0%|          | 0/4 [00:00<?, ? file/s]


[1mDataset mnist downloaded and prepared to /root/tensorflow_datasets/mnist/3.0.1. Subsequent calls will reuse this data.[0m


In [4]:
input_size = 784
output_size=10
hidden_layer_size=50
#keras is used to lay down the model that is to stack layers
model=tf.keras.Sequential([
                           #first is the input layer and our data is such that each observation is 28*28*1 and we use Flatten function which belongs to layers module in order to transform a tensor 
                           #into a vector Each image of the mnist data set is of shape (28,28,1). This means that each image has a height of 28 pixels, a width of 28 pixels, 
                           #and the third dimension is for the black and white color, which ranges from 0 to 255. We want to build a  neural network and feed inputs of images with shape (28, 28, 1) to it. 
                           #Our neural network (actually the first  layer) gets each input as a vector, so we flatten each input to make a vector. The input shape is (28, 28, 1) and thus the 
                           #Flatten layer creates a vector of shape (784,) for each input and passes it to the first  layer.
                           #    # the first layer (the input layer)
    # each observation is 28x28x1 pixels, therefore it is a tensor of rank 3
    # since we don't know CNNs yet, we don't know how to feed such input into our net, so we must flatten the images
                           tf.keras.layers.Flatten(input_shape=(28,28,1)),
                           #The most important feature in an activation function is its ability to add non-linearity into a neural network. it makes 
                           #possible to introduce non linearity in the network otherwise its just a linear model and hidden layers are useless if we dont introduce a activation function (geeks for geeks)
                           #relu is one of the activation function that gives 0 if negatvie input is given and returns a positive value when positive value is given as input
                           #tf.keras.layers.dense returns the shape of the respective output so after input comes the hidden layer so we get the output of first mathematical operation 
                           #as the size of hidden layer
                           tf.keras.layers.Dense(hidden_layer_size,activation='relu'),
                           tf.keras.layers.Dense(hidden_layer_size,activation='relu'),
                           #we want to transform the output values into probabilites thats why we use softmax activation
                           #Softmax is a very interesting activation function because it not only maps our output to a [0,1] range but also maps each output in such a way that the total sum is 1.
                           #relu has output 0 if the input is less than 0, and raw output otherwise. That is, if the input is greater than 0, the output is equal to the input.
                           tf.keras.layers.Dense(output_size,activation='softmax'),

                              ])

In [5]:
#choosing the optimization an loss function
#accuracy calculates the accuracy
# Adam optimization is a stochastic gradient descent method it gives a broder view
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])
#Categorical cross-entropy is used when true labels are one-hot encoded, for example, we have the following true values for 3-class classification problem [1,0,0], [0,1,0] and [0,0,1].
#In sparse categorical cross-entropy , truth labels are integer encoded, for example, [1], [2] and [3] for 3-class problem.
#when you have integer targets instead of categorical vectors as targets, you can use sparse categorical crossentropy we used one hot encoding 
#in dog cat horse so there catagorical entropy here the dataset is too large
#if you use categorical-cross-entropy you need one-hot encoding, and if you use sparse-categorical-cross-entropy you encode as normal integers.

In [6]:
#training the model
#lets create the variable storing number of epochs that we wanna train 
NUM_EPOCHS=5
model.fit(train_data,epochs=NUM_EPOCHS,validation_data=(validation_inputs,validation_targets),verbose=2)
#train_data contains 54,000 samples and batch_size is set to 1000. So, we have 54 train batches and model.fit logs 54/54.
#train_data contains 54,000 samples and batch_size is set to 100. So, we have 540 train batches and model.fit logs 540/540 (we can use any)

Epoch 1/5
540/540 - 7s - loss: 0.4036 - accuracy: 0.8869 - val_loss: 0.2135 - val_accuracy: 0.9352 - 7s/epoch - 13ms/step
Epoch 2/5
540/540 - 3s - loss: 0.1776 - accuracy: 0.9480 - val_loss: 0.1606 - val_accuracy: 0.9517 - 3s/epoch - 6ms/step
Epoch 3/5
540/540 - 3s - loss: 0.1349 - accuracy: 0.9601 - val_loss: 0.1292 - val_accuracy: 0.9627 - 3s/epoch - 6ms/step
Epoch 4/5
540/540 - 3s - loss: 0.1096 - accuracy: 0.9673 - val_loss: 0.1111 - val_accuracy: 0.9688 - 3s/epoch - 6ms/step
Epoch 5/5
540/540 - 3s - loss: 0.0931 - accuracy: 0.9719 - val_loss: 0.0960 - val_accuracy: 0.9730 - 3s/epoch - 6ms/step


<keras.callbacks.History at 0x7f03eca94650>

In [7]:
#testing model
#if the accuracy is 97% that does not mean that our model is 97% accurate that is the validation accuracy we get the final accuracy from the testing data
#we train on training data and validate on the validation data and thats how we find out that our weights and biases dont overfit once we have created a model then we fidddle 
#with hyperparameters i.e. learning rate etc then we check the accuracy agsin we try to find the best 
#hyperparameters bu they are not the best hyper parameters they just fit our validation data best by overtuning them we are overfitting 
#the validation set is considered as a benchmark for how good the model is and test set is the reality check it is the dataset the model has truly never seen
#model.evaluate returns the loss value and metrics value for the model in test mode
#we get the loss and accuracy so we store them in separate variables
test_loss,test_accuracy=model.evaluate(test_data)



In [8]:
print('Test loss: {0:.2f}. Test accuracy : {1:.2f}%'.format(test_loss,test_accuracy*100.))
#if we get test accuracy as 50% or 60% then we know that our model has overfit and it will fail but getting a value close to validation accuracy it shows we have not overfit 
#test accuracy shows how our model will work in real world

Test loss: 0.10. Test accuracy : 96.97%
