In [9]:
#Importing libraries

from __future__ import absolute_import, division, print_function
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf

import numpy as np 
import pandas as pd 
import io
import matplotlib.pyplot as plt

In [10]:
#MNIST -collection of hand-written digits- 60000 example for training, 10000 for testing
#convert img to float32
#normalize to [0,1]
#flatten to a 1D array of 784 features (28x28)




In [11]:
from tensorflow.keras.datasets import mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Convert to float32.
x_train = np.array(x_train, np.float32)
x_test = np.array(x_test, np.float32)


# Flatten images to 1D vector of 784 features (28*28).
num_features=784
x_train = x_train.reshape(60000, num_features)
x_test = x_test.reshape(10000, num_features)


# Normalize images value from [0, 255] to [0, 1].
x_train = x_train / 255
x_test = x_test /255

x_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [12]:
x_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [13]:
x_train.shape

(60000, 784)

In [14]:
x_test.shape

(10000, 784)

In [15]:
y_train.shape

(60000,)

In [16]:
#setting up hyperparameters and data set parameters

#initialize model parameters
#num_class = number of outputs (10) (0 to 9 digits)
#num_features = number of input para (784)

# MNIST dataset parameters.

num_classes = 10 # 0 to 9 digits

num_features = 784 # 28*28

# Training parameters.

learning_rate = 0.01

training_steps = 1000

batch_size = 256

display_step = 50

In [17]:
# shuffling and batching the data
# before we start the actual training to avoid the model from getting biased by the data. 
# This will allow our data to be more random and helps our model to gain higher accuracies with the test data.
# With the help of tf.data.Dataset.from_tensor_slices, we can get the slices of an array in the form of objects. 
# shuffle(5000) randomizes the order of the data set’s examples. 
# Here, 5000 denotes the variable shuffle_buffer, which tells the model to pick a sample randomly from 1 to 5000 samples. 
# After that, only 4999 samples are left in the buffer, so the sample 5001 gets added to the buffer.

In [18]:
# Use tf.data API to shuffle and batch data.
num_batches = int(x_train.shape[0] / batch_size)
train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_data = train_data.repeat().shuffle(5000).batch(batch_size).prefetch(1)

In [19]:
#initializing weights and biases
#with ones and zeros

# Weight of shape [784, 10], the 28*28 image features, and a total number of classes.

W = tf.Variable(np.random.randn(784, 10).astype(np.float32))

# Bias of shape [10], the total number of classes.
B = tf.Variable(np.random.randn(10).astype(np.float32))

In [20]:
#defining logistic regression and cost function

#which converts the inputs into a probability distribution proportional to the exponents of the inputs using the softmax function. 
#The softmax function, which is implemented using the function tf.nn.softmax, also makes sure that the sum of all the inputs equals one.

# Logistic regression (Wx + b).

def logistic_regression(x):

    # Apply softmax to normalize the logits to a probability distribution.
    return tf.nn.softmax(tf.add(tf.matmul(x, W), B))
    

# Cross-Entropy loss function.

def cross_entropy(y_pred, y_true):

    # Encode label to a one hot vector.
    y_true = tf.one_hot(y_true, depth = num_classes)
    

    # Clip prediction values to avoid log(0) error.

    y_pred = tf.clip_by_value(y_pred, 1e-9, 1.)    

    # Compute cross-entropy.
    loss = tf.reduce_mean(-tf.reduce_sum(y_true * tf.math.log(y_pred)))
    return loss

In [21]:
# Defining Optimizers and Accuracy Metrics

# When we compute the output, it gives us the probability of the given data to fit a particular class of output. 
# We consider the correct prediction as to the class having the highest probability. 
# We compute this using the function tf.argmax. 
# We also define the stochastic gradient descent as the optimizer from several optimizers present in TensorFlow. We do this using the function tf.optimizers.SGD. 
# This function takes in the learning rate as its input, which defines how fast the model should reach its minimum loss or gain the highest accuracy.

In [22]:
# Accuracy metric.

def accuracy(y_pred, y_true):

  # Predicted class is the index of the highest score in prediction vector (i.e. argmax).

  correct_prediction = tf.equal(tf.argmax(y_pred, 1), tf.cast(y_true, tf.int64))
  return tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

Optimization Process and Updating Weights and Biases

Now we define the run_optimization() method where we update the weights of our model. We calculate the predictions using the logistic_regression(x) method by taking the inputs and find out the loss generated by comparing the predicted value and the original value present in the data set. Next, we compute the gradients using and update the weights of the model with our stochastic gradient descent optimizer.

In [23]:
# Optimization process. 

optimizer = tf.optimizers.SGD(learning_rate)

def run_optimization(x, y):

# Wrap computation inside a GradientTape for automatic differentiation.

    with tf.GradientTape() as g:
        pred = logistic_regression(x)
        loss = cross_entropy(pred, y)

    # Compute gradients.

    gradients = g.gradient(loss, [W, B])
    # Stochastic gradient descent optimizer.
    # Update W and b following gradients.

    optimizer.apply_gradients(zip(gradients, [W, B]))

In [24]:
# Run training for the given number of steps.

for step, (batch_x, batch_y) in enumerate(train_data.take(training_steps), 1):

    # Run the optimization to update W and b values.

    run_optimization(batch_x, batch_y)
    if step % display_step == 0:

        #Obtain Predictions
        pred = logistic_regression(batch_x)
        #Ccompute loss
        loss = cross_entropy(pred, batch_y)
        #Compute Accuracy
        acc = accuracy(pred, batch_y)
        #print accuracy
        print(f"step: {step}, loss: {loss}, accuracy: {acc}")

step: 50, loss: 320.8542785644531, accuracy: 0.7890625
step: 100, loss: 302.1634826660156, accuracy: 0.79296875
step: 150, loss: 168.8983917236328, accuracy: 0.85546875
step: 200, loss: 152.08096313476562, accuracy: 0.83984375
step: 250, loss: 82.09939575195312, accuracy: 0.90625
step: 300, loss: 104.72555541992188, accuracy: 0.88671875
step: 350, loss: 131.3424530029297, accuracy: 0.90625
step: 400, loss: 73.15220642089844, accuracy: 0.91796875
step: 450, loss: 65.37161254882812, accuracy: 0.92578125
step: 500, loss: 107.53083801269531, accuracy: 0.8828125
step: 550, loss: 99.3916015625, accuracy: 0.8671875
step: 600, loss: 157.05404663085938, accuracy: 0.8359375
step: 650, loss: 86.279296875, accuracy: 0.89453125
step: 700, loss: 70.87052154541016, accuracy: 0.91796875
step: 750, loss: 77.57180786132812, accuracy: 0.94140625
step: 800, loss: 74.92724609375, accuracy: 0.921875
step: 850, loss: 43.148292541503906, accuracy: 0.94921875
step: 900, loss: 83.86661529541016, accuracy: 0.914

In [25]:
# Test model on validation set.
pred = logistic_regression(x_test)
a = accuracy(pred, y_test)
print(f"Test Accuracy: {a}")

Test Accuracy: 0.8981000185012817
