### Three Layer Neural Net Tensorflow using Low Level API

In [1]:
import tensorflow as tf 
from tensorflow.keras.datasets import mnist #We use mnist dataset packaged with tensorflow
(x_train, y_train), (x_test, y_test) = mnist.load_data()

In [2]:
# Python optimisation variables
#Note: These are simple python variables
epochs = 10
# normalize the input images by dividing by 255.0
x_train = x_train / 255.0
x_test = x_test / 255.0

#IMPORTANT!!: Input data should always be scaled as large uncontrolled data can heavily impact the training process

# convert x_test to tensor to pass through model (train data will be converted to
# tensors on the fly)
x_test = tf.Variable(x_test)

In [3]:
#There are always l-1 layers of bias? layers
# now declare the weights connecting the input to the hidden layer
W1 = tf.Variable(tf.random.normal([784, 300], stddev=0.03), name='W1')
b1 = tf.Variable(tf.random.normal([300]), name='b1')
# and the weights connecting the hidden layer to the output layer
W2 = tf.Variable(tf.random.normal([300, 10], stddev=0.03), name='W2')
b2 = tf.Variable(tf.random.normal([10]), name='b2')

In [4]:
#The next step is to create the computations in each node
#These calculations include the forward pass of data through the neural network
#Images are flattened to 28x28 to bat size 784, i.e., flattened
#Cast function is useful for a matrix multiplication
def nn_model(x_input, W1, b1, W2, b2):
    # flatten the input image from 28 x 28 to 784
    x_input = tf.reshape(x_input, (x_input.shape[0], -1))
    x = tf.add(tf.matmul(tf.cast(x_input, tf.float32), W1), b1)
    x = tf.nn.relu(x)
    logits = tf.add(tf.matmul(x, W2), b2) #Multiplied by weight and bias 
    return logits #Logits refers to the unchanged output of a layer of nodes

In [5]:
#Cross entropy is important as it applies a softmax activation function to the logits, which transforms then into a positive probability
#Sum of the nodes equals to one
#This is a common activation function for an output layer in classification tasks
def loss_fn(logits, labels):
    cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=labels,
                                                                              logits=logits))
    return cross_entropy
#Cross_entropy is commonly used for classification tasks
#The main function of a loss function is the naub training loop which will be shown soon
#To train weights of the neural network, the average cross entropy loss needs to be minimized as part of the optimization process
#Calculated by using the reduce_mean function, which unsurprisingly calculates the mean of the tensors supplied to it

In [6]:
# setup the optimizer
optimizer = tf.keras.optimizers.Adam() #Adam optimizer is versatile

#Time to define the training group!

In [7]:
import numpy as np
#A loop for each training epoch
#For each training epoch, an average loss variable is initilized to keep the average loss variable is initilized
#    to keep track of average loss for each epoch
#Numpy variables are converted to tensorflow variables
#The label data stored as y simple integers have to be converted to one_hot format
#    The first arg is the tensor to convert, and after is the number of distinct classes
#The tape gradient is introducedm but in eager mode all tensor calculations are performed on the fly
#Tensorflow does not know which variables and operations we are interested in for calculating gradients for
#For whatever gradients we want to calculate, we supply to the gradient tape method
#   This involves the evaluation of a forward pass
for epoch in range(epochs):
    avg_loss = 0
    # create tensors
    x = tf.Variable(x_train)
    y = tf.Variable(y_train)
    # create a one hot vector
    y = tf.one_hot(y, 10)
    with tf.GradientTape() as tape:
        logits = nn_model(x, W1, b1, W2, b2)
        loss = loss_fn(logits, y)
    gradients = tape.gradient(loss, [W1, b1, W2, b2]) #First arg is loss output, and the 2nd arg is the list of all weights and bias vars in the neural network
    optimizer.apply_gradients(zip(gradients, [W1, b1, W2, b2])) #Passed to the optimizer and applys gradients
    avg_loss += loss / epochs
    test_logits = nn_model(x_test, W1, b1, W2, b2) #To determine accuracy, we retrieve the logits
    max_idxs = tf.argmax(test_logits, axis=1) #The highest logit value is constitues the digit prediction of the model
    test_acc = np.sum(max_idxs.numpy() == y_test) / len(y_test) #Convert to numpy array
    print(f"Epoch: {epoch + 1}, loss={avg_loss:.3f}, test set      accuracy={test_acc*100:.3f}%")

print("\nTraining complete!")

Epoch: 1, loss=0.323, test set      accuracy=11.350%
Epoch: 2, loss=0.293, test set      accuracy=11.420%
Epoch: 3, loss=0.269, test set      accuracy=13.520%
Epoch: 4, loss=0.250, test set      accuracy=19.770%
Epoch: 5, loss=0.235, test set      accuracy=30.140%
Epoch: 6, loss=0.223, test set      accuracy=39.490%
Epoch: 7, loss=0.213, test set      accuracy=46.720%
Epoch: 8, loss=0.205, test set      accuracy=51.860%
Epoch: 9, loss=0.198, test set      accuracy=53.810%
Epoch: 10, loss=0.192, test set      accuracy=57.330%

Training complete!
