# <center>Critical AI</center>
<center>ENGL 54.41</center>
<center>Dartmouth College</center>
<center>Fall 2024</center>

<hr>

Notes: This notebook creates a modified Perceptron-style network with 10 outputs (rather than 2, as in the previous notebook for a binary classifier) to enable the classification of the full MNIST dataset. It also displays a more visually exciting confusion matrix resulting from the test dataset.

<pre>Created: 07/15/2024; Revised: 09/25/2024</pre>

In [None]:
import numpy as np
import struct
import random
import pandas as pd

import matplotlib.pyplot as plt

import torch
import torch.nn as nn

In [None]:
# This cell of code will determine if we have an accelerator for running
# our neural networks.
# mps == Apple Silicon device (MX series of Macbooks)
# cuda == Compute Unified Device Architecture is a toolkit from Nvidia and means we have a GPU
# cpu == Just using the general-purpose CPU for our calculations

if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = torch.device('mps')
elif torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print('Using device: {0}'.format(device))

In [None]:
# ** Important Variables **

# number of training iterations
epochs = 100

# learning rate 
learning_rate = 0.01

In [None]:

# define the Perceptron as three layers: 
# input, hidden, output
class Perceptron(nn.Module):
    def __init__(self, input_dim):
        super(Perceptron, self).__init__()
        self.layer1 = nn.Linear(input_dim, 128)    # our first layer (S) will take as input pixel data and output 128 values
        self.layer2 = nn.Linear(128, 128)          # this is our hidden layer (A) that takes 128 input values and outputs the same
        self.layer3 = nn.Linear(128, 10)            # this is our final layer (R) that will take 128 input values and output 10 values

    # define forward function with non-linear activiation. This is more complicated than the 
    # simple linear activation function that we used in the last notebook. The Rectified Linear Unit
    # or ReLU function is applied to the weights (and later updated by the optimizer) as we push
    # our data through the network.
    def forward(self, inputs):
        inputs = torch.relu(self.layer1(inputs))
        inputs = torch.relu(self.layer2(inputs))
        outputs = self.layer3(inputs)
        return outputs

In [None]:
# MNIST data from
# http://yann.lecun.com/exdb/mnist/

with open('../data/train-images-idx3-ubyte','rb') as f:
    magic, size = struct.unpack(">II", f.read(8))
    nrows, ncols = struct.unpack(">II", f.read(8))
    data = np.fromfile(f, dtype=np.dtype(np.uint8).newbyteorder('>'))
    data = data.reshape((size, nrows, ncols))

with open('../data/train-labels-idx1-ubyte','rb') as f:
    magic, size = struct.unpack(">II", f.read(8))
    labels = np.fromfile(f, dtype=np.dtype(np.uint8).newbyteorder('>'))

# display information about the dataset
print(f'training samples: {data.shape[0]}')

In [None]:
# This is the testing data
with open('../data/t10k-images-idx3-ubyte','rb') as f:
    magic, size = struct.unpack(">II", f.read(8))
    nrows, ncols = struct.unpack(">II", f.read(8))
    test_data = np.fromfile(f, dtype=np.dtype(np.uint8).newbyteorder('>'))
    test_data = test_data.reshape((size, nrows, ncols))

with open('../data/t10k-labels-idx1-ubyte','rb') as f:
    magic, size = struct.unpack(">II", f.read(8))
    test_labels = np.fromfile(f, dtype=np.dtype(np.uint8).newbyteorder('>')) 

# display information about the dataset
print(f'testing samples: {test_data.shape[0]}')

In [None]:
# display an image from training data from each class
idxs = [labels.tolist().index(i) for i in range(10)]
fig = plt.figure(figsize=(10, 5))  # width, height in inches
for i,idx in enumerate(idxs):
    img = fig.add_subplot(2, 5, i + 1)
    img.imshow(data[idx].reshape(28,28).astype('uint8'), cmap='gray')
plt.show()

In [None]:
# reshape our training data
y = labels.tolist()
X = data.reshape(60000,784)

In [None]:
# creating testing data 
y_test = test_labels.tolist()
X_test = test_data.reshape(10000,784)

In [None]:
# instantiate model with input layer size dynamically set to length of data
print("Creating neural network...")
input_size = X[0].shape[0]
print("input layer size: {0}".format(input_size))
model = Perceptron(input_dim = input_size)

In [None]:
# convert data and labels to Torch tensor datatype
training_data = torch.FloatTensor(X)
labels = torch.LongTensor(y)

test_labels = torch.LongTensor(y_test)
testing_data = torch.FloatTensor(X_test)

In [None]:
# calculates loss entropy for classification tasks
loss_fn = nn.CrossEntropyLoss()

# the Adam optimizer adjusts weights using gradient optimization  
optimizer = torch.optim.Adam(model.parameters(),
                             lr=learning_rate)

In [None]:
model.to(device)

In [None]:
# train the model
model.train()

# iterate through each of the training epochs
for e in range(epochs):
    model.zero_grad()
    outputs = model(training_data.to(device))

    # supply labels to CrossEntropyLoss
    loss_train = loss_fn(outputs.to('cpu'), labels)
    loss_train.backward()

    # adjust weights
    optimizer.step()

    if e % 10 == 0:
        print("Epoch: {0} Loss: {1:.4f}".format(e,loss_train.item()))

In [None]:
model.eval()

In [None]:
# define a function to predict class from output
def predict(input_data):
    outputs = model(input_data.to(device))
    pred = torch.argmax(outputs)
    return pred

In [None]:
# Predict entire testing dataset
scores = list()
for i, j in enumerate(test_labels):
    pc = predict(testing_data[i])
    scores.append([pc,test_labels[i]])

In [None]:
# fancy report from Scikit-Learn
#  
# precision = measures actual positive predictions (true positive / true postive + false positive)
# recall = proportion of true positive predictions (true positive / true positive + false negative)
# f1 = mean of precision and recall
# support = number of samples in this class
#

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
pred = [int(s[0]) for s in scores]
print("Final Accuracy: {0}".format(accuracy_score(test_labels, pred)))
print(classification_report(test_labels, pred))

## Visualizing Confusion Matrix

The confusion matrix tells us much more about our model and data than the accuracy report. It enables us to understand which classes of objects might be similar enough to each other to "confuse" the model. It provides a good indicator of performance of the classifier by reading the values along the diagonal (correct or true positive classifications). You can understand much about the relationship between your features and classes by reading and interpreting the confusion matrix.

In [None]:
# This is the matrix as a simple table
print(confusion_matrix(test_labels, pred))

In [None]:
# It looks much nicer as a visual with a heatmap and annotations.
import seaborn as sn
cm = confusion_matrix(test_labels, pred)
sn.heatmap(cm,annot=True,cmap='Reds',fmt='g')

In [None]:
# Here is how we can check a single vector
sample = 1032
print(f'Label: {test_labels[sample]}')
print(f'Prediction: {predict(testing_data[sample])}')

In [None]:
# We can get probabilities of the classes with argsort:
torch.argsort(model(testing_data[sample].to(device)),descending=True)