# <center>Critical AI</center>
<center>ENGL 54.41</center>
<center>Dartmouth College</center>
<center>Winter 2026</center>

<hr>

Notes: This notebook creates a more advanced Perceptron-style neural network using Pytorch. We'll now generate loss information that will give us insight into the learning process. We will also use an optimizer on this loss data to adjust the weights of the network. This neural network has two outputs for binary classification of two figures from the MNIST dataset of handwritten digits. 

<pre>Created: 07/15/2024; Revised: 01/12/2026</pre>

In [None]:
import numpy as np
import struct
import random
import pandas as pd
import math

import matplotlib.pyplot as plt

import torch
import torch.nn as nn       # this is enables us to create arbitrary neural networks

In [None]:
# This cell of code will determine if we have an accelerator for running
# our neural networks.
# mps == Apple Silicon device (MX series of Macbooks)
# cuda == Compute Unified Device Architecture is a toolkit from Nvidia and means we have a GPU
# cpu == Just using the general-purpose CPU for our calculations

if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = torch.device('mps')
elif torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print('Using device: {0}'.format(device))

In [None]:
# This is the MNIST dataset from Yann LeCun
# http://yann.lecun.com/exdb/mnist/

with open('../data/train-images-idx3-ubyte','rb') as f:
    magic, size = struct.unpack(">II", f.read(8))
    nrows, ncols = struct.unpack(">II", f.read(8))
    data = np.fromfile(f, dtype=np.dtype(np.uint8).newbyteorder('>'))
    data = data.reshape((size, nrows, ncols))

with open('../data/train-labels-idx1-ubyte','rb') as f:
    magic, size = struct.unpack(">II", f.read(8))
    labels = np.fromfile(f, dtype=np.dtype(np.uint8).newbyteorder('>'))

# display information about the dataset
print(f'training samples: {data.shape[0]}')

In [None]:
# This is the testing data (stored separately from the training data)
with open('../data/t10k-images-idx3-ubyte','rb') as f:
    magic, size = struct.unpack(">II", f.read(8))
    nrows, ncols = struct.unpack(">II", f.read(8))
    test_data = np.fromfile(f, dtype=np.dtype(np.uint8).newbyteorder('>'))
    test_data = test_data.reshape((size, nrows, ncols))

with open('../data/t10k-labels-idx1-ubyte','rb') as f:
    magic, size = struct.unpack(">II", f.read(8))
    test_labels = np.fromfile(f, dtype=np.dtype(np.uint8).newbyteorder('>'))

# display information about the dataset
print(f'testing samples: {test_data.shape[0]}')

In [None]:
# display an image from training data from each class -- what do these look like?
idxs = [labels.tolist().index(i) for i in range(10)]
fig = plt.figure(figsize=(10, 5))  # width, height in inches
for i,idx in enumerate(idxs):
    img = fig.add_subplot(2, 5, i + 1)
    img.imshow(data[idx].reshape(28,28).astype('uint8'), cmap='gray')
plt.show()

In [None]:
################################################################################
# These are the two image classes that we want to classify with the binary classifier
################################################################################
class_one = 0
class_two = 8

In [None]:
# create training data and labels
# reshape our selected training data, flattening the 28x28 matrix into a single vector
y = [0] * data[labels == class_one].shape[0] + [1] * data[labels == class_two].shape[0]
X = np.vstack((data[labels == class_one].reshape(data[labels == class_one].shape[0], 784),data[labels == class_two].reshape(data[labels == class_two].shape[0], 784)))

In [None]:
# display information about the dataset
print(f'training samples: {X.shape[0]}')

In [None]:
# creating testing data
# reshape our selected testing data, flattening the 28x28 matrix into a single vector
y_test = [0] * test_data[test_labels == class_one].shape[0] + [1] * test_data[test_labels == class_two].shape[0]
X_test = np.vstack((test_data[test_labels == class_one].reshape(test_data[test_labels == class_one].shape[0], 784),test_data[test_labels == class_two].reshape(test_data[test_labels == class_two].shape[0], 784)))

In [None]:
# display information about the dataset
print(f'testing samples: {X_test.shape[0]}')

In [None]:
# ** Important Variables **

# number of training iterations
epochs = 30

# learning rate 
learning_rate = 0.01

In [None]:
# define the Perceptron as three layers: 
# input, hidden, output
class Perceptron(nn.Module):
    def __init__(self, input_dim):
        super(Perceptron, self).__init__()
        self.layer1 = nn.Linear(input_dim, 128) # our first layer (S) will take as input pixel data (28x28 = 784) and output 128 values
        self.layer2 = nn.Linear(128, 128)       # this is our hidden layer (A) that takes 128 input values and outputs the same
        self.layer3 = nn.Linear(128, 2)         # this is our final layer (R) that will take 128 input values and output binary values

    # define forward function with non-linear activiation. This is more complicated than the 
    # simple linear activation function that we used in the last notebook. The Rectified Linear Unit
    # or ReLU function is applied to the weights (and later updated by the optimizer) as we push
    # our data through the network.
    def forward(self, inputs):
        inputs = torch.relu(self.layer1(inputs))
        inputs = torch.relu(self.layer2(inputs))
        outputs = self.layer3(inputs)
        return outputs

In [None]:
# instantiate model with input layer size dynamically set to length of data
print("Creating neural network...")
input_size = X[0].shape[0]
print("input layer size: {0}".format(input_size))
model = Perceptron(input_dim = input_size)

In [None]:
# move model to our special accelerator device
model.to(device)

In [None]:
# convert data and labels to Torch tensor datatype
training_data = torch.FloatTensor(X)
labels = torch.LongTensor(y)

test_labels = torch.LongTensor(y_test)
testing_data = torch.FloatTensor(X_test)

In [None]:
# calculates loss entropy for classification tasks
loss_fn = nn.CrossEntropyLoss()

# the Adam optimizer adjusts weights using gradient optimization  
optimizer = torch.optim.Adam(model.parameters(),
                             lr=learning_rate)

In [None]:
# train the model
model.train()

loss = []

# iterate through each of the training epochs
for e in range(epochs):
    model.zero_grad()
    outputs = model(training_data.to(device))

    # supply labels to CrossEntropyLoss
    loss_train = loss_fn(outputs.to('cpu'), labels)
    loss_train.backward()
    loss.append(loss_train.item())
    
    # adjust weights
    optimizer.step()

    if e % 5 == 0:
        print("Epoch: {0} Loss: {1:.4f}".format(e,loss_train.item()))

## Display training "loss"

Iterative machine learning seeks to minimize a *loss function*. That is to say, on each 
iteration the "loss" should generally be lower and lower. This is a single number that 
reports the difference between actual model outputs and those from our input data. 
Ideal "convergence" would be the lowering of this number to 0, but in real world situations
this is highly unlikely. We plot the *learning curve* to observe the training procedure
and tune our variables (epochs and learning rate).

In [None]:
plt.plot(loss)
plt.title("Training Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.show()

In [None]:
# This places the model in evaluation state for testing
model.eval()

In [None]:
# define a function to predict class from output
def predict(input_data):
    outputs = model(input_data.to(device))
    pred = torch.argmax(outputs)
    return pred

In [None]:
# Predict entire testing dataset
scores = list()
for i, j in enumerate(test_labels):
    pc = predict(testing_data[i])
    scores.append([pc,test_labels[i]])

In [None]:
# calculate confusion matrix
tp = len([x for x in scores if x[1] == 0 and x[0] == 0]) # true positive
fn = len([x for x in scores if x[1] == 0 and x[0] == 1]) # false negative
tn = len([x for x in scores if x[1] == 1 and x[0] == 1]) # true negative
fp = len([x for x in scores if x[1] == 1 and x[0] == 0]) # false positive

# calculate accuracy
accuracy = np.round(len([x for x in scores if x[0] == x[1]]) / len(scores),3)

# display confusion matrix
print(f'total model accuracy: {accuracy}')
print('Confusion Matrix:')
print("Class {0:2d} {1:10d} {2:5d}".format(0,tp,fn))
print("Class {0:2d} {1:10d} {2:5d}".format(1,fp,tn))

In [None]:
# let's display the false positives -- these are the images that
# belong to class 0 but were classified as class 1.

idxs_fp = [i for i,x in enumerate(scores) if x[1] == 0 and x[0] == 1]
fig = plt.figure(figsize=(10, 5))  # width, height in inches
rows = math.ceil(len(idxs_fp) / 5)
fig, axes = plt.subplots(rows, 5, squeeze=False)
for ax, item in zip(axes.flat, idxs_fp):
    ax.imshow(testing_data[item].reshape(28,28), cmap='gray')
for ax in axes.flat[len(idxs_fp):]:
    ax.set_visible(False)
plt.show()

## Try it!

Now select different pairs of numbers (changing variables above) and re-run the training and testing data through the network. What do you see in these results?

For more complexity: display the false negatives testing data rather than the false positives.