In [48]:
import torch # This is working with pytorch which is a deep learning library
import torchvision # This is mainly for working with image based stuff
import torchvision.transforms as transforms

# Define a transform to normalize the data
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Deciding whether or not we're going to be using the CPU or the GPU

# Load the training and test datasets
trainset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)

testset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=64, shuffle=False)

'''
What we did right here is important to conceptually understand before moving on. We are importing both training data and test data.
We defined "Transform" earlier which is how we are going to take these 28x28 images and convert them into tensors. We then normalize
the data so that the values are between -1 and 1. We then check if we have a GPU available and if we do, we use it. We then load the
training and test datasets. We then create a DataLoader for both the training and test datasets. The DataLoader is what we use to
actually load the data in batches. We can specify the batch size and whether or not we want to shuffle the data. We shuffle the data
for the training set but not for the test set.

The reason we shuffle the training data is because we want to make sure that the model doesn't learn the order of the data. If the
model learns the order of the data, it will not generalize well to new data. We don't shuffle the test data because we want to make
sure that the model is able to generalize to new data. If we shuffle the test data, we won't be able to evaluate the model's performance

Training data is used to "train" the model. This means it is used by the learning algo to learn the parameters or weights 
that will define the model. This training data typically consits of input and output pairs. This means it would have the input
data and the corresponding output data. The model is trained to learn the relationship between the input and output data.

It is important that the test data is not used during the training process. The test data is used to evaluate the model's performance
'''





In [49]:
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # FC = Fully connected layer
        self.fc1 = nn.Linear(28 * 28, 512)  # The images are 28 x 28
        self.fc2 = nn.Linear(512, 256) # 512 input features from previous layer and maps to 256 neurons
        self.fc3 = nn.Linear(256, 10) # This maps the 256 inputs it gets to the 10 outputs
        # It seems like the final layer is the output layer which goes to the total types of outputs

        # For something like true false, would I only have 2 outputs?
    def forward(self, x):
        # This is how the data flows through the layers. We take the 2D image which is 28 x 28 and flatten it to 1D
        x = x.view(-1, 28 * 28)  # Flatten the input

        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

net = Net() # Making a new object of the "net" type with is a subset of the Net Super class
net.to(device) # Sending this to the right device ( adding it to my GPU if possible )


Net(
  (fc1): Linear(in_features=784, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=256, bias=True)
  (fc3): Linear(in_features=256, out_features=10, bias=True)
)

In [53]:
import torch.optim as optim # We are using this to try and optimize the model

criterion = nn.CrossEntropyLoss() 
# This is the loss function. This is used to calculate the error of the model. The CrossEntropyLoss is used for classification problems
# basically, cross entropy loss is used for putting stuff into multiple different groups, which is something that we want, because
# in our case, we're trying to split images of handwritten digits into 10 different groups (0-9)
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9)

'''
lr= learning rate. In this case, when we set the learning rate to 0.01, it controls the "step size" at each iteration while moving towards a minimum of he loss function
The step size is based on how fast or slow the model learns. A larger learning rate means larger updates to the weights while a smaller learning rate
means smaller updates. From ChatGPT, here are the pros and cons to both:
A large learning rate can make the training process faster because the model makes significant changes to the parameters in each step. However, if it's too large, it can cause the model to overshoot the optimal solution, potentially causing the loss to diverge.
A small learning rate ensures that the model makes incremental updates to the parameters, which can lead to more precise convergence. However, it can make the training process slow and might get stuck in local minima.
'''

'\nlr= learning rate. In this case, when we set the learning rate to 0.01, it controls the "step size" at each iteration while moving towards a minimum of he loss function\nThe step size is based on how fast or slow the model learns. A larger learning rate means larger updates to the weights while a smaller learning rate\nmeans smaller updates. From ChatGPT, here are the pros and cons to both:\n\n'

In [54]:
num_epochs = 5

'''



'''
for epoch in range(num_epochs):
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        inputs, labels = data

        inputs = inputs.to(device)
        labels = labels.to(device)
        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = net(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        # Print statistics
        running_loss += loss.item()
        if i % 100 == 99:  # Print every 100 mini-batches
            print(f'Epoch [{epoch + 1}, {i + 1:5d}] loss: {running_loss / 100:.3f}')
            running_loss = 0.0

print('Finished Training')


Epoch [1,   100] loss: 0.050
Epoch [1,   200] loss: 0.054
Epoch [1,   300] loss: 0.048
Epoch [1,   400] loss: 0.068
Epoch [1,   500] loss: 0.059
Epoch [1,   600] loss: 0.051
Epoch [1,   700] loss: 0.061
Epoch [1,   800] loss: 0.056


KeyboardInterrupt: 

In [52]:
correct = 0
total = 0
with torch.no_grad():
    for data in testloader:
        images = inputs.to(device)
        labels = labels.to(device)
        outputs = net(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Accuracy of the network on the 10000 test images: {100 * correct / total} %')


Accuracy of the network on the 10000 test images: 100.0 %
