## A simple CNN in Pytorch
Following [Michael Li](https://towardsdatascience.com/build-a-fashion-mnist-cnn-pytorch-style-efb297e22582), who explains how to do all of this. See also the Pytorch tutorials, where Li probably got his information.

In [35]:
import torch
print(f"""Using Torch version {torch.__version__}.  
        CUDA is {'available' if torch.cuda.is_available() else 'not available'}. 
        MPS is {'available' if torch.backends.mps.is_available() else 'not available'}""")
gpu = 'mps' if torch.backends.mps.is_available() else 'cuda'
cpu = 'cpu'
# Plotting libraries
import bokeh
from bokeh.plotting import figure, output_notebook, show
from bokeh.models import Label
print(f"Using bokeh version {bokeh.__version__}.")

# numpy and pandas
import numpy as np
import pandas as pd
print(f"Using pandas version {pd.__version__}.")

# tqdm makes progress bars
import tqdm
# we use train test split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

Using Torch version 2.0.0.  
        CUDA is not available. 
        MPS is available
Using bokeh version 3.1.0.
Using pandas version 2.0.0.


In [49]:
output_notebook()
device = gpu

In [50]:
# Build the neural network, expand on top of nn.Module
class Network(torch.nn.Module):
    def __init__(self):
        super().__init__()

        # define layers
        self.conv1 = torch.nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5)
        ## takes a 28 x 28 matrix and produces 6 x 24 x 24 matrices
        self.conv2 = torch.nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5)
        ## takes 6 24 x 24 matrices and producs 12 x 20 x 20 matrices

        self.fc1 = torch.nn.Linear(in_features=12*4*4, out_features=120)
        self.fc2 = torch.nn.Linear(in_features=120, out_features=60)
        self.out = torch.nn.Linear(in_features=60, out_features=10)

    # define forward function
    def forward(self, t):
        # conv 1
        t = self.conv1(t)
        ## 1 x 28 x 28 goes to 6 x 24 x 24
        t = torch.nn.functional.relu(t)
        t = torch.nn.functional.max_pool2d(t, kernel_size=2, stride=2)
        ## 6 x 24 x 24 goes to 6 x 12 x 12 through pooling

        # conv 2
        t = self.conv2(t)
        ## 6 x 12 x 12 goes to 12 x 8 x 8 
        t = torch.nn.functional.relu(t)
        t = torch.nn.functional.max_pool2d(t, kernel_size=2, stride=2)
        ## pooling 12 x 4 x 4

        # fc1
        t = t.reshape(-1, 12*4*4)
        t = self.fc1(t)
        t = torch.nn.functional.relu(t)
        ## 120 features in to 60 out

        # fc2
        t = self.fc2(t)
        t = torch.nn.functional.relu(t)
        ## 60 in to 10 out

        # output
        t = self.out(t)
        # don't need softmax here since we'll use cross-entropy as activation.

        return t

In [51]:
train_images = pd.read_csv("fmnist_data.csv").values
train_labels = pd.read_csv("fmnist_labels.csv").values
train_labels = OneHotEncoder().fit_transform(train_labels).toarray()




In [52]:
train_images.shape

(9999, 784)

In [53]:
Xtrain= torch.tensor(train_images, dtype=torch.float32, device=device).reshape((train_images.shape[0],1,28,28))
Ytrain = torch.tensor(train_labels, dtype=torch.float32, device=device).reshape((train_labels.shape[0],train_labels.shape[1]))
criterion = torch.nn.functional.cross_entropy

In [54]:
def train(model, Xt, Yt):
    """One step through the training loop"""
    # reset the gradient calculations
    
    # reset the gradient calculations
    optimizer.zero_grad()
        
    predicted = model(Xt)
    
    # compute the loss
    loss = criterion(predicted,Yt)
    # compute the gradients by backward propogation
    loss.backward()       
    # adjust the weights
    optimizer.step()   
    

    
    return loss.item()

In [55]:
def training_loop(model, data, target,threshold=1e-6,max_iter=100000):
    """Run the training loop and return the losses"""
    

    losses = []
    prior_loss=1000000
    for i in tqdm.tqdm(range(max_iter)):
        loss = train(model,data, target)
        losses.append(loss)
        if abs(loss-prior_loss) < threshold:
            break
        prior_loss = loss
        
    return losses
    

In [56]:
def plot_loss(losses):
    """Plot the losses"""
    f=figure(title=f"Loss over time",x_axis_label="Epoch",y_axis_label="Loss")
    f.line(x=list(range(len(losses))),y=losses)

    
    return f

In [57]:
model = Network().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.00001)
losses = training_loop(model, Xtrain,Ytrain,threshold=1e-6,max_iter=10000)
show(plot_loss(losses))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [32:47<00:00,  5.08it/s]


In [58]:
(torch.argmax(torch.nn.functional.softmax(model(Xtrain),dim=1),dim=1)==torch.argmax(Ytrain,dim=1)).sum().item()/Xtrain.shape[0]

0.9994999499949995

In [61]:
list(model.named_parameters())

[('conv1.weight',
  Parameter containing:
  tensor([[[[-1.8695e-01, -1.7938e-01,  1.1258e-01,  2.2363e-01, -1.3567e-01],
            [ 1.1001e-01, -1.7596e-01, -1.6005e-01, -4.8153e-02, -2.1528e-01],
            [-2.4229e-01, -1.9769e-01, -8.9774e-02, -4.8138e-02,  1.7466e-01],
            [-7.2506e-02, -3.1235e-02, -8.4170e-02,  2.1055e-01,  1.6907e-01],
            [-4.5415e-02, -5.2372e-02, -2.2685e-01, -2.7263e-02,  2.1839e-01]]],
  
  
          [[[ 1.4079e-01, -5.3934e-02,  1.9264e-01,  1.2276e-01, -1.9391e-02],
            [-9.9190e-03, -1.1849e-01,  1.9455e-01,  1.8547e-01,  8.1716e-02],
            [-1.1359e-02, -1.8637e-01, -1.7495e-01,  1.5469e-01,  7.7679e-02],
            [-1.8651e-01,  5.4059e-02,  8.6442e-02, -1.0476e-01,  2.0535e-01],
            [ 1.1802e-01, -1.4401e-01,  4.3815e-02,  1.9103e-01,  1.6689e-01]]],
  
  
          [[[ 5.2712e-02,  1.4311e-01,  1.3769e-01,  1.7062e-01, -1.2337e-01],
            [ 8.0534e-02, -8.2513e-02, -1.8356e-02,  2.2462e-01, -1.9641e