In [1]:
import torch
from torch import nn
import torch.nn.functional as F
import torchvision.datasets as datasets
from tqdm.notebook import tqdm

<img src = "LeNet5_architecture.png">

In this implementation, we use the paper's original architecture. 

In [2]:
mnist = datasets.MNIST(root = "../data", download = True)

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
bs = 100

In [4]:
class MNISTTrainingDataset(torch.utils.data.Dataset):
    def __init__(self, data, targets, mean, std):
        super().__init__()
        train = data.reshape((-1,1,28,28))
        flat = train.flatten().to(torch.float32)

        self.train = ((train - mean)/ std).to(device)
        self.label = targets.to(torch.long).to(device)
        
    def __getitem__(self, i):
        return self.train[i], self.label[i]
    
    def __len__(self):
        return self.train.shape[0]

In [5]:
def train_test_datasets():
    #we have to use the training mean on the test set
    train = mnist.data[:50000].reshape((-1,1,28,28))
    flat = train.flatten().to(torch.float32)
    mean = flat.mean()
    std = flat.std()
    
    return MNISTTrainingDataset(mnist.data[:50000], mnist.targets[:50000], mean,std),\
        MNISTTrainingDataset(mnist.data[50000:], mnist.targets[50000:], mean, std)   

In [6]:
mnist_train, mnist_test = train_test_datasets()

In [7]:
train_dataloader = torch.utils.data.DataLoader(mnist_train, batch_size = bs, shuffle = True)
test_dataloader = torch.utils.data.DataLoader(mnist_train, batch_size = bs, shuffle = False)

In [8]:
class SubSampling(nn.Module):
    def __init__(self, kernel_size, stride = None):
        super().__init__()
        self.a = torch.tensor(0.001, requires_grad = True, dtype = torch.float32, device = device)
        self.b = torch.tensor(0.001, requires_grad = True, dtype = torch.float32, device = device)
        
        self.kernel_size = kernel_size
        self.stride = stride if stride else kernel_size
            
    def forward(self, x):
        with torch.no_grad():
            channels = x.shape[1]

        kernel = torch.ones(channels, channels, self.kernel_size, self.kernel_size).type(torch.float32).to(device)
        
        x = F.conv2d(x, kernel, stride = self.stride)
        return self.a * x + self.b 
    

In [9]:
class LeNetOriginal(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 6, 5, padding = 2)
        self.subsampling1 = SubSampling(2 , 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.subsampling2 = SubSampling(2,2)
        self.conv3 = nn.Conv2d(16,120,5)
        self.f1 = nn.Linear(120, 84)
        self.f2 = nn.Linear(84,10)
        
    def forward(self, x):
        C1 = self.conv1(x)
        S2 = torch.sigmoid(self.subsampling1(C1))
        C3 = self.conv2(S2)
        S4 = torch.sigmoid(self.subsampling2(C3))
        C5 = torch.sigmoid(self.conv3(S4).squeeze())
        F6 = torch.sigmoid(self.f1(C5))
        return self.f2(F6)
        

In [10]:
lenet = LeNetOriginal().cuda()

In [17]:
loss = nn.CrossEntropyLoss()
optim = torch.optim.SGD(lenet.parameters(), lr=0.003)

In [12]:
scheduler = torch.optim.lr_scheduler.MultiplicativeLR(optim, lambda e: 0.95)

In [13]:
def get_accuracy():
    accumulated_accuracy = []
    for batch, truth in test_dataloader:
        weighted_rate = (lenet(batch).argmax(dim = 1) == truth).sum().cpu().numpy()
        accumulated_accuracy.append(weighted_rate)
    return sum(accumulated_accuracy)/bs/len(test_dataloader)

In [14]:
accuracy = []

In [18]:
#training
epochs = 10

for epoch in tqdm(range(epochs)):
    accuracy.append(get_accuracy())
    for batch, preds in train_dataloader:
        y_pred = lenet(batch)
        
        l = loss(y_pred, preds)
        
        l.backward()
        
        optim.step()
        
        optim.zero_grad()


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




In [19]:
accuracy

[0.09012,
 0.09936,
 0.11356000000000001,
 0.09976,
 0.11356000000000001,
 0.11356000000000001,
 0.11356000000000001,
 0.11356000000000001,
 0.11356000000000001,
 0.11356000000000001,
 0.10202,
 0.11356000000000001,
 0.11356000000000001,
 0.11356000000000001,
 0.11356000000000001,
 0.11356000000000001,
 0.11356000000000001,
 0.11356000000000001,
 0.11356000000000001,
 0.11356000000000001]

We are stuck in a local minimum :(

In [None]:
mnist["test"]

In [26]:
foo.shape

torch.Size([100, 1, 28, 28])

In [28]:
foo=next(iter(train_dataloader))[0].type(torch.float32).cpu()

In [29]:
kernel = torch.tensor([[[0,0],[0,0]] for _ in range(5)]).unsqueeze(1).type(torch.float32)

In [30]:
kernel = torch.tensor([[[1,1],[1,1]] for _ in range(5)]).unsqueeze(1).type(torch.float32)

In [31]:
kernel.shape

torch.Size([5, 1, 2, 2])

In [32]:
F.conv2d(foo, kernel).shape

torch.Size([100, 5, 27, 27])

In [46]:
foo


tensor([[[[-0.4241, -0.4241, -0.4241,  ..., -0.4241, -0.4241, -0.4241],
          [-0.4241, -0.4241, -0.4241,  ..., -0.4241, -0.4241, -0.4241],
          [-0.4241, -0.4241, -0.4241,  ..., -0.4241, -0.4241, -0.4241],
          ...,
          [-0.4241, -0.4241, -0.4241,  ..., -0.4241, -0.4241, -0.4241],
          [-0.4241, -0.4241, -0.4241,  ..., -0.4241, -0.4241, -0.4241],
          [-0.4241, -0.4241, -0.4241,  ..., -0.4241, -0.4241, -0.4241]]],


        [[[-0.4241, -0.4241, -0.4241,  ..., -0.4241, -0.4241, -0.4241],
          [-0.4241, -0.4241, -0.4241,  ..., -0.4241, -0.4241, -0.4241],
          [-0.4241, -0.4241, -0.4241,  ..., -0.4241, -0.4241, -0.4241],
          ...,
          [-0.4241, -0.4241, -0.4241,  ..., -0.4241, -0.4241, -0.4241],
          [-0.4241, -0.4241, -0.4241,  ..., -0.4241, -0.4241, -0.4241],
          [-0.4241, -0.4241, -0.4241,  ..., -0.4241, -0.4241, -0.4241]]],


        [[[-0.4241, -0.4241, -0.4241,  ..., -0.4241, -0.4241, -0.4241],
          [-0.4241, -0.424