## Assignment 7 by Agajan Torayev (matr.: 3067341), Joan Plepi (matr.: 3056655)

In [1]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import math

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


### Prepare Data

In [2]:
batch_size = 100

trainset = datasets.MNIST(root='./data', train=True, transform=transforms.ToTensor(), download=True)
testset = datasets.MNIST(root='./data', train=False, transform=transforms.ToTensor(), download=True)

trainloader = torch.utils.data.DataLoader(dataset=trainset, batch_size=batch_size, shuffle=True)
testloader = torch.utils.data.DataLoader(dataset=testset, batch_size=batch_size, shuffle=False)

### LSTM Model using nn.LSTM

In [3]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, bv=1.0):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
        # init biases for forget gate
        for names in self.lstm._all_weights:
            for name in filter(lambda n: "bias" in n,  names):
                bias = getattr(self.lstm, name)
                n = bias.size(0)
                start, end = n//4, n//2
                bias.data[start:end].fill_(bv)
    
    def forward(self, x):
        # initialize hidden and cell states to zeros
        h0 = torch.zeros(1, x.size(0), self.hidden_dim, device=device)
        c0 = torch.zeros(1, x.size(0), self.hidden_dim, device=device)
            
        out, (hn, cn) = self.lstm(x, (h0, c0))
        
        out = self.fc(out[:, -1, :])

        return out

### LSTM Model without nn.LSTM

In [4]:
"""This implementation is slower than nn.LSTM because of explicit weight vectors."""
class MyLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, bv=1.0):
        super().__init__()
        
        self.hidden_dim = hidden_dim
        
        # input gate
        self.linear_ii = nn.Linear(input_dim, hidden_dim)
        self.linear_hi = nn.Linear(hidden_dim, hidden_dim)
        
        # forget gate
        self.linear_if = nn.Linear(input_dim, hidden_dim)
        self.linear_hf = nn.Linear(hidden_dim, hidden_dim)
        
        # cell gate
        self.linear_ig = nn.Linear(input_dim, hidden_dim)
        self.linear_hg = nn.Linear(hidden_dim, hidden_dim)
        
        # output gate
        self.linear_io = nn.Linear(input_dim, hidden_dim)
        self.linear_ho = nn.Linear(hidden_dim, hidden_dim)
        
        # initialize biases for forget gate differently
        self.linear_if.bias.data.fill_(bv)
        self.linear_hf.bias.data.fill_(bv)
        
        # Affine layer for output
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        # initialize hidden and cell states to zeros
        h_t = torch.zeros(x.size(0), self.hidden_dim, device=device)
        c_t = torch.zeros(x.size(0), self.hidden_dim, device=device)
        
        seqlen = x.shape[1]
        for t in range(seqlen):
            xb_t = x[:, t, :]
            i_t = nn.Sigmoid()(self.linear_ii(xb_t) + self.linear_hi(h_t))
            f_t = nn.Sigmoid()(self.linear_if(xb_t) + self.linear_hf(h_t))
            g_t = nn.Tanh()(self.linear_ig(xb_t) + self.linear_hg(h_t))
            o_t = nn.Sigmoid()(self.linear_io(xb_t) + self.linear_ho(h_t))
            c_t = f_t * c_t + i_t * g_t
            h_t = o_t * nn.Tanh()(c_t)
        
        out = self.fc(h_t)
        return out

### Training

In [5]:
input_dim = 28
output_dim = 10
seq_len = 28 # for truncated BPTT

n_iters = 6000
num_epochs = 1

hidden_dim = 100
lr = 1e-1

model = MyLSTM(input_dim, hidden_dim, output_dim, bv=5.0)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=lr)

iter = 0
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(trainloader):
        images = images.view(-1, seq_len, input_dim).to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        iter += 1
        
        if iter % 10 == 0:
            with torch.no_grad():
                correct = 0
                total = 0

                for images, labels in testloader:
                    images = images.view(-1, seq_len, input_dim).to(device)
                    labels = labels.to(device)

                    outputs = model(images)
                    _, predicted = torch.max(outputs.data, 1)
                    total += labels.size(0)
                    
                    correct += (predicted.cpu() == labels.cpu()).sum()
                
                acc = 100 * correct / total
                
                print('Iteration: {}. Loss: {:.4f}. Accuracy: {:.2f}%'.format(iter, loss.item(), acc))

Iteration: 10. Loss: 2.2702. Accuracy: 25.00%
Iteration: 20. Loss: 2.0724. Accuracy: 29.00%
Iteration: 30. Loss: 2.0219. Accuracy: 29.00%
Iteration: 40. Loss: 2.0427. Accuracy: 38.00%
Iteration: 50. Loss: 1.8270. Accuracy: 38.00%
Iteration: 60. Loss: 1.7080. Accuracy: 46.00%
Iteration: 70. Loss: 1.7286. Accuracy: 43.00%
Iteration: 80. Loss: 1.7146. Accuracy: 40.00%
Iteration: 90. Loss: 1.6722. Accuracy: 49.00%
Iteration: 100. Loss: 1.4075. Accuracy: 39.00%
Iteration: 110. Loss: 1.4525. Accuracy: 52.00%
Iteration: 120. Loss: 1.2916. Accuracy: 59.00%
Iteration: 130. Loss: 1.4501. Accuracy: 56.00%
Iteration: 140. Loss: 1.3513. Accuracy: 62.00%
Iteration: 150. Loss: 1.2870. Accuracy: 54.00%
Iteration: 160. Loss: 1.0241. Accuracy: 57.00%
Iteration: 170. Loss: 1.2253. Accuracy: 56.00%
Iteration: 180. Loss: 1.1363. Accuracy: 67.00%
Iteration: 190. Loss: 1.1715. Accuracy: 63.00%
Iteration: 200. Loss: 0.9465. Accuracy: 67.00%
Iteration: 210. Loss: 1.0106. Accuracy: 65.00%
Iteration: 220. Loss: 