## Assignment 7 by Agajan Torayev (matr.: 3067341), Joan Plepi (matr.: 3056655)

In [2]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import math

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [1]:
# http://pytorch.org/
from os.path import exists
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
cuda_output = !ldconfig -p|grep cudart.so|sed -e 's/.*\.\([0-9]*\)\.\([0-9]*\)$/cu\1\2/'
accelerator = cuda_output[0] if exists('/dev/nvidia0') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.1-{platform}-linux_x86_64.whl torchvision
import torch

tcmalloc: large alloc 1073750016 bytes == 0x5818a000 @  0x7ff87ad272a4 0x591a07 0x5b5d56 0x502e9a 0x506859 0x502209 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x507641 0x502209 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x507641 0x504c28 0x502540 0x502f3d 0x507641


### Prepare Data

In [4]:
batch_size = 100

trainset = datasets.MNIST(root='./data', train=True, transform=transforms.ToTensor(), download=True)
testset = datasets.MNIST(root='./data', train=False, transform=transforms.ToTensor(), download=True)

trainloader = torch.utils.data.DataLoader(dataset=trainset, batch_size=batch_size, shuffle=True)
testloader = torch.utils.data.DataLoader(dataset=testset, batch_size=batch_size, shuffle=False)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Processing...
Done!


### LSTM Model using nn.LSTM

In [0]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, bv=1.0):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
        # init biases for forget gate
        for names in self.lstm._all_weights:
            for name in filter(lambda n: "bias" in n,  names):
                bias = getattr(self.lstm, name)
                n = bias.size(0)
                start, end = n//4, n//2
                bias.data[start:end].fill_(bv)
    
    def forward(self, x):
        # initialize hidden and cell states to zeros
        h0 = torch.zeros(1, x.size(0), self.hidden_dim, device=device)
        c0 = torch.zeros(1, x.size(0), self.hidden_dim, device=device)
            
        out, (hn, cn) = self.lstm(x, (h0, c0))
        
        out = self.fc(out[:, -1, :])

        return out

### LSTM Model without nn.LSTM

In [0]:
"""This implementation is slower than nn.LSTM because of explicit weight vectors."""
class MyLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, bv=1.0):
        super().__init__()
        
        self.hidden_dim = hidden_dim
        
        # input gate
        self.linear_ii = nn.Linear(input_dim, hidden_dim)
        self.linear_hi = nn.Linear(hidden_dim, hidden_dim)
        
        # forget gate
        self.linear_if = nn.Linear(input_dim, hidden_dim)
        self.linear_hf = nn.Linear(hidden_dim, hidden_dim)
        
        # cell gate
        self.linear_ig = nn.Linear(input_dim, hidden_dim)
        self.linear_hg = nn.Linear(hidden_dim, hidden_dim)
        
        # output gate
        self.linear_io = nn.Linear(input_dim, hidden_dim)
        self.linear_ho = nn.Linear(hidden_dim, hidden_dim)
        
        # initialize biases for forget gate differently
        self.linear_if.bias.data.fill_(bv)
        self.linear_hf.bias.data.fill_(bv)
        
        # Affine layer for output
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        # input of shape (seq_len, batch, input_size)
        # initialize hidden and cell states to zeros
        # x.size(0) is equal to batch_size (num of layers)
        h_t = torch.zeros(x.size(0), self.hidden_dim, device=device)
        c_t = torch.zeros(x.size(0), self.hidden_dim, device=device)
        print("H_T: ----->", h_t.size())
        print(x.size())
        seqlen = x.shape[1]
        for t in range(seqlen):
            xb_t = x[:, t, :]
            i_t = nn.Sigmoid()(self.linear_ii(xb_t) + self.linear_hi(h_t))
            f_t = nn.Sigmoid()(self.linear_if(xb_t) + self.linear_hf(h_t))
            g_t = nn.Tanh()(self.linear_ig(xb_t) + self.linear_hg(h_t))
            o_t = nn.Sigmoid()(self.linear_io(xb_t) + self.linear_ho(h_t))
            c_t = f_t * c_t + i_t * g_t
            h_t = o_t * nn.Tanh()(c_t)
        print("H_T after processing: ", h_t.size())
        out = self.fc(h_t)
        print(out.size())
        return out

## GRU implementation without nn.GRU

In [0]:
class MyGRU(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, bias=1.0):
        super().__init__()
        self.hidden_dim = hidden_dim
        
        self.linear_ir = nn.Linear(input_dim, hidden_dim)
        self.linear_hr = nn.Linear(hidden_dim, hidden_dim)
        
        self.linear_iz = nn.Linear(input_dim, hidden_dim)
        self.linear_hz = nn.Linear(hidden_dim, hidden_dim)
        
        self.linear_in = nn.Linear(input_dim, hidden_dim)
        self.linear_hn = nn.Linear(hidden_dim, hidden_dim)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        # x shape = (batch_size, seq_len, input_dim)
        h_t = torch.zeros(x.size(0), self.hidden_dim, device=device)
        c_t = torch.zeros(x.size(0), self.hidden_dim, device=device)
        
        seqlen = x.shape[1]
        
        for t in range(seqlen):
            x_t = x[:, t, :]
            r_t = nn.Sigmoid()(self.linear_ir(x_t) + self.linear_hr(h_t))
            z_t = nn.Sigmoid()(self.linear_iz(x_t) + self.linear_hz(h_t))
            n_t = nn.Tanh()(self.linear_in(x_t) + r_t * self.linear_hn(h_t))
            h_t = (1-z_t) * n_t + z_t * h_t
        
        out = self.fc(h_t)
        return out

### Training

In [46]:
input_dim = 28
output_dim = 10
seq_len = 28 # for truncated BPTT

n_iters = 6000
num_epochs = 1

hidden_dim = 100
lr = 1e-1

model = MyLSTM(input_dim, hidden_dim, output_dim, bv=5.0)
#model = MyGRU(input_dim, hidden_dim, output_dim)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=lr)

iter = 0
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(trainloader):
       
        images = images.view(-1, seq_len, input_dim).to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        iter += 1
        
        if iter % 10 == 0:
            with torch.no_grad():
                correct = 0
                total = 0

                for images, labels in testloader:
                    images = images.view(-1, seq_len, input_dim).to(device)
                    labels = labels.to(device)

                    outputs = model(images)
                    _, predicted = torch.max(outputs.data, 1)
                    total += labels.size(0)
                    
                    correct += (predicted.cpu() == labels.cpu()).sum()
                
                acc = 100 * correct.double() / total
                
                print('Iteration: {}. Loss: {:.4f}. Accuracy: {:.2f}%'.format(iter, loss.item(), acc))

Iteration: 10. Loss: 2.2967. Accuracy: 11.00%
Iteration: 20. Loss: 2.3114. Accuracy: 11.00%
Iteration: 30. Loss: 2.3026. Accuracy: 11.00%
Iteration: 40. Loss: 2.2953. Accuracy: 15.00%
Iteration: 50. Loss: 2.3000. Accuracy: 15.00%
Iteration: 60. Loss: 2.2857. Accuracy: 14.00%
Iteration: 70. Loss: 2.2794. Accuracy: 14.00%
Iteration: 80. Loss: 2.2798. Accuracy: 17.00%
Iteration: 90. Loss: 2.2754. Accuracy: 20.00%
Iteration: 100. Loss: 2.2698. Accuracy: 21.00%
Iteration: 110. Loss: 2.2798. Accuracy: 22.00%
Iteration: 120. Loss: 2.2641. Accuracy: 23.00%
Iteration: 130. Loss: 2.2587. Accuracy: 23.00%
Iteration: 140. Loss: 2.2733. Accuracy: 24.00%
Iteration: 150. Loss: 2.2692. Accuracy: 23.00%
Iteration: 160. Loss: 2.2359. Accuracy: 20.00%
Iteration: 170. Loss: 2.2364. Accuracy: 21.00%
Iteration: 180. Loss: 2.2496. Accuracy: 22.00%
Iteration: 190. Loss: 2.2315. Accuracy: 23.00%
Iteration: 200. Loss: 2.1998. Accuracy: 24.00%
Iteration: 210. Loss: 2.2043. Accuracy: 24.00%
Iteration: 220. Loss: 