In [11]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [12]:
import torch
import random
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from tqdm.auto import tqdm

In [13]:
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#### For understanding how the input output works for LSTMs

In [14]:
embedding_size = 28
hidden_size = 1000
cell_size = 1000
batch_size = 100
sequence_length = 28
num_layers = 1
bidirectional = False

lstm = nn.LSTM(
            input_size = embedding_size,
            hidden_size = hidden_size,
            num_layers = num_layers,
            bidirectional = bidirectional,
            batch_first = True
        )

In [15]:
X = torch.randn((batch_size, sequence_length, embedding_size))
h_0 = torch.zeros(((int(bidirectional) + 1)*num_layers, batch_size, hidden_size))
c_0 = torch.zeros(((int(bidirectional) + 1)*num_layers, batch_size, hidden_size)) 

In [16]:
print(f"Input Vector size: - {X.size()}")
print(f"Initial Hidden State size: - {h_0.size()}")
print(f"Initial Cell State size: - {c_0.size()}")

Input Vector size: - torch.Size([100, 28, 28])
Initial Hidden State size: - torch.Size([1, 100, 1000])
Initial Cell State size: - torch.Size([1, 100, 1000])


In [17]:
outputs, (h_n, c_n) = lstm(X, (h_0, c_0))

In [18]:
print(f"Output size: - {outputs.size()}")
print(f"Final Hidden State size: - {h_n.size()}")
print(f"Final Cell State size: - {c_n.size()}")

Output size: - torch.Size([100, 28, 1000])
Final Hidden State size: - torch.Size([1, 100, 1000])
Final Cell State size: - torch.Size([1, 100, 1000])


#### Implementing the Architecture using Unidirectional and 1 Layer RNN

In [19]:
class LSTM(nn.Module):
    def __init__(self,batch_size, embedding_size, hidden_size, num_classes):
        super(LSTM, self).__init__()
        self.batch_size = batch_size
        self.embedding_size = embedding_size #aka input_size
        self.hidden_size = hidden_size
        self.num_classes = num_classes
        
        #defining layers
        self.lstm = nn.LSTM(
            input_size = self.embedding_size,
            hidden_size = self.hidden_size,
            num_layers = 1,
            batch_first = True
        )
        self.linear = nn.Linear(in_features = self.hidden_size,out_features = self.num_classes)
        
        #defining activation function
        self.tanh = nn.Tanh()
        
    def forward(self, X):
        #X.size() = (batch_size = self.batch_size, sequence_length = 28, input_size = 28)
        _,(X,_)  = self.lstm(X.squeeze(1))
        X = self.tanh(X)
        X = self.linear(X.squeeze(0))
        return X

#### Hyperparameters

In [20]:
in_channels = 1
num_classes = 10
learning_rate = 0.001
batch_size = 5000
num_epochs = 100

#### Loading Data (Can also load from digit recognizer but have to do some preprocessing)

In [21]:
#Downloading Data Set of MNIST
train_dataset = datasets.MNIST(root="/kaggle/working/", train = True, transform = transforms.ToTensor(), download = True)
test_dataset = datasets.MNIST(root="/kaggle/working/", train = False, transform = transforms.ToTensor(), download = True)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to /kaggle/working/MNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/9912422 [00:00<?, ?it/s]

Extracting /kaggle/working/MNIST/raw/train-images-idx3-ubyte.gz to /kaggle/working/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to /kaggle/working/MNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/28881 [00:00<?, ?it/s]

Extracting /kaggle/working/MNIST/raw/train-labels-idx1-ubyte.gz to /kaggle/working/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to /kaggle/working/MNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/1648877 [00:00<?, ?it/s]

Extracting /kaggle/working/MNIST/raw/t10k-images-idx3-ubyte.gz to /kaggle/working/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to /kaggle/working/MNIST/raw/t10k-labels-idx1-ubyte.gz


  0%|          | 0/4542 [00:00<?, ?it/s]

Extracting /kaggle/working/MNIST/raw/t10k-labels-idx1-ubyte.gz to /kaggle/working/MNIST/raw



In [22]:
#Creating Generator a.k.a Dataloader
train_dataloader = DataLoader(dataset = train_dataset, batch_size = batch_size, shuffle = True)
test_dataloader = DataLoader(dataset = test_dataset, batch_size = batch_size, shuffle = True)

#### Creating and Instance of model

In [23]:
model = LSTM(batch_size = batch_size, embedding_size = 28, hidden_size = 1000, num_classes =10)

In [24]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = learning_rate)
model = model.to(device)
for epoch in tqdm(range(num_epochs + 1)):
    epoch_loss = 0
    for batch in train_dataloader:
        batch[0] = batch[0].to(device)
        batch[1] = batch[1].to(device)
        inference = model.forward(batch[0])
        
        loss = criterion(inference, batch[1])
        
        optimizer.zero_grad()
        loss.backward()
        
        optimizer.step()
        epoch_loss += loss.item()
    if epoch%10 == 0:
        print(f"epoch:- {epoch.__str__().zfill(3)} loss :- {epoch_loss/len(train_dataloader)}")

  0%|          | 0/101 [00:00<?, ?it/s]

epoch:- 000 loss :- 2.139083723227183
epoch:- 010 loss :- 0.28915897384285927
epoch:- 020 loss :- 0.08961907401680946
epoch:- 030 loss :- 0.044216784027715526
epoch:- 040 loss :- 0.01975997940947612
epoch:- 050 loss :- 0.008582081723337373
epoch:- 060 loss :- 0.009046989143826067
epoch:- 070 loss :- 0.0013831038668286055
epoch:- 080 loss :- 0.02923127884666125
epoch:- 090 loss :- 0.0007572966011745544
epoch:- 100 loss :- 0.00033069648149345693


In [25]:
def evaluate(model, dataloader):
    model.eval()
    with torch.no_grad():
        num_correct = 0
        total_example = 0
        for batch in dataloader:
            batch[0] = batch[0].to(device)
            batch[1] = batch[1].to(device)

            inference = model.forward(batch[0])
            #output of max is max_value for each example with index location of the max in the last 10 neuron (y_pred) so no need of using softmax activation
            _, y_pred = torch.max(inference, dim = 1)
            num_correct += (y_pred == batch[1]).sum()
            total_example += inference.shape[0]
            
    print(f"Accuracy:- {num_correct/total_example}")
            

In [26]:
# Evaluation on Test Dataset
evaluate(model, test_dataloader)

Accuracy:- 0.9866999983787537


In [27]:
# Evaluation on Train Dataset
evaluate(model, train_dataloader)

Accuracy:- 0.9999833703041077
