# Homework  Exercise  3

Full Name: Vasileios Lioutas

For this exercise, I had to develop two text classification models using Vanilla RNN and LSTM for the IMDB Movie Review Dataset.

### Step 0 - Install necessary libraries (Skip if already installed)

The following libraries are needed to be installed along with PyTorch in order to run the following notebook.

In [None]:
%%bash
pip install torchtext
pip install spacy
python -m spacy download en

### Step 1 - Load the necessary libraries

In [9]:
import os

import pandas as pd
from torchtext import data
from torchtext import datasets
from torchtext.vocab import GloVe
import torch
from torch import nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable

torch.backends.cudnn.enabled=True # make sure we use CuDNN for faster code

SEED = 123

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

use_cuda = True
device = torch.device("cuda") if use_cuda else torch.device("cpu")

use_glove = True

### Step 2 - Load dataset

In order to load the dataset, I used the library "torchtext". Torchtext provides pre-built loaders for common NLP datasets (i.e. IMDB dataset) and gives you the flexibility to build a vocabulary of the word easily and with support of pre-trained vectors like Glove.

For my experiments, I chose to use also Spacy (one of the best advanced NLP libraries for Python) to perform the tokenization of the words. I've decided to work with the version of Glove that is contains 6 billion tokens and the 300D vectors. More information can be found here: https://nlp.stanford.edu/projects/glove/

In [2]:
def getData(batch_size=128, use_glove=False, device=torch.device("cpu")):

    # set up fields
    TEXT = data.Field(lower=True, include_lengths=True, batch_first=True, tokenize='spacy')
    LABEL = data.Field(sequential=False, dtype=torch.float32, unk_token=None, is_target=True)

    # make splits for data
    train, test = datasets.IMDB.splits(TEXT, LABEL)
    
    # build the vocabulary
    if use_glove:
        TEXT.build_vocab(train, max_size=25000, vectors=GloVe(name='6B', dim=300))
    else:
        TEXT.build_vocab(train)
        
    LABEL.build_vocab(train)

    # make iterator for splits
    train_iter, test_iter = data.BucketIterator.splits(
        (train, test), sort_key=lambda x: len(x.text), 
        batch_sizes=(batch_size, 300), sort_within_batch=True, device=device, repeat=False)

    return train_iter, test_iter, TEXT.vocab

Below we use the defined function to load the dataset. This will take some time the first time to download the dataset and the Glove vectors.

In [3]:
training_iterator,testing_iterator,vocabulary = getData(batch_size=16, device=device, use_glove=use_glove)

### Step 3 - Defining the model

Below I've created a function to help me build all the different RNN models that I'll use for my experiments. It supports stacked RNNs, bidirectional RNNs and dropout.

In [4]:
class RNN(nn.Module):
    def __init__(self, model='vanilla', vocab=None, hidden_size=128, dropout=0.5, num_layers=1, bidirectional=False):
        super(RNN, self).__init__()
        self.model = model
        self.vocab = vocab
        self.hidden_size = hidden_size
        self.dropout = dropout
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        
        self.emb_size = 300
        
        if self.vocab.vectors is None:
            self.emb = nn.Embedding(len(self.vocab), self.emb_size)
        else:
            self.emb = nn.Embedding(len(self.vocab), self.vocab.vectors.shape[1], padding_idx=1) # we define which id corresponds to pad id
            self.emb.weight.data.copy_(self.vocab.vectors) # copy the pretrained Glove vector as the initialization of the embeddings
            
        rnn_dropout = 0 if self.num_layers == 1 else self.dropout # PyTorch uses dropout only between 2 or more stacked RNN layers, thus if there's only one layer keep dropout to zero
        if model == 'vanilla':
            self.rnn = nn.RNN(input_size=self.emb_size, hidden_size=self.hidden_size, batch_first=True, num_layers=self.num_layers, dropout=rnn_dropout, bidirectional=self.bidirectional)
        else:
            self.rnn = nn.LSTM(input_size=self.emb_size, hidden_size=self.hidden_size, batch_first=True, num_layers=self.num_layers, dropout=rnn_dropout, bidirectional=self.bidirectional)
        
        h_size = self.hidden_size*2 if self.bidirectional else self.hidden_size # if we use bidirection RNN we get a twice the size of the hidden state size we defined
        self.out = nn.Linear(h_size, 1)
        
        self.dropout_layer = nn.Dropout(self.dropout)
        
    def forward(self, x, lengths):
        embed_x = self.dropout_layer(self.emb(x)) # pass the sequences through the embedding layer and through dropout
        
        # PyTorch allows for dynamic RNNs by packing the padded sequences
        packed_emb = nn.utils.rnn.pack_padded_sequence(embed_x, lengths, batch_first=True) 

        # now run through RNN
        output, hidden = self.rnn(packed_emb)

        # undo the packing operation
        output, _ = nn.utils.rnn.pad_packed_sequence(output, batch_first=True)
                        
        x = self.dropout_layer(output.mean(1)) # taking the average output representation of all times t in the sequence
        x = self.out(x.squeeze(1))
        return x.squeeze()

Below are the functions responsible for training and testing.

In [5]:
def train(model, train_loader, optimizer, criterion, epoch):
    model.train()
    loss_sum = 0
    loss_count = 0
    for batch_idx, data in enumerate(train_loader):
        (x, x_lengths), y = data.text, data.label # extract the sequences and the true lengths of them alogn with the labels
        optimizer.zero_grad()
        output = model(x, x_lengths)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()
        loss_sum += loss.item()
        loss_count += 1
    return round(loss_sum/loss_count, 6)


def test(model, test_loader, criterion, epoch):
    model.eval()
    test_loss = 0
    acc = 0
    with torch.no_grad():
        for data in test_loader:
            (x, x_lengths), y = data.text, data.label
            output = model(x, x_lengths)
            test_loss += criterion(output, y).sum().item()
            
            rounded_preds = torch.round(torch.sigmoid(output))
            correct = rounded_preds.eq(y.view_as(rounded_preds)).float()
            acc += (correct.sum()/len(correct)).item()

    test_loss /= len(test_loader)
    acc /= len(test_loader)
    return round(test_loss, 6), 100 * round(acc, 4)

def experiment(model, epochs, training_iterator, testing_iterator, optimizer, criterion):
    best_acc = -1
    best_train_loss = -1
    best_test_loss = -1
    best_epoch = -1
    for epoch in range(1, epochs+1):
        train_loss = train(model, training_iterator, optimizer, criterion, epoch)
        test_loss, acc = test(model, testing_iterator, criterion, epoch)
        if acc >= best_acc: # used for find which epoch had the best accuracy
            best_acc = acc
            best_train_loss = train_loss
            best_test_loss = test_loss
            best_epoch = epoch
            
    return best_epoch, best_train_loss, best_test_loss, best_acc

### Step 4 - Running the experiments

Below I'm going to run all the experiments for the different hyperparameters combinations and save the results into a CVS file. I decided that the three different hyperparameters that I will explore would be the dropout rate, the number of stacked RNN layers and if the RNNs be bidirectional or not.

Note that due to computational resource, I couldn't run the experiments for 200 and 500 hidden states that include bidirectional model and 2 stacked RNN layers. 

All the experiments run for 5 epochs and with fixed 16 batch size (I couldn't test with different batch sizes due to computational resources). The learning rate was set to default according to the optimization algorithm (Adam). 

In [None]:
hidden_states = [20, 50, 100, 200, 500]
dropouts = [0, 0.2, 0.5]
num_layers = [1, 2]
bidirectional = [False, True]
rnn_types = ['vanilla', 'lstm']

epochs = 5

# We use the binary cross entropy error.
# This one includes the sigmoid that the logits have to go through before the computation of the loss.
# It's recommended this way instead of manually use the sigmoid in the model architecture 
# since PyTorch uses the log-sum-exp trick for better numerical stability.
criterion = nn.BCEWithLogitsLoss().to(device)

counter = 0

filename = "results.csv"

if os.path.exists(filename):
    os.remove(filename)

with open(filename, 'a+') as out:
   out.write("rnn_type,"+"hidden_state,"+"dropout,"+"num_layers,"+"bidirectional,"+"best_epoch,"+"train_loss,"+"test_loss,"+"accuracy"+'\n')

for rnn_type in rnn_types:
    for hd in hidden_states:
        for dp in dropouts:
            for nl in num_layers:
                for bd in bidirectional:
                    counter += 1
                    
                    model = RNN(model=rnn_type, vocab=vocabulary, hidden_size=hd, dropout=dp, num_layers=nl, bidirectional=bd).to(device)
                    optimizer = optim.Adam(model.parameters())
                    
                    best_epoch, best_train_loss, best_test_loss, best_acc = experiment(model, epochs, training_iterator, testing_iterator, optimizer, criterion)
                    
                    # used to empty gpu cache memory after each experiment
                    del model
                    torch.cuda.empty_cache()
                    
                    line = str(rnn_type)+","+str(hd)+","+str(dp)+","+str(nl)+","+str(bd)+","+str(best_epoch)+","+str(best_train_loss)+","+str(best_test_loss)+","+str(best_acc)
                    print(counter, line)

                    with open(filename, 'a+') as out:
                       out.write(line + '\n')

### Step 5 - Results

In [20]:
results = pd.read_csv("results.csv")

First, I report what is the best score between the best LSTM and vanilla RNN models. As you can see, LSTM has a much better accuracy than the vanilla RNN. An interesting result is that the LSTM model has achieved the best score after only one epoch and vanilla RNN had to run for 4 epochs and using 2 bidirectional RNN layers.

In [14]:
results.groupby('rnn_type').apply(lambda t: t[t.accuracy==t.accuracy.max()])

Unnamed: 0_level_0,Unnamed: 1_level_0,rnn_type,hidden_state,dropout,num_layers,bidirectional,best_epoch,train_loss,test_loss,accuracy
rnn_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
lstm,112,lstm,500,0.2,1,False,1.0,0.330078,0.247589,90.29
vanilla,11,vanilla,20,0.5,2,True,4.0,0.189767,0.312822,88.41


Next, I created some tables to show in each hidden state, which model best.

In [21]:
results[results['hidden_state']==20].groupby('rnn_type').apply(lambda t: t[t.accuracy==t.accuracy.max()])

Unnamed: 0_level_0,Unnamed: 1_level_0,rnn_type,hidden_state,dropout,num_layers,bidirectional,best_epoch,train_loss,test_loss,accuracy
rnn_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
lstm,71,lstm,20,0.5,2,True,3.0,0.195013,0.31564,88.68
vanilla,11,vanilla,20,0.5,2,True,4.0,0.189767,0.312822,88.41


In [22]:
results[results['hidden_state']==50].groupby('rnn_type').apply(lambda t: t[t.accuracy==t.accuracy.max()])

Unnamed: 0_level_0,Unnamed: 1_level_0,rnn_type,hidden_state,dropout,num_layers,bidirectional,best_epoch,train_loss,test_loss,accuracy
rnn_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
lstm,79,lstm,50,0.2,2,True,1.0,0.401064,0.275062,89.03
vanilla,15,vanilla,50,0.0,2,True,1.0,0.404624,0.302325,87.91


In [23]:
results[results['hidden_state']==100].groupby('rnn_type').apply(lambda t: t[t.accuracy==t.accuracy.max()])

Unnamed: 0_level_0,Unnamed: 1_level_0,rnn_type,hidden_state,dropout,num_layers,bidirectional,best_epoch,train_loss,test_loss,accuracy
rnn_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
lstm,87,lstm,100,0.0,2,True,1.0,0.360739,0.262898,89.72
vanilla,35,vanilla,100,0.5,2,True,4.0,0.173499,0.342928,87.45


In [24]:
results[results['hidden_state']==200].groupby('rnn_type').apply(lambda t: t[t.accuracy==t.accuracy.max()])

Unnamed: 0_level_0,Unnamed: 1_level_0,rnn_type,hidden_state,dropout,num_layers,bidirectional,best_epoch,train_loss,test_loss,accuracy
rnn_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
lstm,104,lstm,200,0.5,1,False,1.0,0.355361,0.259895,89.69
vanilla,40,vanilla,200,0.2,1,False,4.0,0.166254,0.322346,87.22


In [25]:
results[results['hidden_state']==500].groupby('rnn_type').apply(lambda t: t[t.accuracy==t.accuracy.max()])

Unnamed: 0_level_0,Unnamed: 1_level_0,rnn_type,hidden_state,dropout,num_layers,bidirectional,best_epoch,train_loss,test_loss,accuracy
rnn_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
lstm,112,lstm,500,0.2,1,False,1.0,0.330078,0.247589,90.29
vanilla,52,vanilla,500,0.2,1,False,3.0,0.181922,0.330636,86.82


It's easy to see that the best vanilla RNN models were 2-layer bidirectional models but with a hidden state of 200 and 500 there are enough parameters to fit a good model and the versions with 2-layers and bidirectional options for 200 and 500 probably overfitted the training set.

Also, as we increase the hidden state of the LSTM we notice that we gradually get better results.