# LSTM and GRU
## Kris Shen(ys852)
### 1. Preparing Data

In [1]:
import torch
from torchtext import data
from torchtext import datasets
import random

SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(tensor_type=torch.FloatTensor)

train, test = datasets.IMDB.splits(TEXT, LABEL)

train, valid = train.split(random_state=random.seed(SEED))

In [2]:
TEXT.build_vocab(train, max_size=25000, vectors="glove.6B.100d")
LABEL.build_vocab(train)

In [3]:
BATCH_SIZE = 16

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train, valid, test), 
    batch_size=BATCH_SIZE, 
    sort_key=lambda x: len(x.text), 
    repeat=False)

### 2.1. LSTM Model

In [4]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(x))
        
        #embedded = [sent len, batch size, emb dim]
        
        output, (hidden, cell) = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim * num directions]
        #hidden = [num layers * num directions, batch size, hid. dim]
        #cell = [num layers * num directions, batch size, hid. dim]
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
                
        #hidden [batch size, hid. dim * num directions]
            
        return self.fc(hidden.squeeze(0))

In [11]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5

lstm_model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

In [12]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([25002, 100])


In [13]:
lstm_model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.1123,  0.3113,  0.3317,  ..., -0.4576,  0.6191,  0.5304],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [14]:
import  torch.optim as optim
#We use Adam as our optimizer. Adam adapts the learning rate for each parameter, 
#giving parameters that are updated more frequently lower learning rates 
#and parameters that are updated infrequently higher learning rates
optimizer = optim.Adam(lstm_model.parameters())

In [16]:
criterion = nn.BCEWithLogitsLoss()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

lstm_model = lstm_model.to(device)
criterion = criterion.to(device)

In [17]:
import torch.nn.functional as F
#Calculate prediction accuracy
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(F.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [18]:
# This is used to train our model
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()# This is to ensure dropout is turned on while training
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)#get prediction
        
        loss = criterion(predictions, batch.label)#get current loss
        
        acc = binary_accuracy(predictions, batch.label)#get current accuracy
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()#cumulate loss
        epoch_acc += acc.item()#cumulate accuracy
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator) # return loss and accuracy

In [19]:
#after the model is trained, we use this function to test our model
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()# to make sure dropout is turned off
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)#get prediction
            
            loss = criterion(predictions, batch.label)#get loss
            
            acc = binary_accuracy(predictions, batch.label)#get accuracy

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)# return loss and accuracy

### 2.2. Train the data

In [20]:
N_EPOCHS = 5
import time
st=time.time()
for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(lstm_model, train_iterator, optimizer, criterion)# train the model
    valid_loss, valid_acc = evaluate(lstm_model, valid_iterator, criterion)#test the model
    
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc*100:.2f}%, Val. Loss: {valid_loss:.3f}, Val. Acc: {valid_acc*100:.2f}%')
ed=time.time()

  return Variable(arr, volatile=not train)


Epoch: 01, Train Loss: 0.692, Train Acc: 53.06%, Val. Loss: 0.711, Val. Acc: 52.93%
Epoch: 02, Train Loss: 0.612, Train Acc: 64.62%, Val. Loss: 0.369, Val. Acc: 85.11%
Epoch: 03, Train Loss: 0.298, Train Acc: 88.25%, Val. Loss: 0.278, Val. Acc: 89.18%
Epoch: 04, Train Loss: 0.196, Train Acc: 92.68%, Val. Loss: 0.284, Val. Acc: 89.39%
Epoch: 05, Train Loss: 0.139, Train Acc: 95.14%, Val. Loss: 0.291, Val. Acc: 89.70%


In [21]:
print("The time taken for lstm model is:"+str(ed-st))# get a record of time

The time taken for lstm model is:1099.7672159671783


In [22]:
#test our model
#we can see good improvement
test_loss, test_acc = evaluate(lstm_model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc*100:.2f}%')

  return Variable(arr, volatile=not train)


Test Loss: 0.381, Test Acc: 85.91%


In [23]:
import spacy
nlp = spacy.load('en')
# The explaination is in the above block
def predict_sentiment_lstm(sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction = F.sigmoid(lstm_model(tensor))
    return prediction.item()

In [24]:
predict_sentiment_lstm("This film is terrible")#we can see the value is close to 0, as what we expect



0.0049955230206251144

In [25]:
predict_sentiment_lstm("This film is great")#we can see the value is close to 1, as what we expect



0.9857094287872314

### 3.1. GRU Model

In [26]:
import torch.nn as nn

class GRU(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # Here we use GRU as our rnn layer
        self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(x))
        
        #embedded = [sent len, batch size, emb dim]
        
        output, hidden = self.rnn(embedded)# here we deleted the cell since the output of GRU is a tuple
        
        #output = [sent len, batch size, hid dim * num directions]
        #hidden = [num layers * num directions, batch size, hid. dim]
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
                
        #hidden [batch size, hid. dim * num directions]
            
        return self.fc(hidden.squeeze(0))

In [27]:
gru_model = GRU(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

In [28]:
gru_model.embedding.weight.data.copy_(pretrained_embeddings)# copy the pretrained embeddings


tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.1123,  0.3113,  0.3317,  ..., -0.4576,  0.6191,  0.5304],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [29]:
import torch.optim as optim
#We use Adam as our optimizer. Adam adapts the learning rate for each parameter, 
#giving parameters that are updated more frequently lower learning rates 
#and parameters that are updated infrequently higher learning rates
gru_optimizer = optim.Adam(gru_model.parameters())

In [30]:
# use gpu to run the training
gru_model = gru_model.to(device)
criterion = criterion.to(device)

### 3.2. Train the model

In [31]:
N_EPOCHS = 5
import time
st=time.time()
for epoch in range(N_EPOCHS):

    train_loss1, train_acc1 = train(gru_model, train_iterator, gru_optimizer, criterion)# train the model
    valid_loss1, valid_acc1 = evaluate(gru_model, valid_iterator, criterion)# test the model
    
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss1:.3f}, Train Acc: {train_acc1*100:.2f}%, Val. Loss: {valid_loss1:.3f}, Val. Acc: {valid_acc1*100:.2f}%')
ed=time.time()
print(ed-st)

  return Variable(arr, volatile=not train)


Epoch: 01, Train Loss: 0.548, Train Acc: 70.08%, Val. Loss: 0.350, Val. Acc: 87.50%
Epoch: 02, Train Loss: 0.259, Train Acc: 89.67%, Val. Loss: 0.247, Val. Acc: 89.92%
Epoch: 03, Train Loss: 0.178, Train Acc: 93.48%, Val. Loss: 0.244, Val. Acc: 90.20%
Epoch: 04, Train Loss: 0.120, Train Acc: 95.84%, Val. Loss: 0.270, Val. Acc: 89.60%
Epoch: 05, Train Loss: 0.084, Train Acc: 97.18%, Val. Loss: 0.329, Val. Acc: 89.28%
997.4182481765747


In [32]:
print("The time taken for lstm model is:"+str(ed-st))


The time taken for lstm model is:997.4182481765747


In [33]:
#test the gru model
test_loss1, test_acc1 = evaluate(gru_model, test_iterator, criterion)

print(f'Test Loss: {test_loss1:.3f}, Test Acc: {test_acc1*100:.2f}%')

  return Variable(arr, volatile=not train)


Test Loss: 0.384, Test Acc: 87.45%


In [34]:
# it is the same as the predict_sentiment_lstm function
def predict_sentiment_gru(sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction = F.sigmoid(gru_model(tensor))
    return prediction.item()

In [35]:
predict_sentiment_gru("This film is terrible")



0.005685031414031982

In [36]:
predict_sentiment_gru("This film is great")



0.9824297428131104

### 4. Conclusion

-  GRU is faster than LSTM
-  Test accuracy of GRU is higher than LSTM