In [1]:
import torch
from torchtext import data
from torchtext import datasets
import random

#  In this block, we set the seed and define the Fields 
SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
#  tokenize text with spacy
TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(tensor_type=torch.FloatTensor)

In [2]:
#  separate and get the train/valid/test splits.
train, test = datasets.IMDB.splits(TEXT, LABEL)
train, valid = train.split(random_state=random.seed(SEED))

In [3]:
#  this part uses the pre-trained word embeddings.
TEXT.build_vocab(train, max_size=25000, vectors="glove.6B.100d")
LABEL.build_vocab(train)

In [4]:
#  this part creates the iterator. Batch size of 64 means that each time 64 sentences of similar length 
#  would be returned when called.
BATCH_SIZE = 64

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train, valid, test), 
    batch_size=BATCH_SIZE, 
    sort_key=lambda x: len(x.text), 
    repeat=False)

In [5]:
import torch.nn as nn

#  vocab_size: length of TEXT.vocab
#  embedding_dim: dimension of the dense word vector after embedding
#  hidden_dim: dimension of hidden states
#  output_dim: dimension of output class 
#  n_layers: number of layers
#  bidirectional: add an additional layer that processes values from last to first
#  dropout: used to avoid overfitting

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    #  used to define the forwarding process between each node    
    def forward(self, x):
        
        #x = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(x))
        
        #embedded = [sent len, batch size, emb dim]
        
        output, (hidden, cell) = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim * num directions]
        #hidden = [num layers * num directions, batch size, hid. dim]
        #cell = [num layers * num directions, batch size, hid. dim]
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
                
        #hidden [batch size, hid. dim * num directions]
            
        return self.fc(hidden.squeeze(0))

In [6]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
#  build model with parameters
model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

In [7]:
#  print size of pretrained_embeddings
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([25002, 100])


In [8]:
#  assign pre-embeddings to embedding layer
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.4096, -0.5753,  0.1126,  ...,  0.4092,  0.1856,  0.1066],
        [ 0.2110, -0.2472,  0.6508,  ..., -0.1627,  0.4507, -1.1627],
        [-0.2379, -0.1095,  0.4314,  ...,  0.6665,  0.3200,  0.8872]])

In [9]:
#  create an optimizer with Adam method
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [10]:
#  use binomial cross entropy as the loss function, and enable the code to be run with GPU if present.
criterion = nn.BCEWithLogitsLoss()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = model.to(device)
criterion = criterion.to(device)

In [11]:
import torch.nn.functional as F

def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(F.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [12]:
#  method to train all batches in iterator
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [13]:
#  method to evaluate. Faster than train since no need to update parameters
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [14]:
#  time and train the model five times. Print the results as well.
import time
start = time.time()
N_EPOCHS = 5

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc*100:.2f}%, Val. Loss: {valid_loss:.3f}, Val. Acc: {valid_acc*100:.2f}%')

end = time.time()
print("time used:", end - start)

  return Variable(arr, volatile=not train)


Epoch: 01, Train Loss: 0.676, Train Acc: 58.68%, Val. Loss: 0.669, Val. Acc: 62.38%
Epoch: 02, Train Loss: 0.649, Train Acc: 61.41%, Val. Loss: 0.697, Val. Acc: 50.85%
Epoch: 03, Train Loss: 0.663, Train Acc: 61.18%, Val. Loss: 0.664, Val. Acc: 60.17%
Epoch: 04, Train Loss: 0.621, Train Acc: 65.00%, Val. Loss: 0.496, Val. Acc: 79.08%
Epoch: 05, Train Loss: 0.361, Train Acc: 84.99%, Val. Loss: 0.314, Val. Acc: 87.96%
time used: 1562.354117155075


In [15]:
#  print the accuracy in the test set
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc*100:.2f}%')

  return Variable(arr, volatile=not train)


Test Loss: 0.349, Test Acc: 85.91%


In [16]:
#  define a function to predict sentiment of given sentence
import spacy
nlp = spacy.load('en')

def predict_sentiment(sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction = F.sigmoid(model(tensor))
    return prediction.item()

In [17]:
predict_sentiment("Today is terrible")



0.34874266386032104

In [18]:
predict_sentiment("Today is great")



0.8716537952423096

# GRU

In [19]:
class RNN1(nn.Module):   #  class for GRU
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        #  these settings are the same as in class RNN, except that self.rnn calls nn.GRU.
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(x))
        
        #embedded = [sent len, batch size, emb dim]
        
        output, hidden = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim * num directions]
        #hidden = [num layers * num directions, batch size, hid. dim]
        #cell = [num layers * num directions, batch size, hid. dim]
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
                
        #hidden [batch size, hid. dim * num directions]
            
        return self.fc(hidden.squeeze(0))

In [20]:
#  build model with given parameters
model1 = RNN1(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

In [21]:
#  assign pre-embeddings to embedding layer
model1.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.4096, -0.5753,  0.1126,  ...,  0.4092,  0.1856,  0.1066],
        [ 0.2110, -0.2472,  0.6508,  ..., -0.1627,  0.4507, -1.1627],
        [-0.2379, -0.1095,  0.4314,  ...,  0.6665,  0.3200,  0.8872]])

In [22]:
#  create optimizer1 with Adam method
optimizer1 = optim.Adam(model1.parameters())

In [23]:
#  use binomial cross entropy as the loss function, and enable the code to be run with GPU if present.
criterion = nn.BCEWithLogitsLoss()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model1 = model1.to(device)
criterion = criterion.to(device)

In [24]:
#  method to train all batches in iterator 
def train1(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()  #  to zero gradients
        
        predictions = model(batch.text).squeeze(1)  #  to remove the dimension 1
        
        loss = criterion(predictions, batch.label)  #  define loss
        
        acc = binary_accuracy(predictions, batch.label)  #  define accuracy
        
        loss.backward()
        
        optimizer.step()  #  continue
        
        epoch_loss += loss.item() 
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [25]:
#  time and train the model five times. Print the results as well.
torch.cuda.empty_cache()
import time
start = time.time()
N_EPOCHS = 5

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train1(model1, train_iterator, optimizer1, criterion)
    valid_loss, valid_acc = evaluate(model1, valid_iterator, criterion)
    
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc*100:.2f}%, Val. Loss: {valid_loss:.3f}, Val. Acc: {valid_acc*100:.2f}%')

end = time.time()
print("time used:", end - start)

  return Variable(arr, volatile=not train)


Epoch: 01, Train Loss: 0.594, Train Acc: 66.59%, Val. Loss: 0.379, Val. Acc: 83.71%
Epoch: 02, Train Loss: 0.341, Train Acc: 85.75%, Val. Loss: 0.272, Val. Acc: 89.71%
Epoch: 03, Train Loss: 0.231, Train Acc: 91.15%, Val. Loss: 0.254, Val. Acc: 89.91%
Epoch: 04, Train Loss: 0.158, Train Acc: 94.11%, Val. Loss: 0.259, Val. Acc: 90.05%
Epoch: 05, Train Loss: 0.122, Train Acc: 95.62%, Val. Loss: 0.276, Val. Acc: 90.13%
time used: 1362.5740976333618


In [26]:
#  print the statistics for test set
test_loss, test_acc = evaluate(model1, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc*100:.2f}%')

  return Variable(arr, volatile=not train)


Test Loss: 0.335, Test Acc: 87.99%


In [27]:
#  define a function to predict sentiment of given sentence
def predict_sentiment1(sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction = F.sigmoid(model1(tensor))
    return prediction.item()

In [28]:
predict_sentiment1("Today is terrible")



0.02056674100458622

In [29]:
predict_sentiment1("Today is great")



0.9418495893478394

## Conclusion
we can see that GRU runs faster than LSTM.