In [1]:
import torch
from torchtext import data
from torchtext import datasets
import random

#Set the seed, text, label, etc, and split the dataset into training set, validation set, and test set

SEED = 5270

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

#the comment
TEXT = data.Field(tokenize='spacy')

#the sentiment review
LABEL = data.LabelField(tensor_type=torch.FloatTensor)

train, test = datasets.IMDB.splits(TEXT, LABEL)

train, valid = train.split(random_state=random.seed(SEED))

In [2]:
#Download the vectors. Change the text into vectors with different vocab.
#These vectors have been trained on corpuses of billions of tokens. 
#Now, instead of having our word embeddings initialized randomly, they are initialized with 
#these pre-trained vectors, where words that appear in similar contexts appear nearby in this vector space.
#The first step to using these is to specify the vectors and download them, 
#which is passed as an argument to build_vocab. The glove is the algorithm used to calculate the vectors



TEXT.build_vocab(train, max_size=25000, vectors="glove.6B.100d")
LABEL.build_vocab(train)

In [3]:
#We sort the sample text by the length of the comment and later splice them into different buckets.

BATCH_SIZE = 64

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train, valid, test), 
    batch_size=BATCH_SIZE, 
    sort_key=lambda x: len(x.text), 
    repeat=False)

In [4]:
#nn models

In [5]:
import torch.nn as nn

class LSTM(nn.Module):
    
    #We define variables in the __init__ function: 
    
    #vocab_size: length of each of the vocabulary in the TEXT
    #embedding_dim: it is the dimension that change the vocab vectors to dense embedding vectors
    #hidden_dim: the number of hidden states and it helps the NN to update parameters
    #output_dim: the dimension of the output, which is after the hidden states
    #n_layers: number of layers in the NN
    #bidirectional: boolean variable, if it is true, add a direction that runs from last layer to the first
    #dropout: ratio of the NN that drops neurons, it is the regularization procedure that prevent the NN overfit
    #LSTM need to specify the function, which is nn.LSTM
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        
        #note that if we choose bidirectional, hidden layers number got squared
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #forward propagation of the algorithm to get the output
        #it feeds the input text data into the NN, it goes through embedded layer, hidden layer and output layer
        #Apply dropout if set the variable
        
        #x = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(x))
        
        #embedded = [sent len, batch size, emb dim]
        
        output, (hidden, cell) = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim * num directions]
        #hidden = [num layers * num directions, batch size, hid. dim]
        #cell = [num layers * num directions, batch size, hid. dim]
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
                
        #hidden [batch size, hid. dim * num directions]
            
        return self.fc(hidden.squeeze(0))

In [6]:
import torch.nn as nn


#for GRU, it is the same parameters, except we dont have the cell parameter because we dont have memory in this model.


class GRU(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(x))
        
        #embedded = [sent len, batch size, emb dim]
        
        output, hidden = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim * num directions]
        #hidden = [num layers * num directions, batch size, hid. dim]
        #cell = [num layers * num directions, batch size, hid. dim]
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
                
        #hidden [batch size, hid. dim * num directions]
            
        return self.fc(hidden.squeeze(0))

In [7]:
INPUT_DIM  = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5

model_LSTM = LSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)
model_GRU = GRU(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

In [8]:
#get the size of the pretrained model to check if the size is corrent
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([25002, 100])


In [9]:
model_LSTM.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.1897, -0.0174,  0.6258,  ..., -0.3503,  0.0343,  0.8224],
        [-0.2630,  0.1020,  1.2268,  ...,  0.3066, -0.8744,  0.9514],
        [ 0.0010, -0.1038,  0.4295,  ..., -0.4974, -0.8185,  0.7238]])

In [10]:
model_GRU.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.1897, -0.0174,  0.6258,  ..., -0.3503,  0.0343,  0.8224],
        [-0.2630,  0.1020,  1.2268,  ...,  0.3066, -0.8744,  0.9514],
        [ 0.0010, -0.1038,  0.4295,  ..., -0.4974, -0.8185,  0.7238]])

In [11]:
#import optimizer for each methods, we update our hyperparameters using Adam method


import torch.optim as optim

optimizer_LSTM = optim.Adam(model_LSTM.parameters())
optimizer_GRU = optim.Adam(model_GRU.parameters())

In [12]:
#the criterion we use to judge the model's performance is the loss function called "binary cross entropy"


criterion = nn.BCEWithLogitsLoss()

#if find out that is GPU, use GPU instead for speed
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_LSTM = model_LSTM.to(device)
model_GRU = model_GRU.to(device)
criterion = criterion.to(device)

In [13]:
import torch.nn.functional as F

def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(F.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [14]:
def train(model, iterator, optimizer, criterion):
    
    
    #this function runs the sample by different batch
    #first step is forward propagation, where we feed all the data and get the 
    #result and the loss, and then we use optim function to do the backward
    #propagation to update the parameters using gradient descend. 
    #we got our final result until the loss is minimized and the result is presented
    #in the final NN model
    
    
    
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [15]:
def evaluate(model, iterator, criterion):
    
    
    #evaluate function does not update the parameter, it is used to get the loss and judge the model
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [16]:
#run for LSTM model

In [17]:
N_EPOCHS = 5

#epoch means a thorough procedure of all the examples through the NN.

for epoch in range(N_EPOCHS):
    
    #train and then validate

    train_loss_LSTM, train_acc_LSTM = train(model_LSTM, train_iterator, optimizer_LSTM, criterion)
    valid_loss_LSTM, valid_acc_LSTM = evaluate(model_LSTM, valid_iterator, criterion)
    torch.cuda.empty_cache()
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss_LSTM:.3f}, Train Acc: {train_acc_LSTM*100:.2f}%, Val. Loss: {valid_loss_LSTM:.3f}, Val. Acc: {valid_acc_LSTM*100:.2f}%')

  return Variable(arr, volatile=not train)


Epoch: 01, Train Loss: 0.678, Train Acc: 57.07%, Val. Loss: 0.644, Val. Acc: 63.67%
Epoch: 02, Train Loss: 0.673, Train Acc: 56.20%, Val. Loss: 0.673, Val. Acc: 68.47%
Epoch: 03, Train Loss: 0.657, Train Acc: 60.08%, Val. Loss: 0.587, Val. Acc: 70.52%
Epoch: 04, Train Loss: 0.572, Train Acc: 70.45%, Val. Loss: 0.564, Val. Acc: 72.85%
Epoch: 05, Train Loss: 0.463, Train Acc: 78.96%, Val. Loss: 0.420, Val. Acc: 81.47%


In [18]:
#evaluate in the test set

test_loss_LSTM, test_acc_LSTM = evaluate(model_LSTM, test_iterator, criterion)
torch.cuda.empty_cache()
print(f'Test Loss: {test_loss_LSTM:.3f}, Test Acc: {test_acc_LSTM*100:.2f}%')

  return Variable(arr, volatile=not train)


Test Loss: 0.456, Test Acc: 79.16%


In [19]:
#run for GRU model

In [20]:
N_EPOCHS = 5

for epoch in range(N_EPOCHS):

    train_loss_GRU, train_acc_GRU = train(model_GRU, train_iterator, optimizer_GRU, criterion)
    valid_loss_GRU, valid_acc_GRU = evaluate(model_GRU, valid_iterator, criterion)
    torch.cuda.empty_cache()
    
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss_GRU:.3f}, Train Acc: {train_acc_GRU*100:.2f}%, Val. Loss: {valid_loss_GRU:.3f}, Val. Acc: {valid_acc_GRU*100:.2f}%')

  return Variable(arr, volatile=not train)


Epoch: 01, Train Loss: 0.638, Train Acc: 61.98%, Val. Loss: 0.408, Val. Acc: 82.39%
Epoch: 02, Train Loss: 0.362, Train Acc: 84.73%, Val. Loss: 0.299, Val. Acc: 89.12%
Epoch: 03, Train Loss: 0.233, Train Acc: 90.85%, Val. Loss: 0.242, Val. Acc: 90.42%
Epoch: 04, Train Loss: 0.162, Train Acc: 93.81%, Val. Loss: 0.257, Val. Acc: 89.92%
Epoch: 05, Train Loss: 0.115, Train Acc: 95.94%, Val. Loss: 0.266, Val. Acc: 90.36%


In [21]:
test_loss_GRU, test_acc_GRU = evaluate(model_GRU, test_iterator, criterion)
torch.cuda.empty_cache()
print(f'Test Loss: {test_loss_GRU:.3f}, Test Acc: {test_acc_GRU*100:.2f}%')

  return Variable(arr, volatile=not train)


Test Loss: 0.332, Test Acc: 87.45%


In [None]:
#GRU perform better than LSTM

In [22]:
import spacy
nlp = spacy.load('en')

def predict_sentiment_LSTM(sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction_LSTM = F.sigmoid(model_LSTM(tensor))
    return prediction_LSTM.item()

def predict_sentiment_GRU(sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction_GRU = F.sigmoid(model_GRU(tensor))
    return prediction_GRU.item()


In [23]:
predict_sentiment_LSTM("This film is great")



0.874844491481781

In [24]:
predict_sentiment_GRU("This film is great")



0.9878643155097961

In [25]:
predict_sentiment_LSTM("This film is terrible")



0.13526645302772522

In [26]:
predict_sentiment_GRU("This film is terrible")



0.02730211615562439