# HW5
### Yiyang Wen (yw892)

## 1. Preparing Data

In [1]:
import torch
from torchtext import data
from torchtext import datasets
import random

SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

# Field decides how our data should be prossessed. The 'TEXT' 
# field handles the review and the "LABEL" field handles the sentiment.
TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(tensor_type=torch.FloatTensor)

# Split the IMDB data into trainin set, validation set, test set
# The default split is a 70/30 split.
train, test = datasets.IMDB.splits(TEXT, LABEL)
train, valid = train.split(random_state=random.seed(SEED))

In [2]:
# The first update, is the addition of pre-trained word embeddings. 
# These vectors have been trained on corpuses of billions of tokens. 
# Now, instead of having our word embeddings initialized randomly, 
# they are initialized with these pre-trained vectors, 
# where words that appear in similar contexts appear nearby in this vector space.
# The first step to using these is to specify the vectors and download them, 
# which is passed as an argument to build_vocab. 
# The glove is the algorithm used to calculate the vectors,
TEXT.build_vocab(train, max_size=25000, vectors="glove.6B.100d")
LABEL.build_vocab(train)

In [3]:
BATCH_SIZE = 64

# This step we create the iterators. "BucketIterator" firstly sorts the example
# by the length of the sentences, then partitions them into buckets.
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train, valid, test), 
    batch_size=BATCH_SIZE, 
    sort_key=lambda x: len(x.text), 
    repeat=False)

## 2. Build the Model

In [4]:
import torch.nn as nn

#In the "__init__" we define the layers of the module. 
# The layers include embedding layer, RNN, and a linear layer.
# The embedding layer is used to transform our vectors, which denote vocabularies, into a dense embedding vector.
# The RNN layer is our RNN which takes in ourse dense vector and the previous hidden state to calculate the next hidden state.
# The linear layer takes the final hidden state and feeds it through a fully connected layer, transforming it to the correct output dimension.
# Implementing bidirectionality and adding additional layers are done by passing values for the "num_layers" and "bidirectional" arguments for the LSTM/GRU.
# Dropout is implemented by initializing an nn.Dropout layer (the argument is the probability of dropout for each neuron)
# and using it within the forward method after each layer we want to apply to dropout to.

class RNN_LSTM(nn.Module):
    

    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    # The "forward" method is called when we feed examples into the model.
    # The input batch is to get embedded. Then it is fed into the RNN model
    # Then we get the return of RNN layer. "Output" is the concatenation of the hidden state from every step, whereas "hidden" is the final hidden state.
    def forward(self, x):
        
        #x = [sentence length, batch size]
        
        embedded = self.dropout(self.embedding(x))
        
        #embedded = [sentence length, batch size, emb dim]
        
        output, (hidden, cell) = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim * num directions]
        #hidden = [num layers * num directions, batch size, hid. dim]
        #cell = [num layers * num directions, batch size, hid. dim]
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
                
        #hidden [batch size, hid. dim * num directions]
            
        return self.fc(hidden.squeeze(0))
    
    
class RNN_GRU(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):

        #x = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(x))
        
        #embedded = [sent len, batch size, emb dim]22222
        
        output, hidden = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim * num directions]
        #hidden = [num layers * num directions, batch size, hid. dim]
        #cell = [num layers * num directions, batch size, hid. dim]
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
                
        #hidden [batch size, hid. dim * num directions]
            
        return self.fc(hidden.squeeze(0))

In [5]:
### Create the instances of model_LSTM and model_GRU  
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
model_LSTM = RNN_LSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)
model_GRU = RNN_GRU(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

## 3.Train the Model

In [6]:
pretrained_embeddings = TEXT.vocab.vectors
print(pretrained_embeddings.shape)

torch.Size([25002, 100])


In [7]:
#Define an optimizer. This is the algorithm we use to update the parameters 
#For optimization we use Adam algorithm.
import torch.optim as optim
optimizer_LSTM = optim.Adam(model_LSTM.parameters())
optimizer_GRU = optim.Adam(model_GRU.parameters())

In [8]:
#Define the loss function, which is "binary cross entropy with logits"
criterion = nn.BCEWithLogitsLoss()
#If PyTorch detects a GPU, we can place the model and the criterion on the GPU.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_LSTM = model_LSTM.to(device)
model_GRU = model_GRU.to(device)
criterion = criterion.to(device)

In [9]:
import torch.nn.functional as F
# This function is to calculate the binary accuracy, return a round number
def binary_accuracy(preds, y):
    rounded_preds = torch.round(F.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [10]:
# This function is to train our model
# The 'train' function iterates over all examples, a batch at a time.
# For each batch we first intialize the gradients
# Then we feed the bactch of sentences "batch.text" into the model
# Then calculate the loss and accuracy
# Then calculate the current gradient 
# Then update the parameters using optimizer alorithm
# Finally return the results we need such as loss and accuracy.     

def train(model, iterator, optimizer, criterion):    
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [11]:
# This function is test our model, 
# with a few modifications as we don't want to update the parameters.

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.text).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

## 4. LSTM

In [12]:
N_EPOCHS = 5
# Train the LSTM model through multiple epochs    
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model_LSTM, train_iterator, optimizer_LSTM, criterion)
    valid_loss, valid_acc = evaluate(model_LSTM, valid_iterator, criterion)
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc*100:.2f}%, Val. Loss: {valid_loss:.3f}, Val. Acc: {valid_acc*100:.2f}%')

  return Variable(arr, volatile=not train)


Epoch: 01, Train Loss: 0.687, Train Acc: 54.12%, Val. Loss: 0.673, Val. Acc: 57.69%
Epoch: 02, Train Loss: 0.665, Train Acc: 59.20%, Val. Loss: 0.646, Val. Acc: 62.81%
Epoch: 03, Train Loss: 0.674, Train Acc: 55.94%, Val. Loss: 0.680, Val. Acc: 54.38%
Epoch: 04, Train Loss: 0.685, Train Acc: 54.40%, Val. Loss: 0.671, Val. Acc: 58.26%
Epoch: 05, Train Loss: 0.627, Train Acc: 65.72%, Val. Loss: 0.561, Val. Acc: 69.71%


In [13]:
# Calculate the accuracy of the model in the test set.
test_loss, test_acc = evaluate(model_LSTM, test_iterator, criterion)
print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc*100:.2f}%')

  return Variable(arr, volatile=not train)


Test Loss: 0.561, Test Acc: 69.86%


## 5. GRU

In [12]:
N_EPOCHS = 5
# Train GRU model
# An epoch being a complete pass through all examples in the split.
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model_GRU, train_iterator, optimizer_GRU, criterion)
    valid_loss, valid_acc = evaluate(model_GRU, valid_iterator, criterion)
    torch.cuda.empty_cache()
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc*100:.2f}%, Val. Loss: {valid_loss:.3f}, Val. Acc: {valid_acc*100:.2f}%')

  return Variable(arr, volatile=not train)


Epoch: 01, Train Loss: 0.696, Train Acc: 54.12%, Val. Loss: 0.677, Val. Acc: 60.93%
Epoch: 02, Train Loss: 0.689, Train Acc: 54.78%, Val. Loss: 0.711, Val. Acc: 50.85%
Epoch: 03, Train Loss: 0.687, Train Acc: 54.11%, Val. Loss: 0.636, Val. Acc: 65.23%
Epoch: 04, Train Loss: 0.553, Train Acc: 72.00%, Val. Loss: 0.422, Val. Acc: 81.47%
Epoch: 05, Train Loss: 0.403, Train Acc: 82.16%, Val. Loss: 0.375, Val. Acc: 84.49%


In [13]:
# Calculate the accuracy of the model in the test set.
test_loss, test_acc = evaluate(model_GRU, test_iterator, criterion)
print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc*100:.2f}%')

  return Variable(arr, volatile=not train)


Test Loss: 0.401, Test Acc: 83.65%


## 6. Conclusion

It's easy to know that GRU has the better performance because of the lower loss and hiher accuracy in test set.