In [18]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext import data
from torchtext.data import Field, LabelField, BucketIterator

import spacy
import numpy as np

import random
import math
import time

SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# # load the spacy model for English
# spacy_en = spacy.load('en_core_web_sm')

# # tokenizer function, can be passed to TorchText 
# def tokenize_en(text):
#     """
#     Tokenizes English text from a string into a list of tokens
#     """
#     return [tok.text for tok in spacy_en.tokenizer(text)]

In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
TEXT = Field(tokenize = 'spacy',
             include_lengths = True)

# LABEL = Field(sequential = False,
#               pad_token = None,
#               unk_token = None,
#               dtype = float)

LABEL = LabelField(dtype=torch.float)

train_data, valid_data, test_data = data.TabularDataset.splits(path = '/content/drive/My Drive/Colab Notebooks/NLP/', train = 'sst_train.csv',
                                        validation = 'sst_dev.csv', test = 'sst_test.csv',
                                        format = 'csv', fields=[("idx", None), ("text", TEXT), ("label", LABEL)],
                                        skip_header = True)
TEXT.build_vocab(train_data, min_freq = 1)
LABEL.build_vocab(train_data)
print("Unique tokens in TEXT vocabulary: " + str(len(TEXT.vocab)))
print("Unique tokens in LABEL vocabulary: " + str(len(LABEL.vocab)))
print("LABEL vocabulary frequency: " + str(LABEL.vocab.freqs))

print(vars(train_data.examples[0]))

print("Number of training examples: " + str(len(train_data)))
print("Number of validation examples: " + str(len(valid_data)))
print("Number of testing examples: " + str(len(test_data)))

Unique tokens in TEXT vocabulary: 13824
Unique tokens in LABEL vocabulary: 2
LABEL vocabulary frequency: Counter({'1': 3610, '0': 3310})
{'text': ['the', 'rock', 'is', 'destined', 'to', 'be', 'the', '21st', 'century', 's', 'new', 'conan', 'and', 'that', 'he', 's', 'going', 'to', 'make', 'a', 'splash', 'even', 'greater', 'than', 'arnold', 'schwarzenegger', 'jean', 'claud', 'van', 'damme', 'or', 'steven', 'segal'], 'label': '1'}
Number of training examples: 6920
Number of validation examples: 872
Number of testing examples: 1821


In [21]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    sort_key = lambda x: len(x.text),
    device = device)

In [22]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        
        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        
        #embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        #unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
                
        #hidden = [batch size, hid dim * num directions]
            
        return self.fc(hidden)

In [23]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
# CONTEXT_DIM = 100
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

In [24]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print('The model has ' + str(count_parameters(model)) + ' trainable parameters')

The model has 3693057 trainable parameters


In [25]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [26]:
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [27]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [28]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        text, text_lengths = batch.text
        
        predictions = model(text, text_lengths).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [29]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            text, text_lengths = batch.text
            
            predictions = model(text, text_lengths).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [30]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [31]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    test_loss, test_acc = evaluate(model, test_iterator, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'bi_directional_model.pt')
    
    print("Epoch: " + str(epoch+1) + " Epoch Time: " + str(epoch_mins) + "m " + str(epoch_secs) + "s")
    print("Train Loss: " + str(round(train_loss, 4)) + " Train Acc: " + str(round(train_acc * 100, 4)))
    print("Val. Loss : " + str(round(valid_loss, 4)) + " Val.  Acc: " + str(round(valid_acc * 100, 4)))
    print("Test Loss : " + str(round(test_loss, 4)) +  " Test  Acc: " + str(round(test_acc * 100, 4)))
    print("\n")

Epoch: 1 Epoch Time: 1m 8s
Train Loss: 0.6822 Train Acc: 56.1783
Val. Loss : 0.6583 Val.  Acc: 59.3304
Test Loss : 0.6647 Test  Acc: 58.0224


Epoch: 2 Epoch Time: 1m 7s
Train Loss: 0.654 Train Acc: 62.113
Val. Loss : 0.6334 Val.  Acc: 64.375
Test Loss : 0.6465 Test  Acc: 62.2473


Epoch: 3 Epoch Time: 1m 7s
Train Loss: 0.6171 Train Acc: 66.0694
Val. Loss : 0.5911 Val.  Acc: 67.6116
Test Loss : 0.6021 Test  Acc: 67.1076


Epoch: 4 Epoch Time: 1m 7s
Train Loss: 0.5743 Train Acc: 70.3985
Val. Loss : 0.589 Val.  Acc: 70.2455
Test Loss : 0.6134 Test  Acc: 66.6004


Epoch: 5 Epoch Time: 1m 7s
Train Loss: 0.5248 Train Acc: 73.5378
Val. Loss : 0.5832 Val.  Acc: 71.5179
Test Loss : 0.5904 Test  Acc: 71.6223


