# Data Preparation

In [0]:
import torch
from torchtext import data, datasets
import random
import numpy as np

SEED = 1992

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True


In [0]:
# !pip install transformers
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [3]:
len(tokenizer.vocab)

30522

In [4]:
tokens = tokenizer.tokenize('Hello WORLD how ARE yoU?') 
print(tokens) # it is bert base uncased

indexes = tokenizer.convert_tokens_to_ids(tokens)
print(indexes)

['hello', 'world', 'how', 'are', 'you', '?']
[7592, 2088, 2129, 2024, 2017, 1029]


In [5]:
init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

print(init_token, eos_token, pad_token, unk_token)

[CLS] [SEP] [PAD] [UNK]


In [6]:
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']

print(max_input_length)

512


In [0]:
def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length-2]
    return tokens

In [0]:
TEXT = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = tokenize_and_cut,
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  init_token = tokenizer.cls_token_id,
                  eos_token = tokenizer.sep_token_id,
                  pad_token = tokenizer.pad_token_id,
                  unk_token = tokenizer.unk_token_id)
LABEL = data.LabelField(dtype = torch.float)

In [0]:
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
valid_data, test_data = test_data.split(split_ratio = 0.5, random_state = random.seed(SEED)) #Further split test set into validation and test set

In [10]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 25000
Number of testing examples: 12500


In [11]:
print(vars(train_data.examples[0]))

{'text': [2006, 4678, 6642, 1037, 2143, 3310, 2247, 2008, 2038, 1996, 2373, 2000, 7818, 1996, 2568, 1010, 4010, 1996, 2540, 1998, 3543, 1996, 2200, 3969, 1012, 1000, 10223, 1000, 2003, 2107, 1037, 2143, 1012, 1045, 2288, 1000, 10223, 1000, 2013, 2026, 2564, 2040, 2288, 2009, 2013, 1037, 11429, 2040, 2003, 1999, 1996, 2143, 2449, 1012, 2016, 3427, 2009, 2005, 1037, 2117, 2051, 2007, 2033, 1012, 2057, 2020, 2119, 4372, 2705, 7941, 3709, 1012, 2014, 2004, 2065, 2005, 1996, 2034, 2051, 2153, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 1000, 10223, 1000, 2003, 1037, 8687, 3538, 2881, 2000, 4604, 2017, 2067, 2000, 1996, 2617, 2012, 2029, 2035, 1997, 2115, 16547, 2318, 2635, 2173, 1012, 2009, 2515, 2023, 2096, 2108, 21660, 2135, 14036, 1012, 25626, 12385, 1005, 1055, 3772, 1998, 15732, 2004, 1037, 2472, 2079, 2025, 2292, 2017, 2298, 2185, 2013, 1996, 3898, 1012, 2002, 14030, 1037, 2839, 2029, 4487, 10286, 5244, 2007, 1037, 12883, 16291, 2066, 1010, 24646, 19567, 12660, 1010, 2021, 2

In [12]:
print(tokenizer.convert_ids_to_tokens(vars(train_data.examples[0])['text']))

['on', 'rare', 'occasions', 'a', 'film', 'comes', 'along', 'that', 'has', 'the', 'power', 'to', 'expand', 'the', 'mind', ',', 'warm', 'the', 'heart', 'and', 'touch', 'the', 'very', 'soul', '.', '"', 'lou', '"', 'is', 'such', 'a', 'film', '.', 'i', 'got', '"', 'lou', '"', 'from', 'my', 'wife', 'who', 'got', 'it', 'from', 'a', 'neighbor', 'who', 'is', 'in', 'the', 'film', 'business', '.', 'she', 'watched', 'it', 'for', 'a', 'second', 'time', 'with', 'me', '.', 'we', 'were', 'both', 'en', '##th', '##ral', '##led', '.', 'her', 'as', 'if', 'for', 'the', 'first', 'time', 'again', '.', '<', 'br', '/', '>', '<', 'br', '/', '>', '"', 'lou', '"', 'is', 'a', 'magical', 'piece', 'designed', 'to', 'send', 'you', 'back', 'to', 'the', 'moment', 'at', 'which', 'all', 'of', 'your', 'dramas', 'started', 'taking', 'place', '.', 'it', 'does', 'this', 'while', 'being', 'relentless', '##ly', 'entertaining', '.', 'bret', 'carr', "'", 's', 'acting', 'and', 'pacing', 'as', 'a', 'director', 'do', 'not', 'let', 

In [13]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 25000
Number of validation examples: 12500
Number of testing examples: 12500


In [14]:
LABEL.build_vocab(train_data)
vars(LABEL.vocab)

{'freqs': Counter({'neg': 12500, 'pos': 12500}),
 'itos': ['neg', 'pos'],
 'stoi': defaultdict(<function torchtext.vocab._default_unk_index>,
             {'neg': 0, 'pos': 1}),
 'vectors': None}

In [15]:
print(vars(LABEL.vocab).keys())

dict_keys(['freqs', 'itos', 'stoi', 'vectors'])


In [0]:
BATCH_SIZE = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits((train_data, valid_data, test_data),
                                                                           batch_size = BATCH_SIZE, device = device)

In [0]:
from transformers import BertModel

bert = BertModel.from_pretrained('bert-base-uncased')

# Model Architecture

In [0]:
import torch.nn as nn

class BERTGRU(nn.Module):
  def __init__(self, bert, hidden_dim, output_dim,
               num_layers, bidirectional, dropout):
    
    super().__init__() # init from super class in nn.module

    self.bert = bert
    emb_dim = bert.config.to_dict()['hidden_size']
    self.rnn = nn.GRU(emb_dim, hidden_dim, num_layers = num_layers,
                      bidirectional = bidirectional, batch_first = True,
                      dropout = 0 if num_layers < 2 else dropout)
    
    num_direction = 2 if bidirectional else 1
    self.fc = nn.Linear(hidden_dim * num_direction, output_dim)
    self.dropout = nn.Dropout(dropout)

  def forward(self, text):
    # text has dimension of (batch_size, sentence_length)
    with torch.no_grad():
      embedding_output = self.bert(text)[0]
      # embedding_output has dimension of (batch_size, sentence_length, emb_dim)

    _, hidden_output = self.rnn(embedding_output)
    # hidden_output has dimension of (num_directions * num_layers, batch_size, hidden_dim) where num_directions = 2 if bidirectional and num_layers (if it is stacked RNN)

    if self.rnn.bidirectional:
      hidden_output = self.dropout(torch.cat((hidden_output[-2,:,:], hidden_output[-1,:,:]), dim = 1))
    else:
      hidden_output = self.dropout(hidden_output[-1,:,:])
    # hidden_output = [batch size, hidden_dim]

    fc_output = self.fc(hidden_output)
    # fc_output has dimension of (batch_size, output_dim) after fc layer

    return fc_output

In [0]:
HIDDEN_DIM = 256
OUTPUT_DIM = 1
NUM_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.25

model = BERTGRU(bert, HIDDEN_DIM, OUTPUT_DIM,
            NUM_LAYERS, BIDIRECTIONAL, DROPOUT)
model = model.to(device)

In [20]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 112,241,409 trainable parameters


In [0]:
for name, param in model.named_parameters():                
    if name.startswith('bert'):
        param.requires_grad = False

# Define loss, metric and optimizer

In [0]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()
criterion = criterion.to(device)

def binary_accuracy(preds, label):

    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == label).float() 
    acc = correct.sum() / len(correct)
    return acc

# Train Model

In [0]:
import time

def train_model(model, iterator, optimizer, criterion):
  # start of new epoch 
  start_time = time.time()
  epoch_loss = 0
  epoch_acc = 0
  model.train() # set model to train mdoe to enable dropout and batch norm
  batch_size = len(iterator)

  for batch in iterator:
    optimizer.zero_grad() # set optimizer grad to 0 first
    pred = model(batch.text).squeeze(1) # forward prop
    loss = criterion(pred, batch.label) # calculate loss 
    acc = binary_accuracy(pred, batch.label) # calculate metric
    loss.backward() # backward prop
    optimizer.step() # updates optimizer parameters
    epoch_loss += loss.item()
    epoch_acc += acc.item()

  end_time = time.time()
  epoch_time_taken = end_time - start_time

  return epoch_loss / batch_size, epoch_acc / batch_size, epoch_time_taken

In [0]:
from datetime import datetime

def evaluate_model(model, iterator, criterion):
  # start of new epoch 
  start_time = time.time()
  epoch_loss = 0
  epoch_acc = 0
  batch_size = len(iterator)
  model.eval() # set model to eval mode

  for batch in iterator:

    pred = model(batch.text).squeeze(1) # forward prop
    loss = criterion(pred, batch.label) # calculate loss 
    acc = binary_accuracy(pred, batch.label) # calculate metric
    epoch_loss += loss.item()
    epoch_acc += acc.item()

  end_time = time.time()
  epoch_time_taken = end_time - start_time

  return epoch_loss / batch_size, epoch_acc / batch_size, epoch_time_taken

In [0]:
import time

def convert_time(time_taken):
    elapsed_mins = int(time_taken / 60)
    elapsed_secs = int(time_taken - (elapsed_mins * 60))
    output = f'{elapsed_mins} min and {elapsed_secs} sec'
    return output

In [0]:
MAX_EPOCHS = 10

best_loss = float('inf')
for epoch in range(MAX_EPOCHS):
  train_loss, train_acc, train_time = train_model(model, train_iterator, optimizer, criterion)
  valid_loss, valid_acc, valid_time = evaluate_model(model, valid_iterator, criterion)
  print(f'Epoch {epoch} took {convert_time(train_time)} for training and {convert_time(valid_time)} for validation')

  if valid_loss < best_loss:
    best_loss = valid_loss
    torch.save(model.state_dict(), 'model.pt')

  print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
  print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')


Epoch 0 took 20 min and 2 sec for training and 4 min and 14 sec for validation
	Train Loss: 0.437 | Train Acc: 78.47%
	 Val. Loss: 0.246 |  Val. Acc: 89.88%
Epoch 1 took 20 min and 8 sec for training and 4 min and 15 sec for validation
	Train Loss: 0.258 | Train Acc: 89.68%
	 Val. Loss: 0.225 |  Val. Acc: 91.06%
Epoch 2 took 20 min and 11 sec for training and 4 min and 15 sec for validation
	Train Loss: 0.221 | Train Acc: 91.29%
	 Val. Loss: 0.202 |  Val. Acc: 92.02%


# Inference on test set

In [0]:
# model.load_state_dict(torch.load('model.pt'))
# test_loss, test_acc, test_time = evaluate_model(model, test_iterator, criterion)
# print(f'Epoch {epoch} took {convert_time(test_time)} for test')
# print(f'\t Test. Loss: {test_loss:.3f} |  Test. Acc: {test_acc*100:.2f}%')