# Data Preparation

In [0]:
import torch
from torchtext import data, datasets
import random

SEED = 1992

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize = 'spacy')
LABEL = data.LabelField(dtype = torch.float)

In [0]:
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

In [3]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 25000
Number of testing examples: 25000


In [4]:
print(vars(train_data.examples[0]))

{'text': ['Chalk', 'this', 'one', 'up', 'in', 'the', 'win', 'column', ',', 'this', 'was', 'a', 'superb', 'movie', '.', 'The', 'acting', 'performances', 'were', 'great', 'and', 'the', 'script', 'was', 'equally', 'great.<br', '/><br', '/>Helen', 'Hunt', 'was', 'magnificent', 'as', 'the', 'Riverside', 'police', 'officer', 'Gina', 'Pulasky', '.', 'Gina', 'was', 'a', 'complex', 'character', '.', 'She', 'was', 'a', 'rookie', 'cop', 'with', 'the', 'Riverside', 'Police', 'Dept', '.', 'She', 'ended', 'up', 'in', 'an', 'affair', 'with', 'a', 'coworker', 'that', 'she', 'knew', 'had', 'a', 'wife', 'and', 'kids', ',', 'all', 'the', 'while', 'she', 'took', 'on', 'the', 'dangerous', 'task', 'of', 'going', 'undercover', 'to', 'catch', 'a', 'serial', 'killer', '.', '<', 'br', '/><br', '/>Jeff', 'Fahey', '(', 'the', 'Ray', 'Liotta', 'look', 'alike', ')', 'did', 'a', 'bang', 'up', 'job', 'as', 'the', 'confused', ',', 'often', 'stammering', ',', 'police', 'officer', 'that', 'had', 'an', 'affair', 'with', 

In [0]:
valid_data, test_data = test_data.split(split_ratio = 0.5, random_state = random.seed(SEED)) #Further split test set into validation and test set

In [6]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 25000
Number of validation examples: 12500
Number of testing examples: 12500


In [0]:
MAX_VOCAB_SIZE = 25_000
TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

In [8]:
vars(LABEL.vocab)

{'freqs': Counter({'neg': 12500, 'pos': 12500}),
 'itos': ['neg', 'pos'],
 'stoi': defaultdict(<function torchtext.vocab._default_unk_index>,
             {'neg': 0, 'pos': 1}),
 'vectors': None}

In [9]:
print(f'There are {len(TEXT.vocab)} unique tokens in TEXT vocabulary')
print(f'There are {len(LABEL.vocab)} unique tokens in LABEL vocabulary')

There are 25002 unique tokens in TEXT vocabulary
There are 2 unique tokens in LABEL vocabulary


In [10]:
print(vars(TEXT.vocab).keys())
print(vars(LABEL.vocab).keys())

dict_keys(['freqs', 'itos', 'stoi', 'vectors'])
dict_keys(['freqs', 'itos', 'stoi', 'vectors'])


In [0]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits((train_data, valid_data, test_data),
                                                                           batch_size = BATCH_SIZE,
                                                                           device = device)

# Model Architecture

In [0]:
import torch.nn as nn

class baseline_rnn(nn.Module):
  def __init__(self, input_dim, emb_dim, hidden_dim, output_dim):
    super().__init__() # init from super class in nn.module
    self.embedding = nn.Embedding(input_dim, emb_dim)
    self.rnn = nn.RNN(emb_dim, hidden_dim, num_layers = 1, nonlinearity = 'relu', bias=True, bidirectional=False)
    self.fc = nn.Linear(hidden_dim, output_dim)

  def forward(self, text):
    # text has dimension of (sentence_length, batch_size)
    embedding_output = self.embedding(text) 
    # embedding_output has dimension of (sentence_length, batch_size, emb_dim)
    rnn_output, rnn_hidden = self.rnn(embedding_output) 
    # rnn_output has dimension of (sentence_length, batch_size, num_directions * hidden_dim) where num_directions = 2 if bidirectional RNN
    # rnn_hidden has dimension of (num_directions * num_layers, batch_size, hidden_dim) where num_directions = 2 if bidirectional RNN and num_layers (if it is stacked RNN)
    rnn_hidden = rnn_hidden.squeeze(0)
    # rnn_hidden has dimension of (batch_size, hidden_dim) after squeezeing
    fc_output = self.fc(rnn_output)
    # fc_output has dimension of (batch_size, output_dim) after fc layer

    return fc_output

In [0]:
INPUT_DIM = len(TEXT.vocab)
EMB_DIM = 128
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = baseline_rnn(INPUT_DIM, EMB_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [14]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 3,299,329 trainable parameters


# Define loss, metric and optimizer

In [0]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()

def binary_accuracy(preds, label):

    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == label).float() 
    acc = correct.sum() / len(correct)
    return acc

# Train Model

In [0]:
from datetime import datetime

def train_model(model, iterator, optimizer, criterion):
  # start of new epoch 
  start_time = datetime.now()
  epoch_loss = 0
  epoch_acc = 0
  model.train() # set model to train mdoe to enable dropout and batch norm

  for batch in iterator:
    curr_batch_size = vars(batch)['batch_size']
    optimizer.zero_grad() # set optimizer grad to 0 first
    pred = model(batch.text).squeeze(1) # forward prop
    loss = criterion(pred, batch.label) # calculate loss 
    acc = binary_accuracy(pred, label) # calculate metric
    loss.backward() # backward prop
    optimizer.step() # updates optimizer parameters
    epoch_loss += loss.item()
    epoch_acc += acc.item()

  end_time = datetime.now()
  epoch_time_taken = end_time - start_time

  return epoch_loss / curr_batch_size, epoch_acc / curr_batch_size, epoch_time_taken

In [0]:
from datetime import datetime

def evaluate_model(model, iterator, criterion):
  # start of new epoch 
  start_time = datetime.now()
  epoch_loss = 0
  epoch_acc = 0
  model.eval() # set model to eval mode

  for batch in iterator:
    curr_batch_size = vars(batch)['batch_size']
    pred = model(batch.text).squeeze(1) # forward prop
    loss = criterion(pred, batch.label) # calculate loss 
    acc = binary_accuracy(pred, label) # calculate metric
    epoch_loss += loss.item()
    epoch_acc += acc.item()

  end_time = datetime.now()
  epoch_time_taken = end_time - start_time

  return epoch_loss / curr_batch_size, epoch_acc / curr_batch_size, epoch_time_taken

In [0]:
def convert_time(time_taken):
  return f'{time_taken.days} days, {time_taken.min} minutes and {time_taken.seconds} seconds'

In [19]:
MAX_EPOCHS = 10

best_loss = float('inf')
for epoch in range(MAX_EPOCHS):
  train_loss, train_acc, train_time = train_model(model, train_iterator, optimizer, criterion)
  valid_loss, valid_acc, valid_time = evaluate_model(model, valid_iterator, criterion)
  print(f'Epoch {epoch} took {convert_time(train_time)} for training and {convert_time(valid_time)} for validation')

  if valid_loss < best_loss:
    best_loss = valid_loss
    torch.save(model.state_dict(), 'baseline_model.pt')

    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

RuntimeError: ignored