# Data Preparation

In [0]:
def generate_bigrams(x):
    n_grams = set(zip(*[x[i:] for i in range(2)]))
    for n_gram in n_grams:
        x.append(' '.join(n_gram))
    return x

In [0]:
import torch
from torchtext import data, datasets
import random

SEED = 1992

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize = 'spacy', preprocessing = generate_bigrams)
LABEL = data.LabelField(dtype = torch.float)

In [4]:
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:07<00:00, 10.6MB/s]


In [5]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 25000
Number of testing examples: 25000


In [6]:
print(vars(train_data.examples[0]))

{'text': ['Chalk', 'this', 'one', 'up', 'in', 'the', 'win', 'column', ',', 'this', 'was', 'a', 'superb', 'movie', '.', 'The', 'acting', 'performances', 'were', 'great', 'and', 'the', 'script', 'was', 'equally', 'great.<br', '/><br', '/>Helen', 'Hunt', 'was', 'magnificent', 'as', 'the', 'Riverside', 'police', 'officer', 'Gina', 'Pulasky', '.', 'Gina', 'was', 'a', 'complex', 'character', '.', 'She', 'was', 'a', 'rookie', 'cop', 'with', 'the', 'Riverside', 'Police', 'Dept', '.', 'She', 'ended', 'up', 'in', 'an', 'affair', 'with', 'a', 'coworker', 'that', 'she', 'knew', 'had', 'a', 'wife', 'and', 'kids', ',', 'all', 'the', 'while', 'she', 'took', 'on', 'the', 'dangerous', 'task', 'of', 'going', 'undercover', 'to', 'catch', 'a', 'serial', 'killer', '.', '<', 'br', '/><br', '/>Jeff', 'Fahey', '(', 'the', 'Ray', 'Liotta', 'look', 'alike', ')', 'did', 'a', 'bang', 'up', 'job', 'as', 'the', 'confused', ',', 'often', 'stammering', ',', 'police', 'officer', 'that', 'had', 'an', 'affair', 'with', 

In [0]:
valid_data, test_data = test_data.split(split_ratio = 0.5, random_state = random.seed(SEED)) #Further split test set into validation and test set

In [8]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 25000
Number of validation examples: 12500
Number of testing examples: 12500


In [9]:
MAX_VOCAB_SIZE = 25_000
TEXT.build_vocab(train_data,
                 max_size = MAX_VOCAB_SIZE,
                 vectors = "glove.6B.100d",
                 unk_init = torch.Tensor.normal_)
LABEL.build_vocab(train_data)

.vector_cache/glove.6B.zip: 862MB [06:30, 2.21MB/s]
100%|█████████▉| 399338/400000 [00:22<00:00, 16411.89it/s]

In [10]:
vars(LABEL.vocab)

{'freqs': Counter({'neg': 12500, 'pos': 12500}),
 'itos': ['neg', 'pos'],
 'stoi': defaultdict(<function torchtext.vocab._default_unk_index>,
             {'neg': 0, 'pos': 1}),
 'vectors': None}

In [11]:
print(f'There are {len(TEXT.vocab)} unique tokens in TEXT vocabulary')
print(f'There are {len(LABEL.vocab)} unique tokens in LABEL vocabulary')

There are 25002 unique tokens in TEXT vocabulary
There are 2 unique tokens in LABEL vocabulary


In [12]:
print(vars(TEXT.vocab).keys())
print(vars(LABEL.vocab).keys())

dict_keys(['freqs', 'itos', 'stoi', 'vectors'])
dict_keys(['freqs', 'itos', 'stoi', 'vectors'])


In [0]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits((train_data, valid_data, test_data),
                                                                           batch_size = BATCH_SIZE,
                                                                           device = device)

# Model Architecture

In [0]:
import torch.nn as nn
import torch.nn.functional as F

class fast_text(nn.Module):
  def __init__(self, input_dim, emb_dim, output_dim, dropout, pad_idx):
    
    super().__init__() # init from super class in nn.module
    self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx = pad_idx)
    self.fc = nn.Linear(emb_dim, output_dim)
    self.dropout = nn.Dropout(dropout)

  def forward(self, text):
    # text has dimension of (sentence_length, batch_size)
    embedding_output = self.dropout(self.embedding(text))
    # embedding_output has dimension of (sentence_length, batch_size, emb_dim)
    embedding_output = embedding_output.permute(1, 0, 2)
    # embedding_output has dimension of (batch_size, sentence_length, emb_dim)
    pooled = F.avg_pool2d(embedding_output, (embedding_output.shape[1], 1))
    # pooled has dimension of (batch_size, 1, emb_dim)
    pooled = pooled.squeeze()
    # pooled has dimension of (batch_size, emb_dim)
    fc_output = self.fc(pooled)
    # fc_output has dimension of (batch_size, output_dim) after fc layer

    return fc_output

In [0]:
INPUT_DIM = len(TEXT.vocab)
EMB_DIM = 100
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = fast_text(INPUT_DIM, EMB_DIM, OUTPUT_DIM, DROPOUT, PAD_IDX)
model = model.to(device)

In [26]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,500,301 trainable parameters


In [27]:
pretrained_embeddings = TEXT.vocab.vectors
print(pretrained_embeddings.shape)
model.embedding.weight.data.copy_(pretrained_embeddings)

torch.Size([25002, 100])


tensor([[-0.3358, -0.0554,  0.3294,  ...,  0.8827,  0.0913, -2.2403],
        [-0.0548, -0.7422,  0.8423,  ..., -0.6081,  1.1196, -1.0543],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.3432, -0.4694,  1.3178,  ..., -0.9612,  0.5312,  0.8897],
        [-0.0935, -1.1532,  0.1853,  ...,  0.8103, -0.8533, -1.4943],
        [-1.5623, -1.3385,  0.2712,  ...,  0.4855, -1.3892, -1.7497]],
       device='cuda:0')

In [28]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMB_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMB_DIM)
print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.3432, -0.4694,  1.3178,  ..., -0.9612,  0.5312,  0.8897],
        [-0.0935, -1.1532,  0.1853,  ...,  0.8103, -0.8533, -1.4943],
        [-1.5623, -1.3385,  0.2712,  ...,  0.4855, -1.3892, -1.7497]],
       device='cuda:0')


# Define loss, metric and optimizer

In [0]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()
criterion = criterion.to(device)

def binary_accuracy(preds, label):

    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == label).float() 
    acc = correct.sum() / len(correct)
    return acc

# Train Model

In [0]:
import time

def train_model(model, iterator, optimizer, criterion):
  # start of new epoch 
  start_time = time.time()
  epoch_loss = 0
  epoch_acc = 0
  model.train() # set model to train mdoe to enable dropout and batch norm
  batch_size = len(iterator)

  for batch in iterator:
    optimizer.zero_grad() # set optimizer grad to 0 first
    pred = model(batch.text).squeeze(1) # forward prop
    loss = criterion(pred, batch.label) # calculate loss 
    acc = binary_accuracy(pred, batch.label) # calculate metric
    loss.backward() # backward prop
    optimizer.step() # updates optimizer parameters
    epoch_loss += loss.item()
    epoch_acc += acc.item()

  end_time = time.time()
  epoch_time_taken = end_time - start_time

  return epoch_loss / batch_size, epoch_acc / batch_size, epoch_time_taken

In [0]:
from datetime import datetime

def evaluate_model(model, iterator, criterion):
  # start of new epoch 
  start_time = time.time()
  epoch_loss = 0
  epoch_acc = 0
  batch_size = len(iterator)
  model.eval() # set model to eval mode

  for batch in iterator:
    pred = model(batch.text).squeeze(1) # forward prop
    loss = criterion(pred, batch.label) # calculate loss 
    acc = binary_accuracy(pred, batch.label) # calculate metric
    epoch_loss += loss.item()
    epoch_acc += acc.item()

  end_time = time.time()
  epoch_time_taken = end_time - start_time

  return epoch_loss / batch_size, epoch_acc / batch_size, epoch_time_taken

In [0]:
import time

def convert_time(time_taken):
    elapsed_mins = int(time_taken / 60)
    elapsed_secs = int(time_taken - (elapsed_mins * 60))
    output = f'{elapsed_mins} min and {elapsed_secs} sec'
    return output

In [33]:
MAX_EPOCHS = 10

best_loss = float('inf')
for epoch in range(MAX_EPOCHS):
  train_loss, train_acc, train_time = train_model(model, train_iterator, optimizer, criterion)
  valid_loss, valid_acc, valid_time = evaluate_model(model, valid_iterator, criterion)
  print(f'Epoch {epoch} took {convert_time(train_time)} for training and {convert_time(valid_time)} for validation')

  if valid_loss < best_loss:
    best_loss = valid_loss
    torch.save(model.state_dict(), 'model.pt')

  print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
  print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')


Epoch 0 took 0 min and 15 sec for training and 0 min and 2 sec for validation
	Train Loss: 0.685 | Train Acc: 58.61%
	 Val. Loss: 0.610 |  Val. Acc: 69.69%
Epoch 1 took 0 min and 15 sec for training and 0 min and 2 sec for validation
	Train Loss: 0.627 | Train Acc: 74.87%
	 Val. Loss: 0.464 |  Val. Acc: 77.98%
Epoch 2 took 0 min and 15 sec for training and 0 min and 2 sec for validation
	Train Loss: 0.535 | Train Acc: 81.68%
	 Val. Loss: 0.392 |  Val. Acc: 82.99%
Epoch 3 took 0 min and 15 sec for training and 0 min and 2 sec for validation
	Train Loss: 0.451 | Train Acc: 85.55%
	 Val. Loss: 0.381 |  Val. Acc: 85.39%
Epoch 4 took 0 min and 15 sec for training and 0 min and 2 sec for validation
	Train Loss: 0.391 | Train Acc: 87.53%
	 Val. Loss: 0.390 |  Val. Acc: 86.65%
Epoch 5 took 0 min and 15 sec for training and 0 min and 2 sec for validation
	Train Loss: 0.349 | Train Acc: 88.89%
	 Val. Loss: 0.407 |  Val. Acc: 87.30%
Epoch 6 took 0 min and 15 sec for training and 0 min and 2 sec f

# Inference on test set

In [35]:
model.load_state_dict(torch.load('model.pt'))
test_loss, test_acc, test_time = evaluate_model(model, test_iterator, criterion)
print(f'Epoch {epoch} took {convert_time(test_time)} for test')
print(f'\t Test. Loss: {test_loss:.3f} |  Test. Acc: {test_acc*100:.2f}%')

Epoch 9 took 0 min and 2 sec for test
	 Test. Loss: 0.392 |  Test. Acc: 84.92%


# User Input

In [0]:
import spacy
nlp = spacy.load('en')

def predict_sentiment(model, sentence):
    # sentence: 'i love this film so much'
    model.eval()
    tokenized = generate_bigrams([tok.text for tok in nlp.tokenizer(sentence)])
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()

In [37]:
predict_sentiment(model, "This film is amazing")

1.0

In [38]:
predict_sentiment(model, "This film is not awful")

3.6655790609074757e-05