# Data Preparation

In [0]:
import torch
from torchtext import data, datasets
import random

SEED = 1992

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize = 'spacy', batch_first=True)
LABEL = data.LabelField(dtype = torch.float)

In [2]:
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
valid_data, test_data = test_data.split(split_ratio = 0.5, random_state = random.seed(SEED)) #Further split test set into validation and test set

aclImdb_v1.tar.gz:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:03<00:00, 21.4MB/s]


In [6]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 25000
Number of validation examples: 12500
Number of testing examples: 12500


In [7]:
print(vars(train_data.examples[0]))

{'text': ['Loosely', 'based', 'on', 'the', 'James', 'J', 'Corbett', 'biography', '"', 'The', 'Roar', 'Of', 'The', 'Crowd', '"', ',', 'Gentleman', 'Jim', 'is', 'a', 'wonderfully', 'breezy', 'picture', 'that', 'perfectly', 'encapsulates', 'not', 'only', 'the', 'rise', 'of', 'the', 'pugilistic', 'prancer', 'that', 'was', 'Corbett', ',', 'but', 'also', 'the', 'wind', 'of', 'change', 'as', 'regards', 'the', 'sport', 'of', 'boxing', 'circa', 'the', '1890s.<br', '/><br', '/>The', 'story', 'follows', 'Corbett', '{', 'a', 'perfectly', 'casted', 'Errol', 'Flynn', '}', 'from', 'his', 'humble', 'beginnings', 'as', 'a', 'bank', 'teller', 'in', 'San', 'Fransico', ',', 'thru', 'to', 'a', 'chance', 'fight', 'with', 'an', 'ex', 'boxing', 'champion', 'that', 'eventually', 'leads', 'to', 'him', 'fighting', 'the', 'fearsome', 'heavyweight', 'champion', 'of', 'the', 'world', ',', 'John', 'L', 'Sullivan', '{', 'beefcake', 'personified', 'delightfully', 'by', 'Ward', 'Bond', '}', '.', 'Not', 'all', 'the', 'f

In [8]:
MAX_VOCAB_SIZE = 25_000
TEXT.build_vocab(train_data,
                 max_size = MAX_VOCAB_SIZE,
                 vectors = "glove.6B.100d",
                 unk_init = torch.Tensor.normal_)
LABEL.build_vocab(train_data)

.vector_cache/glove.6B.zip: 862MB [06:28, 2.22MB/s]                           
100%|█████████▉| 399597/400000 [00:15<00:00, 26159.56it/s]

In [9]:
vars(LABEL.vocab)

{'freqs': Counter({'neg': 12500, 'pos': 12500}),
 'itos': ['neg', 'pos'],
 'stoi': defaultdict(<function torchtext.vocab._default_unk_index>,
             {'neg': 0, 'pos': 1}),
 'vectors': None}

In [10]:
print(f'There are {len(TEXT.vocab)} unique tokens in TEXT vocabulary')
print(f'There are {len(LABEL.vocab)} unique tokens in LABEL vocabulary')

There are 25002 unique tokens in TEXT vocabulary
There are 2 unique tokens in LABEL vocabulary


In [11]:
print(vars(TEXT.vocab).keys())
print(vars(LABEL.vocab).keys())

dict_keys(['freqs', 'itos', 'stoi', 'vectors'])
dict_keys(['freqs', 'itos', 'stoi', 'vectors'])


In [0]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits((train_data, valid_data, test_data),
                                                                           batch_size = BATCH_SIZE,
                                                                           device = device)

# Model Architecture

In [0]:
import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
  def __init__(self, input_dim, emb_dim, num_filters,
               filter_sizes, output_dim, dropout, pad_idx):
    
    super().__init__() # init from super class in nn.module
    self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx = pad_idx)

    self.convs = nn.ModuleList([nn.Conv2d(in_channels = 1,
                                out_channels = num_filters,
                                kernel_size = (fs, emb_dim))
                                for fs in filter_sizes
                                ])

    self.fc = nn.Linear(num_filters * len(filter_sizes) , output_dim)
    self.dropout = nn.Dropout(dropout)

  def forward(self, text):
    # text has dimension of (batch_size, sentence_length)
    embedding_output = self.embedding(text)
    # embedding_output has dimension of (batch_size, sentence_length, emb_dim)
    embedding_output = embedding_output.unsqueeze(1)
    # embedding_output has dimension of (batch_size, 1, sentence_length, emb_dim)

    convs_output = [F.relu(conv(embedding_output).squeeze(3)) for conv in self.convs]
    # Each of the tensor in convs_output has dimension of (batch_size, num_filters, sent_len - filter_sizes[N] + 1)

    pools_output = [F.max_pool1d(conv_output, kernel_size = conv_output.shape[-1]).squeeze(2) for conv_output in convs_output]
    # Each of the tensor in pools_output has dimension of (batch_size, num_filters) after squeezing

    concat = torch.cat(pools_output, dim = 1)
    concat = self.dropout(concat)
    # concat has dimension of (batch_size, num_filters * len(filter_sizes))

    fc_output = self.fc(concat)
    # fc_output has dimension of (batch_size, output_dim) after fc layer

    return fc_output

In [0]:
INPUT_DIM = len(TEXT.vocab)
EMB_DIM = 100
NUM_FILTERS = 100
FILTER_SIZES = [3, 4, 5]
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = CNN(INPUT_DIM, EMB_DIM, NUM_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)
model = model.to(device)

In [47]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,620,801 trainable parameters


In [48]:
pretrained_embeddings = TEXT.vocab.vectors
print(pretrained_embeddings.shape)
model.embedding.weight.data.copy_(pretrained_embeddings)

torch.Size([25002, 100])


tensor([[-0.3358, -0.0554,  0.3294,  ...,  0.8827,  0.0913, -2.2403],
        [-0.0548, -0.7422,  0.8423,  ..., -0.6081,  1.1196, -1.0543],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.5826, -0.1010,  1.1440,  ...,  0.1084,  0.8885, -0.2234],
        [-0.6728,  1.0572,  0.2273,  ..., -0.4935, -1.3922, -1.3847],
        [ 0.1171, -1.1903, -1.0085,  ..., -1.1771,  1.2799,  0.5319]],
       device='cuda:0')

In [49]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMB_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMB_DIM)
print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.5826, -0.1010,  1.1440,  ...,  0.1084,  0.8885, -0.2234],
        [-0.6728,  1.0572,  0.2273,  ..., -0.4935, -1.3922, -1.3847],
        [ 0.1171, -1.1903, -1.0085,  ..., -1.1771,  1.2799,  0.5319]],
       device='cuda:0')


# Define loss, metric and optimizer

In [0]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()
criterion = criterion.to(device)

def binary_accuracy(preds, label):

    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == label).float() 
    acc = correct.sum() / len(correct)
    return acc

# Train Model

In [0]:
import time

def train_model(model, iterator, optimizer, criterion):
  # start of new epoch 
  start_time = time.time()
  epoch_loss = 0
  epoch_acc = 0
  model.train() # set model to train mdoe to enable dropout and batch norm
  batch_size = len(iterator)

  for batch in iterator:
    optimizer.zero_grad() # set optimizer grad to 0 first
    pred = model(batch.text).squeeze(1) # forward prop
    loss = criterion(pred, batch.label) # calculate loss 
    acc = binary_accuracy(pred, batch.label) # calculate metric
    loss.backward() # backward prop
    optimizer.step() # updates optimizer parameters
    epoch_loss += loss.item()
    epoch_acc += acc.item()

  end_time = time.time()
  epoch_time_taken = end_time - start_time

  return epoch_loss / batch_size, epoch_acc / batch_size, epoch_time_taken

In [0]:
from datetime import datetime

def evaluate_model(model, iterator, criterion):
  # start of new epoch 
  start_time = time.time()
  epoch_loss = 0
  epoch_acc = 0
  batch_size = len(iterator)
  model.eval() # set model to eval mode

  for batch in iterator:
    pred = model(batch.text).squeeze(1) # forward prop
    loss = criterion(pred, batch.label) # calculate loss 
    acc = binary_accuracy(pred, batch.label) # calculate metric
    epoch_loss += loss.item()
    epoch_acc += acc.item()

  end_time = time.time()
  epoch_time_taken = end_time - start_time

  return epoch_loss / batch_size, epoch_acc / batch_size, epoch_time_taken

In [0]:
import time

def convert_time(time_taken):
    elapsed_mins = int(time_taken / 60)
    elapsed_secs = int(time_taken - (elapsed_mins * 60))
    output = f'{elapsed_mins} min and {elapsed_secs} sec'
    return output

In [54]:
MAX_EPOCHS = 10

best_loss = float('inf')
for epoch in range(MAX_EPOCHS):
  train_loss, train_acc, train_time = train_model(model, train_iterator, optimizer, criterion)
  valid_loss, valid_acc, valid_time = evaluate_model(model, valid_iterator, criterion)
  print(f'Epoch {epoch} took {convert_time(train_time)} for training and {convert_time(valid_time)} for validation')

  if valid_loss < best_loss:
    best_loss = valid_loss
    torch.save(model.state_dict(), 'model.pt')

  print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
  print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')


Epoch 0 took 0 min and 25 sec for training and 0 min and 1 sec for validation
	Train Loss: 0.615 | Train Acc: 64.69%
	 Val. Loss: 0.428 |  Val. Acc: 81.08%
Epoch 1 took 0 min and 24 sec for training and 0 min and 1 sec for validation
	Train Loss: 0.369 | Train Acc: 83.66%
	 Val. Loss: 0.336 |  Val. Acc: 85.34%
Epoch 2 took 0 min and 24 sec for training and 0 min and 1 sec for validation
	Train Loss: 0.266 | Train Acc: 89.24%
	 Val. Loss: 0.330 |  Val. Acc: 85.84%
Epoch 3 took 0 min and 24 sec for training and 0 min and 1 sec for validation
	Train Loss: 0.190 | Train Acc: 92.70%
	 Val. Loss: 0.328 |  Val. Acc: 86.39%
Epoch 4 took 0 min and 25 sec for training and 0 min and 1 sec for validation
	Train Loss: 0.135 | Train Acc: 95.14%
	 Val. Loss: 0.365 |  Val. Acc: 86.03%
Epoch 5 took 0 min and 24 sec for training and 0 min and 1 sec for validation
	Train Loss: 0.097 | Train Acc: 96.59%
	 Val. Loss: 0.402 |  Val. Acc: 85.83%
Epoch 6 took 0 min and 25 sec for training and 0 min and 1 sec f

# Inference on test set

In [55]:
model.load_state_dict(torch.load('model.pt'))
test_loss, test_acc, test_time = evaluate_model(model, test_iterator, criterion)
print(f'Epoch {epoch} took {convert_time(test_time)} for test')
print(f'\t Test. Loss: {test_loss:.3f} |  Test. Acc: {test_acc*100:.2f}%')

Epoch 9 took 0 min and 1 sec for test
	 Test. Loss: 0.316 |  Test. Acc: 86.88%


# User Input

In [0]:
import spacy
nlp = spacy.load('en')

def predict_sentiment(model, sentence, min_len = 5):
    # sentence: 'i love this film so much'
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    if len(tokenized) < min_len:
        tokenized += ['<pad>'] * (min_len - len(tokenized))
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()

In [63]:
predict_sentiment(model, "This film is amazing")

0.9917160868644714

In [65]:
predict_sentiment(model, "This film is truly terrible")

0.39685243368148804