# Data Preparation

In [0]:
import torch
from torchtext import data, datasets
import random

SEED = 1992

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize = 'spacy', batch_first=True)
LABEL = data.LabelField()

In [2]:
train_data, test_data = datasets.TREC.splits(TEXT, LABEL, fine_grained=False)
valid_data, test_data = test_data.split(split_ratio = 0.5, random_state = random.seed(SEED)) #Further split test set into validation and test set

downloading train_5500.label


train_5500.label: 100%|██████████| 336k/336k [00:00<00:00, 868kB/s]


downloading TREC_10.label


TREC_10.label: 100%|██████████| 23.4k/23.4k [00:00<00:00, 297kB/s]


In [3]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 5452
Number of validation examples: 250
Number of testing examples: 250


In [4]:
print(vars(train_data.examples[0]))

{'text': ['How', 'did', 'serfdom', 'develop', 'in', 'and', 'then', 'leave', 'Russia', '?'], 'label': 'DESC'}


In [5]:
MAX_VOCAB_SIZE = 25_000
TEXT.build_vocab(train_data,
                 max_size = MAX_VOCAB_SIZE,
                 vectors = "glove.6B.100d",
                 unk_init = torch.Tensor.normal_)
LABEL.build_vocab(train_data)

.vector_cache/glove.6B.zip: 862MB [06:30, 2.21MB/s]                           
100%|█████████▉| 399287/400000 [00:22<00:00, 18263.59it/s]

In [6]:
vars(LABEL.vocab)

{'freqs': Counter({'ABBR': 86,
          'DESC': 1162,
          'ENTY': 1250,
          'HUM': 1223,
          'LOC': 835,
          'NUM': 896}),
 'itos': ['ENTY', 'HUM', 'DESC', 'NUM', 'LOC', 'ABBR'],
 'stoi': defaultdict(<function torchtext.vocab._default_unk_index>,
             {'ABBR': 5, 'DESC': 2, 'ENTY': 0, 'HUM': 1, 'LOC': 4, 'NUM': 3}),
 'vectors': None}

In [7]:
print(f'There are {len(TEXT.vocab)} unique tokens in TEXT vocabulary')
print(f'There are {len(LABEL.vocab)} unique tokens in LABEL vocabulary')

There are 9343 unique tokens in TEXT vocabulary
There are 6 unique tokens in LABEL vocabulary


In [8]:
print(vars(TEXT.vocab).keys())
print(vars(LABEL.vocab).keys())

dict_keys(['freqs', 'itos', 'stoi', 'vectors'])
dict_keys(['freqs', 'itos', 'stoi', 'vectors'])


In [0]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits((train_data, valid_data, test_data),
                                                                           batch_size = BATCH_SIZE,
                                                                           device = device)

# Model Architecture

In [0]:
import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
  def __init__(self, input_dim, emb_dim, num_filters,
               filter_sizes, output_dim, dropout, pad_idx):
    
    super().__init__() # init from super class in nn.module
    self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx = pad_idx)

    self.convs = nn.ModuleList([nn.Conv2d(in_channels = 1,
                                out_channels = num_filters,
                                kernel_size = (fs, emb_dim))
                                for fs in filter_sizes
                                ])

    self.fc = nn.Linear(num_filters * len(filter_sizes) , output_dim)
    self.dropout = nn.Dropout(dropout)

  def forward(self, text):
    # text has dimension of (batch_size, sentence_length)
    embedding_output = self.embedding(text)
    # embedding_output has dimension of (batch_size, sentence_length, emb_dim)
    embedding_output = embedding_output.unsqueeze(1)
    # embedding_output has dimension of (batch_size, 1, sentence_length, emb_dim)

    convs_output = [F.relu(conv(embedding_output).squeeze(3)) for conv in self.convs]
    # Each of the tensor in convs_output has dimension of (batch_size, num_filters, sent_len - filter_sizes[N] + 1)

    pools_output = [F.max_pool1d(conv_output, kernel_size = conv_output.shape[-1]).squeeze(2) for conv_output in convs_output]
    # Each of the tensor in pools_output has dimension of (batch_size, num_filters) after squeezing

    concat = torch.cat(pools_output, dim = 1)
    concat = self.dropout(concat)
    # concat has dimension of (batch_size, num_filters * len(filter_sizes))

    fc_output = self.fc(concat)
    # fc_output has dimension of (batch_size, output_dim) after fc layer

    return fc_output

In [0]:
INPUT_DIM = len(TEXT.vocab)
EMB_DIM = 100
NUM_FILTERS = 100
FILTER_SIZES = [2, 3, 4]
OUTPUT_DIM = len(LABEL.vocab)
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = CNN(INPUT_DIM, EMB_DIM, NUM_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)
model = model.to(device)

In [13]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,026,406 trainable parameters


In [14]:
pretrained_embeddings = TEXT.vocab.vectors
print(pretrained_embeddings.shape)
model.embedding.weight.data.copy_(pretrained_embeddings)

torch.Size([9343, 100])


tensor([[-0.3358, -0.0554,  0.3294,  ...,  0.8827,  0.0913, -2.2403],
        [-0.0548, -0.7422,  0.8423,  ..., -0.6081,  1.1196, -1.0543],
        [ 0.1638,  0.6046,  1.0789,  ..., -0.3140,  0.1844,  0.3624],
        ...,
        [ 0.0091,  0.2810,  0.7356,  ..., -0.7508,  0.8967, -0.7631],
        [ 0.2906,  0.3217,  0.2419,  ..., -0.9444, -0.3790,  0.6196],
        [-1.1760,  0.5552,  0.9607,  ...,  1.1343,  0.5196, -0.3769]],
       device='cuda:0')

In [15]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMB_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMB_DIM)
print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.1638,  0.6046,  1.0789,  ..., -0.3140,  0.1844,  0.3624],
        ...,
        [ 0.0091,  0.2810,  0.7356,  ..., -0.7508,  0.8967, -0.7631],
        [ 0.2906,  0.3217,  0.2419,  ..., -0.9444, -0.3790,  0.6196],
        [-1.1760,  0.5552,  0.9607,  ...,  1.1343,  0.5196, -0.3769]],
       device='cuda:0')


# Define loss, metric and optimizer

In [0]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
criterion = criterion.to(device)

def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    correct = max_preds.squeeze(1).eq(y)
    return correct.sum() / torch.FloatTensor([y.shape[0]])

# Train Model

In [0]:
import time

def train_model(model, iterator, optimizer, criterion):
  # start of new epoch 
  start_time = time.time()
  epoch_loss = 0
  epoch_acc = 0
  model.train() # set model to train mdoe to enable dropout and batch norm
  batch_size = len(iterator)

  for batch in iterator:
    optimizer.zero_grad() # set optimizer grad to 0 first
    pred = model(batch.text) # forward prop
    loss = criterion(pred, batch.label) # calculate loss 
    acc = categorical_accuracy(pred, batch.label) # calculate metric
    loss.backward() # backward prop
    optimizer.step() # updates optimizer parameters
    epoch_loss += loss.item()
    epoch_acc += acc.item()

  end_time = time.time()
  epoch_time_taken = end_time - start_time

  return epoch_loss / batch_size, epoch_acc / batch_size, epoch_time_taken

In [0]:
from datetime import datetime

def evaluate_model(model, iterator, criterion):
  # start of new epoch 
  start_time = time.time()
  epoch_loss = 0
  epoch_acc = 0
  batch_size = len(iterator)
  model.eval() # set model to eval mode

  for batch in iterator:
    pred = model(batch.text) # forward prop
    loss = criterion(pred, batch.label) # calculate loss 
    acc = categorical_accuracy(pred, batch.label) # calculate metric
    epoch_loss += loss.item()
    epoch_acc += acc.item()

  end_time = time.time()
  epoch_time_taken = end_time - start_time

  return epoch_loss / batch_size, epoch_acc / batch_size, epoch_time_taken

In [0]:
import time

def convert_time(time_taken):
    elapsed_mins = int(time_taken / 60)
    elapsed_secs = int(time_taken - (elapsed_mins * 60))
    output = f'{elapsed_mins} min and {elapsed_secs} sec'
    return output

In [20]:
MAX_EPOCHS = 10

best_loss = float('inf')
for epoch in range(MAX_EPOCHS):
  train_loss, train_acc, train_time = train_model(model, train_iterator, optimizer, criterion)
  valid_loss, valid_acc, valid_time = evaluate_model(model, valid_iterator, criterion)
  print(f'Epoch {epoch} took {convert_time(train_time)} for training and {convert_time(valid_time)} for validation')

  if valid_loss < best_loss:
    best_loss = valid_loss
    torch.save(model.state_dict(), 'model.pt')

  print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
  print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')


Epoch 0 took 0 min and 0 sec for training and 0 min and 0 sec for validation
	Train Loss: 1.199 | Train Acc: 53.40%
	 Val. Loss: 0.787 |  Val. Acc: 72.43%
Epoch 1 took 0 min and 0 sec for training and 0 min and 0 sec for validation
	Train Loss: 0.765 | Train Acc: 72.58%
	 Val. Loss: 0.599 |  Val. Acc: 79.39%
Epoch 2 took 0 min and 0 sec for training and 0 min and 0 sec for validation
	Train Loss: 0.550 | Train Acc: 80.79%
	 Val. Loss: 0.467 |  Val. Acc: 85.88%
Epoch 3 took 0 min and 0 sec for training and 0 min and 0 sec for validation
	Train Loss: 0.395 | Train Acc: 86.88%
	 Val. Loss: 0.403 |  Val. Acc: 87.14%
Epoch 4 took 0 min and 0 sec for training and 0 min and 0 sec for validation
	Train Loss: 0.301 | Train Acc: 89.81%
	 Val. Loss: 0.374 |  Val. Acc: 88.78%
Epoch 5 took 0 min and 0 sec for training and 0 min and 0 sec for validation
	Train Loss: 0.228 | Train Acc: 93.04%
	 Val. Loss: 0.353 |  Val. Acc: 88.78%
Epoch 6 took 0 min and 0 sec for training and 0 min and 0 sec for vali

# Inference on test set

In [21]:
model.load_state_dict(torch.load('model.pt'))
test_loss, test_acc, test_time = evaluate_model(model, test_iterator, criterion)
print(f'Epoch {epoch} took {convert_time(test_time)} for test')
print(f'\t Test. Loss: {test_loss:.3f} |  Test. Acc: {test_acc*100:.2f}%')

Epoch 9 took 0 min and 0 sec for test
	 Test. Loss: 0.278 |  Test. Acc: 91.08%


# User Input

In [0]:
import spacy
nlp = spacy.load('en')

def predict_class(model, sentence, min_len = 4):

    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    if len(tokenized) < min_len:
        tokenized += ['<pad>'] * (min_len - len(tokenized))
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    pred = model(tensor)
    prediction = pred.argmax(dim=1)
    return prediction.item()

In [37]:
pred_class = predict_class(model, "Who is Justin Bieber?")
print(f'Predicted class is: {pred_class} = {LABEL.vocab.itos[pred_class]}')

Predicted class is: 1 = HUM


In [36]:
pred_class = predict_class(model, "How many apples does she have?")
print(f'Predicted class is: {pred_class} = {LABEL.vocab.itos[pred_class]}')

Predicted class is: 3 = NUM


In [32]:
pred_class = predict_class(model, "What continent is Singapore in?")
print(f'Predicted class is: {pred_class} = {LABEL.vocab.itos[pred_class]}')

Predicted class is: 4 = LOC


In [33]:
pred_class = predict_class(model, "What does LOL stand for?")
print(f'Predicted class is: {pred_class} = {LABEL.vocab.itos[pred_class]}')

Predicted class is: 5 = ABBR
