In [1]:
!pip install portalocker
!pip install torchmetrics

Collecting portalocker
  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Installing collected packages: portalocker
Successfully installed portalocker-2.8.2
Collecting torchmetrics
  Downloading torchmetrics-1.2.0-py3-none-any.whl (805 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m805.2/805.2 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.10.0-py3-none-any.whl (24 kB)
Installing collected packages: lightning-utilities, torchmetrics
Successfully installed lightning-utilities-0.10.0 torchmetrics-1.2.0


Fill in the code below with the appropriate logic to make this notebook work. You will pull the GLOVE embeddings and then create a MLP text classifier for the AG_NEWS dataset. There are many uttilities used here. You need to create a vocabulary, allow the model to update the GLOVE embeddings with gradients or not. Additionally, you might not use the GLOVE embeddings. GLOVE is a pretrained set of embeddings much like Word2Vec. Each word has a unique embedding but the objective function is different for GLOVE.

In [2]:
import argparse
import logging
import time

import torch
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
from torchtext.data.utils import get_tokenizer, ngrams_iterator
from torchtext.datasets import DATASETS
from torchtext.utils import download_from_url
from torchtext.vocab import build_vocab_from_iterator
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from torchtext.vocab import GloVe
from tqdm import tqdm

torch.autograd.set_detect_anomaly(True)

FILL = '_FILL_'

### Information
- torchtext repo: https://github.com/pytorch/text/tree/main/torchtext
- torchtext documentation: https://pytorch.org/text/stable/index.html

### Constants

In [3]:
DATASET = "AG_NEWS"
DATA_DIR = ".data"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# Set to 300 since GLOVE embeddings are dimension 300
EMBED_DIM = 300
LR = 0.1
BATCH_SIZE = 64
NUM_EPOCHS = 5
PADDING_VALUE = 0
PADDING_IDX = PADDING_VALUE

In [4]:
DEVICE

'cuda'

### Get the tokenizer
- Use the WordLevel tokenizer.


In [5]:
# Get the basic english tokenizer
basic_english_tokenizer =get_tokenizer("basic_english")

In [6]:
basic_english_tokenizer("This is some text ...")

['this', 'is', 'some', 'text', '.', '.', '.']

In [7]:
# Needed later
TOKENIZER = basic_english_tokenizer

### Get the data and get the vocabulary

In [8]:
# This takes a (x, y) pair from data_iter and returns tokenized list of words
# For every sentence, it should yield a tokenized set of string
# "a b c d" -> ["a", "b", "c", "d"]
# Use TOKENIZER here
def yield_tokens(data_iter):
  tokens =[]
  for _, text in data_iter:
        tokens=tokens +[TOKENIZER(text)]
  return tokens

In [9]:
train_iter = DATASETS[DATASET](root=DATA_DIR, split="train")
# Build the vocabulary from the above iterator
# Use special symbols '<pad>' and '<unk>'
VOCAB = build_vocab_from_iterator(
    yield_tokens(train_iter),
    specials=('<pad>', '<unk>'),
    min_freq = 15
)
# Make the default index the same as that of the '<unk>'
rare=VOCAB['<unk>']
VOCAB.set_default_index(rare)

In [10]:
print(len(VOCAB))

16338


### Get GLOVE embeddings
GLOVE is a pretrained set of word vectors that comes with torchtext; it is roughly 2G of data.

In [11]:
# This will take a bit of time
GLOVE = GloVe()

.vector_cache/glove.840B.300d.zip: 2.18GB [06:54, 5.25MB/s]                            
100%|█████████▉| 2196016/2196017 [05:59<00:00, 6106.42it/s]


In [12]:
# How many word vectors are there and what is the shape of GLOVE.vectors?
# What is the dimension of each vector?
len(GLOVE), GLOVE.vectors.shape

(2196017, torch.Size([2196017, 300]))

### Helper functions

In [13]:
# For text, return the tokens for each word
# This might work like "The man walks" -> ["the", "man", "walks"] -> [17, 123, 5]
def text_pipeline(text):
  text_ =TOKENIZER(text)
  itos=VOCAB(text_)
  return itos

# Return the label as an integer between 0 and num_classes - 1 inclusive
def label_pipeline(label):
    label=label -1
    return int(label)

Nice link on collate_fn and DataLoader in PyTorch: https://python.plainenglish.io/understanding-collate-fn-in-pytorch-f9d1742647d3

In [14]:
# What does this do?
# For each batch, return a tensor of labels and a tensor of ints representing the words in associated sentences
def collate_batch(batch):
    label_list, text_list = [], []
    # For each label and text in a batch, transform to a tensor of labels
    # And a tensor of tokens for each word
    # You need text_pipeline here and
    for (_label, _text) in batch:
        # Get the label from {1, 2, 3, 4} to {0, 1, 2, 3}
        label_list.append(label_pipeline(_label))

        # Return a list of ints
        # This basically returns a tensor of the token ids and then puts them into text_list
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text.clone().detach())

    # Make a tensor from the label_list
    label_tensor = torch.tensor(label_list, dtype=torch.int64)
    # Pad the sequence list and return a tensor
    text_tensor = pad_sequence(text_list, batch_first=True)

    return label_tensor.to(DEVICE), text_tensor.to(DEVICE)

### Get the data

In [15]:
# Pull the train_iter as above
#train_iter = to_map_style_dataset(train_iter)
train_iter = DataLoader(train_iter, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
# Get the number of classes
# Loop over the train_iter and get the total integer number
num_class = len(set([label for (label, _) in train_iter])) # <-- this doesn't decompose the list right
num_class = 4
# What are the classes?
print(f"The number of classes is {num_class} ...")

The number of classes is 4 ...


In [16]:
for yb, xb in train_iter:
    print(yb.shape, xb.shape)
    print(xb)
    break

torch.Size([64]) torch.Size([64, 77])
tensor([[ 5792,  2495,  1977,  ...,     0,     0,     0],
        [  287, 10383,    71,  ...,     0,     0,     0],
        [  371,    77,  3335,  ...,     0,     0,     0],
        ...,
        [    1,     5,   383,  ...,     0,     0,     0],
        [   89,     4,   176,  ...,     0,     0,     0],
        [   13,     3,    51,  ...,     0,     0,     0]], device='cuda:0')


### Set up the model

The goal of this problem is to construct a MLP neural classifier for text classification.

In [23]:
# Fill in the comments below
class MLPTextClassificationModel(nn.Module):
    def __init__(
        self,
        vocab_size,
        num_class,
        embed_dim = 300,
        # If this is true, we will use the pretrained GLOVE embeddings
        use_pretrained = True,
        # If this is true, we will allow gradient updates for the GLOVE embeddings
        # If  this is false, the GLOVE embeddings are static
        fine_tune_embeddings = True
    ):

        super(MLPTextClassificationModel, self).__init__()

        # Set to a nn.Embedding layer with vocab_size words and embed_dim dimension per word
        # Set the padding index to PADDING_IDX
        # This layer is like a linear layer but without the bias
        # Essentially, it is a Matrix sich that A * [2, 3] pulls out two vectors
        # The vectors we get are for the words with tokens 2 and 3 specifically
        self.embedding = nn.Embedding(vocab_size, embed_dim,padding_idx=PADDING_IDX)

        if use_pretrained:
          # If this is true, set the requires_grad to False and load the GLOVE embeddings per token
          self.embedding.weight.requires_grad = False
          for i in range(vocab_size):
            # Look up the token id for the token i
            token = VOCAB.lookup_token(i)

            # Set the ith row of the embeddings weight matrix to the GLOVE vector for this token
            self.embedding.weight[i, :] = GLOVE[token]

          # Set the requires_grad field to True
          # Had we not made it False before, we might not be able to change the Embedding layer as above
          # See what happens if interested
          self.embedding.weight.requires_grad = True

        # If this is off, make the embedding weights be constant and without gradients
        if not fine_tune_embeddings:
          # Set the requires_grad field to False
          self.embedding.weight.requires_grad = False

        # Make a linear layer going from embed_dim to dimension 100
        self.linear1 = nn.Linear(embed_dim, 100)
        # Make a fc layer going from 100 to num_class
        self.fc = nn.Linear(100, num_class)

    # B = batch_size, L = sequence length, D = vector dimension
    def forward(self, text):
        # B X L X D
        embedded = self.embedding(text)

        # B X D - Make embedded be the average of word embedding across a sentence
        embedded = embedded.mean(axis=1)

        # Pass through ReLU
        embedded = torch.nn.functional.relu(embedded)

        # B X 100 Pass through linear1
        embedded =  self.linear1(embedded)

        # Pass through ReLU
        embedded = torch.nn.functional.relu(embedded)

        # B X num_classes Pass through fc to get
        embedded = self.fc(embedded)

        # B X num_classes Take the LogSoftmax and return this
        out =torch.nn.LogSoftmax(dim=1)(embedded)

        return out

### Set up the model

In [24]:
# Either use the GLOVE embeddings to initialize the model or don't
USE_PRETRAINED = True
# If this is off, your model should do worse as the GLOVE embeddings will not be modified by gradient updates
FINE_TUNE_EMBEDDINGS = True

# Define the model such that it takes in the log softmax
# Hint: search the PyTorch webpage
criterion = nn.NLLLoss()

# Instantiate the model
model = MLPTextClassificationModel(
    len(VOCAB),
    num_class,
    EMBED_DIM,
    use_pretrained=USE_PRETRAINED,
    fine_tune_embeddings=FINE_TUNE_EMBEDDINGS
).to(DEVICE)

# Define an SGD optimizer acting on the parameters of the model
optimizer = torch.optim.SGD(model.parameters(), lr=LR)

# Define scheduler to be a learning rate scheduler of type StepLR with gamma = 0.1
# Learning rate schedulers lower the learning rate for you via some rule
# They can help ensure you don't blow up the optimization as you go
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

### Set up the data

In [19]:
# Get the iterators for train and test data
train_iter, test_iter = DATASETS[DATASET](root=DATA_DIR, split="train"),DATASETS[DATASET](root=DATA_DIR, split="test")
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)

num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = random_split(train_dataset, [num_train, len(train_dataset) - num_train])

# Define data loaders on the train, validation, and test data
# Set collate_fn as the collate_batch above
# Each batch of raw data is sent through collate_batch to get the data we need (the tensors)
train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)

### Train the model

In [25]:
def train(dataloader, model, optimizer, criterion, epoch):
    # Put model in train mode
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 100

    for idx, (label, text) in enumerate(dataloader):
        # Set the gradients to zero
        optimizer.zero_grad()

        # Get the LogSoftmax predictions
        y_preds = model(text)

        # Get the loss
        loss =  criterion(y_preds, label)

        # Do back propagation
        loss.backward()

        # Clip the gradients to 0.1 so they don't get too large
        # Look up clip_grad_norm_
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)

        # Do an optimization step
        optimizer.step()

        # Get the total accuracy
        n=label.size(0)
        _, prediction = torch.max(y_preds.data, 1)
        total_acc += (label==prediction).sum().item()
        total_count += n
        if idx % log_interval == 0 and idx > 0:
            print(
                "| epoch {:3d} | {:5d}/{:5d} batches "
                "| accuracy {:8.3f}".format(epoch, idx, len(dataloader), total_acc / total_count)
            )
            total_acc, total_count = 0, 0

In [21]:
def evaluate(dataloader, model):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
      for idx, (label, text) in enumerate(dataloader):
        # Get the predicted labels
        predited_label = model(text)
        _, predited_label = torch.max(predited_label.data, 1)
        # Get the accuracy
        n = label.size(0)
        total_acc += (label==predited_label).sum().item()
        # Update the total count
        total_count += n
    # Return the accuracy
    return total_acc / total_count

In [22]:
for epoch in range(1, NUM_EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader, model, optimizer, criterion, epoch)
    accu_val = evaluate(valid_dataloader, model)
    scheduler.step()
    print("-" * 59)
    print(
        "| end of epoch {:3d} | time: {:5.2f}s | "
        "valid accuracy {:8.3f} ".format(epoch, time.time() - epoch_start_time, accu_val)
    )
    print("-" * 59)

print("Checking the results of test dataset.")
accu_test = evaluate(test_dataloader, model)
print("test accuracy {:8.3f}".format(accu_test))

| epoch   1 |   100/ 1782 batches | accuracy    0.508
| epoch   1 |   200/ 1782 batches | accuracy    0.656
| epoch   1 |   300/ 1782 batches | accuracy    0.663
| epoch   1 |   400/ 1782 batches | accuracy    0.667
| epoch   1 |   500/ 1782 batches | accuracy    0.672
| epoch   1 |   600/ 1782 batches | accuracy    0.691
| epoch   1 |   700/ 1782 batches | accuracy    0.720
| epoch   1 |   800/ 1782 batches | accuracy    0.751
| epoch   1 |   900/ 1782 batches | accuracy    0.768
| epoch   1 |  1000/ 1782 batches | accuracy    0.776
| epoch   1 |  1100/ 1782 batches | accuracy    0.803
| epoch   1 |  1200/ 1782 batches | accuracy    0.805
| epoch   1 |  1300/ 1782 batches | accuracy    0.810
| epoch   1 |  1400/ 1782 batches | accuracy    0.813
| epoch   1 |  1500/ 1782 batches | accuracy    0.825
| epoch   1 |  1600/ 1782 batches | accuracy    0.821
| epoch   1 |  1700/ 1782 batches | accuracy    0.838
-----------------------------------------------------------
| end of epoch   1 | t