<a href="https://colab.research.google.com/github/ajdillhoff/CSE6363/blob/main/imdb-rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torchtext
import spacy
import torch
import torch.nn as nn
import torchtext.transforms as T
import torch.optim as optim
import pytorch_lightning as pl
import torch.nn.functional as F

# !pip install torchtext==0.12.0
# !pip install torchdata
# !pip install pytorch-lightning

print(torchtext.__version__)

0.12.0


In [231]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc


class RNN(pl.LightningModule):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim,
                 train_datapipe, val_datapipe, test_datapipe, batch_size=32):
        super().__init__()
        
        # Required since our input vector represents each word as an index into
        # the vocabulary.
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=1)
        # Creates an RNN using tanh by default.
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

        # LightningModule attributes
        self.lr = 1e-3
        self.batch_size = batch_size
        self.loss_fn = nn.BCEWithLogitsLoss()

        # Datasets
        self.train_datapipe = train_datapipe
        self.val_datapipe = val_datapipe
        self.test_datapipe = test_datapipe

    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, cell) = self.rnn(embedded)

        return self.fc(hidden.squeeze(0))

    def training_step(self, batch, batch_idx):
        input = batch["token_ids"].cuda()
        target = torch.tensor(batch["target"], dtype=torch.float).cuda()
        output = self(input).squeeze()
        loss = self.loss_fn(output, target)
        acc = binary_accuracy(output, target)
        
        self.log("train_loss", loss)
        self.log("train_acc", acc)

        return loss
    
    def validation_step(self, batch, batch_idx):
        input = batch["token_ids"].cuda()
        target = torch.tensor(batch["target"], dtype=torch.float).cuda()
        output = self(input).squeeze()
        loss = self.loss_fn(output, target)
        acc = binary_accuracy(output, target)
        
        self.log("val_loss", loss)
        self.log("val_acc", acc)

    def train_dataloader(self):
        loader = torch.utils.data.DataLoader(self.train_datapipe,
                                             batch_size=None,
                                             num_workers=8,
                                             shuffle=True)

        return loader

    def val_dataloader(self):
        loader = torch.utils.data.DataLoader(self.val_datapipe,
                                             batch_size=None,
                                             num_workers=8,
                                             shuffle=False)

        return loader

    def test_dataloader(self):
        loader = torch.utils.data.DataLoader(self.test_datapipe,
                                             batch_size=None,
                                             num_workers=8,
                                             shuffle=False)

        return loader

    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=self.lr)

Our model works with numerical input. So, we'll need to convert each word into a corresponding one-hot vector based on the vocabulary of our dataset.

Luckily, `torchtext` makes this simple by providing `build_vocab_from_iterator`. All we need to do is supply the `datapipe` iterator and it builds a vocabulary for us.

In [216]:
tokenizer = torchtext.data.utils.get_tokenizer("spacy", language="en_core_web_sm")
max_tokens = 25000

def make_vocabulary():
    train_dataset = torchtext.datasets.SST2(split="train")
    train_datapipe = train_dataset.map(lambda x: tokenizer(x[0]))
    v = torchtext.vocab.build_vocab_from_iterator(train_datapipe, specials=["<unk>"], max_tokens=max_tokens)
    v.set_default_index(0)

    return v

In [217]:
v = make_vocabulary()

To finish preparing the data, the labels `pos` and `neg` should be converted to numeric values as well. This can be done with `LabelToIndex`.

With the transforms in place, we can pass the `datapipe` to a PyTorch `DataLoader` object for use during training.

In [218]:
padding_idx = 1
bos_idx = 0
eos_idx = 2
max_seq_len = 256

text_transform = T.Sequential(
    T.VocabTransform(v),
    T.Truncate(max_seq_len - 2),
    T.AddToken(token=bos_idx, begin=True),
    T.AddToken(token=eos_idx, begin=False),
    T.ToTensor(padding_value=padding_idx)
)

batch_size = 32

train_datapipe = torchtext.datasets.SST2(split='train')
val_datapipe = torchtext.datasets.SST2(split='dev')

train_datapipe = train_datapipe.map(lambda x: (tokenizer(x[0]), x[1]))
train_datapipe = train_datapipe.batch(batch_size).rows2columnar(["text", "label"])
train_datapipe = train_datapipe.map(lambda x: {"token_ids": text_transform(x["text"]), "target": x["label"]})

val_datapipe = val_datapipe.map(lambda x: (tokenizer(x[0]), x[1]))
val_datapipe = val_datapipe.batch(batch_size).rows2columnar(["text", "label"])
val_datapipe = val_datapipe.map(lambda x: {"token_ids": text_transform(x["text"]), "target": x["label"]})

Create our model

In [232]:
model = RNN(len(v), 100, 256, 1, train_datapipe, val_datapipe, test_datapipe)

In [233]:
from pytorch_lightning.callbacks import ModelCheckpoint

checkpoint_callback = ModelCheckpoint(
    monitor="val_loss",
    mode="min"
)

trainer = pl.Trainer(accelerator="gpu", callbacks=[checkpoint_callback], max_epochs=5)
trainer.fit(model)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type              | Params
------------------------------------------------
0 | embedding | Embedding         | 1.4 M 
1 | rnn       | LSTM              | 366 K 
2 | fc        | Linear            | 257   
3 | loss_fn   | BCEWithLogitsLoss | 0     
------------------------------------------------
1.8 M     Trainable params
0         Non-trainable params
1.8 M     Total params
7.023     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]