In [1]:
import pytorch_lightning as pl
import torch 
import torch.nn as nn

from torchmetrics import Accuracy

torch.set_float32_matmul_precision('medium')


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from pytorch_lightning.utilities.types import TRAIN_DATALOADERS
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from torchtext.datasets import IMDB

import re
from collections import Counter, OrderedDict

class IMDBDataModule(pl.LightningDataModule):
    def __init__(self, data_path='./IMDB/') -> None:
        super().__init__()
        self.data_path = data_path

    def tokenizer(self, text):
        text = re.sub('<[^>]*>', '', text)
        emoticons = re.findall(
        '(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower()
        )

        text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
        tokenized = text.split()

        return tokenized
    

    def collate_batch(self, batch):
        label_list, text_list, lengths = [], [], []
        for _label, _text in batch:
            label_list.append(self.label_pipeline(_label))
            processed_text = torch.tensor(self.text_pipeline(_text),
            dtype=torch.int64)
            text_list.append(processed_text)
            lengths.append(processed_text.size(0))

        label_list = torch.tensor(label_list)
        lengths = torch.tensor(lengths)
        padded_text_list = nn.utils.rnn.pad_sequence(
        text_list, batch_first=True)
        return padded_text_list, label_list, lengths

    def prepare_data(self):
        pass

    def setup(self, stage: str):
        ## Step 1: create the datasets
        train_dataset = IMDB(split='train')

        self.train, self.val = random_split(
            list(train_dataset), [20000, 5000], 
            generator=torch.Generator().manual_seed(1)
        )

        self.test = IMDB(split='test')


        ## Step 2: find unique tokens (words)
        token_counts = Counter()
        for label, line in train_dataset:
            tokens = self.tokenizer(line)
            token_counts.update(tokens)


        ## Step 3: encoding each unique token into integers
        from torchtext.vocab import vocab
        sorted_by_freq_tuples = sorted(
        token_counts.items(), key=lambda x: x[1], reverse=True)
        ordered_dict = OrderedDict(sorted_by_freq_tuples)
        self.vocab = vocab(ordered_dict)
        self.vocab.insert_token("<pad>", 0)
        self.vocab.insert_token("<unk>", 1)
        self.vocab.set_default_index(1)
        self.vocab_size = len(self.vocab)


        ## Step 3-A: define the functions for transformation
        self.text_pipeline = \
            lambda x: [self.vocab[token] for token in self.tokenizer(x)]
        self.label_pipeline = lambda x: 1. if x == 1 else 0.
    


    def train_dataloader(self):
        return DataLoader(self.train, batch_size=32, shuffle=False, collate_fn=self.collate_batch, num_workers=31)
    
    def val_dataloader(self):
        return DataLoader(self.val, batch_size=32, shuffle=False, collate_fn=self.collate_batch, num_workers=31)
    
    def test_dataloader(self):
        return DataLoader(self.test, batch_size=32, shuffle=False, collate_fn=self.collate_batch, num_workers=31)


In [3]:
from typing import Any


class RNNNetwork(pl.LightningModule):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size,
                fc_hidden_size):
        super().__init__()

        self.train_acc = Accuracy(task="binary")
        self.valid_acc   = Accuracy(task="binary")
        self.test_acc  = Accuracy(task="binary")


        self.embedding = nn.Embedding(vocab_size,
                                      embedding_dim=embed_dim,
                                      padding_idx=0)
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size,
                           batch_first=True)
        self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

        

    def forward(self, text, lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(
                 out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True
                )
        out, (hidden, cell) = self.rnn(out)
        out = hidden[-1, :, :]
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out
    
    def training_step(self, batch, batch_idx):
        text_batch, label_batch, lengths = batch
        pred = self(text_batch, lengths)[:, 0]
        loss = nn.functional.binary_cross_entropy(pred, label_batch)
        self.log("train loss: ",loss, prog_bar=True)
        
        pred = (pred >= 0.5).float()
        self.train_acc.update(pred, label_batch)
        return loss
    
    def on_train_epoch_end(self):
        self.log("train acc :", self.train_acc.compute())

    def validation_step(self, batch, batch_idx):
        text_batch, label_batch, lengths = batch
        pred = self(text_batch, lengths)[:, 0]
        loss = nn.functional.binary_cross_entropy(pred, label_batch)

        pred = (pred >= 0.5).float()
        self.valid_acc.update(pred, label_batch)
        self.log("valid_loss", loss, prog_bar=True)
        self.log("valid_acc", self.valid_acc.compute(), prog_bar=True)
        return loss

    def test_step(self, batch, batch_idx):
        text_batch, label_batch, lengths = batch
        pred = self(text_batch, lengths)[:, 0]
        loss = nn.functional.binary_cross_entropy(pred, label_batch)

        pred = (pred >= 0.5).float()
        self.test_acc.update(pred, label_batch)
        self.log("test_loss", loss, prog_bar=True)
        self.log("test_acc", self.test_acc.compute(), prog_bar=True)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr = 0.001)
        return optimizer



        

In [4]:
torch.manual_seed(1)
embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64
torch.manual_seed(1)
imdb_dm = IMDBDataModule()
imdb_dm.setup("train")
imdbClassifier = RNNNetwork(imdb_dm.vocab_size, embed_dim,
rnn_hidden_size, fc_hidden_size)

trainer = pl.Trainer(max_epochs=20, accelerator="auto", enable_checkpointing=True)

trainer.fit(model=imdbClassifier, datamodule=imdb_dm)
            # ,ckpt_path='/home/tej/Documents/Courses/Learning/ML_With_PyTorch_Scikit_Practice/Chapter13/lightning_logs/version_0/checkpoints/epoch=49-step=43000.ckpt')

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: /media/tej/fmSpin/Courses/Learning/ML_With_PyTorch_Scikit_Practice/Chapter15/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type           | Params
---------------------------------------------
0 | train_acc | BinaryAccuracy | 0     
1 | valid_acc | BinaryAccuracy | 0     
2 | test_acc  | BinaryAccuracy | 0     
3 | embedding | Embedding      | 1.5 M 
4 | rnn       | LSTM           | 22.0 K
5 | fc1       | Linear         | 4.2 K 
6 | relu      | ReLU           | 0     
7 | fc2       | Linear         | 65    
8 | sigmoid   | Sigmoid        | 0     
---------------------------------------------
1.5 M     Trainable params
0         Non-trainable params
1.5 M     Total params
6.183     Total estimated model params size (MB)


Epoch 19: 100%|██████████| 625/625 [00:14<00:00, 43.85it/s, v_num=0, train loss: =0.0106, valid_loss=0.697, valid_acc=0.832]  

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|██████████| 625/625 [00:14<00:00, 43.71it/s, v_num=0, train loss: =0.0106, valid_loss=0.697, valid_acc=0.832]


In [5]:
# %load_ext tensorboard
# %tensorboard --logdir ./Chapter13/lightning_logs/

In [6]:
# filepath = "./Chapter15/Deployment/model.onnx"
# input_sample = torch.tensor([[[1, 2, 3, 4]], [4]])
# imdbClassifier.to_onnx(filepath, input_sample, export_params=True)

In [7]:
imdbClassifier

RNNNetwork(
  (train_acc): BinaryAccuracy()
  (valid_acc): BinaryAccuracy()
  (test_acc): BinaryAccuracy()
  (embedding): Embedding(75979, 20, padding_idx=0)
  (rnn): LSTM(20, 64, batch_first=True)
  (fc1): Linear(in_features=64, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)