## Download dataset

In [6]:
!python -m pip install pytorch_lightning torch transformers datasets sklearn
from datasets import load_dataset
import pytorch_lightning as pl
from pytorch_lightning.callbacks import Callback, ModelCheckpoint, EarlyStopping
import torch
from torch import nn
from torch.nn import functional as F
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import accuracy_score
from pytorch_lightning.loggers import WandbLogger

Collecting sklearn
  Using cached sklearn-0.0.tar.gz (1.1 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting scikit-learn
  Downloading scikit_learn-1.1.2-cp38-cp38-macosx_10_9_x86_64.whl (8.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Collecting scipy>=1.3.2
  Downloading scipy-1.9.2-cp38-cp38-macosx_10_9_x86_64.whl (34.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.2/34.2 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting joblib>=1.0.0
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25ldone
[?25h  Created

In [2]:
cola_dataset = load_dataset('glue','cola')
print(cola_dataset)
train_dataset = cola_dataset['train']
print(train_dataset[0])

{'sentence': "Our friends won't buy this analysis, let alone the next one we propose.", 'label': 1, 'idx': 0}


### DataModule

Pytorch lightning의 경우 Pytorch의 Dataloader와 유사한 DataModule을 사용한다.

< 정의해야 하는 method >
* prepare_data
* setup
* train_dataloader, val_dataloader, test_dataloader -> return DataLoader
  
< DataModule 안에서 수행되는 작업 >
* Download / tokenize / process
* Clean and save to disk
* Load inside Dataset
* Apply transforms (rotate, tokenize, etc…)
* Wrap inside a DataLoader (Pytorch)

In [5]:
class DataModule(pl.LightningDataModule):
    def __init__(self, model_name="google/bert_uncased_L-2_H-128_A-2", batch_size=32):
        super().__init__()

        self.batch_size = batch_size
        self.tokenizer = AutoTokenizer.from_pretrained(model_name) # Transformer (BERT) model

    def prepare_data(self):
        cola_dataset = load_dataset("glue", "cola")
        self.train_data = cola_dataset["train"]
        self.val_data = cola_dataset["validation"]

    def tokenize_data(self, example):
        # processing the data
        return self.tokenizer(
            example["sentence"],
            truncation=True,
            padding="max_length",
            max_length=256,
        )

    def setup(self, stage=None):
        if stage == "fit" or stage is None:
            self.train_data = self.train_data.map(self.tokenize_data, batched=True)
            self.train_data.set_format(
                type="torch", columns=["input_ids", "attention_mask", "label"]
            )

            self.val_data = self.val_data.map(self.tokenize_data, batched=True)
            self.val_data.set_format(
                type="torch", columns=["input_ids", "attention_mask", "label"]
            )

    def train_dataloader(self):
        return torch.utils.data.DataLoader(
            self.train_data, batch_size=self.batch_size, shuffle=True
        )

    def val_dataloader(self):
        return torch.utils.data.DataLoader(
            self.val_data, batch_size=self.batch_size, shuffle=False
        )

### Build model

Pytorch에서 model을 만들떄 상속받았던 `torch.nn.Module`과 마찬가지로 Pytorch-lightning은 `pl.LightningModule`을 상속받는다. forward 만 정의해 주면 되었던 때와는 다르게, 몇 가지 method를 추가로 정의해 주어야 한다. ([Document](https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html))

< 정의 해야 할 methods >
- forward -> return inference
- training_step -> return loss
- validation_step
- test_step (optional)
- configure_optimizers -> return optimizer

In [None]:
class ColaModel(pl.LightningModule):
    def __init__(self, model_name="google/bert_uncased_L-2_H-128_A-2", lr=1e-2):
        super(ColaModel, self).__init__()
        self.save_hyperparameters()

        self.bert = AutoModel.from_pretrained(model_name)
        self.W = nn.Linear(self.bert.config.hidden_size, 2)
        self.num_classes = 2

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        h_cls = outputs.last_hidden_state[:, 0]
        logits = self.W(h_cls)
        return logits

    def training_step(self, batch, batch_idx):
        logits = self.forward(batch["input_ids"], batch["attention_mask"])
        loss = F.cross_entropy(logits, batch["label"])
        self.log("train_loss", loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        logits = self.forward(batch["input_ids"], batch["attention_mask"])
        loss = F.cross_entropy(logits, batch["label"])
        _, preds = torch.max(logits, dim=1)
        val_acc = accuracy_score(preds.cpu(), batch["label"].cpu())
        val_acc = torch.tensor(val_acc)
        self.log("val_loss", loss, prog_bar=True)
        self.log("val_acc", val_acc, prog_bar=True)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams["lr"])

### Make Trainer

DataModule과 Pytorch-lightning model은 Trainer를 이용해서 학습을 진행하게 된다(Tensorflow의 Session과 비슷한 접근 방법).  
< Trainer 가 사용할 수 있는 options 예시 >
- logging
- gradient accumulation
- half precision training
- distributed computing
  
< Loggers >
- TensorboardLogger
- WandbLogger

< Callbacks >
[Documents](https://pytorch-lightning.readthedocs.io/en/latest/extensions/callbacks.html)

In [None]:
cola_data = DataModule()
cola_model = ColaModel()

checkpoint_callbacks = [
    ModelCheckpoint(dirpath="./models", monitor="val_loss", mode="min"),
    EarlyStopping(monitor="val_loss", patience=3, verbose=True, mode="min"),
]

trainer = pl.Trainer(
    gpus=(1 if torch.cuda.is_available() else 0),
    max_epochs=1,
    fast_dev_run=False, # True: one batch training one validation -> for debugging
    logger=pl.loggers.TensorBoardLogger("logs/", name="cola", version=1), # directory: logs/cola
    # logger = pl.loggers.WandbLogger(name='cola',project='pytorchlightning')
    callbacks=checkpoint_callbacks,
)
trainer.fit(cola_model, cola_data)

### Inference module

MLOps는 모델의 Training과 Inference의 모듈을 분리한다. 서버에서 학습이 진행되는 동안에도 모델을 freeze 하고 버전 관리를 하며 debuggin 할 수 있어야 하기 때문이다.  

< 정의 해야 할 methods >
- predict

< Inference 내부에서 수행되는 작업 >
- Load the trained model
- Get the input
- Convert the input in the required format
- Get the predictions

In [None]:
class ColaPredictor:
    def __init__(self, model_path):
        self.model_path = model_path
        # loading the trained model
        self.model = ColaModel.load_from_checkpoint(model_path)
        # keep the model in eval mode
        self.model.eval()
        self.model.freeze()
        self.processor = DataModule()
        self.softmax = torch.nn.Softmax(dim=0)
        self.lables = ["unacceptable", "acceptable"]

    def predict(self, text):
        # text => run time input
        inference_sample = {"sentence": text}
        # tokenizing the input
        processed = self.processor.tokenize_data(inference_sample)
        # predictions
        logits = self.model(
            torch.tensor([processed["input_ids"]]),
            torch.tensor([processed["attention_mask"]]),
        )
        scores = self.softmax(logits[0]).tolist()
        predictions = []
        for score, label in zip(scores, self.lables):
            predictions.append({"label": label, "score": score})
        return predictions