# RobertaForSequenceClassification

- Baseline: 

In [None]:
import re
import pathlib

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import transformers
from sklearn.model_selection import KFold, StratifiedKFold
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizer, RobertaConfig

In [None]:
src_dir = pathlib.Path("../data/raw")

## Split Fold

In [None]:
data = pd.read_csv(src_dir / "train.csv")

# KFold
# cv = KFold(n_splits=5, shuffle=True, random_state=42)

# StratifiedKFold
num_bins = int(np.floor(1 + np.log2(len(data))))  # ref: https://www.kaggle.com/abhishek/step-1-create-folds
target_bins = pd.cut(data["target"], bins=num_bins, labels=False)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=42)
for n_fold, (train_idx, valid_idx) in enumerate(cv.split(data, target_bins)):

    train = data.loc[train_idx, ["excerpt", "target"]]
    valid = data.loc[valid_idx, ["excerpt", "target"]]

    fold_dump_dir = pathlib.Path(f"../data/split/fold_{n_fold}/")
    fold_dump_dir.mkdir(exist_ok=True)

    train.to_pickle(fold_dump_dir / "train.pkl")
    valid.to_pickle(fold_dump_dir / "valid.pkl")

    print("Fold:", n_fold)
    print(f"\tTrain Target Average: {train.target.mean():.04f}" + f"\tTrain Size={train.shape[0]}")
    print(f"\tValid Target Average: {valid.target.mean():.04f}" + f"\tValid Size={valid.shape[0]}")

## Define Dataset & Dataloader

In [None]:
import pathlib
from typing import Optional

import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from torch.utils.data import DataLoader, Dataset


class CommonLitDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=256):
        self.target = data[["target"]].to_numpy()
        self.excerpt = data[["excerpt"]].to_numpy()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.excerpt)

    def __getitem__(self, idx):
        text = str(self.excerpt[idx])
        inputs = self.tokenizer(
            text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        target = self.target[idx]

        return {
            "inputs": {
                "input_ids": inputs["input_ids"].to(torch.long),
                "attention_mask": inputs["attention_mask"].to(torch.long),
                # "token_type_ids": inputs["token_type_ids"].to(torch.long),
            },
            "target": torch.tensor(target, dtype=torch.float32),
        }


class CommonLitDataModule(pl.LightningDataModule):
    def __init__(self, data_dir: str, tokenizer, batch_size: int = 32):
        super(CommonLitDataModule, self).__init__()
        self.data_dir = pathlib.Path(data_dir)
        self.tokenizer = tokenizer
        self.batch_size = batch_size

    def setup(self, stage: Optional[str] = None):
        self.train = pd.read_pickle(self.data_dir / "train.pkl")
        self.valid = pd.read_pickle(self.data_dir / "valid.pkl")

    def train_dataloader(self):
        dataset = CommonLitDataset(self.train, self.tokenizer)
        return DataLoader(
            dataset,
            batch_size=self.batch_size,
            num_workers=4,
            pin_memory=True,
            shuffle=True,
            drop_last=True,
        )

    def val_dataloader(self):
        dataset = CommonLitDataset(self.valid, self.tokenizer)
        return DataLoader(
            dataset,
            batch_size=self.batch_size,
            num_workers=4,
            pin_memory=True,
            shuffle=False,
            drop_last=False,
        )

    def test_dataloader(self):
        dataset = CommonLitDataset(self.valid, self.tokenizer)
        return DataLoader(
            dataset,
            batch_size=self.batch_size,
            num_workers=4,
            pin_memory=True,
            shuffle=False,
            drop_last=False,
        )

## Define Model Arch

In [None]:
import pytorch_lightning as pl
import pytorch_warmup as warmup
import torch
import torch.nn as nn
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from transformers import (
    AdamW,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BertConfig,
    BertModel,
    RobertaModel,
    RobertaForSequenceClassification,
    XLMRobertaModel,
    get_cosine_schedule_with_warmup,
    get_linear_schedule_with_warmup,
)


class RMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()

    def forward(self, yhat, y):
        loss = torch.sqrt(self.mse(yhat.view(-1).to(torch.float32), y.view(-1).to(torch.float32)))
        return loss
    

class CommonLitRoBERTaModel(nn.Module):
    def __init__(self):
        super(CommonLitRoBERTaModel, self).__init__()
        configuration = RobertaConfig.from_pretrained("roberta-base", num_labels=1, problem_type="regression")
        self.roberta = RobertaForSequenceClassification.from_pretrained("roberta-base", config=configuration)

    def forward(self, batch):
        input_ids = batch["inputs"]["input_ids"].view(-1, 256)
        attention_mask = batch["inputs"]["attention_mask"].view(-1, 256)
        outputs = self.roberta(input_ids=input_ids,attention_mask=attention_mask, labels=batch["target"].view(-1, 1))
        return outputs
    
    
class CommonLitModel(pl.LightningModule):
    def __init__(
        self,
        num_epoch: int = 20,
        train_dataloader_len: int = 20,
        lr: float = 5e-5,
    ):
        super(CommonLitModel, self).__init__()
        self.lr = lr
        self.num_epoch = num_epoch
        self.train_dataloader_len = train_dataloader_len

        self.model = CommonLitRoBERTaModel()
        self.loss_fn = nn.MSELoss()
        self.eval_fn = RMSELoss()

    def forward(self, batch):
        z = self.model(batch)
        return z

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(
            self.parameters(),
            lr=self.lr,
            betas=(0.9, 0.999),
            eps=1e-8,
            weight_decay=5e-2,
        )
        lr_scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=2,
            num_training_steps=self.train_dataloader_len * self.num_epoch,
        )
        return [optimizer], [lr_scheduler]

    def shared_step(self, batch):
        z = self(batch)
        loss = self.loss_fn(z.logits, batch["target"]) + z.loss
        metric = self.eval_fn(z.logits, batch["target"])
        return loss, metric

    def training_step(self, batch, batch_idx):
        loss, _ = self.shared_step(batch)
        self.log("train_loss", loss)
        return {"loss": loss}

    def validation_step(self, batch, batch_idx):
        _, metric = self.shared_step(batch)
        self.log("val_loss", metric, prog_bar=True)
        return {"val_loss": metric}

    def test_step(self, batch, batch_idx):
        _, metric = self.shared_step(batch)
        return {"test_loss": metric}

    def test_step_end(self, outputs):
        return outputs

    def test_epoch_end(self, outputs):
        loss = torch.cat([out["test_loss"] for out in outputs], dim=0)
        self.log("test_rmse", torch.mean(loss))

## Train Model

In [None]:
model_path = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_path)

In [None]:
num_epoch = 100

best_checkpoints = []
for n_fold in range(5):
    datamodule = CommonLitDataModule(f"../data/split/fold_{n_fold}/", tokenizer, 32)
    datamodule.setup()

    # Logger
    tb_logger = TensorBoardLogger(
        save_dir="../tb_logs",
        name="RobertaForSequenceClassification",
    )
    # Callbacks
    lr_monitor = LearningRateMonitor(logging_interval="step")
    early_stop = EarlyStopping(
        mode="min",
        patience=10,
        verbose=False,
        monitor="val_loss",
        min_delta=0.01,
    )
    checkpoint = ModelCheckpoint(
        filename="{epoch:02d}-{loss:.4f}-{val_loss:.4f}",
        monitor="val_loss",
        mode="min",
    )

    train_dataloader_len = len(datamodule.train_dataloader())
    model = CommonLitModel(
        num_epoch=num_epoch,
        train_dataloader_len=train_dataloader_len,
    )
    trainer = Trainer(
        max_epochs=num_epoch,
        gpus=1,
        accelerator="dp",
        logger=tb_logger,
        callbacks=[lr_monitor, early_stop, checkpoint]
    )
    trainer.fit(model=model, datamoduprepare_inputs_for_generationdatamodule)
    trainer.test(model=model, datamodule=datamodule)
    
    best_checkpoints.append(checkpoint.best_model_path)
    torch.cuda.empty_cache()

## CV Score

In [None]:
def calc_average_loss(ckeckpoints):
    metrics = []
    for ckpt in ckeckpoints:
        metric = float(re.findall(r"val_loss=(\d+\.\d+)", ckpt)[0])
        metrics.append(metric)
        
    return np.mean(metrics)

In [None]:
avg_valid_loss = calc_average_loss(best_checkpoints)
print("Average Validation Loss:", avg_valid_loss)