# RoBERTa Classification

## Experiment Histories

In [1]:
import pathlib
import re
from typing import Optional

import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
import torch.nn as nn
import transformers
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedStratifiedKFold
from torch.utils.data import DataLoader, Dataset
from torchinfo import summary
from transformers import (
    AdamW,
    RobertaConfig,
    RobertaModel,
    RobertaTokenizer,
    get_cosine_schedule_with_warmup,
    get_linear_schedule_with_warmup,
)


In [2]:
def seed_everything(seed: int = 42):
    np.random.seed(seed)
    pl.seed_everything(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        
seed_everything(422)

Global seed set to 422


In [3]:
src_dir = pathlib.Path("../data/raw")

## Binning

In [4]:
data = pd.read_csv(src_dir / "train.csv")

num_bin = 40
target_label, target_bins = pd.cut(data["target"], bins=num_bin, labels=False, retbins=True)

In [5]:
data['target_label']= target_label

## Split Fold

In [6]:
# KFold
# cv = KFold(n_splits=5, shuffle=True, random_state=42)

# StratifiedKFold
num_bins = int(np.floor(1 + np.log2(len(data))))  # ref: https://www.kaggle.com/abhishek/step-1-create-folds
bins = pd.cut(data["target"], bins=num_bins, labels=False)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=42)
for n_fold, (train_idx, valid_idx) in enumerate(cv.split(data, bins)):

    train = data.loc[train_idx, ["excerpt", "target", "target_label"]]
    valid = data.loc[valid_idx, ["excerpt", "target", "target_label"]]

    fold_dump_dir = pathlib.Path(f"../data/split/fold_{n_fold}/")
    fold_dump_dir.mkdir(exist_ok=True)

    train.to_pickle(fold_dump_dir / "train.pkl")
    valid.to_pickle(fold_dump_dir / "valid.pkl")

    print("Fold:", n_fold)
    print(f"\tTrain Target Average: {train.target.mean():.04f}" + f"\tTrain Size={train.shape[0]}")
    print(f"\tValid Target Average: {valid.target.mean():.04f}" + f"\tValid Size={valid.shape[0]}")

Fold: 0
	Train Target Average: -0.9589	Train Size=2267
	Valid Target Average: -0.9609	Valid Size=567
Fold: 1
	Train Target Average: -0.9581	Train Size=2267
	Valid Target Average: -0.9641	Valid Size=567
Fold: 2
	Train Target Average: -0.9583	Train Size=2267
	Valid Target Average: -0.9632	Valid Size=567
Fold: 3
	Train Target Average: -0.9623	Train Size=2267
	Valid Target Average: -0.9473	Valid Size=567
Fold: 4
	Train Target Average: -0.9589	Train Size=2268
	Valid Target Average: -0.9611	Valid Size=566


In [7]:
num_splits = cv.get_n_splits()
num_splits

5

## Define Dataset & Dataloader

In [8]:
class CommonLitDataset(Dataset):
    def __init__(self, data, tokenizer, max_len: int = 256, is_test: bool = False):
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.excerpt = data[["excerpt"]].to_numpy()
        
        if is_test:
            self.target = np.zeros(len(data))
            self.target_label = np.zeros(len(data)).astype(np.int8)
        else:
            self.target = data["target"].to_numpy()
            self.target_label = data["target_label"].to_numpy()

    def __len__(self):
        return len(self.excerpt)

    def __getitem__(self, idx):
        text = str(self.excerpt[idx])
        inputs = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_token_type_ids=True,
        )
        target = self.target[idx]
        target_label = self.target_label[idx]

        return {
            "inputs": {
                "input_ids": torch.tensor(inputs["input_ids"], dtype=torch.long),
                "attention_mask": torch.tensor(inputs["attention_mask"], dtype=torch.long),
                "token_type_ids": torch.tensor(inputs["token_type_ids"], dtype=torch.long),
            },
            "target": torch.tensor(target, dtype=torch.float32),
            "target_label": torch.tensor(target_label, dtype=torch.long),
        }


class CommonLitDataModule(pl.LightningDataModule):
    def __init__(self, data_dir: str, tokenizer, batch_size: int = 32):
        super(CommonLitDataModule, self).__init__()
        self.data_dir = pathlib.Path(data_dir)
        self.tokenizer = tokenizer
        self.batch_size = batch_size

    def setup(self, stage: Optional[str] = None):
        self.train = pd.read_pickle(self.data_dir / "train.pkl")
        self.valid = pd.read_pickle(self.data_dir / "valid.pkl")

    def train_dataloader(self):
        dataset = CommonLitDataset(self.train, self.tokenizer)
        return DataLoader(
            dataset,
            batch_size=self.batch_size,
            num_workers=4,
            pin_memory=True,
            shuffle=True,
            drop_last=True,
        )

    def val_dataloader(self):
        dataset = CommonLitDataset(self.valid, self.tokenizer)
        return DataLoader(
            dataset,
            batch_size=self.batch_size,
            num_workers=4,
            pin_memory=True,
            shuffle=False,
            drop_last=False,
        )

## Define Model Arch

In [9]:
class RMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()

    def forward(self, yhat, y):
        loss = torch.sqrt(self.mse(yhat, y))
        return loss


class CommonLitRoBERTaModel(nn.Module):
    def __init__(
            self, 
            out_dim: int = 20,
    ):
        super(CommonLitRoBERTaModel, self).__init__()
        self.out_dim = out_dim
        
        self.config = RobertaConfig.from_pretrained("roberta-base")
        self.roberta = RobertaModel.from_pretrained("roberta-base")
        self.layers = nn.Sequential(
            nn.LayerNorm(768),
            nn.Dropout(0.5),
            nn.Linear(768, out_dim)
        )
        # Initialize Weights
        self.layers.apply(self._init_weights)

    def forward(self, batch):
        outputs = self.roberta(**batch["inputs"])
        pooler_output = outputs.pooler_output
        hidden_state_avg = outputs.last_hidden_state[:, -4:].mean(dim=(1,2)).view(-1, 1)
        hidden_state_sum = outputs.last_hidden_state[:, -4:].sum(dim=(1,2)).view(-1, 1)
        
        # x = torch.cat((pooler_output, hidden_state_avg, hidden_state_sum), dim=1)
        x = self.layers(pooler_output)
        return x
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    
    
class CommonLitModel(pl.LightningModule):
    def __init__(
        self,
        out_dim: int,
        target_bins: np.ndarray,
        lr: float = 5e-5,
        num_epoch: int = 10,
    ):
        super(CommonLitModel, self).__init__()
        self.save_hyperparameters()
        self.target_bins = torch.tensor(target_bins, dtype=torch.float32, device='cuda')

        self.roberta = CommonLitRoBERTaModel(out_dim=out_dim)
        self.loss_fn = nn.CrossEntropyLoss()  # nn.MSELoss()
        self.eval_fn = RMSELoss()

    def forward(self, batch):
        z = self.roberta(batch)
        return z

    def configure_optimizers(self):
        optimizer_grouped_parameters = self._get_optimizer_params(self)
        optimizer = torch.optim.AdamW(
            optimizer_grouped_parameters,  # self.parameters()
            lr=self.hparams.lr,
            betas=(0.9, 0.999),
            eps=1e-8,
            weight_decay=1e-2,
        )
        
        lr_scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps=self.hparams.num_epoch,
        )
        return [optimizer], [lr_scheduler]
    
    def _get_optimizer_params(self, model):
        # differential learning rate and weight decay
        param_optimizer = list(model.named_parameters())
        learning_rate = self.hparams.lr
        no_decay = ["bias", "gamma", "beta"]
        group1 = ["layer.0.", "layer.1.", "layer.2.", "layer.3."]
        group2 = ["layer.4.", "layer.5.", "layer.6.", "layer.7."]
        group3 = ["layer.8.", "layer.9.", "layer.10.", "layer.11."]
        group_all = [
            "layer.0.",
            "layer.1.",
            "layer.2.",
            "layer.3.",
            "layer.4.",
            "layer.5.",
            "layer.6.",
            "layer.7.",
            "layer.8.",
            "layer.9.",
            "layer.10.",
            "layer.11.",
        ]
        optimizer_parameters = [
            {
                "params": [
                    p
                    for n, p in model.roberta.named_parameters()
                    if not any(nd in n for nd in no_decay)
                    and not any(nd in n for nd in group_all)
                ],
                "weight_decay_rate": 0.01,
            },
            {
                "params": [
                    p
                    for n, p in model.roberta.named_parameters()
                    if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)
                ],
                "weight_decay_rate": 0.01,
                "lr": learning_rate / 2.6,
            },
            {
                "params": [
                    p
                    for n, p in model.roberta.named_parameters()
                    if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)
                ],
                "weight_decay_rate": 0.01,
                "lr": learning_rate / 1.5,
            },
            {
                "params": [
                    p
                    for n, p in model.roberta.named_parameters()
                    if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)
                ],
                "weight_decay_rate": 0.01,
                "lr": learning_rate,
            },
            {
                "params": [
                    p
                    for n, p in model.roberta.named_parameters()
                    if any(nd in n for nd in no_decay)
                    and not any(nd in n for nd in group_all)
                ],
                "weight_decay_rate": 0.0,
            },
            {
                "params": [
                    p
                    for n, p in model.roberta.named_parameters()
                    if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)
                ],
                "weight_decay_rate": 0.0,
                "lr": learning_rate / 2.6,
            },
            {
                "params": [
                    p
                    for n, p in model.roberta.named_parameters()
                    if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)
                ],
                "weight_decay_rate": 0.0,
                "lr": learning_rate / 1.5,
            },
            {
                "params": [
                    p
                    for n, p in model.roberta.named_parameters()
                    if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)
                ],
                "weight_decay_rate": 0.0,
                "lr": learning_rate,
            },
            {
                "params": [p for n, p in model.named_parameters() if "roberta" not in n],
                "lr": 1e-5,
                "momentum": 0.99,
            },
        ]
        return optimizer_parameters


    def shared_step(self, batch):
        z = self(batch)
        loss = self.loss_fn(z, batch["target_label"])
        
        z_reg = self.target_bins[z.argmax(dim=1)]
        metric = self.eval_fn(z_reg, batch["target"])
        return z, loss, metric

    def training_step(self, batch, batch_idx):
        z, loss, metric = self.shared_step(batch)
        self.log("train_loss", loss)
        return {"loss": loss}

    def validation_step(self, batch, batch_idx):
        z, loss, metric = self.shared_step(batch)
        
        self.log("val_loss", loss, prog_bar=True)
        self.log("val_metric", metric, prog_bar=True)
        return {"val_loss": loss, "val_metric": metric}

In [10]:
model = CommonLitRoBERTaModel()

# print(model)
print(summary(model, depth=3, verbose=0))

Layer (type:depth-idx)                   Param #
├─RobertaModel: 1-1                      --
|    └─RobertaEmbeddings: 2-1            --
|    |    └─Embedding: 3-1               38,603,520
|    |    └─Embedding: 3-2               394,752
|    |    └─Embedding: 3-3               768
|    |    └─LayerNorm: 3-4               1,536
|    |    └─Dropout: 3-5                 --
|    └─RobertaEncoder: 2-2               --
|    |    └─ModuleList: 3-6              85,054,464
|    └─RobertaPooler: 2-3                --
|    |    └─Linear: 3-7                  590,592
|    |    └─Tanh: 3-8                    --
├─Sequential: 1-2                        --
|    └─LayerNorm: 2-4                    1,536
|    └─Dropout: 2-5                      --
|    └─Linear: 2-6                       15,380
Total params: 124,662,548
Trainable params: 124,662,548
Non-trainable params: 0


## Train Model

In [11]:
%%time

num_epoch = 10
batch_size = 32

model_path = "roberta-base" 
tokenizer = RobertaTokenizer.from_pretrained(model_path)

best_checkpoints = []
for n_fold in range(num_splits):
    datamodule = CommonLitDataModule(f"../data/split/fold_{n_fold}/", tokenizer, batch_size)
    datamodule.setup()

    # Logger
    tb_logger = TensorBoardLogger(
        save_dir="../tb_logs",
        name="Baseline",
    )
    # Callbacks
    lr_monitor = LearningRateMonitor(logging_interval="step")
    early_stop = EarlyStopping(
        mode="min",
        patience=5,
        verbose=False,
        monitor="val_loss",
        min_delta=0.01,
    )
    checkpoint = ModelCheckpoint(
        filename="{epoch:02d}-{loss:.4f}-{val_loss:.4f}-{val_metric:.4f}",
        monitor="val_loss",
        save_top_k=3,
        mode="min",
    )

    model = CommonLitModel(
        out_dim=num_bin,
        target_bins=target_bins,
        lr=3e-5,
        num_epoch=num_epoch,
    )
    trainer = Trainer(
        accelerator="dp",
        gpus=1,
        logger=tb_logger,
        callbacks=[lr_monitor, checkpoint],
        max_epochs=num_epoch,
        stochastic_weight_avg=True,
    )
    trainer.fit(model=model, datamodule=datamodule)
    trainer.test(model=model, datamodule=datamodule, ckpt_path='best')
    
    best_checkpoints.append(checkpoint.best_model_path)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  if value == "???":
  if value == "???":
  special_value = value is None or value == "???"

  | Name    | Type                  | Params
--------------------------------------------------
0 | roberta | CommonLitRoBERTaModel | 124 M 
1 | loss_fn | CrossEntropyLoss      | 0     
2 | eval_fn | RMSELoss              | 0     
--------------------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
498.712   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 422


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Swapping scheduler <torch.optim.lr_scheduler.LambdaLR object at 0x7f3d9c8563a0> for <torch.optim.swa_utils.SWALR object at 0x7f3e2103a820>


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  if value == "???":
  if value == "???":
  special_value = value is None or value == "???"

  | Name    | Type                  | Params
--------------------------------------------------
0 | roberta | CommonLitRoBERTaModel | 124 M 
1 | loss_fn | CrossEntropyLoss      | 0     
2 | eval_fn | RMSELoss              | 0     
--------------------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
498.712   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 422


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Swapping scheduler <torch.optim.lr_scheduler.LambdaLR object at 0x7f3e214c52e0> for <torch.optim.swa_utils.SWALR object at 0x7f3e2141ce80>


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  if value == "???":
  if value == "???":
  special_value = value is None or value == "???"

  | Name    | Type                  | Params
--------------------------------------------------
0 | roberta | CommonLitRoBERTaModel | 124 M 
1 | loss_fn | CrossEntropyLoss      | 0     
2 | eval_fn | RMSELoss              | 0     
--------------------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
498.712   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 422


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Swapping scheduler <torch.optim.lr_scheduler.LambdaLR object at 0x7f3da5235bb0> for <torch.optim.swa_utils.SWALR object at 0x7f3e213e8580>


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  if value == "???":
  if value == "???":
  special_value = value is None or value == "???"

  | Name    | Type                  | Params
--------------------------------------------------
0 | roberta | CommonLitRoBERTaModel | 124 M 
1 | loss_fn | CrossEntropyLoss      | 0     
2 | eval_fn | RMSELoss              | 0     
--------------------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
498.712   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 422


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Swapping scheduler <torch.optim.lr_scheduler.LambdaLR object at 0x7f3e21d05670> for <torch.optim.swa_utils.SWALR object at 0x7f3d9c87c5b0>


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  if value == "???":
  if value == "???":
  special_value = value is None or value == "???"

  | Name    | Type                  | Params
--------------------------------------------------
0 | roberta | CommonLitRoBERTaModel | 124 M 
1 | loss_fn | CrossEntropyLoss      | 0     
2 | eval_fn | RMSELoss              | 0     
--------------------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
498.712   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 422


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Swapping scheduler <torch.optim.lr_scheduler.LambdaLR object at 0x7f3d9c9fec40> for <torch.optim.swa_utils.SWALR object at 0x7f3e21c5a9d0>


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


CPU times: user 25min 59s, sys: 4min 49s, total: 30min 49s
Wall time: 31min 29s


## CV Score

In [12]:
best_checkpoints

['../tb_logs/Baseline/version_8/checkpoints/epoch=03-loss=0.0000-val_loss=3.0207-val_metric=0.6094.ckpt',
 '../tb_logs/Baseline/version_9/checkpoints/epoch=02-loss=0.0000-val_loss=3.0405-val_metric=0.6174.ckpt',
 '../tb_logs/Baseline/version_10/checkpoints/epoch=03-loss=0.0000-val_loss=3.0848-val_metric=0.6049.ckpt',
 '../tb_logs/Baseline/version_11/checkpoints/epoch=01-loss=0.0000-val_loss=2.9501-val_metric=0.6045.ckpt',
 '../tb_logs/Baseline/version_12/checkpoints/epoch=02-loss=0.0000-val_loss=3.0309-val_metric=0.6078.ckpt']

In [13]:
def calc_average_loss(ckeckpoints):
    metrics = []
    for ckpt in ckeckpoints:
        metric = float(re.findall(r"val_metric=(\d+\.\d+)", ckpt)[0])
        metrics.append(metric)
        
    return np.mean(metrics)

In [14]:
avg_valid_loss = calc_average_loss(best_checkpoints)
print("Average Validation Loss:", avg_valid_loss)

Average Validation Loss: 0.6088
