# RoBERTa

## Experiment Histories

### 2021-05-23

- Baseline: 
- Add LayerNorm: 
- Above and Init Weights: 

### 2021-05-22

CV戦略を変更、あと若干のモデル構成の変更（Linear層を追加）

- Baseline: 0.5152
- Change CV Strategy: 0.50776


`outputs.pooler_output` or `outputs.last_hidden_state[:, 0, :]`

- Baseline: 0.50776
- outputs.last_hidden_state[:, 0, :]: 0.517820


### 2021-05-21

Baseline（Dropout層無し） を設定して、有りの場合と比較

- Baseline: 0.56846
- dropout 0.3: 0.57654
- dropout 0.5: 0.5221, 0.52694

In [1]:
import pathlib
import re
from typing import Optional

import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
import torch.nn as nn
import transformers
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedStratifiedKFold
from torch.utils.data import DataLoader, Dataset
from torchinfo import summary
from transformers import (
    AdamW,
    RobertaConfig,
    RobertaModel,
    RobertaTokenizer,
    get_cosine_schedule_with_warmup,
    get_linear_schedule_with_warmup,
)


In [2]:
def seed_everything(seed: int = 42):
    np.random.seed(seed)
    pl.seed_everything(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        
seed_everything()

Global seed set to 42


In [3]:
src_dir = pathlib.Path("../data/raw")

## Split Fold

In [4]:
data = pd.read_csv(src_dir / "train.csv")

num_bins = int(np.floor(1 + np.log2(len(data))))  # ref: https://www.kaggle.com/abhishek/step-1-create-folds
target_bins = pd.cut(data["target"], bins=num_bins, labels=False)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for n_fold, (train_idx, valid_idx) in enumerate(cv.split(data, target_bins)):

    train = data.loc[train_idx, ["excerpt", "target"]]
    valid = data.loc[valid_idx, ["excerpt", "target"]]

    fold_dump_dir = pathlib.Path(f"../data/split/fold_{n_fold}/")
    fold_dump_dir.mkdir(exist_ok=True)

    train.to_pickle(fold_dump_dir / "train.pkl")
    valid.to_pickle(fold_dump_dir / "valid.pkl")

    print("Fold:", n_fold)
    print(f"\tTrain Target Average: {train.target.mean():.06f}")
    print(f"\tValid Target Average: {valid.target.mean():.06f}")

Fold: 0
	Train Target Average: -0.958924
	Valid Target Average: -0.960897
Fold: 1
	Train Target Average: -0.958125
	Valid Target Average: -0.964091
Fold: 2
	Train Target Average: -0.958339
	Valid Target Average: -0.963237
Fold: 3
	Train Target Average: -0.962328
	Valid Target Average: -0.947285
Fold: 4
	Train Target Average: -0.958878
	Valid Target Average: -0.961087


In [5]:
num_splits = cv.get_n_splits()
num_splits

## Define Dataset & Dataloader

In [6]:
class CommonLitDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=256):
        self.target = data[["target"]].to_numpy()
        self.excerpt = data[["excerpt"]].to_numpy()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.excerpt)

    def __getitem__(self, idx):
        text = str(self.excerpt[idx])
        inputs = self.tokenizer(
            text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
        )
        target = self.target[idx]

        return {
            "inputs": {
                "input_ids": torch.tensor(inputs["input_ids"], dtype=torch.long),
                "attention_mask": torch.tensor(inputs["attention_mask"], dtype=torch.long),
                # "token_type_ids": torch.tensor(inputs["token_type_ids"], dtype=torch.long),
            },
            "target": torch.tensor(target, dtype=torch.float32),
        }


class CommonLitDataModule(pl.LightningDataModule):
    def __init__(self, data_dir: str, tokenizer, batch_size: int = 32):
        super(CommonLitDataModule, self).__init__()
        self.data_dir = pathlib.Path(data_dir)
        self.tokenizer = tokenizer
        self.batch_size = batch_size

    def setup(self, stage: Optional[str] = None):
        self.train = pd.read_pickle(self.data_dir / "train.pkl")
        self.valid = pd.read_pickle(self.data_dir / "valid.pkl")

    def train_dataloader(self):
        dataset = CommonLitDataset(self.train, self.tokenizer)
        return DataLoader(
            dataset,
            batch_size=self.batch_size,
            num_workers=4,
            pin_memory=True,
            shuffle=True,
            drop_last=True,
        )

    def val_dataloader(self):
        dataset = CommonLitDataset(self.valid, self.tokenizer)
        return DataLoader(
            dataset,
            batch_size=self.batch_size,
            num_workers=4,
            pin_memory=True,
            shuffle=False,
            drop_last=False,
        )

    def test_dataloader(self):
        dataset = CommonLitDataset(self.valid, self.tokenizer)
        return DataLoader(
            dataset,
            batch_size=self.batch_size,
            num_workers=4,
            pin_memory=True,
            shuffle=False,
            drop_last=False,
        )

## Define Model Arch

In [7]:
class RMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()

    def forward(self, yhat, y):
        loss = torch.sqrt(self.mse(yhat, y))
        return loss


class CommonLitRoBERTaModel(nn.Module):
    def __init__(self):
        super(CommonLitRoBERTaModel, self).__init__()
        self.config = RobertaConfig.from_pretrained("roberta-base")
        self.roberta = RobertaModel.from_pretrained("roberta-base")
        self.layers = nn.Sequential(
            # nn.LayerNorm(768 + 2),
            nn.Dropout(0.5),
            nn.Linear(768 + 2, 770),
            nn.Dropout(0.3),
            nn.Linear(770, 1)
        )
        # Initialize Weights
        # self.layers.apply(self._init_weights)

    def forward(self, batch):
        outputs = self.roberta(**batch["inputs"])

        last_hidden_state = outputs.pooler_output
        hidden_state_avg = outputs.last_hidden_state[:, -4:].mean(dim=(1,2)).view(-1, 1)
        hidden_state_sum = outputs.last_hidden_state[:, -4:].sum(dim=(1,2)).view(-1, 1)
        output = torch.cat((last_hidden_state, hidden_state_avg, hidden_state_sum), dim=1)
        
        output = self.layers(output)
        return output
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    
    
class CommonLitModel(pl.LightningModule):
    def __init__(
        self,
        lr: float = 1e-4,
        num_epoch: int = 20,
    ):
        super(CommonLitModel, self).__init__()
        self.save_hyperparameters()

        self.model = CommonLitRoBERTaModel()
        self.loss_fn = nn.MSELoss()
        self.eval_fn = RMSELoss()

    def forward(self, batch):
        z = self.model(batch)
        return z

    def configure_optimizers(self):
        train_dataloader_len = len(self.trainer.datamodule.train_dataloader())
        total_step_num = (train_dataloader_len * self.hparams.num_epoch)
        
        optimizer = torch.optim.AdamW(
            self.parameters(),
            lr=self.hparams.lr,
            betas=(0.9, 0.999),
            eps=1e-8,
            weight_decay=0,
        )
        lr_scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps=total_step_num,
        )
        return [optimizer], [lr_scheduler]

    def shared_step(self, batch):
        z = self(batch)
        loss = self.loss_fn(z, batch["target"])
        metric = self.eval_fn(z, batch["target"])
        return z, loss, metric

    def training_step(self, batch, batch_idx):
        z, loss, metric = self.shared_step(batch)
        self.log("train_loss", loss)
        return {"loss": loss}

    def validation_step(self, batch, batch_idx):
        z, loss, metric = self.shared_step(batch)
        self.log("val_loss", loss, prog_bar=True)
        self.log("val_metric", metric, prog_bar=True)
        return {"val_loss": loss, "val_metric": metric}

    def test_step(self, batch, batch_idx):
        z, loss, metric = self.shared_step(batch)
        return {"test_loss": loss, "test_metric": metric}

    def test_step_end(self, outputs):
        return outputs

    def test_epoch_end(self, outputs):
        loss = torch.cat([out["test_loss"] for out in outputs], dim=0)
        metric = torch.cat([out["test_metric"] for out in outputs], dim=0)
        self.log("test_loss", torch.mean(loss), "test_metric", torch.mean(metric))

In [8]:
model = CommonLitRoBERTaModel()

# print(model)
print(summary(model, depth=3, verbose=0))

Layer (type:depth-idx)                   Param #
├─RobertaModel: 1-1                      --
|    └─RobertaEmbeddings: 2-1            --
|    |    └─Embedding: 3-1               38,603,520
|    |    └─Embedding: 3-2               394,752
|    |    └─Embedding: 3-3               768
|    |    └─LayerNorm: 3-4               1,536
|    |    └─Dropout: 3-5                 --
|    └─RobertaEncoder: 2-2               --
|    |    └─ModuleList: 3-6              85,054,464
|    └─RobertaPooler: 2-3                --
|    |    └─Linear: 3-7                  590,592
|    |    └─Tanh: 3-8                    --
├─Sequential: 1-2                        --
|    └─Dropout: 2-4                      --
|    └─Linear: 2-5                       593,670
|    └─Dropout: 2-6                      --
|    └─Linear: 2-7                       771
Total params: 125,240,073
Trainable params: 125,240,073
Non-trainable params: 0


## Train Model

In [10]:
model_path = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_path)

ValueError: Connection error, and we cannot find the requested files in the cached path. Please try again or make sure your Internet connection is on.

In [None]:
num_epoch = 10

best_checkpoints = []
for n_fold in range(5):
    datamodule = CommonLitDataModule(f"../data/split/fold_{n_fold}/", tokenizer, 32)
    datamodule.setup()

    # Logger
    tb_logger = TensorBoardLogger(
        save_dir="./tb_logs",
        name="Baseline",
    )
    # Callbacks
    lr_monitor = LearningRateMonitor(logging_interval="step")
    early_stop = EarlyStopping(
        mode="min",
        patience=10,
        verbose=False,
        monitor="val_loss",
        min_delta=0.01,
    )
    checkpoint = ModelCheckpoint(
        filename="{epoch:02d}-{loss:.4f}-{val_loss:.4f}-{val_metric:.4f}",
        monitor="val_loss",
        mode="min",
    )

    model = CommonLitModel(
        lr=5e-5,
        num_epoch=num_epoch,
    )
    trainer = Trainer(
        max_epochs=num_epoch,
        gpus=1,
        accelerator="dp",
        logger=tb_logger,
        callbacks=[lr_monitor, early_stop, checkpoint],
    )
    trainer.fit(model=model, datamodule=datamodule)
    # trainer.test(model=model, datamodule=datamodule, ckpt_path=checkpoint.best_model_path)
    
    best_checkpoints.append(checkpoint.best_model_path)

## CV Score

In [None]:
def calc_average_loss(ckeckpoints):
    metrics = []
    for ckpt in ckeckpoints:
        metric = float(re.findall(r"val_metric=(\d+\.\d+)", ckpt)[0])
        metrics.append(metric)
        
    return np.mean(metrics)

In [None]:
avg_valid_loss = calc_average_loss(best_checkpoints)
print("Average Validation Loss:", avg_valid_loss)