# ~~BERT Large~~

学習がおそすぎるし、精度が大きく変わるわけでもなさそうなので却下

- Baseline: 
- dropout 0.3: 
- dropout 0.5: 

In [1]:
import re
import pathlib

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import transformers
from sklearn.model_selection import KFold, StratifiedKFold
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer

In [2]:
src_dir = pathlib.Path("../data/raw")

## Split Fold

In [3]:
data = pd.read_csv(src_dir / "train.csv")
target_bins = pd.cut(data["target"], bins=10, labels=False)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for n_fold, (train_idx, valid_idx) in enumerate(cv.split(data, target_bins)):

    train = data.loc[train_idx, ["excerpt", "target"]]
    valid = data.loc[valid_idx, ["excerpt", "target"]]

    fold_dump_dir = pathlib.Path(f"../data/split/fold_{n_fold}/")
    fold_dump_dir.mkdir(exist_ok=True)

    train.to_pickle(fold_dump_dir / "train.pkl")
    valid.to_pickle(fold_dump_dir / "valid.pkl")

    print("Fold:", n_fold)
    print(f"\tTrain Target Average: {train.target.mean():.06f}")
    print(f"\tValid Target Average: {valid.target.mean():.06f}")

Fold: 0
	Train Target Average: -0.959512
	Valid Target Average: -0.958545
Fold: 1
	Train Target Average: -0.959406
	Valid Target Average: -0.958969
Fold: 2
	Train Target Average: -0.960382
	Valid Target Average: -0.955068
Fold: 3
	Train Target Average: -0.958762
	Valid Target Average: -0.961544
Fold: 4
	Train Target Average: -0.958532
	Valid Target Average: -0.962472


## Define Dataset & Dataloader

In [4]:
import pathlib
from typing import Optional

import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from torch.utils.data import DataLoader, Dataset


class CommonLitDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=256):
        self.target = data[["target"]].to_numpy()
        self.excerpt = data[["excerpt"]].to_numpy()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.excerpt)

    def __getitem__(self, idx):
        text = str(self.excerpt[idx])
        inputs = self.tokenizer(
            text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]

        target = self.target[idx]

        return {
            "input_ids": torch.tensor(ids, dtype=torch.long),
            "attention_mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "target": torch.tensor(target, dtype=torch.float32),
        }


class CommonLitDataModule(pl.LightningDataModule):
    def __init__(self, data_dir: str, tokenizer, batch_size: int = 32):
        super(CommonLitDataModule, self).__init__()
        self.data_dir = pathlib.Path(data_dir)
        self.tokenizer = tokenizer
        self.batch_size = batch_size

    def setup(self, stage: Optional[str] = None):
        self.train = pd.read_pickle(self.data_dir / "train.pkl")
        self.valid = pd.read_pickle(self.data_dir / "valid.pkl")

    def train_dataloader(self):
        dataset = CommonLitDataset(self.train, self.tokenizer)
        return DataLoader(
            dataset,
            batch_size=self.batch_size,
            num_workers=4,
            pin_memory=True,
            shuffle=True,
            drop_last=True,
        )

    def val_dataloader(self):
        dataset = CommonLitDataset(self.valid, self.tokenizer)
        return DataLoader(
            dataset,
            batch_size=self.batch_size,
            num_workers=4,
            pin_memory=True,
            shuffle=False,
            drop_last=False,
        )

    def test_dataloader(self):
        dataset = CommonLitDataset(self.valid, self.tokenizer)
        return DataLoader(
            dataset,
            batch_size=self.batch_size,
            num_workers=4,
            pin_memory=True,
            shuffle=False,
            drop_last=False,
        )

## Define Model Arch

In [5]:
import pytorch_lightning as pl
import pytorch_warmup as warmup
import torch
import torch.nn as nn
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from transformers import (
    AdamW,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BertConfig,
    BertModel,
    RobertaModel,
    XLMRobertaModel,
    get_cosine_schedule_with_warmup,
    get_linear_schedule_with_warmup,
)


class RMSELoss(nn.Module):
    def __init__(self, eps=1e-6):
        super().__init__()
        self.mse = nn.MSELoss()
        self.eps = eps

    def forward(self, yhat, y):
        loss = torch.sqrt(self.mse(yhat, y) + self.eps)
        return loss
    

class CommonLitBertModel(nn.Module):
    def __init__(self):
        super(CommonLitBertModel, self).__init__()
        self.bert = BertModel.from_pretrained("bert-large-uncased")
        self.layers = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(1024, 1)
        )

    def forward(self, batch):
        ids, mask, token_type_ids = (
            batch["input_ids"],
            batch["attention_mask"],
            batch["token_type_ids"],
        )
        _, output = self.bert(
            ids,
            attention_mask=mask,
            token_type_ids=token_type_ids,
            return_dict=False,
        )
        output = self.layers(output)
        return output
    
    
class CommonLitModel(pl.LightningModule):
    def __init__(
        self,
        num_epoch: int = 20,
        train_dataloader_len: int = 20,
        lr: float = 3e-6,
    ):
        super(CommonLitModel, self).__init__()
        self.lr = lr
        self.num_epoch = num_epoch
        self.train_dataloader_len = train_dataloader_len

        self.model = CommonLitBertModel()
        self.loss_fn = RMSELoss()

    def forward(self, batch):
        z = self.model(batch)
        return z

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(
            self.parameters(),
            lr=self.lr,
            betas=(0.9, 0.999),
            eps=1e-8,
            weight_decay=5e-2,
        )
        lr_scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=2,
            num_training_steps=self.train_dataloader_len * self.num_epoch,
        )
        return [optimizer], [lr_scheduler]

    def shared_step(self, batch):
        z = self(batch)
        loss = self.loss_fn(z, batch["target"])
        return loss

    def training_step(self, batch, batch_idx):
        loss = self.shared_step(batch)
        self.log("train_loss", loss)
        return {"loss": loss}

    def validation_step(self, batch, batch_idx):
        loss = self.shared_step(batch)
        self.log("val_loss", loss, prog_bar=True)
        return {"val_loss": loss}

    def test_step(self, batch, batch_idx):
        loss = self.shared_step(batch)
        return {"test_loss": loss}

    def test_step_end(self, outputs):
        return outputs

    def test_epoch_end(self, outputs):
        loss = torch.cat([out["test_loss"] for out in outputs], dim=0)
        self.log("test_rmse", torch.mean(loss))

## Train Model

In [6]:
model_path = "bert-large-uncased"
tokenizer = BertTokenizer.from_pretrained(model_path)

In [None]:
num_epoch = 100

best_checkpoints = []
for n_fold in range(5):
    datamodule = CommonLitDataModule(f"../data/split/fold_{n_fold}/", tokenizer, 8)
    datamodule.setup()

    # Logger
    tb_logger = TensorBoardLogger(
        save_dir="../tb_logs",
        name="Baseline",
    )
    # Callbacks
    lr_monitor = LearningRateMonitor(logging_interval="step")
    early_stop = EarlyStopping(
        mode="min",
        patience=10,
        verbose=False,
        monitor="val_loss",
        min_delta=0.01,
    )
    checkpoint = ModelCheckpoint(
        filename="{epoch:02d}-{loss:.4f}-{val_loss:.4f}",
        monitor="val_loss",
        mode="min",
    )

    train_dataloader_len = len(datamodule.train_dataloader())
    model = CommonLitModel(
        num_epoch=num_epoch,
        train_dataloader_len=train_dataloader_len,
    )
    trainer = Trainer(
        max_epochs=num_epoch,
        gpus=1,
        accelerator="dp",
        logger=tb_logger,
        callbacks=[lr_monitor, early_stop, checkpoint]
    )
    trainer.fit(model=model, datamodule=datamodule)
    trainer.test(model=model, datamodule=datamodule)
    
    best_checkpoints.append(checkpoint.best_model_path)
    torch.cuda.empty_cache()

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type               | Params
-----------------------------------------------
0 | model   | CommonLitBertModel | 335 M 
1 | loss_fn | RMSELoss           | 0     
-----------------------------------------------
335 M     Trainable params
0         Non-trainable params
335 M     Total params
1,340.572 Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_rmse': 0.6486238241195679}
--------------------------------------------------------------------------------


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type               | Params
-----------------------------------------------
0 | model   | CommonLitBertModel | 335 M 
1 | loss_fn | RMSELoss           | 0     
-----------------------------------------------
335 M     Trainable params
0         Non-trainable params
335 M     Total params
1,340.572 Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_rmse': 0.5799907445907593}
--------------------------------------------------------------------------------


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type               | Params
-----------------------------------------------
0 | model   | CommonLitBertModel | 335 M 
1 | loss_fn | RMSELoss           | 0     
-----------------------------------------------
335 M     Trainable params
0         Non-trainable params
335 M     Total params
1,340.572 Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

## CV Score

In [None]:
def calc_average_loss(ckeckpoints):
    metrics = []
    for ckpt in ckeckpoints:
        metric = float(re.findall(r"val_loss=(\d+\.\d+)", ckpt)[0])
        metrics.append(metric)
        
    return np.mean(metrics)

In [None]:
avg_valid_loss = calc_average_loss(best_checkpoints)
print("Average Validation Loss:", avg_valid_loss)