# RoBERTa with pre-trianed model

## Experiment Histories

2021-05-27

- Baseline: 0.51776
- further pretrained model
    - `../data/extend/roberta_itpt`: 0.51748
    - `../data/extend/pretrained_roberta`: 0.52324
        - further-5epoch-pretraining
- lr scheduler (pretrained_roberta で検証)
    - get_cosine_schedule_with_warmup: 0.52324
    - get_linear_schedule_with_warmup: 0.5281

In [1]:
import pathlib
import re
from typing import Optional

import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
import torch.nn as nn
import transformers
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.model_selection import KFold, RepeatedKFold, StratifiedKFold, RepeatedStratifiedKFold
from torch.utils.data import DataLoader, Dataset
from torchinfo import summary
from transformers import (
    AdamW,
    AutoConfig,
    AutoTokenizer,
    AutoModel,
    RobertaConfig,
    RobertaModel,
    RobertaTokenizer,
    get_cosine_schedule_with_warmup,
    get_linear_schedule_with_warmup,
)

In [2]:
def seed_everything(seed: int = 42):
    np.random.seed(seed)
    pl.seed_everything(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        
seed_everything(422)

Global seed set to 422


In [3]:
src_dir = pathlib.Path("../data/raw")

## Split Fold

In [4]:
data = pd.read_csv(src_dir / "train.csv")

# KFold
cv = KFold(n_splits=5, shuffle=True, random_state=42)
# cv = RepeatedKFold(n_splits=5, n_repeats=2, random_state=42)

# StratifiedKFold
num_bins = int(np.floor(1 + np.log2(len(data))))  # ref: https://www.kaggle.com/abhishek/step-1-create-folds
target_bins = pd.cut(data["target"], bins=num_bins, labels=False)

# cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=42)
for n_fold, (train_idx, valid_idx) in enumerate(cv.split(data, target_bins)):

    train = data.loc[train_idx, ["excerpt", "target"]]
    valid = data.loc[valid_idx, ["excerpt", "target"]]

    fold_dump_dir = pathlib.Path(f"../data/split/fold_{n_fold}/")
    fold_dump_dir.mkdir(exist_ok=True)

    train.to_pickle(fold_dump_dir / "train.pkl")
    valid.to_pickle(fold_dump_dir / "valid.pkl")

    print("Fold:", n_fold)
    print(f"\tTrain Target Average: {train.target.mean():.04f}" + f"\tTrain Size={train.shape[0]}")
    print(f"\tValid Target Average: {valid.target.mean():.04f}" + f"\tValid Size={valid.shape[0]}")

Fold: 0
	Train Target Average: -0.9644	Train Size=2267
	Valid Target Average: -0.9390	Valid Size=567
Fold: 1
	Train Target Average: -0.9559	Train Size=2267
	Valid Target Average: -0.9728	Valid Size=567
Fold: 2
	Train Target Average: -0.9540	Train Size=2267
	Valid Target Average: -0.9806	Valid Size=567
Fold: 3
	Train Target Average: -0.9681	Train Size=2267
	Valid Target Average: -0.9244	Valid Size=567
Fold: 4
	Train Target Average: -0.9542	Train Size=2268
	Valid Target Average: -0.9798	Valid Size=566


In [5]:
num_splits = cv.get_n_splits()
num_splits

5

## Define Dataset & Dataloader

In [6]:
class CommonLitDataset(Dataset):
    def __init__(
        self, 
        data: pd.DataFrame, 
        tokenizer: transformers.PreTrainedTokenizer, 
        max_len: int = 256,
        is_test: bool = False,
    ):
        self.target = data[["target"]].to_numpy()
        self.excerpt = data[["excerpt"]].to_numpy()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.excerpt)

    def __getitem__(self, idx):
        text = str(self.excerpt[idx])
        inputs = self.tokenizer(
            text,
            max_length=self.max_len,
            truncation=True,
            padding="max_length",
            return_attention_mask=True,
            return_token_type_ids=True,
        )
        target = self.target[idx]

        return {
            "inputs": {
                "input_ids": torch.tensor(inputs["input_ids"], dtype=torch.long),
                "attention_mask": torch.tensor(inputs["attention_mask"], dtype=torch.long),
                "token_type_ids": torch.tensor(inputs["token_type_ids"], dtype=torch.long),
            },
            "target": torch.tensor(target, dtype=torch.float32),
        }


class CommonLitDataModule(pl.LightningDataModule):
    def __init__(
        self,
        data_dir: str, 
        tokenizer: transformers.PreTrainedTokenizer,
        batch_size: int = 32
    ):
        super(CommonLitDataModule, self).__init__()
        self.data_dir = pathlib.Path(data_dir)
        self.tokenizer = tokenizer
        self.batch_size = batch_size

    def setup(self, stage: Optional[str] = None):
        self.train = pd.read_pickle(self.data_dir / "train.pkl")
        self.valid = pd.read_pickle(self.data_dir / "valid.pkl")

    def train_dataloader(self):
        dataset = CommonLitDataset(self.train, self.tokenizer)
        return DataLoader(
            dataset,
            batch_size=self.batch_size,
            num_workers=4,
            pin_memory=True,
            shuffle=True,
            drop_last=True,
        )

    def val_dataloader(self):
        dataset = CommonLitDataset(self.valid, self.tokenizer)
        return DataLoader(
            dataset,
            batch_size=self.batch_size,
            num_workers=4,
            pin_memory=True,
            shuffle=False,
            drop_last=False,
        )

## Define Model Arch

In [7]:
class RMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()

    def forward(self, yhat, y):
        loss = torch.sqrt(self.mse(yhat.to(torch.float32), y.to(torch.float32)))
        return loss


class CommonLitRoBERTaModel(nn.Module):
    def __init__(
        self,
        config_name_or_path: str,
        model_name_or_path: str,
        num_labels: int = 1,
        output_hidden_states: bool = False,
    ):
        super(CommonLitRoBERTaModel, self).__init__()
        self.config = AutoConfig.from_pretrained(config_name_or_path)
        self.config.update({'num_labels': num_labels})
        
        self.roberta = AutoModel.from_pretrained(
            model_name_or_path,
            output_hidden_states=output_hidden_states
        )
        self.layers = nn.Sequential(
            nn.LayerNorm(768),
            nn.Dropout(0.5),
            nn.Linear(768, 1)
        )
        # Initialize Weights
        self.layers.apply(self._init_weights)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, batch):
        outputs = self.roberta(**batch["inputs"])
        pooler_output = outputs.pooler_output
        hidden_state_avg = outputs.last_hidden_state[:, -4:].mean(dim=(1,2)).view(-1, 1)
        hidden_state_sum = outputs.last_hidden_state[:, -4:].sum(dim=(1,2)).view(-1, 1)

        # x = torch.cat((pooler_output, hidden_state_avg, hidden_state_sum), dim=1)
        x = self.layers(pooler_output)
        return x
    
    
class CommonLitModel(pl.LightningModule):
    def __init__(
        self,
        lr: float = 5e-5,
        num_epoch: int = 10,
        lr_scheduler: str = "linear",
        lr_interval: str = "epoch",
        lr_warmup_step: int = 0,
    ):
        super(CommonLitModel, self).__init__()
        self.save_hyperparameters()

        self.model = CommonLitRoBERTaModel(
            config_name_or_path="../data/extend/roberta_itpt",
            model_name_or_path="../data/extend/roberta_itpt",
        )
        self.loss_fn = nn.MSELoss()
        self.eval_fn = RMSELoss()

    def forward(self, batch):
        z = self.model(batch)
        return z

    def configure_optimizers(self):
        optimizer_grouped_parameters = self._get_optimizer_params(self.model)
        optimizer = torch.optim.AdamW(
            optimizer_grouped_parameters,  # self.parameters()
            lr=self.hparams.lr,
            betas=(0.9, 0.999),
            eps=1e-8,
            weight_decay=1e-2,
        )
        
        if self.hparams.lr_scheduler == "linear":
            # Linear scheduler
            lr_scheduler = get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=0,
                num_training_steps=self.hparams.num_epoch,
            )
        elif self.hparams.lr_scheduler == "cosine":
            # Cosine scheduler
            lr_scheduler = get_cosine_schedule_with_warmup(
                optimizer,
                num_warmup_steps=self.hparams.lr_warmup_step,
                num_training_steps=self.hparams.num_epoch,
            )
        else:
            # Linear scheduler
            lr_scheduler = get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=self.hparams.lr_warmup_step,
                num_training_steps=self.hparams.num_epoch * self.train_dataloader_len,
            )
        
        lr_dict = {
            "scheduler": lr_scheduler,
            "interval": self.hparams.lr_interval,  # step or epoch
            "strict": True,
        }

        return {"optimizer": optimizer, "lr_scheduler": lr_dict}
    
    def _get_optimizer_params(self, model):
        # differential learning rate and weight decay
        param_optimizer = list(model.named_parameters())
        learning_rate = self.hparams.lr
        no_decay = ["bias", "gamma", "beta"]
        group1 = ["layer.0.", "layer.1.", "layer.2.", "layer.3."]
        group2 = ["layer.4.", "layer.5.", "layer.6.", "layer.7."]
        group3 = ["layer.8.", "layer.9.", "layer.10.", "layer.11."]
        group_all = [
            "layer.0.",
            "layer.1.",
            "layer.2.",
            "layer.3.",
            "layer.4.",
            "layer.5.",
            "layer.6.",
            "layer.7.",
            "layer.8.",
            "layer.9.",
            "layer.10.",
            "layer.11.",
        ]
        optimizer_parameters = [
            {
                "params": [
                    p
                    for n, p in model.roberta.named_parameters()
                    if not any(nd in n for nd in no_decay)
                    and not any(nd in n for nd in group_all)
                ],
                "weight_decay_rate": 0.01,
            },
            {
                "params": [
                    p
                    for n, p in model.roberta.named_parameters()
                    if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)
                ],
                "weight_decay_rate": 0.01,
                "lr": learning_rate / 2.6,
            },
            {
                "params": [
                    p
                    for n, p in model.roberta.named_parameters()
                    if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)
                ],
                "weight_decay_rate": 0.01,
                "lr": learning_rate
            },
            {
                "params": [
                    p
                    for n, p in model.roberta.named_parameters()
                    if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)
                ],
                "weight_decay_rate": 0.01,
                "lr": learning_rate * 2.6,
            },
            {
                "params": [
                    p
                    for n, p in model.roberta.named_parameters()
                    if any(nd in n for nd in no_decay)
                    and not any(nd in n for nd in group_all)
                ],
                "weight_decay_rate": 0.0,
            },
            {
                "params": [
                    p
                    for n, p in model.roberta.named_parameters()
                    if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)
                ],
                "weight_decay_rate": 0.0,
                "lr": learning_rate / 2.6,
            },
            {
                "params": [
                    p
                    for n, p in model.roberta.named_parameters()
                    if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)
                ],
                "weight_decay_rate": 0.0,
                "lr": learning_rate,
            },
            {
                "params": [
                    p
                    for n, p in model.roberta.named_parameters()
                    if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)
                ],
                "weight_decay_rate": 0.0,
                "lr": learning_rate * 2.6,
            },
            {
                "params": [p for n, p in model.named_parameters() if "roberta" not in n],
                "lr": 1e-5,
                "momentum": 0.99,
            },
        ]
        return optimizer_parameters


    def shared_step(self, batch):
        z = self(batch)
        loss = self.loss_fn(z, batch["target"])
        metric = self.eval_fn(z, batch["target"])
        return z, loss, metric

    def training_step(self, batch, batch_idx):
        z, loss, metric = self.shared_step(batch)
        self.log("train_loss", loss)
        return {"loss": loss}

    def validation_step(self, batch, batch_idx):
        z, loss, metric = self.shared_step(batch)
        self.log("val_loss", loss, prog_bar=True)
        self.log("val_metric", metric, prog_bar=True)
        return {"val_loss": loss, "val_metric": metric}

In [8]:
model = CommonLitRoBERTaModel(
    config_name_or_path="../data/extend/roberta_itpt",
    model_name_or_path="../data/extend/roberta_itpt",
)

# print(model)
print(summary(model, depth=3, verbose=0))
del model

Some weights of RobertaModel were not initialized from the model checkpoint at ../data/extend/roberta_itpt and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Layer (type:depth-idx)                   Param #
├─RobertaModel: 1-1                      --
|    └─RobertaEmbeddings: 2-1            --
|    |    └─Embedding: 3-1               38,603,520
|    |    └─Embedding: 3-2               394,752
|    |    └─Embedding: 3-3               768
|    |    └─LayerNorm: 3-4               1,536
|    |    └─Dropout: 3-5                 --
|    └─RobertaEncoder: 2-2               --
|    |    └─ModuleList: 3-6              85,054,464
|    └─RobertaPooler: 2-3                --
|    |    └─Linear: 3-7                  590,592
|    |    └─Tanh: 3-8                    --
├─Sequential: 1-2                        --
|    └─LayerNorm: 2-4                    1,536
|    └─Dropout: 2-5                      --
|    └─Linear: 2-6                       769
Total params: 124,647,937
Trainable params: 124,647,937
Non-trainable params: 0


## Train Model

In [9]:
model_path = "roberta-base" 
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

In [10]:
%%time

num_epoch = 10
batch_size = 8

best_checkpoints = []
for n_fold in range(num_splits):
    datamodule = CommonLitDataModule(f"../data/split/fold_{n_fold}/", tokenizer, batch_size)
    datamodule.setup()

    # Logger
    tb_logger = TensorBoardLogger(
        save_dir="../tb_logs",
        name="Baseline",
    )
    # Callbacks
    lr_monitor = LearningRateMonitor(logging_interval="step")
    early_stop = EarlyStopping(
        mode="min",
        patience=5,
        verbose=False,
        monitor="val_loss",
        min_delta=0.01,
    )
    checkpoint = ModelCheckpoint(
        filename="{epoch:02d}-{loss:.4f}-{val_loss:.4f}-{val_metric:.4f}",
        monitor="val_loss",
        save_top_k=3,
        mode="min",
    )

    model = CommonLitModel(
        lr=5e-5,
        num_epoch=num_epoch,
        lr_scheduler="cosine",
        lr_interval="epoch",
        lr_warmup_step=0,
    )
    trainer = Trainer(
        accelerator="dp",
        gpus=1,
        logger=tb_logger,
        callbacks=[lr_monitor, checkpoint],
        max_epochs=num_epoch,
        stochastic_weight_avg=True,
    )
    trainer.fit(model=model, datamodule=datamodule)
    trainer.test(model=model, datamodule=datamodule, ckpt_path='best')
    
    print(f"{n_fold}-Fold Best Checkpoint:\n", checkpoint.best_model_path)
    best_checkpoints.append(checkpoint.best_model_path)

Some weights of RobertaModel were not initialized from the model checkpoint at ../data/extend/pretrained_roberta and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type                  | Params
--------------------------------------------------
0 | model   | CommonLitRoBERTaModel | 124 M 
1 | loss_fn | MSELoss               | 0     
2 | eval_fn | RMSELoss              | 0     
--------------------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
498.592   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 422


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Swapping scheduler <torch.optim.lr_scheduler.LambdaLR object at 0x7fc36e9f18b0> for <torch.optim.swa_utils.SWALR object at 0x7fc388b36ac0>


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


0-Fold Best Checkpoint:
 ../tb_logs/Baseline/version_5/checkpoints/epoch=06-loss=0.0000-val_loss=0.2757-val_metric=0.5101.ckpt


Some weights of RobertaModel were not initialized from the model checkpoint at ../data/extend/pretrained_roberta and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type                  | Params
--------------------------------------------------
0 | model   | CommonLitRoBERTaModel | 124 M 
1 | loss_fn | MSELoss               | 0     
2 | eval_fn | RMSELoss              | 0     
--------------------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
498.592   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 422


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Swapping scheduler <torch.optim.lr_scheduler.LambdaLR object at 0x7fc3f8584250> for <torch.optim.swa_utils.SWALR object at 0x7fc36d9c8fa0>


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


1-Fold Best Checkpoint:
 ../tb_logs/Baseline/version_6/checkpoints/epoch=03-loss=0.0000-val_loss=0.2717-val_metric=0.5081.ckpt


Some weights of RobertaModel were not initialized from the model checkpoint at ../data/extend/pretrained_roberta and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type                  | Params
--------------------------------------------------
0 | model   | CommonLitRoBERTaModel | 124 M 
1 | loss_fn | MSELoss               | 0     
2 | eval_fn | RMSELoss              | 0     
--------------------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
498.592   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 422


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Swapping scheduler <torch.optim.lr_scheduler.LambdaLR object at 0x7fc3f8841fd0> for <torch.optim.swa_utils.SWALR object at 0x7fc36f201cd0>


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


2-Fold Best Checkpoint:
 ../tb_logs/Baseline/version_7/checkpoints/epoch=04-loss=0.0000-val_loss=0.2844-val_metric=0.5187.ckpt


Some weights of RobertaModel were not initialized from the model checkpoint at ../data/extend/pretrained_roberta and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type                  | Params
--------------------------------------------------
0 | model   | CommonLitRoBERTaModel | 124 M 
1 | loss_fn | MSELoss               | 0     
2 | eval_fn | RMSELoss              | 0     
--------------------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
498.592   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 422


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Swapping scheduler <torch.optim.lr_scheduler.LambdaLR object at 0x7fc3f8282850> for <torch.optim.swa_utils.SWALR object at 0x7fc3f3777280>


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


3-Fold Best Checkpoint:
 ../tb_logs/Baseline/version_8/checkpoints/epoch=05-loss=0.0000-val_loss=0.3002-val_metric=0.5331.ckpt


Some weights of RobertaModel were not initialized from the model checkpoint at ../data/extend/pretrained_roberta and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type                  | Params
--------------------------------------------------
0 | model   | CommonLitRoBERTaModel | 124 M 
1 | loss_fn | MSELoss               | 0     
2 | eval_fn | RMSELoss              | 0     
--------------------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
498.592   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 422


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Swapping scheduler <torch.optim.lr_scheduler.LambdaLR object at 0x7fc3f330b7c0> for <torch.optim.swa_utils.SWALR object at 0x7fc36f282910>


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


4-Fold Best Checkpoint:
 ../tb_logs/Baseline/version_9/checkpoints/epoch=04-loss=0.0000-val_loss=0.3107-val_metric=0.5462.ckpt
CPU times: user 37min 5s, sys: 3min 44s, total: 40min 50s
Wall time: 40min 9s


## CV Score

In [11]:
best_checkpoints

['../tb_logs/Baseline/version_5/checkpoints/epoch=06-loss=0.0000-val_loss=0.2757-val_metric=0.5101.ckpt',
 '../tb_logs/Baseline/version_6/checkpoints/epoch=03-loss=0.0000-val_loss=0.2717-val_metric=0.5081.ckpt',
 '../tb_logs/Baseline/version_7/checkpoints/epoch=04-loss=0.0000-val_loss=0.2844-val_metric=0.5187.ckpt',
 '../tb_logs/Baseline/version_8/checkpoints/epoch=05-loss=0.0000-val_loss=0.3002-val_metric=0.5331.ckpt',
 '../tb_logs/Baseline/version_9/checkpoints/epoch=04-loss=0.0000-val_loss=0.3107-val_metric=0.5462.ckpt']

In [12]:
def calc_average_loss(ckeckpoints):
    metrics = []
    for ckpt in ckeckpoints:
        metric = float(re.findall(r"val_metric=(\d+\.\d+)", ckpt)[0])
        metrics.append(metric)
        
    return np.mean(metrics)

In [13]:
avg_valid_loss = calc_average_loss(best_checkpoints)
print("Average Validation Loss:", avg_valid_loss)

Average Validation Loss: 0.52324
