# About this notebook
- Deberta-v3-base starter code
- pip wheels is [here](https://www.kaggle.com/code/yasufuminakama/fb3-pip-wheels)
- Inference notebook is [here](https://www.kaggle.com/yasufuminakama/fb3-deberta-v3-base-baseline-inference)

If this notebook is helpful, feel free to upvote :)

# Directory settings

In [1]:
# ====================================================
# Directory settings
# ====================================================
import os

OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# CFG

In [2]:
# ====================================================
# CFG
# ====================================================
class CFG:
    wandb=True
    competition='FB3'
    _wandb_kernel='nakama'
    debug=False
    apex=True
    print_freq=20
    num_workers=4
    model="microsoft/deberta-v3-base"
    gradient_checkpointing=True
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=8
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]
    train=True
    reinit_layers = 1
    
if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]

In [3]:
# ====================================================
# wandb
# ====================================================
if CFG.wandb:
    
    import wandb

    try:
        from kaggle_secrets import UserSecretsClient
        user_secrets = UserSecretsClient()
        secret_value_0 = user_secrets.get_secret("wandb_api")
        wandb.login(key=secret_value_0)
        anony = None
    except:
        anony = "must"
        print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')


    def class2dict(f):
        return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

    run = wandb.init(project='FB3-Public', 
                     name=CFG.model,
                     config=class2dict(CFG),
                     group=CFG.model,
                     job_type="train",
                     anonymous=anony)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mhey1[0m. Use [1m`wandb login --relogin`[0m to force relogin


# Library

In [4]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

os.system('pip install iterative-stratification==0.1.7')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

os.system('pip uninstall -y transformers')
os.system('pip uninstall -y tokenizers')
os.system('python -m pip install --no-index --find-links=../input/fb3-pip-wheels transformers')
os.system('python -m pip install --no-index --find-links=../input/fb3-pip-wheels tokenizers')
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Collecting iterative-stratification==0.1.7
  Downloading iterative_stratification-0.1.7-py3-none-any.whl (8.5 kB)
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.7




Found existing installation: transformers 4.20.1
Uninstalling transformers-4.20.1:
  Successfully uninstalled transformers-4.20.1




Found existing installation: tokenizers 0.12.1
Uninstalling tokenizers-0.12.1:
  Successfully uninstalled tokenizers-0.12.1




Looking in links: ../input/fb3-pip-wheels
Processing /kaggle/input/fb3-pip-wheels/transformers-4.21.2-py3-none-any.whl
Processing /kaggle/input/fb3-pip-wheels/tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl
Installing collected packages: tokenizers, transformers


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
allennlp 2.10.0 requires protobuf==3.20.0, but you have protobuf 3.19.4 which is incompatible.
allennlp 2.10.0 requires transformers<4.21,>=4.1, but you have transformers 4.21.2 which is incompatible.


Successfully installed tokenizers-0.12.1 transformers-4.21.2
Looking in links: ../input/fb3-pip-wheels




tokenizers.__version__: 0.12.1
transformers.__version__: 4.21.2
env: TOKENIZERS_PARALLELISM=true


# Utils

In [5]:
# ====================================================
# Utils
# ====================================================
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores


def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()
config = AutoConfig.from_pretrained(CFG.model)


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

Downloading config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

# Data Loading

In [6]:
# ====================================================
# Data Loading
# ====================================================
train = pd.read_csv('../input/feedback-prize-english-language-learning/train.csv')
test = pd.read_csv('../input/feedback-prize-english-language-learning/test.csv')
submission = pd.read_csv('../input/feedback-prize-english-language-learning/sample_submission.csv')

print(f"train.shape: {train.shape}")
display(train.head())
print(f"test.shape: {test.shape}")
display(test.head())
print(f"submission.shape: {submission.shape}")
display(submission.head())

train.shape: (3911, 8)


Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


test.shape: (3, 2)


Unnamed: 0,text_id,full_text
0,0000C359D63E,when a person has no experience on a job their...
1,000BAD50D026,Do you think students would benefit from being...
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde..."


submission.shape: (3, 7)


Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,3.0,3.0,3.0,3.0,3.0,3.0
1,000BAD50D026,3.0,3.0,3.0,3.0,3.0,3.0
2,00367BB2546B,3.0,3.0,3.0,3.0,3.0,3.0


# CV split

In [7]:
# ====================================================
# CV split
# ====================================================
Fold = MultilabelStratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
for n, (train_index, val_index) in enumerate(Fold.split(train, train[CFG.target_cols])):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

fold
0    978
1    977
2    978
3    978
dtype: int64

In [8]:
if CFG.debug:
    display(train.groupby('fold').size())
    train = train.sample(n=1000, random_state=0).reset_index(drop=True)
    display(train.groupby('fold').size())

# tokenizer

In [9]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Downloading tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Dataset

In [10]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['full_text'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 3 # cls & sep & sep
LOGGER.info(f"max_len: {CFG.max_len}")

  0%|          | 0/3911 [00:00<?, ?it/s]

max_len: 1429


In [11]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors=None, 
        add_special_tokens=True, 
        max_length=CFG.max_len,
        pad_to_max_length=True,
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['full_text'].values
        self.labels = df[cfg.target_cols].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label
    

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

# Model

In [12]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        
        # reinit layers
        if cfg.reinit_layers > 0:
            print(f'Reinitializing Last {cfg.reinit_layers} Layers ...')
            encoder_temp = getattr(self.model, "encoder")
            for layer in encoder_temp.layer[-cfg.reinit_layers:]:
                for module in layer.modules():
                    if isinstance(module, nn.Linear):
                        module.weight.data.normal_(mean=0.0, std=config.initializer_range)
                        if module.bias is not None:
                            module.bias.data.zero_()
                    elif isinstance(module, nn.Embedding):
                        module.weight.data.normal_(mean=0.0, std=config.initializer_range)
                        if module.padding_idx is not None:
                            module.weight.data[module.padding_idx].zero_()
                    elif isinstance(module, nn.LayerNorm):
                        module.bias.data.zero_()
                        module.weight.data.fill_(1.0)
        print('Layer Reinitialized!')
        
        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, 6)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output

# Loss

In [13]:
# ====================================================
# Loss
# ====================================================
class RMSELoss(nn.Module):
    def __init__(self, reduction='mean', eps=1e-9):
        super().__init__()
        self.mse = nn.MSELoss(reduction='none')
        self.reduction = reduction
        self.eps = eps

    def forward(self, y_pred, y_true):
        loss = torch.sqrt(self.mse(y_pred, y_true) + self.eps)
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss

# Helpler functions

In [14]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
        if CFG.wandb:
            wandb.log({f"[fold{fold}] loss": losses.val,
                       f"[fold{fold}] lr": scheduler.get_lr()[0]})
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions

# train loop

In [15]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds[CFG.target_cols].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0,lr_mult=0.9):
        layer_names = []
        for idx, (name, param) in enumerate(model.named_parameters()):
            layer_names.append(name)
        layer_names.reverse()

        # learning rate
        lr = encoder_lr

        # placeholder
        parameters = []
        # parameter group name
        try:
            prev_group_name = int(name.split('.')[3])
        except:
            prev_group_name = -1  #name.split('.')[0]

        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]        
        # store params & learning rates
        for idx, name in enumerate(layer_names):

            # parameter group name
            try:
                cur_group_name = int(name.split('.')[3])
            except:
                cur_group_name = -1  #name.split('.')[0]

            # update learning rate
            if cur_group_name != prev_group_name:
                lr *= lr_mult
            prev_group_name = cur_group_name

            # display info
            # print(f'{idx}: lr = {lr:.6f}, {name} x {cur_group_name}')

            # append layer parameters
            if not any(nd in name for nd in no_decay):
                parameters += [{'params': [p for n, p in model.named_parameters() if n == name and p.requires_grad],
                                'lr':  lr, 'weight_decay': weight_decay}]
            elif any(nd in name for nd in no_decay): #Layer norm params
                parameters += [{'params': [p for n, p in model.named_parameters() if n == name and p.requires_grad],
                                'lr':  lr, 'weight_decay': 0.0}]        
            elif "model" not in name:
                parameters += [{'params': [p for n, p in model.named_parameters() if n == name and p.requires_grad],
                                'lr':  lr, 'weight_decay': weight_decay}]        
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.SmoothL1Loss(reduction='mean') # RMSELoss(reduction="mean")
    
    best_score = np.inf

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score, scores = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}  Scores: {scores}')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1, 
                       f"[fold{fold}] avg_train_loss": avg_loss, 
                       f"[fold{fold}] avg_val_loss": avg_val_loss,
                       f"[fold{fold}] score": score})
        
        if best_score > score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds[[f"pred_{c}" for c in CFG.target_cols]] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [16]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df[CFG.target_cols].values
        preds = oof_df[[f"pred_{c}" for c in CFG.target_cols]].values
        score, scores = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}  Scores: {scores}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_DIR+'oof_df.pkl')
        
    if CFG.wandb:
        wandb.finish()

DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}



Downloading pytorch_model.bin:   0%|          | 0.00/354M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Reinitializing Last 1 Layers ...
Layer Reinitialized!
Epoch: [1][0/366] Elapsed 0m 2s (remain 15m 17s) Loss: 2.2886(2.2886) Grad: inf  LR: 0.00002000  
Epoch: [1][20/366] Elapsed 0m 36s (remain 9m 57s) Loss: 0.1703(0.9910) Grad: 188338.1406  LR: 0.00001999  
Epoch: [1][40/366] Elapsed 1m 16s (remain 10m 4s) Loss: 0.1422(0.5992) Grad: 143402.4844  LR: 0.00001996  
Epoch: [1][60/366] Elapsed 1m 52s (remain 9m 23s) Loss: 0.1320(0.4529) Grad: 174048.4688  LR: 0.00001991  
Epoch: [1][80/366] Elapsed 2m 26s (remain 8m 33s) Loss: 0.2080(0.3770) Grad: 319731.9062  LR: 0.00001985  
Epoch: [1][100/366] Elapsed 3m 0s (remain 7m 52s) Loss: 0.1455(0.3282) Grad: 148032.4219  LR: 0.00001977  
Epoch: [1][120/366] Elapsed 3m 40s (remain 7m 25s) Loss: 0.1373(0.2928) Grad: 200140.5312  LR: 0.00001967  
Epoch: [1][140/366] Elapsed 4m 12s (remain 6m 43s) Loss: 0.1834(0.2695) Grad: 104585.3750  LR: 0.00001955  
Epoch: [1][160/366] Elapsed 4m 47s (remain 6m 6s) Loss: 0.2395(0.2537) Grad: 279361.4375  LR: 0.0

Epoch 1 - avg_train_loss: 0.1831  avg_val_loss: 0.1126  time: 735s
Epoch 1 - Score: 0.4751  Scores: [0.503620027783237, 0.4534327693567662, 0.43362699256823645, 0.46541305920331333, 0.5240475326592121, 0.4704324782546755]
Epoch 1 - Save Best Score: 0.4751 Model


EVAL: [60/62] Elapsed 1m 16s (remain 0m 1s) Loss: 0.0967(0.1126) 
EVAL: [61/62] Elapsed 1m 16s (remain 0m 0s) Loss: 0.1096(0.1126) 
Epoch: [2][0/366] Elapsed 0m 1s (remain 11m 42s) Loss: 0.1094(0.1094) Grad: 148147.2031  LR: 0.00001706  
Epoch: [2][20/366] Elapsed 0m 43s (remain 11m 54s) Loss: 0.0838(0.1045) Grad: 164608.1719  LR: 0.00001675  
Epoch: [2][40/366] Elapsed 1m 18s (remain 10m 21s) Loss: 0.1215(0.1094) Grad: 195126.5156  LR: 0.00001643  
Epoch: [2][60/366] Elapsed 1m 59s (remain 9m 57s) Loss: 0.0990(0.1061) Grad: 245292.2344  LR: 0.00001610  
Epoch: [2][80/366] Elapsed 2m 34s (remain 9m 2s) Loss: 0.0922(0.1039) Grad: 91456.3828  LR: 0.00001575  
Epoch: [2][100/366] Elapsed 3m 6s (remain 8m 9s) Loss: 0.0726(0.1034) Grad: 71178.4375  LR: 0.00001540  
Epoch: [2][120/366] Elapsed 3m 42s (remain 7m 29s) Loss: 0.1281(0.1053) Grad: 99850.0625  LR: 0.00001503  
Epoch: [2][140/366] Elapsed 4m 16s (remain 6m 49s) Loss: 0.1411(0.1058) Grad: 248581.2031  LR: 0.00001466  
Epoch: [2][160

Epoch 2 - avg_train_loss: 0.1031  avg_val_loss: 0.1029  time: 734s
Epoch 2 - Score: 0.4541  Scores: [0.4837323580183268, 0.44139436983109576, 0.42109138678901464, 0.4565394280470815, 0.48187416848923026, 0.4399863667066082]
Epoch 2 - Save Best Score: 0.4541 Model


EVAL: [60/62] Elapsed 1m 15s (remain 0m 1s) Loss: 0.0822(0.1030) 
EVAL: [61/62] Elapsed 1m 15s (remain 0m 0s) Loss: 0.0804(0.1029) 
Epoch: [3][0/366] Elapsed 0m 1s (remain 10m 25s) Loss: 0.0994(0.0994) Grad: 118333.6484  LR: 0.00001000  
Epoch: [3][20/366] Elapsed 0m 38s (remain 10m 37s) Loss: 0.0803(0.1005) Grad: 135119.9844  LR: 0.00000957  
Epoch: [3][40/366] Elapsed 1m 19s (remain 10m 32s) Loss: 0.1483(0.1021) Grad: 235026.5469  LR: 0.00000914  
Epoch: [3][60/366] Elapsed 1m 52s (remain 9m 24s) Loss: 0.1207(0.1005) Grad: 136052.3594  LR: 0.00000872  
Epoch: [3][80/366] Elapsed 2m 27s (remain 8m 38s) Loss: 0.1167(0.0995) Grad: 159058.3438  LR: 0.00000829  
Epoch: [3][100/366] Elapsed 3m 5s (remain 8m 6s) Loss: 0.0689(0.0985) Grad: 80476.9688  LR: 0.00000787  
Epoch: [3][120/366] Elapsed 3m 37s (remain 7m 20s) Loss: 0.0885(0.0972) Grad: 113331.7969  LR: 0.00000746  
Epoch: [3][140/366] Elapsed 4m 13s (remain 6m 45s) Loss: 0.1127(0.0964) Grad: 115151.1953  LR: 0.00000704  
Epoch: [3][

Epoch 3 - avg_train_loss: 0.0956  avg_val_loss: 0.1063  time: 736s
Epoch 3 - Score: 0.4618  Scores: [0.48579339512646713, 0.4487238504213605, 0.4430417661913269, 0.47371738514102807, 0.4750776624171608, 0.4443033485827198]


EVAL: [60/62] Elapsed 1m 15s (remain 0m 1s) Loss: 0.0899(0.1062) 
EVAL: [61/62] Elapsed 1m 15s (remain 0m 0s) Loss: 0.1205(0.1063) 
Epoch: [4][0/366] Elapsed 0m 3s (remain 21m 25s) Loss: 0.0865(0.0865) Grad: 285932.0625  LR: 0.00000294  
Epoch: [4][20/366] Elapsed 0m 36s (remain 9m 55s) Loss: 0.0715(0.0840) Grad: 71074.5547  LR: 0.00000264  
Epoch: [4][40/366] Elapsed 1m 11s (remain 9m 25s) Loss: 0.0938(0.0876) Grad: 130141.9844  LR: 0.00000236  
Epoch: [4][60/366] Elapsed 1m 51s (remain 9m 15s) Loss: 0.0702(0.0875) Grad: 127858.2266  LR: 0.00000209  
Epoch: [4][80/366] Elapsed 2m 28s (remain 8m 42s) Loss: 0.0975(0.0887) Grad: 126583.5312  LR: 0.00000183  
Epoch: [4][100/366] Elapsed 3m 1s (remain 7m 55s) Loss: 0.1229(0.0883) Grad: 309300.1875  LR: 0.00000159  
Epoch: [4][120/366] Elapsed 3m 40s (remain 7m 26s) Loss: 0.0810(0.0886) Grad: 159098.6406  LR: 0.00000137  
Epoch: [4][140/366] Elapsed 4m 19s (remain 6m 53s) Loss: 0.1029(0.0890) Grad: 177637.3438  LR: 0.00000116  
Epoch: [4][1

Epoch 4 - avg_train_loss: 0.0893  avg_val_loss: 0.1015  time: 731s
Epoch 4 - Score: 0.4509  Scores: [0.47992946050794794, 0.4421196846420046, 0.4141958948267202, 0.457162092378581, 0.4716538193930461, 0.4405118507482787]
Epoch 4 - Save Best Score: 0.4509 Model


EVAL: [60/62] Elapsed 1m 15s (remain 0m 1s) Loss: 0.0799(0.1015) 
EVAL: [61/62] Elapsed 1m 15s (remain 0m 0s) Loss: 0.0936(0.1015) 


Score: 0.4509  Scores: [0.47992946050794794, 0.4421196846420046, 0.4141958948267202, 0.457162092378581, 0.4716538193930461, 0.4405118507482787]
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size"

Reinitializing Last 1 Layers ...
Layer Reinitialized!
Epoch: [1][0/366] Elapsed 0m 1s (remain 9m 42s) Loss: 2.8378(2.8378) Grad: inf  LR: 0.00002000  
Epoch: [1][20/366] Elapsed 0m 37s (remain 10m 17s) Loss: 0.3083(1.0650) Grad: 94976.9297  LR: 0.00001999  
Epoch: [1][40/366] Elapsed 1m 12s (remain 9m 32s) Loss: 0.2039(0.6368) Grad: 161957.9688  LR: 0.00001996  
Epoch: [1][60/366] Elapsed 1m 49s (remain 9m 5s) Loss: 0.1815(0.4821) Grad: 200394.1875  LR: 0.00001991  
Epoch: [1][80/366] Elapsed 2m 22s (remain 8m 21s) Loss: 0.1328(0.3948) Grad: 87830.0312  LR: 0.00001985  
Epoch: [1][100/366] Elapsed 3m 0s (remain 7m 53s) Loss: 0.1208(0.3419) Grad: 70824.3828  LR: 0.00001977  
Epoch: [1][120/366] Elapsed 3m 30s (remain 7m 6s) Loss: 0.1360(0.3057) Grad: 75274.1953  LR: 0.00001967  
Epoch: [1][140/366] Elapsed 4m 5s (remain 6m 31s) Loss: 0.1483(0.2803) Grad: 111842.2500  LR: 0.00001955  
Epoch: [1][160/366] Elapsed 4m 46s (remain 6m 4s) Loss: 0.0794(0.2606) Grad: 105763.0312  LR: 0.00001941

Epoch 1 - avg_train_loss: 0.1812  avg_val_loss: 0.1094  time: 730s
Epoch 1 - Score: 0.4689  Scores: [0.5136554636892466, 0.46051301388362414, 0.4334824776888305, 0.4636727771892551, 0.4819716255130323, 0.45987438063680464]
Epoch 1 - Save Best Score: 0.4689 Model


EVAL: [60/62] Elapsed 1m 16s (remain 0m 1s) Loss: 0.0933(0.1094) 
EVAL: [61/62] Elapsed 1m 16s (remain 0m 0s) Loss: 0.0800(0.1094) 
Epoch: [2][0/366] Elapsed 0m 2s (remain 14m 57s) Loss: 0.1010(0.1010) Grad: 100578.7656  LR: 0.00001707  
Epoch: [2][20/366] Elapsed 0m 45s (remain 12m 28s) Loss: 0.1431(0.1143) Grad: 199585.5156  LR: 0.00001676  
Epoch: [2][40/366] Elapsed 1m 16s (remain 10m 8s) Loss: 0.1414(0.1086) Grad: 235914.7188  LR: 0.00001644  
Epoch: [2][60/366] Elapsed 1m 56s (remain 9m 41s) Loss: 0.1237(0.1044) Grad: 199194.0000  LR: 0.00001610  
Epoch: [2][80/366] Elapsed 2m 32s (remain 8m 56s) Loss: 0.1132(0.1021) Grad: 187149.7656  LR: 0.00001576  
Epoch: [2][100/366] Elapsed 3m 7s (remain 8m 12s) Loss: 0.1059(0.1002) Grad: 183541.7188  LR: 0.00001540  
Epoch: [2][120/366] Elapsed 3m 42s (remain 7m 29s) Loss: 0.1208(0.1002) Grad: 216178.4375  LR: 0.00001504  
Epoch: [2][140/366] Elapsed 4m 22s (remain 6m 58s) Loss: 0.1095(0.1010) Grad: 145689.9375  LR: 0.00001466  
Epoch: [2]

Epoch 2 - avg_train_loss: 0.1009  avg_val_loss: 0.1073  time: 740s
Epoch 2 - Score: 0.4637  Scores: [0.5277631882612858, 0.44413260325571846, 0.42940028040134515, 0.4530804393790051, 0.4755611065178225, 0.4521164256657545]
Epoch 2 - Save Best Score: 0.4637 Model


EVAL: [60/62] Elapsed 1m 16s (remain 0m 1s) Loss: 0.0982(0.1073) 
EVAL: [61/62] Elapsed 1m 16s (remain 0m 0s) Loss: 0.0674(0.1073) 
Epoch: [3][0/366] Elapsed 0m 1s (remain 9m 1s) Loss: 0.0645(0.0645) Grad: 115056.7969  LR: 0.00001001  
Epoch: [3][20/366] Elapsed 0m 37s (remain 10m 15s) Loss: 0.0780(0.0937) Grad: 114374.9141  LR: 0.00000958  
Epoch: [3][40/366] Elapsed 1m 16s (remain 10m 8s) Loss: 0.0769(0.0953) Grad: 213525.4219  LR: 0.00000916  
Epoch: [3][60/366] Elapsed 1m 53s (remain 9m 29s) Loss: 0.0658(0.0946) Grad: 110841.8672  LR: 0.00000873  
Epoch: [3][80/366] Elapsed 2m 30s (remain 8m 50s) Loss: 0.0749(0.0959) Grad: 90863.2656  LR: 0.00000831  
Epoch: [3][100/366] Elapsed 3m 12s (remain 8m 25s) Loss: 0.0824(0.0948) Grad: 71144.1875  LR: 0.00000789  
Epoch: [3][120/366] Elapsed 3m 46s (remain 7m 39s) Loss: 0.1274(0.0937) Grad: 166183.0000  LR: 0.00000747  
Epoch: [3][140/366] Elapsed 4m 20s (remain 6m 55s) Loss: 0.1170(0.0927) Grad: 92681.9766  LR: 0.00000706  
Epoch: [3][160

Epoch 3 - avg_train_loss: 0.0920  avg_val_loss: 0.1041  time: 731s
Epoch 3 - Score: 0.4569  Scores: [0.49264350563899373, 0.44346257232816894, 0.4202844990961219, 0.4650518588778538, 0.46961955399954874, 0.45005979057408263]
Epoch 3 - Save Best Score: 0.4569 Model


EVAL: [60/62] Elapsed 1m 16s (remain 0m 1s) Loss: 0.0890(0.1041) 
EVAL: [61/62] Elapsed 1m 16s (remain 0m 0s) Loss: 0.0482(0.1041) 
Epoch: [4][0/366] Elapsed 0m 2s (remain 16m 33s) Loss: 0.1175(0.1175) Grad: 240400.6875  LR: 0.00000295  
Epoch: [4][20/366] Elapsed 0m 40s (remain 10m 58s) Loss: 0.1002(0.0863) Grad: 263496.6562  LR: 0.00000265  
Epoch: [4][40/366] Elapsed 1m 13s (remain 9m 46s) Loss: 0.1224(0.0839) Grad: 247872.9531  LR: 0.00000237  
Epoch: [4][60/366] Elapsed 1m 51s (remain 9m 15s) Loss: 0.0785(0.0855) Grad: 122290.5000  LR: 0.00000210  
Epoch: [4][80/366] Elapsed 2m 23s (remain 8m 23s) Loss: 0.0568(0.0874) Grad: 115114.5469  LR: 0.00000184  
Epoch: [4][100/366] Elapsed 3m 5s (remain 8m 6s) Loss: 0.0730(0.0883) Grad: 131550.4219  LR: 0.00000160  
Epoch: [4][120/366] Elapsed 3m 39s (remain 7m 23s) Loss: 0.1171(0.0882) Grad: 115412.0156  LR: 0.00000138  
Epoch: [4][140/366] Elapsed 4m 15s (remain 6m 47s) Loss: 0.0720(0.0881) Grad: 129206.2188  LR: 0.00000117  
Epoch: [4][

Epoch 4 - avg_train_loss: 0.0863  avg_val_loss: 0.1032  time: 734s
Epoch 4 - Score: 0.4550  Scores: [0.49292344496659357, 0.44402668597815426, 0.4181500326972267, 0.4530167134943818, 0.47050935032207203, 0.45132679845975665]
Epoch 4 - Save Best Score: 0.4550 Model


EVAL: [60/62] Elapsed 1m 16s (remain 0m 1s) Loss: 0.0862(0.1033) 
EVAL: [61/62] Elapsed 1m 16s (remain 0m 0s) Loss: 0.0535(0.1032) 


Score: 0.4550  Scores: [0.49292344496659357, 0.44402668597815426, 0.4181500326972267, 0.4530167134943818, 0.47050935032207203, 0.45132679845975665]
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_s

Reinitializing Last 1 Layers ...
Layer Reinitialized!
Epoch: [1][0/366] Elapsed 0m 3s (remain 22m 35s) Loss: 2.2105(2.2105) Grad: inf  LR: 0.00002000  
Epoch: [1][20/366] Elapsed 0m 48s (remain 13m 12s) Loss: 0.3592(0.9287) Grad: 283426.0000  LR: 0.00001999  
Epoch: [1][40/366] Elapsed 1m 32s (remain 12m 11s) Loss: 0.1657(0.5791) Grad: 106476.7109  LR: 0.00001996  
Epoch: [1][60/366] Elapsed 2m 5s (remain 10m 26s) Loss: 0.1397(0.4446) Grad: 121759.2188  LR: 0.00001991  
Epoch: [1][80/366] Elapsed 2m 42s (remain 9m 31s) Loss: 0.1069(0.3680) Grad: 131693.2188  LR: 0.00001985  
Epoch: [1][100/366] Elapsed 3m 13s (remain 8m 28s) Loss: 0.1423(0.3190) Grad: 125379.6172  LR: 0.00001977  
Epoch: [1][120/366] Elapsed 3m 48s (remain 7m 41s) Loss: 0.1898(0.2898) Grad: 195692.3125  LR: 0.00001967  
Epoch: [1][140/366] Elapsed 4m 24s (remain 7m 2s) Loss: 0.1302(0.2680) Grad: 99771.4062  LR: 0.00001955  
Epoch: [1][160/366] Elapsed 5m 2s (remain 6m 24s) Loss: 0.1297(0.2517) Grad: 146395.2188  LR: 0.

Epoch 1 - avg_train_loss: 0.1777  avg_val_loss: 0.1153  time: 746s
Epoch 1 - Score: 0.4815  Scores: [0.5161268454414688, 0.46986575690256327, 0.44571709823040734, 0.4854897817052876, 0.4882941594424227, 0.48348910730279404]
Epoch 1 - Save Best Score: 0.4815 Model


Epoch: [2][0/366] Elapsed 0m 1s (remain 7m 37s) Loss: 0.0862(0.0862) Grad: 119960.0938  LR: 0.00001706  
Epoch: [2][20/366] Elapsed 0m 36s (remain 9m 56s) Loss: 0.0432(0.1099) Grad: 76572.0547  LR: 0.00001675  
Epoch: [2][40/366] Elapsed 1m 9s (remain 9m 10s) Loss: 0.1455(0.1086) Grad: 282507.2188  LR: 0.00001643  
Epoch: [2][60/366] Elapsed 1m 47s (remain 8m 59s) Loss: 0.0698(0.1030) Grad: 131290.0000  LR: 0.00001610  
Epoch: [2][80/366] Elapsed 2m 25s (remain 8m 31s) Loss: 0.1296(0.1045) Grad: 286621.9688  LR: 0.00001575  
Epoch: [2][100/366] Elapsed 3m 3s (remain 8m 1s) Loss: 0.0888(0.1045) Grad: 149277.4844  LR: 0.00001540  
Epoch: [2][120/366] Elapsed 3m 36s (remain 7m 18s) Loss: 0.0664(0.1046) Grad: 65646.7109  LR: 0.00001503  
Epoch: [2][140/366] Elapsed 4m 17s (remain 6m 51s) Loss: 0.0735(0.1028) Grad: 214362.0781  LR: 0.00001466  
Epoch: [2][160/366] Elapsed 4m 56s (remain 6m 18s) Loss: 0.0956(0.1033) Grad: 217877.2188  LR: 0.00001427  
Epoch: [2][180/366] Elapsed 5m 30s (rema

Epoch 2 - avg_train_loss: 0.1022  avg_val_loss: 0.1106  time: 748s
Epoch 2 - Score: 0.4716  Scores: [0.4967710605551424, 0.45153340564391875, 0.4522404702708659, 0.4815858078025248, 0.48232527202099895, 0.465138868756276]
Epoch 2 - Save Best Score: 0.4716 Model


Epoch: [3][0/366] Elapsed 0m 2s (remain 15m 27s) Loss: 0.0992(0.0992) Grad: 99984.9922  LR: 0.00001000  
Epoch: [3][20/366] Elapsed 0m 35s (remain 9m 49s) Loss: 0.0649(0.0936) Grad: 94401.4141  LR: 0.00000957  
Epoch: [3][40/366] Elapsed 1m 13s (remain 9m 39s) Loss: 0.0937(0.0929) Grad: 188734.5625  LR: 0.00000914  
Epoch: [3][60/366] Elapsed 1m 50s (remain 9m 13s) Loss: 0.0841(0.0951) Grad: 102239.2188  LR: 0.00000872  
Epoch: [3][80/366] Elapsed 2m 30s (remain 8m 51s) Loss: 0.0806(0.0964) Grad: 199550.4219  LR: 0.00000829  
Epoch: [3][100/366] Elapsed 3m 9s (remain 8m 16s) Loss: 0.0568(0.0960) Grad: 72049.5156  LR: 0.00000787  
Epoch: [3][120/366] Elapsed 3m 45s (remain 7m 36s) Loss: 0.0801(0.0951) Grad: 130766.7656  LR: 0.00000746  
Epoch: [3][140/366] Elapsed 4m 22s (remain 6m 59s) Loss: 0.0738(0.0935) Grad: 61447.8008  LR: 0.00000704  
Epoch: [3][160/366] Elapsed 4m 56s (remain 6m 16s) Loss: 0.0959(0.0938) Grad: 87413.6641  LR: 0.00000664  
Epoch: [3][180/366] Elapsed 5m 34s (rema

Epoch 3 - avg_train_loss: 0.0930  avg_val_loss: 0.1108  time: 756s
Epoch 3 - Score: 0.4715  Scores: [0.4970324789162114, 0.4537582450967015, 0.42212365214926056, 0.4656315879034119, 0.5221653249351008, 0.4682470246555042]
Epoch 3 - Save Best Score: 0.4715 Model


Epoch: [4][0/366] Elapsed 0m 1s (remain 9m 5s) Loss: 0.0735(0.0735) Grad: 181715.0156  LR: 0.00000294  
Epoch: [4][20/366] Elapsed 0m 38s (remain 10m 28s) Loss: 0.0660(0.0795) Grad: 194285.6875  LR: 0.00000264  
Epoch: [4][40/366] Elapsed 1m 14s (remain 9m 49s) Loss: 0.0662(0.0863) Grad: 111149.8125  LR: 0.00000236  
Epoch: [4][60/366] Elapsed 1m 55s (remain 9m 38s) Loss: 0.0976(0.0860) Grad: 220849.1406  LR: 0.00000209  
Epoch: [4][80/366] Elapsed 2m 29s (remain 8m 46s) Loss: 0.0721(0.0851) Grad: 116148.5547  LR: 0.00000183  
Epoch: [4][100/366] Elapsed 3m 6s (remain 8m 10s) Loss: 0.0411(0.0847) Grad: 56815.6133  LR: 0.00000159  
Epoch: [4][120/366] Elapsed 3m 44s (remain 7m 35s) Loss: 0.0984(0.0851) Grad: 238829.3750  LR: 0.00000137  
Epoch: [4][140/366] Elapsed 4m 25s (remain 7m 3s) Loss: 0.0817(0.0853) Grad: 128107.1875  LR: 0.00000116  
Epoch: [4][160/366] Elapsed 5m 1s (remain 6m 23s) Loss: 0.0841(0.0853) Grad: 72554.2266  LR: 0.00000097  
Epoch: [4][180/366] Elapsed 5m 39s (rema

Epoch 4 - avg_train_loss: 0.0867  avg_val_loss: 0.1059  time: 749s
Epoch 4 - Score: 0.4613  Scores: [0.4872288795732708, 0.4498834190426982, 0.41911049935328265, 0.46496448833482495, 0.4833694971432972, 0.4632257691736791]
Epoch 4 - Save Best Score: 0.4613 Model
Score: 0.4613  Scores: [0.4872288795732708, 0.4498834190426982, 0.41911049935328265, 0.46496448833482495, 0.4833694971432972, 0.4632257691736791]
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hi

Reinitializing Last 1 Layers ...
Layer Reinitialized!
Epoch: [1][0/366] Elapsed 0m 3s (remain 20m 5s) Loss: 2.4968(2.4968) Grad: inf  LR: 0.00002000  
Epoch: [1][20/366] Elapsed 0m 38s (remain 10m 30s) Loss: 0.1727(0.9680) Grad: 179664.6562  LR: 0.00001999  
Epoch: [1][40/366] Elapsed 1m 12s (remain 9m 38s) Loss: 0.1498(0.5853) Grad: 207381.9688  LR: 0.00001996  
Epoch: [1][60/366] Elapsed 1m 51s (remain 9m 16s) Loss: 0.1447(0.4363) Grad: 167849.0000  LR: 0.00001991  
Epoch: [1][80/366] Elapsed 2m 24s (remain 8m 28s) Loss: 0.1769(0.3640) Grad: 143315.2812  LR: 0.00001985  
Epoch: [1][100/366] Elapsed 3m 3s (remain 8m 2s) Loss: 0.1607(0.3162) Grad: 226746.9219  LR: 0.00001977  
Epoch: [1][120/366] Elapsed 3m 35s (remain 7m 16s) Loss: 0.0572(0.2865) Grad: 37003.6133  LR: 0.00001967  
Epoch: [1][140/366] Elapsed 4m 19s (remain 6m 54s) Loss: 0.1036(0.2656) Grad: 58487.1680  LR: 0.00001955  
Epoch: [1][160/366] Elapsed 4m 55s (remain 6m 15s) Loss: 0.1030(0.2522) Grad: 127264.9219  LR: 0.000

Epoch 1 - avg_train_loss: 0.1795  avg_val_loss: 0.1131  time: 741s
Epoch 1 - Score: 0.4762  Scores: [0.5122088892424358, 0.4847542764967922, 0.4208525907085739, 0.44843820761568104, 0.5350575804593929, 0.45575900459604046]
Epoch 1 - Save Best Score: 0.4762 Model


Epoch: [2][0/366] Elapsed 0m 1s (remain 9m 14s) Loss: 0.0912(0.0912) Grad: 184868.1562  LR: 0.00001706  
Epoch: [2][20/366] Elapsed 0m 38s (remain 10m 31s) Loss: 0.0967(0.1039) Grad: 150534.3906  LR: 0.00001675  
Epoch: [2][40/366] Elapsed 1m 15s (remain 10m 0s) Loss: 0.1001(0.0987) Grad: 85532.8281  LR: 0.00001643  
Epoch: [2][60/366] Elapsed 1m 48s (remain 9m 2s) Loss: 0.1163(0.1020) Grad: 84629.4219  LR: 0.00001610  
Epoch: [2][80/366] Elapsed 2m 26s (remain 8m 34s) Loss: 0.0685(0.1025) Grad: 134756.8594  LR: 0.00001575  
Epoch: [2][100/366] Elapsed 2m 59s (remain 7m 50s) Loss: 0.0985(0.1050) Grad: 131024.1406  LR: 0.00001540  
Epoch: [2][120/366] Elapsed 3m 36s (remain 7m 18s) Loss: 0.0920(0.1053) Grad: 165367.7812  LR: 0.00001503  
Epoch: [2][140/366] Elapsed 4m 6s (remain 6m 33s) Loss: 0.0716(0.1039) Grad: 174985.9062  LR: 0.00001466  
Epoch: [2][160/366] Elapsed 4m 49s (remain 6m 8s) Loss: 0.0825(0.1056) Grad: 70146.0156  LR: 0.00001427  
Epoch: [2][180/366] Elapsed 5m 34s (rema

Epoch 2 - avg_train_loss: 0.1041  avg_val_loss: 0.1077  time: 747s
Epoch 2 - Score: 0.4649  Scores: [0.49529140818015516, 0.45837616986940444, 0.4185973917495409, 0.47430006364152655, 0.4961319562169069, 0.44691987313005893]
Epoch 2 - Save Best Score: 0.4649 Model


Epoch: [3][0/366] Elapsed 0m 3s (remain 21m 33s) Loss: 0.1346(0.1346) Grad: 201676.1406  LR: 0.00001000  
Epoch: [3][20/366] Elapsed 0m 42s (remain 11m 39s) Loss: 0.0768(0.1044) Grad: 179396.5312  LR: 0.00000957  
Epoch: [3][40/366] Elapsed 1m 17s (remain 10m 16s) Loss: 0.0800(0.1093) Grad: 112882.7500  LR: 0.00000914  
Epoch: [3][60/366] Elapsed 2m 0s (remain 10m 4s) Loss: 0.1057(0.1051) Grad: 221228.7969  LR: 0.00000872  
Epoch: [3][80/366] Elapsed 2m 45s (remain 9m 41s) Loss: 0.1341(0.1031) Grad: 149452.8438  LR: 0.00000829  
Epoch: [3][100/366] Elapsed 3m 16s (remain 8m 35s) Loss: 0.0992(0.1017) Grad: 165213.1250  LR: 0.00000787  
Epoch: [3][120/366] Elapsed 3m 55s (remain 7m 56s) Loss: 0.1318(0.1004) Grad: 348367.7812  LR: 0.00000746  
Epoch: [3][140/366] Elapsed 4m 30s (remain 7m 10s) Loss: 0.1517(0.0992) Grad: 189641.7188  LR: 0.00000704  
Epoch: [3][160/366] Elapsed 5m 12s (remain 6m 37s) Loss: 0.0881(0.0993) Grad: 239374.5000  LR: 0.00000664  
Epoch: [3][180/366] Elapsed 5m 52

Epoch 3 - avg_train_loss: 0.0959  avg_val_loss: 0.1032  time: 752s
Epoch 3 - Score: 0.4551  Scores: [0.49043400727104636, 0.46045175218920587, 0.4207488407831289, 0.4404111357647465, 0.4722011327048638, 0.44616811065376943]
Epoch 3 - Save Best Score: 0.4551 Model


Epoch: [4][0/366] Elapsed 0m 2s (remain 15m 54s) Loss: 0.0828(0.0828) Grad: 136228.1094  LR: 0.00000294  
Epoch: [4][20/366] Elapsed 0m 42s (remain 11m 30s) Loss: 0.1076(0.0918) Grad: 90556.5547  LR: 0.00000264  
Epoch: [4][40/366] Elapsed 1m 16s (remain 10m 8s) Loss: 0.0647(0.0913) Grad: 114294.0625  LR: 0.00000236  
Epoch: [4][60/366] Elapsed 1m 50s (remain 9m 11s) Loss: 0.0802(0.0908) Grad: 124541.1641  LR: 0.00000209  
Epoch: [4][80/366] Elapsed 2m 33s (remain 9m 0s) Loss: 0.0853(0.0884) Grad: 114738.2500  LR: 0.00000183  
Epoch: [4][100/366] Elapsed 3m 7s (remain 8m 12s) Loss: 0.0709(0.0886) Grad: 133933.2656  LR: 0.00000159  
Epoch: [4][120/366] Elapsed 3m 43s (remain 7m 31s) Loss: 0.1076(0.0882) Grad: 122740.7031  LR: 0.00000137  
Epoch: [4][140/366] Elapsed 4m 17s (remain 6m 51s) Loss: 0.1021(0.0891) Grad: 158048.5625  LR: 0.00000116  
Epoch: [4][160/366] Elapsed 4m 57s (remain 6m 19s) Loss: 0.0638(0.0896) Grad: 100328.3828  LR: 0.00000097  
Epoch: [4][180/366] Elapsed 5m 36s (

Epoch 4 - avg_train_loss: 0.0892  avg_val_loss: 0.1002  time: 745s
Epoch 4 - Score: 0.4483  Scores: [0.4893826923568244, 0.4411177891271667, 0.4146058696097001, 0.437256527634917, 0.4658727954746083, 0.4415600303566593]
Epoch 4 - Save Best Score: 0.4483 Model
Score: 0.4483  Scores: [0.4893826923568244, 0.4411177891271667, 0.4146058696097001, 0.437256527634917, 0.4658727954746083, 0.4415600303566593]
Score: 0.4539  Scores: [0.48738784906163807, 0.4442999426694501, 0.41652068867012526, 0.4532126292711556, 0.47289592104689243, 0.4492488257705533]


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
[fold0] avg_train_loss,█▂▁▁
[fold0] avg_val_loss,█▂▄▁
[fold0] epoch,▁▃▆█
[fold0] loss,█▃▃▃▄▄▃▅▃▄▁▃▂▃▂▄▂▃▃▃▁▃▂▁▄▃▂▂▂▂▃▄▁▂▂▃▂▂▃▁
[fold0] lr,███████▇▇▇▇▇▇▆▆▆▆▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
[fold0] score,█▂▄▁
[fold1] avg_train_loss,█▂▁▁
[fold1] avg_val_loss,█▆▂▁
[fold1] epoch,▁▃▆█
[fold1] loss,█▆▂▅▅▃▇▇▅▃▄▃▂▅▂▂▃▃▃▄▂▃▂▂▄▄▂▁▃▃▂▃▂▁▂▂▃▁▁▂

0,1
[fold0] avg_train_loss,0.08932
[fold0] avg_val_loss,0.1015
[fold0] epoch,4.0
[fold0] loss,0.06735
[fold0] lr,0.0
[fold0] score,0.45093
[fold1] avg_train_loss,0.08634
[fold1] avg_val_loss,0.10323
[fold1] epoch,4.0
[fold1] loss,0.04493
