# About this notebook
- Deberta-v3-base starter code
- pip wheels is [here](https://www.kaggle.com/code/yasufuminakama/fb3-pip-wheels)

If this notebook is helpful, feel free to upvote :)

# Directory settings

In [1]:
# ====================================================
# Directory settings
# ====================================================
import os

OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# CFG

In [2]:
# ====================================================
# CFG
# ====================================================
class CFG:
    wandb=True
    competition='FB3'
    _wandb_kernel='nakama'
    debug=False
    apex=True
    print_freq=20
    num_workers=4
    model="microsoft/deberta-v3-base"
    gradient_checkpointing=True
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=2e-6
    decoder_lr=2e-6
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=8
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]
    train=True
    
if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]

In [3]:
# ====================================================
# wandb
# ====================================================
if CFG.wandb:
    
    import wandb

    try:
        from kaggle_secrets import UserSecretsClient
        user_secrets = UserSecretsClient()
        secret_value_0 = user_secrets.get_secret("wandb_api")
        wandb.login(key=secret_value_0)
        anony = None
    except:
        anony = "must"
        print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')


    def class2dict(f):
        return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

    run = wandb.init(project='FB3-Public', 
                     name=CFG.model,
                     config=class2dict(CFG),
                     group=CFG.model,
                     job_type="train",
                     anonymous=anony)

If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. 
Get your W&B access token from here: https://wandb.ai/authorize


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


# Library

In [4]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

os.system('pip install iterative-stratification==0.1.7')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

os.system('pip uninstall -y transformers')
os.system('pip uninstall -y tokenizers')
os.system('python -m pip install --no-index --find-links=../input/fb3-pip-wheels transformers')
os.system('python -m pip install --no-index --find-links=../input/fb3-pip-wheels tokenizers')
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Collecting iterative-stratification==0.1.7
  Downloading iterative_stratification-0.1.7-py3-none-any.whl (8.5 kB)
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.7




Found existing installation: transformers 4.20.1
Uninstalling transformers-4.20.1:
  Successfully uninstalled transformers-4.20.1




Found existing installation: tokenizers 0.12.1
Uninstalling tokenizers-0.12.1:
  Successfully uninstalled tokenizers-0.12.1




Looking in links: ../input/fb3-pip-wheels
Processing /kaggle/input/fb3-pip-wheels/transformers-4.21.2-py3-none-any.whl
Processing /kaggle/input/fb3-pip-wheels/tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl
Installing collected packages: tokenizers, transformers


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
allennlp 2.10.0 requires protobuf==3.20.0, but you have protobuf 3.19.4 which is incompatible.
allennlp 2.10.0 requires transformers<4.21,>=4.1, but you have transformers 4.21.2 which is incompatible.


Successfully installed tokenizers-0.12.1 transformers-4.21.2
Looking in links: ../input/fb3-pip-wheels




tokenizers.__version__: 0.12.1
transformers.__version__: 4.21.2
env: TOKENIZERS_PARALLELISM=true


# Utils

In [5]:
# ====================================================
# Utils
# ====================================================
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores


def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

# Data Loading

In [6]:
# ====================================================
# Data Loading
# ====================================================
train = pd.read_csv('../input/feedback-prize-english-language-learning/train.csv')
test = pd.read_csv('../input/feedback-prize-english-language-learning/test.csv')
submission = pd.read_csv('../input/feedback-prize-english-language-learning/sample_submission.csv')

print(f"train.shape: {train.shape}")
display(train.head())
print(f"test.shape: {test.shape}")
display(test.head())
print(f"submission.shape: {submission.shape}")
display(submission.head())

train.shape: (3911, 8)


Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


test.shape: (3, 2)


Unnamed: 0,text_id,full_text
0,0000C359D63E,when a person has no experience on a job their...
1,000BAD50D026,Do you think students would benefit from being...
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde..."


submission.shape: (3, 7)


Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,3.0,3.0,3.0,3.0,3.0,3.0
1,000BAD50D026,3.0,3.0,3.0,3.0,3.0,3.0
2,00367BB2546B,3.0,3.0,3.0,3.0,3.0,3.0


# CV split

In [7]:
# ====================================================
# CV split
# ====================================================
Fold = MultilabelStratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
for n, (train_index, val_index) in enumerate(Fold.split(train, train[CFG.target_cols])):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

fold
0    978
1    977
2    978
3    978
dtype: int64

In [8]:
if CFG.debug:
    display(train.groupby('fold').size())
    train = train.sample(n=1000, random_state=0).reset_index(drop=True)
    display(train.groupby('fold').size())

# tokenizer

In [9]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Downloading tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Dataset

In [10]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['full_text'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 3 # cls & sep & sep
LOGGER.info(f"max_len: {CFG.max_len}")

  0%|          | 0/3911 [00:00<?, ?it/s]

max_len: 1429


In [11]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors=None, 
        add_special_tokens=True, 
        max_length=CFG.max_len,
        pad_to_max_length=True,
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['full_text'].values
        self.labels = df[cfg.target_cols].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label
    

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

# Model

In [12]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, 6)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output

# Loss

In [13]:
# ====================================================
# Loss
# ====================================================
class RMSELoss(nn.Module):
    def __init__(self, reduction='mean', eps=1e-9):
        super().__init__()
        self.mse = nn.MSELoss(reduction='none')
        self.reduction = reduction
        self.eps = eps

    def forward(self, y_pred, y_true):
        loss = torch.sqrt(self.mse(y_pred, y_true) + self.eps)
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss

# Helpler functions

In [14]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
        if CFG.wandb:
            wandb.log({f"[fold{fold}] loss": losses.val,
                       f"[fold{fold}] lr": scheduler.get_lr()[0]})
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions

# train loop

In [15]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds[CFG.target_cols].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.SmoothL1Loss(reduction='mean') # RMSELoss(reduction="mean")
    
    best_score = np.inf

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score, scores = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}  Scores: {scores}')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1, 
                       f"[fold{fold}] avg_train_loss": avg_loss, 
                       f"[fold{fold}] avg_val_loss": avg_val_loss,
                       f"[fold{fold}] score": score})
        
        if best_score > score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds[[f"pred_{c}" for c in CFG.target_cols]] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [16]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df[CFG.target_cols].values
        preds = oof_df[[f"pred_{c}" for c in CFG.target_cols]].values
        score, scores = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}  Scores: {scores}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_DIR+'oof_df.pkl')
        
    if CFG.wandb:
        wandb.finish()

DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}



Downloading pytorch_model.bin:   0%|          | 0.00/354M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/366] Elapsed 0m 2s (remain 16m 31s) Loss: 2.3540(2.3540) Grad: inf  LR: 0.00000200  
Epoch: [1][20/366] Elapsed 0m 39s (remain 10m 42s) Loss: 2.4410(2.4707) Grad: 210881.6094  LR: 0.00000200  
Epoch: [1][40/366] Elapsed 1m 15s (remain 9m 59s) Loss: 1.6179(2.1935) Grad: 229282.8438  LR: 0.00000200  
Epoch: [1][60/366] Elapsed 1m 48s (remain 9m 1s) Loss: 1.0150(1.9030) Grad: 188309.2031  LR: 0.00000199  
Epoch: [1][80/366] Elapsed 2m 25s (remain 8m 30s) Loss: 0.8703(1.6714) Grad: 200546.7812  LR: 0.00000198  
Epoch: [1][100/366] Elapsed 3m 8s (remain 8m 13s) Loss: 0.4267(1.4869) Grad: 174231.0156  LR: 0.00000198  
Epoch: [1][120/366] Elapsed 3m 44s (remain 7m 34s) Loss: 0.1530(1.3111) Grad: 67362.2344  LR: 0.00000197  
Epoch: [1][140/366] Elapsed 4m 22s (remain 6m 59s) Loss: 0.2565(1.1587) Grad: 102279.3047  LR: 0.00000195  
Epoch: [1][160/366] Elapsed 4m 58s (remain 6m 20s) Loss: 0.1528(1.0373) Grad: 65552.1953  LR: 0.00000194  
Epoch: [1][180/366] Elapsed 5m 35s (remain 5m

Epoch 1 - avg_train_loss: 0.5365  avg_val_loss: 0.1198  time: 741s
Epoch 1 - Score: 0.4913  Scores: [0.5160428758391893, 0.46966657211152635, 0.4587258119168911, 0.49497493867963627, 0.5203757940370904, 0.4882185053151892]
Epoch 1 - Save Best Score: 0.4913 Model


EVAL: [60/62] Elapsed 1m 16s (remain 0m 1s) Loss: 0.1148(0.1198) 
EVAL: [61/62] Elapsed 1m 17s (remain 0m 0s) Loss: 0.1151(0.1198) 
Epoch: [2][0/366] Elapsed 0m 1s (remain 7m 32s) Loss: 0.1088(0.1088) Grad: 198902.9375  LR: 0.00000171  
Epoch: [2][20/366] Elapsed 0m 30s (remain 8m 22s) Loss: 0.1534(0.1204) Grad: 318750.6875  LR: 0.00000168  
Epoch: [2][40/366] Elapsed 1m 8s (remain 9m 6s) Loss: 0.0959(0.1228) Grad: 185980.3125  LR: 0.00000164  
Epoch: [2][60/366] Elapsed 1m 52s (remain 9m 21s) Loss: 0.1138(0.1232) Grad: 96927.2188  LR: 0.00000161  
Epoch: [2][80/366] Elapsed 2m 30s (remain 8m 50s) Loss: 0.0768(0.1207) Grad: 126474.0078  LR: 0.00000158  
Epoch: [2][100/366] Elapsed 3m 6s (remain 8m 8s) Loss: 0.1856(0.1207) Grad: 468002.4375  LR: 0.00000154  
Epoch: [2][120/366] Elapsed 3m 43s (remain 7m 32s) Loss: 0.1178(0.1196) Grad: 148243.8750  LR: 0.00000150  
Epoch: [2][140/366] Elapsed 4m 18s (remain 6m 52s) Loss: 0.0836(0.1201) Grad: 61064.2539  LR: 0.00000147  
Epoch: [2][160/36

Epoch 2 - avg_train_loss: 0.1177  avg_val_loss: 0.1127  time: 734s
Epoch 2 - Score: 0.4759  Scores: [0.5056748216048906, 0.4601916277589313, 0.4427872553499749, 0.4767721411486046, 0.50502274412977, 0.4651536768938691]
Epoch 2 - Save Best Score: 0.4759 Model


EVAL: [60/62] Elapsed 1m 15s (remain 0m 1s) Loss: 0.0987(0.1128) 
EVAL: [61/62] Elapsed 1m 15s (remain 0m 0s) Loss: 0.0874(0.1127) 
Epoch: [3][0/366] Elapsed 0m 1s (remain 9m 43s) Loss: 0.1098(0.1098) Grad: 331493.3125  LR: 0.00000100  
Epoch: [3][20/366] Elapsed 0m 32s (remain 8m 55s) Loss: 0.0748(0.1061) Grad: 204331.1250  LR: 0.00000096  
Epoch: [3][40/366] Elapsed 1m 9s (remain 9m 7s) Loss: 0.0901(0.1119) Grad: 132153.1406  LR: 0.00000091  
Epoch: [3][60/366] Elapsed 1m 49s (remain 9m 9s) Loss: 0.0922(0.1091) Grad: 187915.2656  LR: 0.00000087  
Epoch: [3][80/366] Elapsed 2m 23s (remain 8m 26s) Loss: 0.1000(0.1081) Grad: 86415.3047  LR: 0.00000083  
Epoch: [3][100/366] Elapsed 3m 2s (remain 7m 59s) Loss: 0.0728(0.1093) Grad: 135105.4688  LR: 0.00000079  
Epoch: [3][120/366] Elapsed 3m 41s (remain 7m 28s) Loss: 0.1303(0.1102) Grad: 134884.1719  LR: 0.00000075  
Epoch: [3][140/366] Elapsed 4m 15s (remain 6m 47s) Loss: 0.0614(0.1105) Grad: 51758.2656  LR: 0.00000070  
Epoch: [3][160/36

Epoch 3 - avg_train_loss: 0.1125  avg_val_loss: 0.1106  time: 738s
Epoch 3 - Score: 0.4714  Scores: [0.5062279341030695, 0.45772241198338937, 0.4378294346633954, 0.47306250455097076, 0.49188698795296565, 0.46149583359719365]
Epoch 3 - Save Best Score: 0.4714 Model


EVAL: [60/62] Elapsed 1m 15s (remain 0m 1s) Loss: 0.0958(0.1107) 
EVAL: [61/62] Elapsed 1m 15s (remain 0m 0s) Loss: 0.0845(0.1106) 
Epoch: [4][0/366] Elapsed 0m 2s (remain 16m 45s) Loss: 0.1482(0.1482) Grad: 856779.4375  LR: 0.00000029  
Epoch: [4][20/366] Elapsed 0m 38s (remain 10m 25s) Loss: 0.0657(0.1127) Grad: 178816.0312  LR: 0.00000026  
Epoch: [4][40/366] Elapsed 1m 12s (remain 9m 32s) Loss: 0.0907(0.1105) Grad: 216709.1250  LR: 0.00000024  
Epoch: [4][60/366] Elapsed 1m 46s (remain 8m 53s) Loss: 0.0873(0.1074) Grad: 134784.1719  LR: 0.00000021  
Epoch: [4][80/366] Elapsed 2m 23s (remain 8m 24s) Loss: 0.2067(0.1107) Grad: 126094.7891  LR: 0.00000018  
Epoch: [4][100/366] Elapsed 3m 1s (remain 7m 57s) Loss: 0.1613(0.1148) Grad: 118804.3672  LR: 0.00000016  
Epoch: [4][120/366] Elapsed 3m 37s (remain 7m 20s) Loss: 0.1332(0.1158) Grad: 175244.7188  LR: 0.00000014  
Epoch: [4][140/366] Elapsed 4m 9s (remain 6m 38s) Loss: 0.1253(0.1137) Grad: 109183.4453  LR: 0.00000012  
Epoch: [4][

Epoch 4 - avg_train_loss: 0.1103  avg_val_loss: 0.1100  time: 734s
Epoch 4 - Score: 0.4698  Scores: [0.5037966407704993, 0.4566682599455463, 0.4361674326535523, 0.4712786010423545, 0.4923128459439782, 0.4585839033149411]
Epoch 4 - Save Best Score: 0.4698 Model


EVAL: [60/62] Elapsed 1m 15s (remain 0m 1s) Loss: 0.0965(0.1100) 
EVAL: [61/62] Elapsed 1m 15s (remain 0m 0s) Loss: 0.0861(0.1100) 


Score: 0.4698  Scores: [0.5037966407704993, 0.4566682599455463, 0.4361674326535523, 0.4712786010423545, 0.4923128459439782, 0.4585839033149411]
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size"

Epoch: [1][0/366] Elapsed 0m 1s (remain 8m 51s) Loss: 2.4673(2.4673) Grad: inf  LR: 0.00000200  
Epoch: [1][20/366] Elapsed 0m 40s (remain 11m 7s) Loss: 2.6771(2.4490) Grad: 208033.7656  LR: 0.00000200  
Epoch: [1][40/366] Elapsed 1m 19s (remain 10m 28s) Loss: 1.9251(2.2793) Grad: 191333.3438  LR: 0.00000200  
Epoch: [1][60/366] Elapsed 1m 58s (remain 9m 54s) Loss: 1.8001(2.1355) Grad: 211684.1250  LR: 0.00000199  
Epoch: [1][80/366] Elapsed 2m 37s (remain 9m 15s) Loss: 1.5446(1.9756) Grad: 213223.4844  LR: 0.00000198  
Epoch: [1][100/366] Elapsed 3m 17s (remain 8m 37s) Loss: 0.8141(1.7984) Grad: 168313.4062  LR: 0.00000198  
Epoch: [1][120/366] Elapsed 3m 52s (remain 7m 51s) Loss: 0.3506(1.5949) Grad: 125938.4297  LR: 0.00000197  
Epoch: [1][140/366] Elapsed 4m 27s (remain 7m 6s) Loss: 0.5223(1.4140) Grad: 126056.4766  LR: 0.00000195  
Epoch: [1][160/366] Elapsed 5m 2s (remain 6m 25s) Loss: 0.1285(1.2611) Grad: 68568.9297  LR: 0.00000194  
Epoch: [1][180/366] Elapsed 5m 35s (remain 5m

Epoch 1 - avg_train_loss: 0.6302  avg_val_loss: 0.1238  time: 734s
Epoch 1 - Score: 0.4998  Scores: [0.5265858560831784, 0.4874423431830648, 0.45957221621549954, 0.4929690178196832, 0.5328610088202572, 0.49942339740200015]
Epoch 1 - Save Best Score: 0.4998 Model


EVAL: [60/62] Elapsed 1m 16s (remain 0m 1s) Loss: 0.0998(0.1239) 
EVAL: [61/62] Elapsed 1m 16s (remain 0m 0s) Loss: 0.0644(0.1238) 
Epoch: [2][0/366] Elapsed 0m 1s (remain 7m 19s) Loss: 0.1339(0.1339) Grad: 197101.7500  LR: 0.00000171  
Epoch: [2][20/366] Elapsed 0m 34s (remain 9m 28s) Loss: 0.0968(0.1148) Grad: 128067.2891  LR: 0.00000168  
Epoch: [2][40/366] Elapsed 1m 9s (remain 9m 9s) Loss: 0.0834(0.1161) Grad: 96708.8281  LR: 0.00000164  
Epoch: [2][60/366] Elapsed 1m 42s (remain 8m 31s) Loss: 0.1236(0.1177) Grad: 64102.1680  LR: 0.00000161  
Epoch: [2][80/366] Elapsed 2m 12s (remain 7m 45s) Loss: 0.1004(0.1168) Grad: 100185.1094  LR: 0.00000158  
Epoch: [2][100/366] Elapsed 2m 48s (remain 7m 22s) Loss: 0.1578(0.1170) Grad: 352049.2188  LR: 0.00000154  
Epoch: [2][120/366] Elapsed 3m 26s (remain 6m 58s) Loss: 0.0770(0.1163) Grad: 126444.1328  LR: 0.00000150  
Epoch: [2][140/366] Elapsed 4m 4s (remain 6m 30s) Loss: 0.1224(0.1169) Grad: 130569.8984  LR: 0.00000147  
Epoch: [2][160/3

Epoch 2 - avg_train_loss: 0.1145  avg_val_loss: 0.1167  time: 731s
Epoch 2 - Score: 0.4846  Scores: [0.5158513981520022, 0.4741416650384128, 0.44580504272583854, 0.4746377023174107, 0.5110696988103004, 0.4859629080309698]
Epoch 2 - Save Best Score: 0.4846 Model


EVAL: [60/62] Elapsed 1m 16s (remain 0m 1s) Loss: 0.0934(0.1167) 
EVAL: [61/62] Elapsed 1m 16s (remain 0m 0s) Loss: 0.0553(0.1167) 
Epoch: [3][0/366] Elapsed 0m 2s (remain 14m 9s) Loss: 0.0992(0.0992) Grad: 651711.5625  LR: 0.00000100  
Epoch: [3][20/366] Elapsed 0m 34s (remain 9m 28s) Loss: 0.1328(0.1114) Grad: 165091.8750  LR: 0.00000096  
Epoch: [3][40/366] Elapsed 1m 14s (remain 9m 49s) Loss: 0.0624(0.1095) Grad: 153633.2344  LR: 0.00000092  
Epoch: [3][60/366] Elapsed 1m 44s (remain 8m 41s) Loss: 0.0994(0.1098) Grad: 93150.4922  LR: 0.00000087  
Epoch: [3][80/366] Elapsed 2m 27s (remain 8m 37s) Loss: 0.1345(0.1099) Grad: 233772.5156  LR: 0.00000083  
Epoch: [3][100/366] Elapsed 3m 0s (remain 7m 54s) Loss: 0.0867(0.1082) Grad: 76402.5000  LR: 0.00000079  
Epoch: [3][120/366] Elapsed 3m 38s (remain 7m 23s) Loss: 0.0841(0.1100) Grad: 105008.1719  LR: 0.00000075  
Epoch: [3][140/366] Elapsed 4m 15s (remain 6m 47s) Loss: 0.0820(0.1104) Grad: 184380.7344  LR: 0.00000071  
Epoch: [3][160

Epoch 3 - avg_train_loss: 0.1097  avg_val_loss: 0.1153  time: 733s
Epoch 3 - Score: 0.4815  Scores: [0.513352403875166, 0.4693122269251891, 0.44472581829722413, 0.4731792991433848, 0.5074360794667362, 0.4809848674555522]
Epoch 3 - Save Best Score: 0.4815 Model


EVAL: [60/62] Elapsed 1m 16s (remain 0m 1s) Loss: 0.0923(0.1154) 
EVAL: [61/62] Elapsed 1m 16s (remain 0m 0s) Loss: 0.0534(0.1153) 
Epoch: [4][0/366] Elapsed 0m 4s (remain 25m 53s) Loss: 0.1028(0.1028) Grad: 525452.6875  LR: 0.00000029  
Epoch: [4][20/366] Elapsed 0m 37s (remain 10m 18s) Loss: 0.1142(0.1059) Grad: 200076.9219  LR: 0.00000027  
Epoch: [4][40/366] Elapsed 1m 12s (remain 9m 32s) Loss: 0.1329(0.1146) Grad: 196481.3125  LR: 0.00000024  
Epoch: [4][60/366] Elapsed 1m 45s (remain 8m 49s) Loss: 0.1163(0.1118) Grad: 156775.0312  LR: 0.00000021  
Epoch: [4][80/366] Elapsed 2m 17s (remain 8m 2s) Loss: 0.0938(0.1111) Grad: 203756.0469  LR: 0.00000018  
Epoch: [4][100/366] Elapsed 2m 49s (remain 7m 25s) Loss: 0.0798(0.1088) Grad: 136908.8594  LR: 0.00000016  
Epoch: [4][120/366] Elapsed 3m 22s (remain 6m 49s) Loss: 0.1604(0.1101) Grad: 175813.1094  LR: 0.00000014  
Epoch: [4][140/366] Elapsed 3m 55s (remain 6m 15s) Loss: 0.0896(0.1086) Grad: 171529.8438  LR: 0.00000012  
Epoch: [4]

Epoch 4 - avg_train_loss: 0.1075  avg_val_loss: 0.1142  time: 732s
Epoch 4 - Score: 0.4791  Scores: [0.509075675960559, 0.46743497779847737, 0.44048092785044407, 0.4709832805006723, 0.5062707956671719, 0.4804685675466562]
Epoch 4 - Save Best Score: 0.4791 Model


EVAL: [60/62] Elapsed 1m 16s (remain 0m 1s) Loss: 0.0906(0.1142) 
EVAL: [61/62] Elapsed 1m 16s (remain 0m 0s) Loss: 0.0523(0.1142) 


Score: 0.4791  Scores: [0.509075675960559, 0.46743497779847737, 0.44048092785044407, 0.4709832805006723, 0.5062707956671719, 0.4804685675466562]
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size

Epoch: [1][0/366] Elapsed 0m 1s (remain 9m 29s) Loss: 2.5660(2.5660) Grad: inf  LR: 0.00000200  
Epoch: [1][20/366] Elapsed 0m 35s (remain 9m 48s) Loss: 2.1778(2.2565) Grad: 196323.1406  LR: 0.00000200  
Epoch: [1][40/366] Elapsed 1m 12s (remain 9m 36s) Loss: 1.7401(2.1855) Grad: 182837.6875  LR: 0.00000200  
Epoch: [1][60/366] Elapsed 1m 51s (remain 9m 19s) Loss: 1.9118(2.0467) Grad: 192521.5156  LR: 0.00000199  
Epoch: [1][80/366] Elapsed 2m 34s (remain 9m 4s) Loss: 1.6462(1.9530) Grad: 208433.6250  LR: 0.00000198  
Epoch: [1][100/366] Elapsed 3m 12s (remain 8m 24s) Loss: 1.1056(1.8317) Grad: 205684.6094  LR: 0.00000198  
Epoch: [1][120/366] Elapsed 3m 47s (remain 7m 40s) Loss: 0.9153(1.6922) Grad: 205061.8281  LR: 0.00000197  
Epoch: [1][140/366] Elapsed 4m 22s (remain 6m 58s) Loss: 0.3932(1.5368) Grad: 153688.5000  LR: 0.00000195  
Epoch: [1][160/366] Elapsed 5m 1s (remain 6m 23s) Loss: 0.3710(1.3836) Grad: 94676.0703  LR: 0.00000194  
Epoch: [1][180/366] Elapsed 5m 38s (remain 5m 

Epoch 1 - avg_train_loss: 0.6915  avg_val_loss: 0.1383  time: 751s
Epoch 1 - Score: 0.5296  Scores: [0.5713208209726497, 0.5127378685112699, 0.467572509374277, 0.5331909092089339, 0.5519218433973553, 0.540828179739606]
Epoch 1 - Save Best Score: 0.5296 Model


Epoch: [2][0/366] Elapsed 0m 2s (remain 12m 29s) Loss: 0.1058(0.1058) Grad: 684573.0625  LR: 0.00000171  
Epoch: [2][20/366] Elapsed 0m 34s (remain 9m 28s) Loss: 0.1255(0.1320) Grad: 309545.9375  LR: 0.00000168  
Epoch: [2][40/366] Elapsed 1m 13s (remain 9m 39s) Loss: 0.1695(0.1359) Grad: 118496.6875  LR: 0.00000164  
Epoch: [2][60/366] Elapsed 1m 53s (remain 9m 27s) Loss: 0.1204(0.1287) Grad: 378406.7188  LR: 0.00000161  
Epoch: [2][80/366] Elapsed 2m 29s (remain 8m 44s) Loss: 0.1120(0.1244) Grad: 184124.4844  LR: 0.00000158  
Epoch: [2][100/366] Elapsed 3m 3s (remain 8m 2s) Loss: 0.2067(0.1256) Grad: 548655.2500  LR: 0.00000154  
Epoch: [2][120/366] Elapsed 3m 39s (remain 7m 24s) Loss: 0.1247(0.1254) Grad: 293756.3125  LR: 0.00000150  
Epoch: [2][140/366] Elapsed 4m 17s (remain 6m 50s) Loss: 0.1269(0.1252) Grad: 138655.0625  LR: 0.00000147  
Epoch: [2][160/366] Elapsed 5m 0s (remain 6m 22s) Loss: 0.1048(0.1240) Grad: 134609.3750  LR: 0.00000143  
Epoch: [2][180/366] Elapsed 5m 45s (r

Epoch 2 - avg_train_loss: 0.1202  avg_val_loss: 0.1209  time: 756s
Epoch 2 - Score: 0.4936  Scores: [0.5199350303025345, 0.4837847346004962, 0.4465011457885917, 0.49659039126924687, 0.524593779529064, 0.4903858096552251]
Epoch 2 - Save Best Score: 0.4936 Model


Epoch: [3][0/366] Elapsed 0m 2s (remain 12m 24s) Loss: 0.0893(0.0893) Grad: 382834.0938  LR: 0.00000100  
Epoch: [3][20/366] Elapsed 0m 43s (remain 11m 50s) Loss: 0.1545(0.1093) Grad: 528947.8125  LR: 0.00000096  
Epoch: [3][40/366] Elapsed 1m 17s (remain 10m 13s) Loss: 0.1251(0.1104) Grad: 171613.0312  LR: 0.00000091  
Epoch: [3][60/366] Elapsed 1m 51s (remain 9m 15s) Loss: 0.1147(0.1100) Grad: 448756.3750  LR: 0.00000087  
Epoch: [3][80/366] Elapsed 2m 24s (remain 8m 26s) Loss: 0.1108(0.1107) Grad: 102004.3047  LR: 0.00000083  
Epoch: [3][100/366] Elapsed 3m 6s (remain 8m 8s) Loss: 0.1452(0.1100) Grad: 191752.0781  LR: 0.00000079  
Epoch: [3][120/366] Elapsed 3m 42s (remain 7m 31s) Loss: 0.0477(0.1105) Grad: 54551.8945  LR: 0.00000075  
Epoch: [3][140/366] Elapsed 4m 21s (remain 6m 58s) Loss: 0.0767(0.1106) Grad: 143876.7344  LR: 0.00000070  
Epoch: [3][160/366] Elapsed 4m 55s (remain 6m 16s) Loss: 0.0902(0.1104) Grad: 131174.5000  LR: 0.00000066  
Epoch: [3][180/366] Elapsed 5m 39s 

Epoch 3 - avg_train_loss: 0.1142  avg_val_loss: 0.1172  time: 756s
Epoch 3 - Score: 0.4855  Scores: [0.519652281630926, 0.4701794792918083, 0.4399527089679855, 0.4830146320753997, 0.5118062722709903, 0.4882286255579081]
Epoch 3 - Save Best Score: 0.4855 Model


Epoch: [4][0/366] Elapsed 0m 2s (remain 12m 36s) Loss: 0.1582(0.1582) Grad: 774954.0625  LR: 0.00000029  
Epoch: [4][20/366] Elapsed 0m 38s (remain 10m 28s) Loss: 0.1107(0.1154) Grad: 122579.5625  LR: 0.00000026  
Epoch: [4][40/366] Elapsed 1m 13s (remain 9m 42s) Loss: 0.0832(0.1143) Grad: 143149.1719  LR: 0.00000024  
Epoch: [4][60/366] Elapsed 1m 49s (remain 9m 9s) Loss: 0.0954(0.1167) Grad: 102103.2109  LR: 0.00000021  
Epoch: [4][80/366] Elapsed 2m 36s (remain 9m 9s) Loss: 0.1133(0.1150) Grad: 137881.7656  LR: 0.00000018  
Epoch: [4][100/366] Elapsed 3m 9s (remain 8m 17s) Loss: 0.0843(0.1135) Grad: 233286.3281  LR: 0.00000016  
Epoch: [4][120/366] Elapsed 3m 45s (remain 7m 36s) Loss: 0.1385(0.1130) Grad: 289323.1875  LR: 0.00000014  
Epoch: [4][140/366] Elapsed 4m 22s (remain 6m 58s) Loss: 0.0976(0.1132) Grad: 115167.3672  LR: 0.00000012  
Epoch: [4][160/366] Elapsed 4m 56s (remain 6m 17s) Loss: 0.1297(0.1129) Grad: 184870.0781  LR: 0.00000010  
Epoch: [4][180/366] Elapsed 5m 34s (

Epoch 4 - avg_train_loss: 0.1113  avg_val_loss: 0.1162  time: 753s
Epoch 4 - Score: 0.4836  Scores: [0.5110269002762201, 0.469430231168048, 0.4406354461229707, 0.4829816173985039, 0.5106105954863105, 0.4871739566450226]
Epoch 4 - Save Best Score: 0.4836 Model
Score: 0.4836  Scores: [0.5110269002762201, 0.469430231168048, 0.4406354461229707, 0.4829816173985039, 0.5106105954863105, 0.4871739566450226]
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_a

Epoch: [1][0/366] Elapsed 0m 1s (remain 11m 35s) Loss: 2.5190(2.5190) Grad: inf  LR: 0.00000200  
Epoch: [1][20/366] Elapsed 0m 32s (remain 8m 57s) Loss: 2.6066(2.4028) Grad: 252014.5156  LR: 0.00000200  
Epoch: [1][40/366] Elapsed 1m 7s (remain 8m 56s) Loss: 1.8418(2.1684) Grad: 211705.9375  LR: 0.00000200  
Epoch: [1][60/366] Elapsed 1m 40s (remain 8m 23s) Loss: 1.2413(1.9769) Grad: 205490.2031  LR: 0.00000199  
Epoch: [1][80/366] Elapsed 2m 23s (remain 8m 23s) Loss: 0.9882(1.7920) Grad: 211641.2812  LR: 0.00000198  
Epoch: [1][100/366] Elapsed 3m 2s (remain 7m 59s) Loss: 0.8747(1.6246) Grad: 218529.2344  LR: 0.00000198  
Epoch: [1][120/366] Elapsed 3m 41s (remain 7m 29s) Loss: 0.2047(1.4484) Grad: 94276.6562  LR: 0.00000197  
Epoch: [1][140/366] Elapsed 4m 22s (remain 6m 58s) Loss: 0.1020(1.2813) Grad: 63383.6641  LR: 0.00000195  
Epoch: [1][160/366] Elapsed 5m 5s (remain 6m 28s) Loss: 0.3851(1.1472) Grad: 184920.3750  LR: 0.00000194  
Epoch: [1][180/366] Elapsed 5m 40s (remain 5m 4

Epoch 1 - avg_train_loss: 0.5819  avg_val_loss: 0.1194  time: 753s
Epoch 1 - Score: 0.4910  Scores: [0.5191993221997714, 0.47231217360849787, 0.4651598097126536, 0.46782415478315637, 0.5258894283625659, 0.49589749370667024]
Epoch 1 - Save Best Score: 0.4910 Model


Epoch: [2][0/366] Elapsed 0m 2s (remain 13m 57s) Loss: 0.2020(0.2020) Grad: 277706.3438  LR: 0.00000171  
Epoch: [2][20/366] Elapsed 0m 39s (remain 10m 55s) Loss: 0.0818(0.1236) Grad: 213877.1094  LR: 0.00000168  
Epoch: [2][40/366] Elapsed 1m 11s (remain 9m 28s) Loss: 0.1487(0.1214) Grad: 323971.5000  LR: 0.00000164  
Epoch: [2][60/366] Elapsed 1m 52s (remain 9m 21s) Loss: 0.1297(0.1207) Grad: 364051.9375  LR: 0.00000161  
Epoch: [2][80/366] Elapsed 2m 28s (remain 8m 41s) Loss: 0.2114(0.1187) Grad: 627533.1875  LR: 0.00000158  
Epoch: [2][100/366] Elapsed 3m 4s (remain 8m 4s) Loss: 0.1672(0.1171) Grad: 477850.3750  LR: 0.00000154  
Epoch: [2][120/366] Elapsed 3m 43s (remain 7m 32s) Loss: 0.1218(0.1181) Grad: 250529.3125  LR: 0.00000150  
Epoch: [2][140/366] Elapsed 4m 22s (remain 6m 59s) Loss: 0.1185(0.1184) Grad: 93023.6172  LR: 0.00000147  
Epoch: [2][160/366] Elapsed 5m 1s (remain 6m 23s) Loss: 0.1397(0.1181) Grad: 234949.0625  LR: 0.00000143  
Epoch: [2][180/366] Elapsed 5m 41s (r

Epoch 2 - avg_train_loss: 0.1167  avg_val_loss: 0.1138  time: 750s
Epoch 2 - Score: 0.4790  Scores: [0.5148316604229566, 0.46851708328246355, 0.443210152580716, 0.46311826062791156, 0.510297934114837, 0.47427082583898833]
Epoch 2 - Save Best Score: 0.4790 Model


Epoch: [3][0/366] Elapsed 0m 2s (remain 12m 13s) Loss: 0.1472(0.1472) Grad: 883918.0000  LR: 0.00000100  
Epoch: [3][20/366] Elapsed 0m 43s (remain 11m 53s) Loss: 0.0672(0.1170) Grad: 136942.0781  LR: 0.00000096  
Epoch: [3][40/366] Elapsed 1m 27s (remain 11m 36s) Loss: 0.0905(0.1117) Grad: 133106.8906  LR: 0.00000091  
Epoch: [3][60/366] Elapsed 1m 59s (remain 9m 57s) Loss: 0.1902(0.1158) Grad: 212896.7500  LR: 0.00000087  
Epoch: [3][80/366] Elapsed 2m 35s (remain 9m 6s) Loss: 0.1151(0.1165) Grad: 117215.6719  LR: 0.00000083  
Epoch: [3][100/366] Elapsed 3m 15s (remain 8m 33s) Loss: 0.1514(0.1147) Grad: 104723.4062  LR: 0.00000079  
Epoch: [3][120/366] Elapsed 3m 51s (remain 7m 48s) Loss: 0.0896(0.1117) Grad: 174786.0625  LR: 0.00000075  
Epoch: [3][140/366] Elapsed 4m 41s (remain 7m 28s) Loss: 0.0742(0.1108) Grad: 157947.6406  LR: 0.00000070  
Epoch: [3][160/366] Elapsed 5m 20s (remain 6m 47s) Loss: 0.1062(0.1114) Grad: 224404.1250  LR: 0.00000066  
Epoch: [3][180/366] Elapsed 5m 57

Epoch 3 - avg_train_loss: 0.1126  avg_val_loss: 0.1124  time: 754s
Epoch 3 - Score: 0.4758  Scores: [0.5069939670130691, 0.4639398856544539, 0.4366745940348175, 0.46177905113192413, 0.5108536707038592, 0.4743143260093919]
Epoch 3 - Save Best Score: 0.4758 Model


Epoch: [4][0/366] Elapsed 0m 1s (remain 12m 6s) Loss: 0.1029(0.1029) Grad: 488576.8125  LR: 0.00000029  
Epoch: [4][20/366] Elapsed 0m 40s (remain 11m 8s) Loss: 0.1870(0.1113) Grad: 178556.9219  LR: 0.00000026  
Epoch: [4][40/366] Elapsed 1m 15s (remain 9m 57s) Loss: 0.0545(0.1113) Grad: 70623.3828  LR: 0.00000024  
Epoch: [4][60/366] Elapsed 1m 53s (remain 9m 26s) Loss: 0.0932(0.1098) Grad: 127307.6250  LR: 0.00000021  
Epoch: [4][80/366] Elapsed 2m 30s (remain 8m 49s) Loss: 0.1188(0.1098) Grad: 119827.0078  LR: 0.00000018  
Epoch: [4][100/366] Elapsed 3m 3s (remain 8m 1s) Loss: 0.0966(0.1099) Grad: 265806.1562  LR: 0.00000016  
Epoch: [4][120/366] Elapsed 3m 38s (remain 7m 22s) Loss: 0.1138(0.1097) Grad: 252493.2188  LR: 0.00000014  
Epoch: [4][140/366] Elapsed 4m 11s (remain 6m 41s) Loss: 0.0900(0.1105) Grad: 161573.8594  LR: 0.00000012  
Epoch: [4][160/366] Elapsed 4m 52s (remain 6m 12s) Loss: 0.1866(0.1113) Grad: 270773.1875  LR: 0.00000010  
Epoch: [4][180/366] Elapsed 5m 34s (re

Epoch 4 - avg_train_loss: 0.1096  avg_val_loss: 0.1103  time: 744s
Epoch 4 - Score: 0.4713  Scores: [0.5033316747245699, 0.46088991035205407, 0.4346185744772727, 0.4584999701648458, 0.5026815720471564, 0.46754424998073413]
Epoch 4 - Save Best Score: 0.4713 Model
Score: 0.4713  Scores: [0.5033316747245699, 0.46088991035205407, 0.4346185744772727, 0.4584999701648458, 0.5026815720471564, 0.46754424998073413]
Score: 0.4760  Scores: [0.5068180199938503, 0.4636329339892587, 0.4379829149039594, 0.4710154633066133, 0.5030135779766984, 0.4735711673244099]


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
[fold0] avg_train_loss,█▁▁▁
[fold0] avg_val_loss,█▃▁▁
[fold0] epoch,▁▃▆█
[fold0] loss,█▅▄▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[fold0] lr,███████▇▇▇▇▇▇▆▆▆▆▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
[fold0] score,█▃▂▁
[fold1] avg_train_loss,█▁▁▁
[fold1] avg_val_loss,█▃▂▁
[fold1] epoch,▁▃▆█
[fold1] loss,█▇▅▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
[fold0] avg_train_loss,0.11034
[fold0] avg_val_loss,0.10995
[fold0] epoch,4.0
[fold0] loss,0.10744
[fold0] lr,0.0
[fold0] score,0.4698
[fold1] avg_train_loss,0.10746
[fold1] avg_val_loss,0.11419
[fold1] epoch,4.0
[fold1] loss,0.0664
