In [1]:
# ====================================================
# Directory settings
# ====================================================
import os

OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [1]:
!nvidia-smi

Thu Sep 29 09:59:04 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64.00    Driver Version: 440.64.00    CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  On   | 00000000:00:08.0 Off |                    0 |
| N/A   27C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

In [3]:
# ====================================================
# CFG
# ====================================================
class CFG:
    wandb=False
    competition='FB3'
    _wandb_kernel='nakama'
    debug=False
    apex=True
    print_freq=20
    num_workers=4
    model="microsoft/deberta-v3-large"
    gradient_checkpointing=True
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=2e-5
    decoder_lr=5e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=4
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    train=True
    
if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]

In [5]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

# os.system('pip install iterative-stratification==0.1.7')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

# os.system('pip uninstall -y transformers')
# os.system('pip uninstall -y tokenizers')
# os.system('python -m pip install --no-index --find-links=../input/fb3-pip-wheels transformers')
# os.system('python -m pip install --no-index --find-links=../input/fb3-pip-wheels tokenizers')
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizers.__version__: 0.12.1
transformers.__version__: 4.21.2
env: TOKENIZERS_PARALLELISM=true


In [6]:
# ====================================================
# Utils
# ====================================================
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    
    y_preds=np.array(y_preds)
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores


def get_score(y_trues, y_preds):
    #label2score={0:1,1:1.5,2:2,3:2.5,4:3,5:3.5,6:4,7:4.5,8:5}
    #scores=[[label2score[y_trues[j][i].item()] for j in range(6)] for i in range(y_trues.shape[1])]
    
    #scores=torch.tensor(scores,dtype=float)
    
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

In [8]:
# ====================================================
# Data Loading
# ====================================================
train = pd.read_csv('../train.csv')
test = pd.read_csv('../test.csv')
submission = pd.read_csv('../sample_submission.csv')

print(f"train.shape: {train.shape}")
display(train.head())
print(f"test.shape: {test.shape}")
display(test.head())
print(f"submission.shape: {submission.shape}")
display(submission.head())

train.shape: (3911, 8)


Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


test.shape: (3, 2)


Unnamed: 0,text_id,full_text
0,0000C359D63E,when a person has no experience on a job their...
1,000BAD50D026,Do you think students would benefit from being...
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde..."


submission.shape: (3, 7)


Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,3.0,3.0,3.0,3.0,3.0,3.0
1,000BAD50D026,3.0,3.0,3.0,3.0,3.0,3.0
2,00367BB2546B,3.0,3.0,3.0,3.0,3.0,3.0


In [9]:
# ====================================================
# CV split
# ====================================================
Fold = MultilabelStratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
for n, (train_index, val_index) in enumerate(Fold.split(train, train[CFG.target_cols])):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

fold
0    391
1    391
2    391
3    391
4    391
5    391
6    391
7    391
8    391
9    392
dtype: int64

In [10]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['full_text'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 3 # cls & sep & sep
LOGGER.info(f"max_len: {CFG.max_len}")

  0%|          | 0/3911 [00:00<?, ?it/s]

max_len: 1429


In [12]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors=None, 
        add_special_tokens=True, 
        max_length=CFG.max_len,
        pad_to_max_length=True,
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['full_text'].values
        self.labels = df[cfg.target_cols].values
        self.score2label={1:0,1.5:1,2:2,2.5:3,
                          3:4,3.5:5,4:6,4.5:7,5:8}
        

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        label0 = torch.tensor(self.score2label[self.labels[item][0]])
        label1 = torch.tensor(self.score2label[self.labels[item][1]])
        label2 = torch.tensor(self.score2label[self.labels[item][2]])
        label3 = torch.tensor(self.score2label[self.labels[item][3]])
        label4 = torch.tensor(self.score2label[self.labels[item][4]])
        label5 = torch.tensor(self.score2label[self.labels[item][5]])
        return inputs,label0, label1,label2,label3,label4,label5
    

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

In [13]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        self.pool = MeanPooling()
        
        self.fc0 = nn.Linear(self.config.hidden_size, 9)
        self.fc1 = nn.Linear(self.config.hidden_size, 9)
        self.fc2 = nn.Linear(self.config.hidden_size, 9)
        self.fc3 = nn.Linear(self.config.hidden_size, 9)
        self.fc4 = nn.Linear(self.config.hidden_size, 9)
        self.fc5 = nn.Linear(self.config.hidden_size, 9)
        
        self._init_weights(self.fc0)
        self._init_weights(self.fc1)
        self._init_weights(self.fc2)
        self._init_weights(self.fc3)
        self._init_weights(self.fc4)
        self._init_weights(self.fc5)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        
        output0 = self.fc0(feature)
        output1 = self.fc1(feature)
        output2 = self.fc2(feature)
        output3 = self.fc3(feature)
        output4 = self.fc4(feature)
        output5 = self.fc5(feature)
        
        
        output=[output0,output1,output2,output3,output4,output5]
        
        return output

In [14]:
# ====================================================
# Loss
# ====================================================
class RMSELoss(nn.Module):
    def __init__(self, reduction='mean', eps=1e-9):
        super().__init__()
        self.mse = nn.MSELoss(reduction='none')
        self.reduction = reduction
        self.eps = eps

    def forward(self, y_pred, y_true):
        loss = torch.sqrt(self.mse(y_pred, y_true) + self.eps)
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss

In [15]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels0,labels1,labels2,labels3,labels4,labels5) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels0 = labels0.to(device)
        labels1 = labels1.to(device)
        labels2 = labels2.to(device)
        labels3 = labels3.to(device)
        labels4 = labels4.to(device)
        labels5 = labels5.to(device)
        batch_size = labels0.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
            loss_0 = criterion(y_preds[0], labels0)
            loss_1 = criterion(y_preds[1], labels1)
            loss_2 = criterion(y_preds[2], labels2)
            loss_3 = criterion(y_preds[3], labels3)
            loss_4 = criterion(y_preds[4], labels4)
            loss_5 = criterion(y_preds[5], labels5)
            
            loss=(loss_0+loss_1+loss_2+loss_3+loss_4+loss_5)/6
            
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
        if CFG.wandb:
            wandb.log({f"[fold{fold}] loss": losses.val,
                       f"[fold{fold}] lr": scheduler.get_lr()[0]})
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    label2score = {v:k for k,v in {1:0,1.5:1,2:2,2.5:3,
                          3:4,3.5:5,4:6,4.5:7,5:8}.items()}
    losses = AverageMeter()
    model.eval()
    predictions = []
    start = end = time.time()
    for step, (inputs,  labels0,labels1,labels2,labels3,labels4,labels5) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels0 = labels0.to(device)
        labels1 = labels1.to(device)
        labels2 = labels2.to(device)
        labels3 = labels3.to(device)
        labels4 = labels4.to(device)
        labels5 = labels5.to(device)
        batch_size = labels0.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
            loss_0 = criterion(y_preds[0], labels0)
            loss_1 = criterion(y_preds[1], labels1)
            loss_2 = criterion(y_preds[2], labels2)
            loss_3 = criterion(y_preds[3], labels3)
            loss_4 = criterion(y_preds[4], labels4)
            loss_5 = criterion(y_preds[5], labels5)
            
            loss=(loss_0+loss_1+loss_2+loss_3+loss_4+loss_5)/6
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        
        y_preds=torch.cat(y_preds,dim=1).reshape([-1,6,9])
        
        y_preds=torch.argmax(y_preds,dim=2).cpu().numpy()
    
        y_preds=[[label2score[x] for x in y] for y in y_preds]
        predictions+=y_preds
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    #predictions = np.concatenate(preds)
    
    return losses.avg, predictions

In [16]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels0,labels1,labels2,labels3,labels4,labels5) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels0 = labels0.to(device)
        labels1 = labels1.to(device)
        labels2 = labels2.to(device)
        labels3 = labels3.to(device)
        labels4 = labels4.to(device)
        labels5 = labels5.to(device)
        batch_size = labels0.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
            loss_0 = criterion(y_preds[0], labels0)
            loss_1 = criterion(y_preds[1], labels1)
            loss_2 = criterion(y_preds[2], labels2)
            loss_3 = criterion(y_preds[3], labels3)
            loss_4 = criterion(y_preds[4], labels4)
            loss_5 = criterion(y_preds[5], labels5)
            
            loss=(loss_0+loss_1+loss_2+loss_3+loss_4+loss_5)/6
            
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
        if CFG.wandb:
            wandb.log({f"[fold{fold}] loss": losses.val,
                       f"[fold{fold}] lr": scheduler.get_lr()[0]})
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    label2score = {v:k for k,v in {1:0,1.5:1,2:2,2.5:3,
                          3:4,3.5:5,4:6,4.5:7,5:8}.items()}
    losses = AverageMeter()
    model.eval()
    predictions = []
    start = end = time.time()
    for step, (inputs,  labels0,labels1,labels2,labels3,labels4,labels5) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels0 = labels0.to(device)
        labels1 = labels1.to(device)
        labels2 = labels2.to(device)
        labels3 = labels3.to(device)
        labels4 = labels4.to(device)
        labels5 = labels5.to(device)
        batch_size = labels0.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
            loss_0 = criterion(y_preds[0], labels0)
            loss_1 = criterion(y_preds[1], labels1)
            loss_2 = criterion(y_preds[2], labels2)
            loss_3 = criterion(y_preds[3], labels3)
            loss_4 = criterion(y_preds[4], labels4)
            loss_5 = criterion(y_preds[5], labels5)
            
            loss=(loss_0+loss_1+loss_2+loss_3+loss_4+loss_5)/6
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        
        y_preds=torch.cat(y_preds,dim=1).reshape([-1,6,9])
        
        y_preds=torch.argmax(y_preds,dim=2).cpu().numpy()
    
        y_preds=[[label2score[x] for x in y] for y in y_preds]
        predictions+=y_preds
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    #predictions = np.concatenate(preds)
    
    return losses.avg, predictions

In [17]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds[CFG.target_cols].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    #criterion = nn.SmoothL1Loss(reduction='mean') # RMSELoss(reduction="mean")
    criterion = nn.CrossEntropyLoss(reduction='mean')
    
    best_score = np.inf

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        #print(valid_labels)
        #print(predictions)
        # scoring
        score, scores = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}  Scores: {scores}')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1, 
                       f"[fold{fold}] avg_train_loss": avg_loss, 
                       f"[fold{fold}] avg_val_loss": avg_val_loss,
                       f"[fold{fold}] score": score})
        
        if best_score > score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds[[f"pred_{c}" for c in CFG.target_cols]] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [18]:


def get_result(oof_df):
    labels = oof_df[CFG.target_cols].values
    preds = oof_df[[f"pred_{c}" for c in CFG.target_cols]].values
    score, scores = get_score(labels, preds)
    LOGGER.info(f'Score: {score:<.4f}  Scores: {scores}')

if CFG.train:
    oof_df = pd.DataFrame()
    for fold in range(6, CFG.n_fold):
        if fold in CFG.trn_fold:
            _oof_df = train_loop(train, fold)
            oof_df = pd.concat([oof_df, _oof_df])
            LOGGER.info(f"========== fold: {fold} result ==========")
            get_result(_oof_df)
    oof_df = oof_df.reset_index(drop=True)
    LOGGER.info(f"========== CV ==========")
    get_result(oof_df)
    oof_df.to_pickle(OUTPUT_DIR+'oof_df.pkl')

if CFG.wandb:
    wandb.finish()

DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Mo

Epoch: [1][0/880] Elapsed 0m 3s (remain 45m 11s) Loss: 2.2467(2.2467) Grad: inf  LR: 0.00002000  
Epoch: [1][20/880] Elapsed 0m 45s (remain 30m 58s) Loss: 1.7687(1.9939) Grad: 102374.2031  LR: 0.00002000  
Epoch: [1][40/880] Elapsed 1m 34s (remain 32m 7s) Loss: 1.5127(1.8869) Grad: 108722.8203  LR: 0.00001999  
Epoch: [1][60/880] Elapsed 2m 22s (remain 31m 59s) Loss: 1.4727(1.7938) Grad: 117950.7266  LR: 0.00001999  
Epoch: [1][80/880] Elapsed 3m 10s (remain 31m 21s) Loss: 1.7227(1.7477) Grad: 121807.5078  LR: 0.00001997  
Epoch: [1][100/880] Elapsed 3m 55s (remain 30m 17s) Loss: 1.4935(1.6997) Grad: 207197.7031  LR: 0.00001996  
Epoch: [1][120/880] Elapsed 4m 45s (remain 29m 49s) Loss: 1.5196(1.6703) Grad: 160495.7812  LR: 0.00001994  
Epoch: [1][140/880] Elapsed 5m 32s (remain 29m 2s) Loss: 3.0056(1.6478) Grad: 204519.3281  LR: 0.00001992  
Epoch: [1][160/880] Elapsed 6m 23s (remain 28m 33s) Loss: 1.5546(1.6365) Grad: 131829.5469  LR: 0.00001990  
Epoch: [1][180/880] Elapsed 7m 12s (

Epoch 1 - avg_train_loss: 1.4602  avg_val_loss: 1.3837  time: 2167s
Epoch 1 - Score: 0.5172  Scores: [0.5452656031743751, 0.5218989525032992, 0.507614651355829, 0.531609533071195, 0.506984465325952, 0.48966296671472614]
Epoch 1 - Save Best Score: 0.5172 Model


EVAL: [48/49] Elapsed 1m 14s (remain 0m 0s) Loss: 1.1736(1.3837) 
Epoch: [2][0/880] Elapsed 0m 2s (remain 31m 47s) Loss: 1.4607(1.4607) Grad: 468829.0000  LR: 0.00001706  
Epoch: [2][20/880] Elapsed 0m 48s (remain 33m 17s) Loss: 1.3615(1.2917) Grad: 371829.4375  LR: 0.00001694  
Epoch: [2][40/880] Elapsed 1m 33s (remain 31m 44s) Loss: 1.5761(1.2771) Grad: 536649.6250  LR: 0.00001681  
Epoch: [2][60/880] Elapsed 2m 20s (remain 31m 20s) Loss: 1.3906(1.2588) Grad: 451200.0938  LR: 0.00001668  
Epoch: [2][80/880] Elapsed 3m 12s (remain 31m 36s) Loss: 1.2259(1.2571) Grad: 333426.9375  LR: 0.00001654  
Epoch: [2][100/880] Elapsed 3m 58s (remain 30m 40s) Loss: 1.4719(1.2528) Grad: 379628.4375  LR: 0.00001641  
Epoch: [2][120/880] Elapsed 4m 48s (remain 30m 11s) Loss: 1.1913(1.2500) Grad: 211797.6719  LR: 0.00001627  
Epoch: [2][140/880] Elapsed 5m 35s (remain 29m 20s) Loss: 1.2846(1.2498) Grad: 228954.0781  LR: 0.00001613  
Epoch: [2][160/880] Elapsed 6m 26s (remain 28m 46s) Loss: 1.2131(1.25

Epoch 2 - avg_train_loss: 1.2507  avg_val_loss: 1.3708  time: 2159s
Epoch 2 - Score: 0.5036  Scores: [0.5231226328525249, 0.48176465904087157, 0.4993602044724246, 0.5113794593124069, 0.506984465325952, 0.49871958816702133]
Epoch 2 - Save Best Score: 0.5036 Model


EVAL: [48/49] Elapsed 1m 13s (remain 0m 0s) Loss: 1.0768(1.3708) 
Epoch: [3][0/880] Elapsed 0m 2s (remain 32m 53s) Loss: 1.2064(1.2064) Grad: 434334.6562  LR: 0.00000999  
Epoch: [3][20/880] Elapsed 0m 46s (remain 31m 25s) Loss: 1.0815(1.1575) Grad: 439819.8750  LR: 0.00000981  
Epoch: [3][40/880] Elapsed 1m 25s (remain 29m 17s) Loss: 1.1295(1.1395) Grad: 500323.6562  LR: 0.00000963  
Epoch: [3][60/880] Elapsed 2m 15s (remain 30m 22s) Loss: 0.9930(1.1226) Grad: 402704.3750  LR: 0.00000946  
Epoch: [3][80/880] Elapsed 3m 1s (remain 29m 54s) Loss: 0.9714(1.1427) Grad: 369382.5000  LR: 0.00000928  
Epoch: [3][100/880] Elapsed 3m 43s (remain 28m 43s) Loss: 1.1596(1.1417) Grad: 487998.4375  LR: 0.00000910  
Epoch: [3][120/880] Elapsed 4m 33s (remain 28m 33s) Loss: 1.0179(1.1415) Grad: 454749.2500  LR: 0.00000892  
Epoch: [3][140/880] Elapsed 5m 20s (remain 28m 2s) Loss: 1.1969(1.1365) Grad: 612790.9375  LR: 0.00000874  
Epoch: [3][160/880] Elapsed 6m 16s (remain 28m 2s) Loss: 1.2213(1.1344)

Epoch 3 - avg_train_loss: 1.1143  avg_val_loss: 1.3948  time: 2171s
Epoch 3 - Score: 0.5085  Scores: [0.5358025746270562, 0.48507125007266594, 0.47776654295295456, 0.5273831551418434, 0.5194429438032705, 0.5057217374241736]


EVAL: [48/49] Elapsed 1m 13s (remain 0m 0s) Loss: 1.0830(1.3948) 
Epoch: [4][0/880] Elapsed 0m 2s (remain 33m 24s) Loss: 0.8143(0.8143) Grad: 381840.8750  LR: 0.00000292  
Epoch: [4][20/880] Elapsed 0m 48s (remain 33m 21s) Loss: 1.0018(1.0558) Grad: 404046.0625  LR: 0.00000280  
Epoch: [4][40/880] Elapsed 1m 33s (remain 31m 59s) Loss: 1.0649(1.0577) Grad: 375336.0000  LR: 0.00000267  
Epoch: [4][60/880] Elapsed 2m 19s (remain 31m 13s) Loss: 0.8655(1.0592) Grad: 449598.5000  LR: 0.00000255  
Epoch: [4][80/880] Elapsed 3m 0s (remain 29m 41s) Loss: 1.0056(1.0674) Grad: 521294.1875  LR: 0.00000244  
Epoch: [4][100/880] Elapsed 3m 48s (remain 29m 18s) Loss: 0.9467(1.0652) Grad: 435921.4062  LR: 0.00000232  
Epoch: [4][120/880] Elapsed 4m 28s (remain 28m 1s) Loss: 1.0262(1.0578) Grad: 378585.0000  LR: 0.00000221  
Epoch: [4][140/880] Elapsed 5m 11s (remain 27m 11s) Loss: 1.1349(1.0501) Grad: 474161.4375  LR: 0.00000210  
Epoch: [4][160/880] Elapsed 6m 2s (remain 26m 57s) Loss: 1.1878(1.0478)

Epoch 4 - avg_train_loss: 1.0433  avg_val_loss: 1.3942  time: 2175s
Epoch 4 - Score: 0.5064  Scores: [0.5358025746270562, 0.4877003817961311, 0.47776654295295456, 0.5231226328525249, 0.506984465325952, 0.506984465325952]


EVAL: [48/49] Elapsed 1m 13s (remain 0m 0s) Loss: 1.0351(1.3942) 


Score: 0.5036  Scores: [0.5231226328525249, 0.48176465904087157, 0.4993602044724246, 0.5113794593124069, 0.506984465325952, 0.49871958816702133]
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_s

Epoch: [1][0/880] Elapsed 0m 1s (remain 27m 31s) Loss: 2.2548(2.2548) Grad: inf  LR: 0.00002000  
Epoch: [1][20/880] Elapsed 0m 55s (remain 37m 38s) Loss: 1.8821(1.9693) Grad: 113506.8828  LR: 0.00002000  
Epoch: [1][40/880] Elapsed 1m 41s (remain 34m 41s) Loss: 1.3934(1.8338) Grad: 140276.2031  LR: 0.00001999  
Epoch: [1][60/880] Elapsed 2m 26s (remain 32m 51s) Loss: 1.4784(1.7744) Grad: 150371.7656  LR: 0.00001999  
Epoch: [1][80/880] Elapsed 3m 14s (remain 32m 0s) Loss: 1.5229(1.7322) Grad: 116519.5469  LR: 0.00001997  
Epoch: [1][100/880] Elapsed 4m 4s (remain 31m 29s) Loss: 1.8180(1.7179) Grad: 167477.6562  LR: 0.00001996  
Epoch: [1][120/880] Elapsed 4m 52s (remain 30m 37s) Loss: 1.2023(1.6757) Grad: 80562.0625  LR: 0.00001994  
Epoch: [1][140/880] Elapsed 5m 40s (remain 29m 44s) Loss: 1.6025(1.6587) Grad: 217416.8594  LR: 0.00001992  
Epoch: [1][160/880] Elapsed 6m 29s (remain 28m 58s) Loss: 1.2735(1.6423) Grad: 104496.2734  LR: 0.00001990  
Epoch: [1][180/880] Elapsed 7m 26s (r

Epoch 1 - avg_train_loss: 1.4613  avg_val_loss: 1.3507  time: 2178s
Epoch 1 - Score: 0.5035  Scores: [0.5405547969364888, 0.5031867754087855, 0.4837513258571414, 0.5144957554275266, 0.48242779033760647, 0.496792782811615]
Epoch 1 - Save Best Score: 0.5035 Model


EVAL: [48/49] Elapsed 1m 12s (remain 0m 0s) Loss: 1.2281(1.3507) 
Epoch: [2][0/880] Elapsed 0m 4s (remain 67m 38s) Loss: 1.1944(1.1944) Grad: 384504.0625  LR: 0.00001706  
Epoch: [2][20/880] Elapsed 0m 49s (remain 33m 31s) Loss: 1.6016(1.3260) Grad: 286855.6562  LR: 0.00001694  
Epoch: [2][40/880] Elapsed 1m 39s (remain 34m 2s) Loss: 1.1992(1.3305) Grad: 268427.8438  LR: 0.00001681  
Epoch: [2][60/880] Elapsed 2m 25s (remain 32m 29s) Loss: 1.3475(1.3207) Grad: 204196.0000  LR: 0.00001668  
Epoch: [2][80/880] Elapsed 3m 21s (remain 33m 3s) Loss: 1.5837(1.3080) Grad: 571009.6250  LR: 0.00001654  
Epoch: [2][100/880] Elapsed 4m 5s (remain 31m 35s) Loss: 1.1847(1.2967) Grad: 198410.1406  LR: 0.00001641  
Epoch: [2][120/880] Elapsed 4m 48s (remain 30m 11s) Loss: 1.1997(1.2842) Grad: 181210.8125  LR: 0.00001627  
Epoch: [2][140/880] Elapsed 5m 38s (remain 29m 36s) Loss: 1.1527(1.2770) Grad: 215679.8438  LR: 0.00001613  
Epoch: [2][160/880] Elapsed 6m 23s (remain 28m 32s) Loss: 1.5699(1.2776)

Epoch 2 - avg_train_loss: 1.2665  avg_val_loss: 1.3447  time: 2176s
Epoch 2 - Score: 0.4988  Scores: [0.549354422326719, 0.4993602044724246, 0.43134832916584803, 0.5163565056438392, 0.5012771412886455, 0.49485847520875453]
Epoch 2 - Save Best Score: 0.4988 Model


EVAL: [48/49] Elapsed 1m 11s (remain 0m 0s) Loss: 1.1188(1.3447) 
Epoch: [3][0/880] Elapsed 0m 2s (remain 36m 56s) Loss: 1.1081(1.1081) Grad: 539608.1875  LR: 0.00000999  
Epoch: [3][20/880] Elapsed 0m 46s (remain 31m 25s) Loss: 1.1517(1.1097) Grad: 476653.0938  LR: 0.00000981  
Epoch: [3][40/880] Elapsed 1m 41s (remain 34m 35s) Loss: 1.3932(1.1660) Grad: 507740.9375  LR: 0.00000963  
Epoch: [3][60/880] Elapsed 2m 37s (remain 35m 16s) Loss: 0.9797(1.1511) Grad: 399716.9062  LR: 0.00000946  
Epoch: [3][80/880] Elapsed 3m 27s (remain 34m 10s) Loss: 1.0121(1.1450) Grad: 438963.9688  LR: 0.00000928  
Epoch: [3][100/880] Elapsed 4m 17s (remain 33m 3s) Loss: 1.3456(1.1292) Grad: 824260.0000  LR: 0.00000910  
Epoch: [3][120/880] Elapsed 5m 2s (remain 31m 38s) Loss: 1.0826(1.1289) Grad: 378132.1562  LR: 0.00000892  
Epoch: [3][140/880] Elapsed 5m 55s (remain 31m 1s) Loss: 1.1698(1.1263) Grad: 437824.5000  LR: 0.00000874  
Epoch: [3][160/880] Elapsed 6m 44s (remain 30m 7s) Loss: 1.0264(1.1271) 

Epoch 3 - avg_train_loss: 1.1083  avg_val_loss: 1.3379  time: 2194s
Epoch 3 - Score: 0.4815  Scores: [0.5025510369674241, 0.4662524041201569, 0.4372373160976031, 0.500638977896506, 0.5, 0.48242779033760647]
Epoch 3 - Save Best Score: 0.4815 Model


EVAL: [48/49] Elapsed 1m 11s (remain 0m 0s) Loss: 1.1520(1.3379) 
Epoch: [4][0/880] Elapsed 0m 2s (remain 32m 30s) Loss: 0.9455(0.9455) Grad: 331953.8438  LR: 0.00000292  
Epoch: [4][20/880] Elapsed 0m 50s (remain 34m 23s) Loss: 0.8126(1.0548) Grad: 363798.5000  LR: 0.00000280  
Epoch: [4][40/880] Elapsed 1m 42s (remain 34m 52s) Loss: 1.0223(1.0377) Grad: 342713.5312  LR: 0.00000267  
Epoch: [4][60/880] Elapsed 2m 31s (remain 33m 52s) Loss: 0.9925(1.0353) Grad: 445444.9375  LR: 0.00000255  
Epoch: [4][80/880] Elapsed 3m 17s (remain 32m 32s) Loss: 1.0964(1.0368) Grad: 508424.6250  LR: 0.00000244  
Epoch: [4][100/880] Elapsed 4m 12s (remain 32m 24s) Loss: 0.9932(1.0328) Grad: 477732.3750  LR: 0.00000232  
Epoch: [4][120/880] Elapsed 4m 55s (remain 30m 54s) Loss: 0.9807(1.0306) Grad: 389626.9688  LR: 0.00000221  
Epoch: [4][140/880] Elapsed 5m 47s (remain 30m 20s) Loss: 1.1666(1.0288) Grad: 521687.5625  LR: 0.00000210  
Epoch: [4][160/880] Elapsed 6m 32s (remain 29m 11s) Loss: 0.9559(1.02

Epoch 4 - avg_train_loss: 1.0132  avg_val_loss: 1.3496  time: 2183s
Epoch 4 - Score: 0.4802  Scores: [0.49161771686915634, 0.46004002939364014, 0.43942533968355785, 0.5019144932832433, 0.496792782811615, 0.49161771686915634]
Epoch 4 - Save Best Score: 0.4802 Model


EVAL: [48/49] Elapsed 1m 11s (remain 0m 0s) Loss: 1.1420(1.3496) 


Score: 0.4802  Scores: [0.49161771686915634, 0.46004002939364014, 0.43942533968355785, 0.5019144932832433, 0.496792782811615, 0.49161771686915634]
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab

Epoch: [1][0/880] Elapsed 0m 3s (remain 48m 21s) Loss: 2.2211(2.2211) Grad: inf  LR: 0.00002000  
Epoch: [1][20/880] Elapsed 0m 52s (remain 35m 35s) Loss: 1.5274(1.9497) Grad: 110629.3281  LR: 0.00002000  
Epoch: [1][40/880] Elapsed 1m 38s (remain 33m 35s) Loss: 1.8122(1.8650) Grad: 125074.3047  LR: 0.00001999  
Epoch: [1][60/880] Elapsed 2m 34s (remain 34m 28s) Loss: 1.9077(1.7935) Grad: 122157.9219  LR: 0.00001999  
Epoch: [1][80/880] Elapsed 3m 23s (remain 33m 31s) Loss: 1.3543(1.7411) Grad: 153706.3594  LR: 0.00001997  
Epoch: [1][100/880] Elapsed 4m 6s (remain 31m 37s) Loss: 1.2976(1.6867) Grad: 101130.1797  LR: 0.00001996  
Epoch: [1][120/880] Elapsed 4m 59s (remain 31m 17s) Loss: 1.2916(1.6655) Grad: 186630.5469  LR: 0.00001994  
Epoch: [1][140/880] Elapsed 5m 42s (remain 29m 53s) Loss: 1.4255(1.6386) Grad: 142278.6562  LR: 0.00001992  
Epoch: [1][160/880] Elapsed 6m 32s (remain 29m 14s) Loss: 1.6381(1.6180) Grad: 110599.7500  LR: 0.00001990  
Epoch: [1][180/880] Elapsed 7m 10s 

Epoch 1 - avg_train_loss: 1.4495  avg_val_loss: 1.4400  time: 2165s
Epoch 1 - Score: 0.5294  Scores: [0.5637152969907603, 0.5206723962776636, 0.4791029554738042, 0.5522564732768778, 0.547605809649741, 0.5132515075227978]
Epoch 1 - Save Best Score: 0.5294 Model


EVAL: [48/49] Elapsed 1m 16s (remain 0m 0s) Loss: 1.2207(1.4400) 
Epoch: [2][0/880] Elapsed 0m 2s (remain 35m 40s) Loss: 1.2215(1.2215) Grad: 276979.4375  LR: 0.00001706  
Epoch: [2][20/880] Elapsed 0m 50s (remain 34m 36s) Loss: 1.4461(1.3371) Grad: 300715.4375  LR: 0.00001694  
Epoch: [2][40/880] Elapsed 1m 47s (remain 36m 36s) Loss: 1.3851(1.3255) Grad: 199553.1406  LR: 0.00001681  
Epoch: [2][60/880] Elapsed 2m 37s (remain 35m 10s) Loss: 1.1914(1.3064) Grad: 188385.5156  LR: 0.00001668  
Epoch: [2][80/880] Elapsed 3m 18s (remain 32m 39s) Loss: 1.4098(1.3125) Grad: 285217.0625  LR: 0.00001654  
Epoch: [2][100/880] Elapsed 4m 14s (remain 32m 46s) Loss: 1.4036(1.3050) Grad: 211192.7500  LR: 0.00001641  
Epoch: [2][120/880] Elapsed 5m 5s (remain 31m 56s) Loss: 1.2666(1.2918) Grad: 222297.9844  LR: 0.00001627  
Epoch: [2][140/880] Elapsed 5m 51s (remain 30m 42s) Loss: 1.1033(1.2902) Grad: 236691.5156  LR: 0.00001613  
Epoch: [2][160/880] Elapsed 6m 43s (remain 30m 3s) Loss: 1.0607(1.2807

Epoch 2 - avg_train_loss: 1.2461  avg_val_loss: 1.3139  time: 2169s
Epoch 2 - Score: 0.4875  Scores: [0.5144957554275266, 0.49807814791679533, 0.4116276717537983, 0.5101276105330244, 0.5163565056438392, 0.474409041459926]
Epoch 2 - Save Best Score: 0.4875 Model


EVAL: [48/49] Elapsed 1m 16s (remain 0m 0s) Loss: 1.2015(1.3139) 
Epoch: [3][0/880] Elapsed 0m 1s (remain 23m 5s) Loss: 0.9038(0.9038) Grad: 375303.3125  LR: 0.00000999  
Epoch: [3][20/880] Elapsed 0m 52s (remain 35m 39s) Loss: 1.0933(1.0865) Grad: 350975.6562  LR: 0.00000981  
Epoch: [3][40/880] Elapsed 1m 33s (remain 31m 43s) Loss: 1.0238(1.0957) Grad: 652998.3750  LR: 0.00000963  
Epoch: [3][60/880] Elapsed 2m 14s (remain 30m 7s) Loss: 1.1743(1.1026) Grad: 420410.5938  LR: 0.00000946  
Epoch: [3][80/880] Elapsed 2m 59s (remain 29m 29s) Loss: 1.0218(1.1083) Grad: 382134.8438  LR: 0.00000928  
Epoch: [3][100/880] Elapsed 3m 38s (remain 28m 5s) Loss: 1.0810(1.1002) Grad: 358513.2188  LR: 0.00000910  
Epoch: [3][120/880] Elapsed 4m 26s (remain 27m 52s) Loss: 0.9489(1.1053) Grad: 423241.2188  LR: 0.00000892  
Epoch: [3][140/880] Elapsed 5m 12s (remain 27m 20s) Loss: 1.2834(1.1098) Grad: 739431.9375  LR: 0.00000874  
Epoch: [3][160/880] Elapsed 5m 54s (remain 26m 23s) Loss: 1.0287(1.1102)

Epoch 3 - avg_train_loss: 1.0926  avg_val_loss: 1.3151  time: 2160s
Epoch 3 - Score: 0.4732  Scores: [0.4942120236261712, 0.4586480745255393, 0.42687827766076236, 0.4877003817961311, 0.5120042359159814, 0.46004002939364014]
Epoch 3 - Save Best Score: 0.4732 Model


EVAL: [48/49] Elapsed 1m 16s (remain 0m 0s) Loss: 1.2563(1.3151) 
Epoch: [4][0/880] Elapsed 0m 2s (remain 30m 32s) Loss: 0.8785(0.8785) Grad: 500399.5938  LR: 0.00000292  
Epoch: [4][20/880] Elapsed 0m 44s (remain 30m 36s) Loss: 0.9833(0.9753) Grad: 483158.0312  LR: 0.00000280  
Epoch: [4][40/880] Elapsed 1m 28s (remain 30m 20s) Loss: 0.9999(1.0017) Grad: 467141.6875  LR: 0.00000267  
Epoch: [4][60/880] Elapsed 2m 17s (remain 30m 52s) Loss: 0.9756(1.0114) Grad: 379894.2812  LR: 0.00000255  
Epoch: [4][80/880] Elapsed 3m 10s (remain 31m 17s) Loss: 0.8506(1.0145) Grad: 359246.5312  LR: 0.00000244  
Epoch: [4][100/880] Elapsed 4m 3s (remain 31m 15s) Loss: 0.8331(1.0236) Grad: 441365.4688  LR: 0.00000232  
Epoch: [4][120/880] Elapsed 4m 53s (remain 30m 42s) Loss: 1.0909(1.0301) Grad: 546352.5625  LR: 0.00000221  
Epoch: [4][140/880] Elapsed 5m 38s (remain 29m 32s) Loss: 1.2633(1.0352) Grad: 677458.2500  LR: 0.00000210  
Epoch: [4][160/880] Elapsed 6m 29s (remain 29m 0s) Loss: 0.9326(1.0303

Epoch 4 - avg_train_loss: 1.0169  avg_val_loss: 1.3270  time: 2192s
Epoch 4 - Score: 0.4760  Scores: [0.506353494993945, 0.4579505105103895, 0.43134832916584803, 0.49161771686915634, 0.5088726821608882, 0.46004002939364014]


EVAL: [48/49] Elapsed 1m 16s (remain 0m 0s) Loss: 1.2915(1.3270) 


Score: 0.4732  Scores: [0.4942120236261712, 0.4586480745255393, 0.42687827766076236, 0.4877003817961311, 0.5120042359159814, 0.46004002939364014]
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_

Epoch: [1][0/879] Elapsed 0m 2s (remain 32m 44s) Loss: 2.2131(2.2131) Grad: inf  LR: 0.00002000  
Epoch: [1][20/879] Elapsed 0m 41s (remain 28m 17s) Loss: 1.9907(2.0391) Grad: 165777.3281  LR: 0.00002000  
Epoch: [1][40/879] Elapsed 1m 31s (remain 31m 4s) Loss: 1.5570(1.8601) Grad: 109265.7266  LR: 0.00001999  
Epoch: [1][60/879] Elapsed 2m 27s (remain 32m 53s) Loss: 1.7328(1.8207) Grad: 108986.1250  LR: 0.00001999  
Epoch: [1][80/879] Elapsed 3m 25s (remain 33m 49s) Loss: 1.4259(1.7656) Grad: 161328.7188  LR: 0.00001997  
Epoch: [1][100/879] Elapsed 4m 12s (remain 32m 21s) Loss: 1.5015(1.7223) Grad: 153742.5000  LR: 0.00001996  
Epoch: [1][120/879] Elapsed 4m 58s (remain 31m 10s) Loss: 1.5362(1.6811) Grad: 180003.7031  LR: 0.00001994  
Epoch: [1][140/879] Elapsed 5m 45s (remain 30m 9s) Loss: 1.5213(1.6500) Grad: 171148.1875  LR: 0.00001992  
Epoch: [1][160/879] Elapsed 6m 36s (remain 29m 29s) Loss: 1.7004(1.6264) Grad: 188530.1875  LR: 0.00001990  
Epoch: [1][180/879] Elapsed 7m 23s (

Epoch 1 - avg_train_loss: 1.4539  avg_val_loss: 1.3754  time: 2165s
Epoch 1 - Score: 0.4986  Scores: [0.549234160686775, 0.46974808707021093, 0.46153028511857447, 0.5069667702349425, 0.5363091934886145, 0.46770717334674267]
Epoch 1 - Save Best Score: 0.4986 Model


EVAL: [48/49] Elapsed 1m 15s (remain 0m 0s) Loss: 1.2330(1.3754) 
Epoch: [2][0/879] Elapsed 0m 2s (remain 36m 0s) Loss: 1.2122(1.2122) Grad: 388913.8438  LR: 0.00001707  
Epoch: [2][20/879] Elapsed 0m 50s (remain 34m 27s) Loss: 1.2816(1.2760) Grad: 600609.8125  LR: 0.00001694  
Epoch: [2][40/879] Elapsed 1m 37s (remain 33m 11s) Loss: 1.3148(1.2930) Grad: 503381.3438  LR: 0.00001681  
Epoch: [2][60/879] Elapsed 2m 24s (remain 32m 22s) Loss: 1.2374(1.2910) Grad: 455771.2188  LR: 0.00001668  
Epoch: [2][80/879] Elapsed 3m 9s (remain 31m 8s) Loss: 1.5517(1.2901) Grad: 388182.6250  LR: 0.00001655  
Epoch: [2][100/879] Elapsed 4m 0s (remain 30m 52s) Loss: 1.2023(1.2834) Grad: 372138.1562  LR: 0.00001641  
Epoch: [2][120/879] Elapsed 4m 56s (remain 30m 54s) Loss: 1.2072(1.2758) Grad: 337023.2188  LR: 0.00001627  
Epoch: [2][140/879] Elapsed 5m 42s (remain 29m 52s) Loss: 1.1718(1.2686) Grad: 404224.3125  LR: 0.00001613  
Epoch: [2][160/879] Elapsed 6m 27s (remain 28m 48s) Loss: 1.0935(1.2696) 

Epoch 2 - avg_train_loss: 1.2476  avg_val_loss: 1.3506  time: 2166s
Epoch 2 - Score: 0.4932  Scores: [0.5515516146957321, 0.4711037842240328, 0.4446392713567405, 0.4974424384708614, 0.5309310266899466, 0.46359839234811323]
Epoch 2 - Save Best Score: 0.4932 Model


EVAL: [48/49] Elapsed 1m 15s (remain 0m 0s) Loss: 1.2405(1.3506) 
Epoch: [3][0/879] Elapsed 0m 5s (remain 80m 15s) Loss: 1.0194(1.0194) Grad: 612480.7500  LR: 0.00001000  
Epoch: [3][20/879] Elapsed 0m 58s (remain 39m 37s) Loss: 1.0248(1.1459) Grad: 454729.1250  LR: 0.00000983  
Epoch: [3][40/879] Elapsed 1m 47s (remain 36m 27s) Loss: 1.0762(1.1292) Grad: 354418.5000  LR: 0.00000965  
Epoch: [3][60/879] Elapsed 2m 31s (remain 33m 53s) Loss: 1.5877(1.1479) Grad: 770199.5000  LR: 0.00000947  
Epoch: [3][80/879] Elapsed 3m 22s (remain 33m 17s) Loss: 0.8865(1.1441) Grad: 404962.9062  LR: 0.00000929  
Epoch: [3][100/879] Elapsed 4m 15s (remain 32m 50s) Loss: 0.9068(1.1369) Grad: 406316.0938  LR: 0.00000911  
Epoch: [3][120/879] Elapsed 5m 6s (remain 32m 2s) Loss: 1.0504(1.1347) Grad: 394758.9375  LR: 0.00000894  
Epoch: [3][140/879] Elapsed 5m 50s (remain 30m 34s) Loss: 1.2497(1.1402) Grad: 421014.8750  LR: 0.00000876  
Epoch: [3][160/879] Elapsed 6m 46s (remain 30m 12s) Loss: 1.0825(1.1397

Epoch 3 - avg_train_loss: 1.1261  avg_val_loss: 1.3641  time: 2185s
Epoch 3 - Score: 0.4944  Scores: [0.544569667213461, 0.47447608055665796, 0.44607128559988557, 0.5050762722761054, 0.5351187165632749, 0.4608388535915693]


EVAL: [48/49] Elapsed 1m 14s (remain 0m 0s) Loss: 1.2726(1.3641) 
Epoch: [4][0/879] Elapsed 0m 1s (remain 25m 55s) Loss: 0.8878(0.8878) Grad: 368395.0938  LR: 0.00000294  
Epoch: [4][20/879] Elapsed 0m 52s (remain 36m 4s) Loss: 1.0523(1.0633) Grad: 406275.0938  LR: 0.00000281  
Epoch: [4][40/879] Elapsed 1m 34s (remain 32m 9s) Loss: 0.9900(1.0786) Grad: 349812.1562  LR: 0.00000269  
Epoch: [4][60/879] Elapsed 2m 25s (remain 32m 27s) Loss: 0.8096(1.0638) Grad: 453989.0312  LR: 0.00000257  
Epoch: [4][80/879] Elapsed 3m 19s (remain 32m 41s) Loss: 0.9855(1.0689) Grad: 408767.6875  LR: 0.00000245  
Epoch: [4][100/879] Elapsed 4m 3s (remain 31m 15s) Loss: 0.9962(1.0605) Grad: 425501.5312  LR: 0.00000233  
Epoch: [4][120/879] Elapsed 4m 58s (remain 31m 7s) Loss: 0.8281(1.0594) Grad: 536965.8125  LR: 0.00000222  
Epoch: [4][140/879] Elapsed 5m 38s (remain 29m 34s) Loss: 1.1025(1.0588) Grad: 554659.6250  LR: 0.00000211  
Epoch: [4][160/879] Elapsed 6m 25s (remain 28m 37s) Loss: 0.9248(1.0604) 

Epoch 4 - avg_train_loss: 1.0559  avg_val_loss: 1.3729  time: 2170s
Epoch 4 - Score: 0.4947  Scores: [0.5555838995037159, 0.47514766770178823, 0.4417613170304636, 0.5082231953842038, 0.5333280186810028, 0.4538666081121012]


EVAL: [48/49] Elapsed 1m 15s (remain 0m 0s) Loss: 1.3088(1.3729) 


Score: 0.4932  Scores: [0.5515516146957321, 0.4711037842240328, 0.4446392713567405, 0.4974424384708614, 0.5309310266899466, 0.46359839234811323]
Score: 0.4879  Scores: [0.5157270992439399, 0.46798459874715614, 0.4534214974095285, 0.4996804090437293, 0.5118406301486104, 0.4787830686616442]
