In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
!pip install datasets
!pip install sentencepiece
!pip install transformers==4.21.2
!pip install tokenizers==0.12.1

Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.17.3-py3-none-a

In [3]:
!nvidia-smi

Fri Sep 29 15:31:06 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    23W / 300W |      0MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    apex=True
    print_freq=20
    num_workers=4
    model="microsoft/deberta-v3-base"
    gradient_checkpointing=True
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=8
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    target_size=2
    fc_dropout=0.2
    target_cols=['content','wording']
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]
    train=True
    freezing=False

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]

In [5]:
import os

DIR = '/content/drive/MyDrive/Competitions/Kaggle/Commonlit'
INPUT_DIR = os.path.join(DIR,'input')
OUTPUT_DIR = os.path.join(DIR,'output')
OUTPUT_MODEL_DIR = DIR + '/output/EXP004/'
if not os.path.exists(OUTPUT_MODEL_DIR):
    os.makedirs(OUTPUT_MODEL_DIR)

In [6]:
# ====================================================
# Library
# ====================================================
from google.colab import runtime
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs


import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW, Optimizer
from torch.utils.data import DataLoader, Dataset
from torch.autograd.function import InplaceFunction
import torch.nn.init as init

#os.system('pip uninstall -y transformers')
#os.system('pip uninstall -y tokenizers')
#os.system('python -m pip install --no-index --find-links=/content/drive/MyDrive/Competitions/Kaggle/FeedBack3/pip_wheel.ipynb transformers')
#os.system('python -m pip install --no-index --find-links=/content/drive/MyDrive/Competitions/Kaggle/FeedBack3/pip_wheel.ipynb tokenizers')
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup,get_polynomial_decay_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizers.__version__: 0.12.1
transformers.__version__: 4.21.2
env: TOKENIZERS_PARALLELISM=true


In [7]:
# ====================================================
# Utils
# ====================================================
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores


def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores


def get_logger(filename=OUTPUT_MODEL_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(seed=CFG.seed)

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """

    for parameter in module.parameters():
        parameter.requires_grad = False

def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """

    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)

    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """

    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"

        if hasattr(embeddings_path, attr_name):
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
prompts_train = pd.read_csv(os.path.join(INPUT_DIR,'prompts_train.csv'))
summary_train = pd.read_csv(os.path.join(INPUT_DIR,'summaries_train.csv'))

print(f"Prompt Train.shape: {prompts_train.shape}")
display(prompts_train.head())
print(f"Summary Train.shape: {summary_train.shape}")
display(summary_train.head())

Prompt Train.shape: (4, 4)


Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...
1,3b9047,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
2,814d6b,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
3,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an..."


Summary Train.shape: (7165, 5)


Unnamed: 0,student_id,prompt_id,text,content,wording
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757


In [10]:
train = prompts_train.merge(summary_train, on="prompt_id")
display(train.head())

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text,student_id,text,content,wording
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00791789cc1f,1 element of an ideal tragedy is that it shoul...,-0.210614,-0.471415
1,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0086ef22de8f,The three elements of an ideal tragedy are: H...,-0.970237,-0.417058
2,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0094589c7a22,Aristotle states that an ideal tragedy should ...,-0.387791,-0.584181
3,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00cd5736026a,One element of an Ideal tragedy is having a co...,0.088882,-0.59471
4,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00d98b8ff756,The 3 ideal of tragedy is how complex you need...,-0.687288,-0.460886


In [11]:
# ====================================================
# CV split
# ====================================================
Fold = GroupKFold(n_splits=CFG.n_fold)
for n, (train_index, val_index) in enumerate(Fold.split(train, groups=train["prompt_id"])):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

fold
0    2057
1    2009
2    1996
3    1103
dtype: int64

In [12]:
if CFG.debug:
    display(train.groupby('fold').size())
    train = train.sample(n=1000, random_state=0).reset_index(drop=True)
    display(train.groupby('fold').size())

In [13]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
# tokenizer.save_pretrained(OUTPUT_MODEL_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Downloading tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [14]:
prompt_title = [
    'On Tragedy',
    'Egyptian Social Structure',
    'The Third Wave',
    'Excerpt from The Jungle'
]
cls_tokens_map = {label: f"[CLS_{label.upper()}]" for label in prompt_title}
end_tokens_map = {label: f"[END_{label.upper()}]" for label in prompt_title}
tokenizer.add_special_tokens(
    {"additional_special_tokens": list(cls_tokens_map.values())+list(end_tokens_map.values())}
)
cls_id_map = {
    label: tokenizer.encode(tkn)[1]
    for label, tkn in cls_tokens_map.items()
}

In [15]:
def len2text(x:str):
    if len(x)<200: return 'Quite short'
    if len(x)<300: return 'Short'
    if len(x)<500: return 'Middle'
    else:
        return 'Long'

In [16]:
train["text_length"] = train['text'].apply(lambda x: len2text(x))
SEP = tokenizer.sep_token
train['full_text'] = train['prompt_title'] + SEP + train["text_length"] + SEP + train['prompt_question'] + SEP + train['text']

In [17]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text,
        return_tensors=None,
        add_special_tokens=True,
        max_length=CFG.max_len,
        pad_to_max_length=True,
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['full_text'].values
        self.labels = df[cfg.target_cols].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label


def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

In [18]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings


class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())

        #self.pool = MeanPooling()
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        self._init_weights(self.fc)
        self.attention = nn.Sequential(
            nn.Linear(self.config.hidden_size, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )
        self._init_weights(self.attention)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        weights = self.attention(last_hidden_states)
        feature = torch.sum(weights * last_hidden_states, dim=1)
        #feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output

In [19]:
# ====================================================
# Loss
# ====================================================
class RMSELoss(nn.Module):
    def __init__(self, reduction='mean', eps=1e-9):
        super().__init__()
        self.mse = nn.MSELoss(reduction='none')
        self.reduction = reduction
        self.eps = eps

    def forward(self, y_pred, y_true):
        loss = torch.sqrt(self.mse(y_pred, y_true) + self.eps)
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss

In [20]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        #if scaler is not None:
        #    scaler.unscale_(optimizer)
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader),
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions

In [21]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds[CFG.target_cols].values

    print(f"========== prompt_id: {valid_folds.prompt_id.unique()} validation ==========")

    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_MODEL_DIR+'config.pth')
    model.to(device)

    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr,
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)

    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler

    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = RMSELoss(reduction="mean") # nn.SmoothL1Loss(reduction='mean')

    best_score = np.inf

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)

        # scoring
        score, scores = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}  Scores: {scores}')

        if best_score > score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_MODEL_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_MODEL_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                             map_location=torch.device('cpu'))['predictions']
    valid_folds[[f"pred_{c}" for c in CFG.target_cols]] = predictions

    torch.cuda.empty_cache()
    gc.collect()

    return valid_folds

In [None]:
if __name__ == '__main__':

    def get_result(oof_df):
        labels = oof_df[CFG.target_cols].values
        preds = oof_df[[f"pred_{c}" for c in CFG.target_cols]].values
        score, scores = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}  Scores: {scores}')

    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_MODEL_DIR+'oof_df.pkl')





DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "at

Downloading pytorch_model.bin:   0%|          | 0.00/354M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/638] Elapsed 0m 3s (remain 38m 34s) Loss: 1.0787(1.0787) Grad: inf  LR: 0.00002000  
Epoch: [1][20/638] Elapsed 0m 6s (remain 3m 7s) Loss: 0.7830(0.8060) Grad: 107119.1016  LR: 0.00002000  
Epoch: [1][40/638] Elapsed 0m 10s (remain 2m 28s) Loss: 0.5049(0.7093) Grad: 18669.7812  LR: 0.00001999  
Epoch: [1][60/638] Elapsed 0m 13s (remain 2m 6s) Loss: 0.7573(0.7360) Grad: 131742.1094  LR: 0.00001997  
Epoch: [1][80/638] Elapsed 0m 15s (remain 1m 49s) Loss: 0.2470(0.7083) Grad: 57057.6484  LR: 0.00001995  
Epoch: [1][100/638] Elapsed 0m 18s (remain 1m 40s) Loss: 0.4739(0.6836) Grad: 79554.0312  LR: 0.00001992  
Epoch: [1][120/638] Elapsed 0m 21s (remain 1m 32s) Loss: 0.4988(0.6602) Grad: 55835.1797  LR: 0.00001989  
Epoch: [1][140/638] Elapsed 0m 24s (remain 1m 27s) Loss: 0.5879(0.6435) Grad: 66609.8516  LR: 0.00001985  
Epoch: [1][160/638] Elapsed 0m 28s (remain 1m 23s) Loss: 0.4867(0.6224) Grad: 109896.9375  LR: 0.00001980  
Epoch: [1][180/638] Elapsed 0m 30s (remain 1m 18s)

Epoch 1 - avg_train_loss: 0.4778  avg_val_loss: 0.3865  time: 112s
INFO:__main__:Epoch 1 - avg_train_loss: 0.4778  avg_val_loss: 0.3865  time: 112s
Epoch 1 - Score: 0.4975  Scores: [0.4211151739567935, 0.5739405553485939]
INFO:__main__:Epoch 1 - Score: 0.4975  Scores: [0.4211151739567935, 0.5739405553485939]
Epoch 1 - Save Best Score: 0.4975 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.4975 Model


EVAL: [128/129] Elapsed 0m 13s (remain 0m 0s) Loss: 0.3759(0.3865) 
Epoch: [2][0/638] Elapsed 0m 0s (remain 4m 8s) Loss: 0.4514(0.4514) Grad: 327608.1250  LR: 0.00001707  
Epoch: [2][20/638] Elapsed 0m 3s (remain 1m 43s) Loss: 0.3768(0.3577) Grad: 50675.5938  LR: 0.00001689  
Epoch: [2][40/638] Elapsed 0m 7s (remain 1m 42s) Loss: 0.2426(0.3368) Grad: 49984.9414  LR: 0.00001671  
Epoch: [2][60/638] Elapsed 0m 10s (remain 1m 38s) Loss: 0.4445(0.3519) Grad: 48312.1328  LR: 0.00001653  
Epoch: [2][80/638] Elapsed 0m 13s (remain 1m 33s) Loss: 0.2835(0.3440) Grad: 52430.6719  LR: 0.00001634  
Epoch: [2][100/638] Elapsed 0m 16s (remain 1m 30s) Loss: 0.2072(0.3472) Grad: 50460.5391  LR: 0.00001615  
Epoch: [2][120/638] Elapsed 0m 20s (remain 1m 29s) Loss: 0.3149(0.3494) Grad: 44646.2383  LR: 0.00001595  
Epoch: [2][140/638] Elapsed 0m 24s (remain 1m 24s) Loss: 0.4141(0.3509) Grad: 45635.1172  LR: 0.00001575  
Epoch: [2][160/638] Elapsed 0m 26s (remain 1m 19s) Loss: 0.3110(0.3546) Grad: 83432.8

Epoch 2 - avg_train_loss: 0.3574  avg_val_loss: 0.4699  time: 110s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3574  avg_val_loss: 0.4699  time: 110s
Epoch 2 - Score: 0.5935  Scores: [0.43652665480696307, 0.7505288294886733]
INFO:__main__:Epoch 2 - Score: 0.5935  Scores: [0.43652665480696307, 0.7505288294886733]


EVAL: [128/129] Elapsed 0m 12s (remain 0m 0s) Loss: 0.5180(0.4699) 
Epoch: [3][0/638] Elapsed 0m 0s (remain 3m 55s) Loss: 0.3567(0.3567) Grad: inf  LR: 0.00001000  
Epoch: [3][20/638] Elapsed 0m 3s (remain 1m 44s) Loss: 0.2848(0.3111) Grad: inf  LR: 0.00000975  
Epoch: [3][40/638] Elapsed 0m 7s (remain 1m 45s) Loss: 0.2912(0.3169) Grad: 58382.3320  LR: 0.00000951  
Epoch: [3][60/638] Elapsed 0m 9s (remain 1m 34s) Loss: 0.3140(0.3196) Grad: 46249.4688  LR: 0.00000926  
Epoch: [3][80/638] Elapsed 0m 12s (remain 1m 26s) Loss: 0.4328(0.3180) Grad: 70255.2344  LR: 0.00000902  
Epoch: [3][100/638] Elapsed 0m 15s (remain 1m 22s) Loss: 0.2617(0.3174) Grad: 55582.2852  LR: 0.00000877  
Epoch: [3][120/638] Elapsed 0m 18s (remain 1m 18s) Loss: 0.4246(0.3146) Grad: 85558.2188  LR: 0.00000853  
Epoch: [3][140/638] Elapsed 0m 21s (remain 1m 17s) Loss: 0.4293(0.3125) Grad: 70511.3047  LR: 0.00000829  
Epoch: [3][160/638] Elapsed 0m 25s (remain 1m 14s) Loss: 0.2553(0.3137) Grad: 60723.4219  LR: 0.0000

Epoch 3 - avg_train_loss: 0.3056  avg_val_loss: 0.3830  time: 109s
INFO:__main__:Epoch 3 - avg_train_loss: 0.3056  avg_val_loss: 0.3830  time: 109s
Epoch 3 - Score: 0.4969  Scores: [0.43226335132604904, 0.5616051054571688]
INFO:__main__:Epoch 3 - Score: 0.4969  Scores: [0.43226335132604904, 0.5616051054571688]
Epoch 3 - Save Best Score: 0.4969 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.4969 Model


EVAL: [128/129] Elapsed 0m 12s (remain 0m 0s) Loss: 0.3585(0.3830) 
Epoch: [4][0/638] Elapsed 0m 0s (remain 6m 48s) Loss: 0.2865(0.2865) Grad: inf  LR: 0.00000293  
Epoch: [4][20/638] Elapsed 0m 4s (remain 2m 4s) Loss: 0.2531(0.2741) Grad: 163347.5156  LR: 0.00000276  
Epoch: [4][40/638] Elapsed 0m 7s (remain 1m 43s) Loss: 0.4168(0.2742) Grad: 109716.1641  LR: 0.00000259  
Epoch: [4][60/638] Elapsed 0m 10s (remain 1m 39s) Loss: 0.3674(0.2706) Grad: 53487.7695  LR: 0.00000243  
Epoch: [4][80/638] Elapsed 0m 13s (remain 1m 34s) Loss: 0.3385(0.2661) Grad: 35690.5625  LR: 0.00000227  
Epoch: [4][100/638] Elapsed 0m 17s (remain 1m 34s) Loss: 0.3454(0.2639) Grad: 89470.2656  LR: 0.00000212  
Epoch: [4][120/638] Elapsed 0m 21s (remain 1m 32s) Loss: 0.4037(0.2644) Grad: 37104.2422  LR: 0.00000197  
Epoch: [4][140/638] Elapsed 0m 24s (remain 1m 26s) Loss: 0.2952(0.2656) Grad: 59815.5430  LR: 0.00000183  
Epoch: [4][160/638] Elapsed 0m 27s (remain 1m 20s) Loss: 0.1767(0.2622) Grad: 44303.7461  L

Epoch 4 - avg_train_loss: 0.2645  avg_val_loss: 0.4013  time: 111s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2645  avg_val_loss: 0.4013  time: 111s
Epoch 4 - Score: 0.5139  Scores: [0.4208934722997784, 0.6068384164273498]
INFO:__main__:Epoch 4 - Score: 0.5139  Scores: [0.4208934722997784, 0.6068384164273498]


EVAL: [128/129] Elapsed 0m 12s (remain 0m 0s) Loss: 0.4367(0.4013) 


Score: 0.4969  Scores: [0.43226335132604904, 0.5616051054571688]
INFO:__main__:Score: 0.4969  Scores: [0.43226335132604904, 0.5616051054571688]




DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "at

Epoch: [1][0/644] Elapsed 0m 0s (remain 4m 26s) Loss: 0.7266(0.7266) Grad: inf  LR: 0.00002000  
Epoch: [1][20/644] Elapsed 0m 2s (remain 1m 28s) Loss: 0.7035(0.7771) Grad: 102988.6406  LR: 0.00002000  
Epoch: [1][40/644] Elapsed 0m 5s (remain 1m 23s) Loss: 0.6882(0.7053) Grad: 51925.2109  LR: 0.00001999  
Epoch: [1][60/644] Elapsed 0m 8s (remain 1m 18s) Loss: 0.4713(0.6745) Grad: 92157.0781  LR: 0.00001997  
Epoch: [1][80/644] Elapsed 0m 11s (remain 1m 20s) Loss: 0.4130(0.6401) Grad: 133652.5156  LR: 0.00001995  
Epoch: [1][100/644] Elapsed 0m 14s (remain 1m 19s) Loss: 0.6410(0.6097) Grad: 126500.0000  LR: 0.00001992  
Epoch: [1][120/644] Elapsed 0m 17s (remain 1m 14s) Loss: 0.3769(0.5821) Grad: 46681.7148  LR: 0.00001989  
Epoch: [1][140/644] Elapsed 0m 20s (remain 1m 11s) Loss: 0.6223(0.5733) Grad: 65994.0781  LR: 0.00001985  
Epoch: [1][160/644] Elapsed 0m 22s (remain 1m 7s) Loss: 0.3839(0.5559) Grad: 125997.1562  LR: 0.00001981  
Epoch: [1][180/644] Elapsed 0m 25s (remain 1m 6s) L

Epoch 1 - avg_train_loss: 0.4499  avg_val_loss: 0.5714  time: 112s
INFO:__main__:Epoch 1 - avg_train_loss: 0.4499  avg_val_loss: 0.5714  time: 112s
Epoch 1 - Score: 0.7921  Scores: [0.6359271266387128, 0.9481777627387658]
INFO:__main__:Epoch 1 - Score: 0.7921  Scores: [0.6359271266387128, 0.9481777627387658]
Epoch 1 - Save Best Score: 0.7921 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7921 Model


EVAL: [125/126] Elapsed 0m 17s (remain 0m 0s) Loss: 0.3431(0.5714) 
Epoch: [2][0/644] Elapsed 0m 0s (remain 4m 23s) Loss: 0.3543(0.3543) Grad: inf  LR: 0.00001707  
Epoch: [2][20/644] Elapsed 0m 3s (remain 1m 29s) Loss: 0.4374(0.3695) Grad: 74666.7812  LR: 0.00001689  
Epoch: [2][40/644] Elapsed 0m 6s (remain 1m 30s) Loss: 0.2366(0.3451) Grad: 125345.0938  LR: 0.00001671  
Epoch: [2][60/644] Elapsed 0m 10s (remain 1m 37s) Loss: 0.2533(0.3448) Grad: 97387.3438  LR: 0.00001653  
Epoch: [2][80/644] Elapsed 0m 13s (remain 1m 32s) Loss: 0.3010(0.3518) Grad: 19264.4961  LR: 0.00001634  
Epoch: [2][100/644] Elapsed 0m 16s (remain 1m 27s) Loss: 0.3549(0.3502) Grad: 42013.0195  LR: 0.00001615  
Epoch: [2][120/644] Elapsed 0m 19s (remain 1m 23s) Loss: 0.3663(0.3475) Grad: 69500.1719  LR: 0.00001596  
Epoch: [2][140/644] Elapsed 0m 23s (remain 1m 22s) Loss: 0.3952(0.3489) Grad: 64708.0625  LR: 0.00001576  
Epoch: [2][160/644] Elapsed 0m 26s (remain 1m 19s) Loss: 0.3165(0.3480) Grad: 40740.9297  L

Epoch 2 - avg_train_loss: 0.3342  avg_val_loss: 0.4932  time: 112s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3342  avg_val_loss: 0.4932  time: 112s
Epoch 2 - Score: 0.6666  Scores: [0.5256948019892972, 0.8075208567359278]
INFO:__main__:Epoch 2 - Score: 0.6666  Scores: [0.5256948019892972, 0.8075208567359278]
Epoch 2 - Save Best Score: 0.6666 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.6666 Model


EVAL: [125/126] Elapsed 0m 17s (remain 0m 0s) Loss: 0.2529(0.4932) 
Epoch: [3][0/644] Elapsed 0m 0s (remain 5m 59s) Loss: 0.2546(0.2546) Grad: inf  LR: 0.00001000  
Epoch: [3][20/644] Elapsed 0m 3s (remain 1m 53s) Loss: 0.2801(0.2750) Grad: 53246.0547  LR: 0.00000976  
Epoch: [3][40/644] Elapsed 0m 6s (remain 1m 34s) Loss: 0.2094(0.2938) Grad: 74159.5078  LR: 0.00000951  
Epoch: [3][60/644] Elapsed 0m 9s (remain 1m 32s) Loss: 0.2950(0.2904) Grad: 43026.9844  LR: 0.00000927  
Epoch: [3][80/644] Elapsed 0m 12s (remain 1m 29s) Loss: 0.2340(0.2920) Grad: 41595.8438  LR: 0.00000903  
Epoch: [3][100/644] Elapsed 0m 16s (remain 1m 31s) Loss: 0.2817(0.2962) Grad: 56340.9453  LR: 0.00000878  
Epoch: [3][120/644] Elapsed 0m 20s (remain 1m 28s) Loss: 0.3069(0.2958) Grad: 85890.6797  LR: 0.00000854  
Epoch: [3][140/644] Elapsed 0m 23s (remain 1m 22s) Loss: 0.1372(0.2964) Grad: 94757.4609  LR: 0.00000830  
Epoch: [3][160/644] Elapsed 0m 25s (remain 1m 17s) Loss: 0.2618(0.2942) Grad: 112405.7266  LR

Epoch 3 - avg_train_loss: 0.2954  avg_val_loss: 0.4902  time: 113s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2954  avg_val_loss: 0.4902  time: 113s
Epoch 3 - Score: 0.6684  Scores: [0.5074997130637312, 0.8292296266389012]
INFO:__main__:Epoch 3 - Score: 0.6684  Scores: [0.5074997130637312, 0.8292296266389012]


EVAL: [125/126] Elapsed 0m 17s (remain 0m 0s) Loss: 0.2662(0.4902) 
Epoch: [4][0/644] Elapsed 0m 0s (remain 4m 22s) Loss: 0.3388(0.3388) Grad: inf  LR: 0.00000293  
Epoch: [4][20/644] Elapsed 0m 2s (remain 1m 28s) Loss: 0.2360(0.2721) Grad: 47440.1445  LR: 0.00000276  
Epoch: [4][40/644] Elapsed 0m 5s (remain 1m 23s) Loss: 0.3182(0.2759) Grad: 47658.3672  LR: 0.00000260  
Epoch: [4][60/644] Elapsed 0m 8s (remain 1m 18s) Loss: 0.1651(0.2685) Grad: 37061.1523  LR: 0.00000244  
Epoch: [4][80/644] Elapsed 0m 11s (remain 1m 17s) Loss: 0.2357(0.2710) Grad: 76534.2031  LR: 0.00000228  
Epoch: [4][100/644] Elapsed 0m 14s (remain 1m 18s) Loss: 0.1923(0.2707) Grad: 76096.0938  LR: 0.00000213  
Epoch: [4][120/644] Elapsed 0m 17s (remain 1m 16s) Loss: 0.4068(0.2686) Grad: 30362.3105  LR: 0.00000198  
Epoch: [4][140/644] Elapsed 0m 20s (remain 1m 12s) Loss: 0.3169(0.2676) Grad: 92112.7422  LR: 0.00000183  
Epoch: [4][160/644] Elapsed 0m 22s (remain 1m 8s) Loss: 0.2205(0.2720) Grad: 78136.1016  LR: 

Epoch 4 - avg_train_loss: 0.2629  avg_val_loss: 0.4866  time: 110s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2629  avg_val_loss: 0.4866  time: 110s
Epoch 4 - Score: 0.6589  Scores: [0.5119967510087977, 0.8057431101933162]
INFO:__main__:Epoch 4 - Score: 0.6589  Scores: [0.5119967510087977, 0.8057431101933162]
Epoch 4 - Save Best Score: 0.6589 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.6589 Model


EVAL: [125/126] Elapsed 0m 17s (remain 0m 0s) Loss: 0.2531(0.4866) 


Score: 0.6589  Scores: [0.5119967510087977, 0.8057431101933162]
INFO:__main__:Score: 0.6589  Scores: [0.5119967510087977, 0.8057431101933162]




DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "at

Epoch: [1][0/646] Elapsed 0m 0s (remain 8m 17s) Loss: 0.5198(0.5198) Grad: inf  LR: 0.00002000  
Epoch: [1][20/646] Elapsed 0m 4s (remain 2m 6s) Loss: 0.6211(0.7591) Grad: 51621.8359  LR: 0.00002000  
Epoch: [1][40/646] Elapsed 0m 7s (remain 1m 49s) Loss: 1.0294(0.6963) Grad: 76483.6250  LR: 0.00001999  
Epoch: [1][60/646] Elapsed 0m 10s (remain 1m 43s) Loss: 0.5057(0.6974) Grad: 118909.6641  LR: 0.00001997  
Epoch: [1][80/646] Elapsed 0m 13s (remain 1m 36s) Loss: 0.4033(0.6804) Grad: 95096.2266  LR: 0.00001995  
Epoch: [1][100/646] Elapsed 0m 17s (remain 1m 34s) Loss: 0.5075(0.6730) Grad: 100814.7500  LR: 0.00001992  
Epoch: [1][120/646] Elapsed 0m 20s (remain 1m 27s) Loss: 0.3859(0.6495) Grad: 40030.1680  LR: 0.00001989  
Epoch: [1][140/646] Elapsed 0m 22s (remain 1m 21s) Loss: 0.6039(0.6319) Grad: 152518.3438  LR: 0.00001985  
Epoch: [1][160/646] Elapsed 0m 25s (remain 1m 16s) Loss: 0.3372(0.6148) Grad: 74472.9609  LR: 0.00001981  
Epoch: [1][180/646] Elapsed 0m 28s (remain 1m 13s) 

Epoch 1 - avg_train_loss: 0.4869  avg_val_loss: 0.4145  time: 113s
INFO:__main__:Epoch 1 - avg_train_loss: 0.4869  avg_val_loss: 0.4145  time: 113s
Epoch 1 - Score: 0.5439  Scores: [0.4465286209213582, 0.6413217594671915]
INFO:__main__:Epoch 1 - Score: 0.5439  Scores: [0.4465286209213582, 0.6413217594671915]
Epoch 1 - Save Best Score: 0.5439 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5439 Model


EVAL: [124/125] Elapsed 0m 15s (remain 0m 0s) Loss: 0.4173(0.4145) 
Epoch: [2][0/646] Elapsed 0m 0s (remain 4m 22s) Loss: 0.3073(0.3073) Grad: inf  LR: 0.00001706  
Epoch: [2][20/646] Elapsed 0m 3s (remain 1m 32s) Loss: 0.3530(0.3434) Grad: 56915.3633  LR: 0.00001689  
Epoch: [2][40/646] Elapsed 0m 5s (remain 1m 25s) Loss: 0.3281(0.3553) Grad: 66585.9609  LR: 0.00001671  
Epoch: [2][60/646] Elapsed 0m 9s (remain 1m 32s) Loss: 0.2772(0.3587) Grad: 64220.2070  LR: 0.00001653  
Epoch: [2][80/646] Elapsed 0m 13s (remain 1m 33s) Loss: 0.4784(0.3678) Grad: 105434.9922  LR: 0.00001634  
Epoch: [2][100/646] Elapsed 0m 16s (remain 1m 29s) Loss: 0.5431(0.3710) Grad: 30956.0586  LR: 0.00001615  
Epoch: [2][120/646] Elapsed 0m 19s (remain 1m 25s) Loss: 0.3366(0.3709) Grad: 41415.1172  LR: 0.00001596  
Epoch: [2][140/646] Elapsed 0m 23s (remain 1m 22s) Loss: 0.5172(0.3737) Grad: 43909.2773  LR: 0.00001576  
Epoch: [2][160/646] Elapsed 0m 26s (remain 1m 20s) Loss: 0.3278(0.3774) Grad: 109551.0938  L

Epoch 2 - avg_train_loss: 0.3641  avg_val_loss: 0.3863  time: 114s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3641  avg_val_loss: 0.3863  time: 114s
Epoch 2 - Score: 0.4941  Scores: [0.46006221313106943, 0.5281407594638985]
INFO:__main__:Epoch 2 - Score: 0.4941  Scores: [0.46006221313106943, 0.5281407594638985]
Epoch 2 - Save Best Score: 0.4941 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.4941 Model


EVAL: [124/125] Elapsed 0m 15s (remain 0m 0s) Loss: 0.4360(0.3863) 
Epoch: [3][0/646] Elapsed 0m 0s (remain 6m 25s) Loss: 0.3243(0.3243) Grad: inf  LR: 0.00000999  
Epoch: [3][20/646] Elapsed 0m 3s (remain 1m 57s) Loss: 0.2110(0.3501) Grad: 84546.7500  LR: 0.00000974  
Epoch: [3][40/646] Elapsed 0m 6s (remain 1m 37s) Loss: 0.3684(0.3475) Grad: 71167.8359  LR: 0.00000950  
Epoch: [3][60/646] Elapsed 0m 9s (remain 1m 31s) Loss: 0.2248(0.3408) Grad: 34614.6484  LR: 0.00000926  
Epoch: [3][80/646] Elapsed 0m 12s (remain 1m 28s) Loss: 0.3624(0.3320) Grad: 128854.6641  LR: 0.00000902  
Epoch: [3][100/646] Elapsed 0m 16s (remain 1m 28s) Loss: 0.3376(0.3306) Grad: 67594.3828  LR: 0.00000878  
Epoch: [3][120/646] Elapsed 0m 20s (remain 1m 28s) Loss: 0.2871(0.3258) Grad: 89288.9453  LR: 0.00000853  
Epoch: [3][140/646] Elapsed 0m 23s (remain 1m 23s) Loss: 0.3647(0.3251) Grad: 33724.3047  LR: 0.00000829  
Epoch: [3][160/646] Elapsed 0m 26s (remain 1m 19s) Loss: 0.3935(0.3226) Grad: 64703.6172  LR

Epoch 3 - avg_train_loss: 0.3115  avg_val_loss: 0.3766  time: 113s
INFO:__main__:Epoch 3 - avg_train_loss: 0.3115  avg_val_loss: 0.3766  time: 113s
Epoch 3 - Score: 0.4893  Scores: [0.4382612916737304, 0.540297933624206]
INFO:__main__:Epoch 3 - Score: 0.4893  Scores: [0.4382612916737304, 0.540297933624206]
Epoch 3 - Save Best Score: 0.4893 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.4893 Model


Epoch: [4][0/646] Elapsed 0m 0s (remain 4m 55s) Loss: 0.3921(0.3921) Grad: inf  LR: 0.00000292  
Epoch: [4][20/646] Elapsed 0m 2s (remain 1m 28s) Loss: 0.2513(0.2841) Grad: 103773.9141  LR: 0.00000275  
Epoch: [4][40/646] Elapsed 0m 5s (remain 1m 25s) Loss: 0.3152(0.2880) Grad: 67868.6328  LR: 0.00000259  
Epoch: [4][60/646] Elapsed 0m 10s (remain 1m 40s) Loss: 0.2501(0.2856) Grad: 39590.5547  LR: 0.00000242  
Epoch: [4][80/646] Elapsed 0m 14s (remain 1m 38s) Loss: 0.2226(0.2827) Grad: 31584.4590  LR: 0.00000227  
Epoch: [4][100/646] Elapsed 0m 17s (remain 1m 34s) Loss: 0.2113(0.2779) Grad: 70532.0000  LR: 0.00000212  
Epoch: [4][120/646] Elapsed 0m 20s (remain 1m 27s) Loss: 0.2695(0.2812) Grad: 93199.8828  LR: 0.00000197  
Epoch: [4][140/646] Elapsed 0m 23s (remain 1m 23s) Loss: 0.2882(0.2809) Grad: 60256.7344  LR: 0.00000183  
Epoch: [4][160/646] Elapsed 0m 26s (remain 1m 20s) Loss: 0.2480(0.2799) Grad: 48613.7383  LR: 0.00000169  
Epoch: [4][180/646] Elapsed 0m 29s (remain 1m 16s) L

Epoch 4 - avg_train_loss: 0.2688  avg_val_loss: 0.3755  time: 112s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2688  avg_val_loss: 0.3755  time: 112s
Epoch 4 - Score: 0.4887  Scores: [0.4346408853659621, 0.5426880252135402]
INFO:__main__:Epoch 4 - Score: 0.4887  Scores: [0.4346408853659621, 0.5426880252135402]
Epoch 4 - Save Best Score: 0.4887 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.4887 Model


EVAL: [124/125] Elapsed 0m 15s (remain 0m 0s) Loss: 0.3819(0.3755) 


Score: 0.4887  Scores: [0.4346408853659621, 0.5426880252135402]
INFO:__main__:Score: 0.4887  Scores: [0.4346408853659621, 0.5426880252135402]




DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_dropout": 0.0,
  "at

Epoch: [1][0/757] Elapsed 0m 0s (remain 5m 8s) Loss: 0.9539(0.9539) Grad: inf  LR: 0.00002000  
Epoch: [1][20/757] Elapsed 0m 3s (remain 2m 10s) Loss: 0.5220(0.8740) Grad: 83291.8906  LR: 0.00002000  
Epoch: [1][40/757] Elapsed 0m 6s (remain 1m 54s) Loss: 0.5022(0.7402) Grad: 73173.4531  LR: 0.00001999  
Epoch: [1][60/757] Elapsed 0m 9s (remain 1m 49s) Loss: 0.5342(0.6783) Grad: 108182.3203  LR: 0.00001998  
Epoch: [1][80/757] Elapsed 0m 14s (remain 1m 57s) Loss: 0.5708(0.6546) Grad: 94036.2812  LR: 0.00001996  
Epoch: [1][100/757] Elapsed 0m 17s (remain 1m 54s) Loss: 0.4723(0.6225) Grad: 45829.6602  LR: 0.00001995  
Epoch: [1][120/757] Elapsed 0m 20s (remain 1m 48s) Loss: 0.4976(0.6120) Grad: 60603.8281  LR: 0.00001992  
Epoch: [1][140/757] Elapsed 0m 23s (remain 1m 41s) Loss: 0.5753(0.6039) Grad: 105883.1406  LR: 0.00001989  
Epoch: [1][160/757] Elapsed 0m 26s (remain 1m 37s) Loss: 0.3379(0.5875) Grad: 38797.7383  LR: 0.00001986  
Epoch: [1][180/757] Elapsed 0m 29s (remain 1m 34s) Lo

Epoch 1 - avg_train_loss: 0.4484  avg_val_loss: 0.4637  time: 121s
INFO:__main__:Epoch 1 - avg_train_loss: 0.4484  avg_val_loss: 0.4637  time: 121s
Epoch 1 - Score: 0.6171  Scores: [0.5259867316357357, 0.7081533520930183]
INFO:__main__:Epoch 1 - Score: 0.6171  Scores: [0.5259867316357357, 0.7081533520930183]
Epoch 1 - Save Best Score: 0.6171 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6171 Model


Epoch: [2][0/757] Elapsed 0m 0s (remain 6m 35s) Loss: 0.3636(0.3636) Grad: inf  LR: 0.00001707  
Epoch: [2][20/757] Elapsed 0m 3s (remain 1m 49s) Loss: 0.2929(0.3716) Grad: 68157.5469  LR: 0.00001692  
Epoch: [2][40/757] Elapsed 0m 6s (remain 1m 45s) Loss: 0.3403(0.3679) Grad: 52149.8320  LR: 0.00001677  
Epoch: [2][60/757] Elapsed 0m 9s (remain 1m 46s) Loss: 0.3824(0.3532) Grad: 73868.0078  LR: 0.00001662  
Epoch: [2][80/757] Elapsed 0m 13s (remain 1m 51s) Loss: 0.2514(0.3482) Grad: 29947.9473  LR: 0.00001646  
Epoch: [2][100/757] Elapsed 0m 17s (remain 1m 51s) Loss: 0.4355(0.3511) Grad: 47176.4219  LR: 0.00001630  
Epoch: [2][120/757] Elapsed 0m 20s (remain 1m 47s) Loss: 0.3975(0.3482) Grad: 96243.2500  LR: 0.00001614  
Epoch: [2][140/757] Elapsed 0m 23s (remain 1m 41s) Loss: 0.4703(0.3494) Grad: 59501.0117  LR: 0.00001597  
Epoch: [2][160/757] Elapsed 0m 26s (remain 1m 36s) Loss: 0.4240(0.3507) Grad: 24360.5762  LR: 0.00001580  
Epoch: [2][180/757] Elapsed 0m 29s (remain 1m 33s) Los

Epoch 2 - avg_train_loss: 0.3486  avg_val_loss: 0.4452  time: 121s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3486  avg_val_loss: 0.4452  time: 121s
Epoch 2 - Score: 0.6021  Scores: [0.5145460976373987, 0.6895539269120516]
INFO:__main__:Epoch 2 - Score: 0.6021  Scores: [0.5145460976373987, 0.6895539269120516]
Epoch 2 - Save Best Score: 0.6021 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.6021 Model


Epoch: [3][0/757] Elapsed 0m 0s (remain 5m 24s) Loss: 0.2707(0.2707) Grad: inf  LR: 0.00001001  
Epoch: [3][20/757] Elapsed 0m 3s (remain 1m 54s) Loss: 0.2694(0.3238) Grad: 63399.6133  LR: 0.00000980  
Epoch: [3][40/757] Elapsed 0m 5s (remain 1m 41s) Loss: 0.2238(0.3146) Grad: 21971.7656  LR: 0.00000959  
Epoch: [3][60/757] Elapsed 0m 9s (remain 1m 48s) Loss: 0.1961(0.3059) Grad: 86342.3828  LR: 0.00000938  
Epoch: [3][80/757] Elapsed 0m 13s (remain 1m 55s) Loss: 0.2703(0.3005) Grad: 60897.2539  LR: 0.00000918  
Epoch: [3][100/757] Elapsed 0m 17s (remain 1m 53s) Loss: 0.4327(0.2967) Grad: 62268.2109  LR: 0.00000897  
Epoch: [3][120/757] Elapsed 0m 20s (remain 1m 47s) Loss: 0.2753(0.2954) Grad: 68757.1172  LR: 0.00000876  
Epoch: [3][140/757] Elapsed 0m 23s (remain 1m 42s) Loss: 0.2410(0.3008) Grad: 34306.0508  LR: 0.00000856  
Epoch: [3][160/757] Elapsed 0m 26s (remain 1m 38s) Loss: 0.2841(0.3046) Grad: 120269.5234  LR: 0.00000835  
Epoch: [3][180/757] Elapsed 0m 30s (remain 1m 36s) Lo

Epoch 3 - avg_train_loss: 0.2976  avg_val_loss: 0.4933  time: 123s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2976  avg_val_loss: 0.4933  time: 123s
Epoch 3 - Score: 0.6566  Scores: [0.5716195355823235, 0.7414901577494691]
INFO:__main__:Epoch 3 - Score: 0.6566  Scores: [0.5716195355823235, 0.7414901577494691]


Epoch: [4][0/757] Elapsed 0m 0s (remain 7m 32s) Loss: 0.1616(0.1616) Grad: inf  LR: 0.00000294  
Epoch: [4][20/757] Elapsed 0m 3s (remain 2m 9s) Loss: 0.5199(0.2857) Grad: 146416.6719  LR: 0.00000279  
Epoch: [4][40/757] Elapsed 0m 6s (remain 1m 54s) Loss: 0.2749(0.2751) Grad: 27535.4121  LR: 0.00000265  
Epoch: [4][60/757] Elapsed 0m 9s (remain 1m 45s) Loss: 0.3937(0.2830) Grad: 64988.4883  LR: 0.00000251  
Epoch: [4][80/757] Elapsed 0m 12s (remain 1m 40s) Loss: 0.2893(0.2756) Grad: 45759.1016  LR: 0.00000238  
Epoch: [4][100/757] Elapsed 0m 15s (remain 1m 40s) Loss: 0.2371(0.2750) Grad: 47191.4609  LR: 0.00000224  
Epoch: [4][120/757] Elapsed 0m 18s (remain 1m 37s) Loss: 0.3578(0.2752) Grad: 90891.1406  LR: 0.00000211  
Epoch: [4][140/757] Elapsed 0m 21s (remain 1m 32s) Loss: 0.3273(0.2723) Grad: 66231.0156  LR: 0.00000199  
Epoch: [4][160/757] Elapsed 0m 23s (remain 1m 28s) Loss: 0.2653(0.2702) Grad: 34325.6992  LR: 0.00000187  
Epoch: [4][180/757] Elapsed 0m 26s (remain 1m 24s) Los

Epoch 4 - avg_train_loss: 0.2602  avg_val_loss: 0.4721  time: 119s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2602  avg_val_loss: 0.4721  time: 119s
Epoch 4 - Score: 0.6354  Scores: [0.5732736873591963, 0.6974843745052304]
INFO:__main__:Epoch 4 - Score: 0.6354  Scores: [0.5732736873591963, 0.6974843745052304]


EVAL: [68/69] Elapsed 0m 6s (remain 0m 0s) Loss: 0.6024(0.4721) 




In [None]:
runtime.unassign()