In [1]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install datasets
!pip install sentencepiece
!pip install transformers==4.21.2
!pip install tokenizers==0.12.1



In [3]:
!nvidia-smi

Mon Oct  9 17:59:23 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    45W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    apex=True
    print_freq=20
    num_workers=4
    model="microsoft/deberta-v3-large"
    gradient_checkpointing=False
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=5e-6
    decoder_lr=5e-6
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=4
    max_len=1024
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    target_size=2
    fc_dropout=0.2
    target_cols=['content','wording']
    seed=2023
    n_fold=4
    trn_fold=[0, 1, 2, 3]
    train=True
    freezing=False
    freeze_layer=True
    n_freeze_layer=4


if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]

In [5]:
import os

DIR = '/content/drive/MyDrive/Competitions/Kaggle/Commonlit'
INPUT_DIR = os.path.join(DIR,'input')
OUTPUT_DIR = os.path.join(DIR,'output')
OUTPUT_MODEL_DIR = DIR + '/output/EXP084/'
if not os.path.exists(OUTPUT_MODEL_DIR):
    os.makedirs(OUTPUT_MODEL_DIR)

In [6]:
# ====================================================
# Library
# ====================================================
from google.colab import runtime
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs


import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW, Optimizer
from torch.utils.data import DataLoader, Dataset
from torch.autograd.function import InplaceFunction
import torch.nn.init as init

#os.system('pip uninstall -y transformers')
#os.system('pip uninstall -y tokenizers')
#os.system('python -m pip install --no-index --find-links=/content/drive/MyDrive/Competitions/Kaggle/FeedBack3/pip_wheel.ipynb transformers')
#os.system('python -m pip install --no-index --find-links=/content/drive/MyDrive/Competitions/Kaggle/FeedBack3/pip_wheel.ipynb tokenizers')
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup,get_polynomial_decay_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizers.__version__: 0.12.1
transformers.__version__: 4.21.2
env: TOKENIZERS_PARALLELISM=true


In [7]:
# ====================================================
# Utils
# ====================================================
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores


def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores


def get_logger(filename=OUTPUT_MODEL_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(seed=CFG.seed)

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """

    for parameter in module.parameters():
        parameter.requires_grad = False

def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """

    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)

    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """

    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"

        if hasattr(embeddings_path, attr_name):
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
prompts_train = pd.read_csv(os.path.join(INPUT_DIR,'prompts_train.csv'))
summary_train = pd.read_csv(os.path.join(INPUT_DIR,'summaries_train.csv'))

print(f"Prompt Train.shape: {prompts_train.shape}")
display(prompts_train.head())
print(f"Summary Train.shape: {summary_train.shape}")
display(summary_train.head())

Prompt Train.shape: (4, 4)


Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...
1,3b9047,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
2,814d6b,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
3,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an..."


Summary Train.shape: (7165, 5)


Unnamed: 0,student_id,prompt_id,text,content,wording
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757


In [10]:
train = prompts_train.merge(summary_train, on="prompt_id")
display(train.head())

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text,student_id,text,content,wording
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00791789cc1f,1 element of an ideal tragedy is that it shoul...,-0.210614,-0.471415
1,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0086ef22de8f,The three elements of an ideal tragedy are: H...,-0.970237,-0.417058
2,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0094589c7a22,Aristotle states that an ideal tragedy should ...,-0.387791,-0.584181
3,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00cd5736026a,One element of an Ideal tragedy is having a co...,0.088882,-0.59471
4,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00d98b8ff756,The 3 ideal of tragedy is how complex you need...,-0.687288,-0.460886


In [11]:
# ====================================================
# CV split
# ====================================================
Fold = GroupKFold(n_splits=CFG.n_fold)
for n, (train_index, val_index) in enumerate(Fold.split(train, groups=train["prompt_id"])):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

fold
0    2057
1    2009
2    1996
3    1103
dtype: int64

In [12]:
if CFG.debug:
    display(train.groupby('fold').size())
    train = train.sample(n=1000, random_state=0).reset_index(drop=True)
    display(train.groupby('fold').size())

In [13]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_MODEL_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [14]:
def len2text(x:str):
    if len(x)<200: return 'Quite short'
    if len(x)<300: return 'Short'
    if len(x)<500: return 'Middle'
    else:
        return 'Long'

In [15]:
train["text_length"] = train['text'].apply(lambda x: len2text(x))
SEP = tokenizer.sep_token
train['full_text'] = train['text'] + " " + train['prompt_question'] + " " + train["prompt_text"]

In [16]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text,
        return_tensors=None,
        add_special_tokens=True,
        max_length=CFG.max_len,
        pad_to_max_length=True,
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['full_text'].values
        self.labels = df[cfg.target_cols].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label


def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

In [17]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings


class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())

        if cfg.freeze_layer:
            for k, param in self.model.base_model.encoder.layer.named_parameters():
                l = int(k.split(".")[0])
                if l < cfg.n_freeze_layer:
                    param.requires_grad = False

        #self.pool = MeanPooling()
        self.output = nn.Sequential(
            nn.LayerNorm(self.config.hidden_size),
            nn.Linear(self.config.hidden_size, self.cfg.target_size)
        )

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        feature = outputs[0][:, 0, :]
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.output(feature)
        return output

In [18]:
# ====================================================
# Loss
# ====================================================
class RMSELoss(nn.Module):
    def __init__(self, reduction='mean', eps=1e-9):
        super().__init__()
        self.mse = nn.MSELoss(reduction='none')
        self.reduction = reduction
        self.eps = eps

    def forward(self, y_pred, y_true):
        loss = torch.sqrt(self.mse(y_pred, y_true) + self.eps)
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss

In [19]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        #if scaler is not None:
        #    scaler.unscale_(optimizer)
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader),
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions

In [20]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds[CFG.target_cols].values

    print(f"========== prompt_id: {valid_folds.prompt_id.unique()} validation ==========")

    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_MODEL_DIR+'config.pth')
    model.to(device)

    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]

        group1=['layer.0.','layer.1.','layer.2.','layer.3.','layer.4.','layer.5.','layer.6.','layer.7.']
        group2=['layer.8.','layer.9.','layer.10.','layer.11.','layer.12.','layer.13.','layer.14.','layer.15.']
        group3=['layer.16.','layer.17.','layer.18.','layer.19.','layer.20.','layer.21.','layer.22.','layer.23.']
        group_all=['layer.0.','layer.1.','layer.2.','layer.3.','layer.4.','layer.5.','layer.6.','layer.7.','layer.8.','layer.9.','layer.10.','layer.11.',
                  'layer.12.','layer.13.','layer.14.','layer.15.','layer.16.','layer.17.','layer.18.','layer.19.','layer.20.','layer.21.','layer.22.','layer.23.']


        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]

        optimizer_parameters2 = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': weight_decay, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': weight_decay, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': weight_decay, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': 0.0},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': 0.0, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': 0.0, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': 0.0, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.named_parameters() if "model" not in n], 'lr':decoder_lr, "momentum" : 0.99},
    ]
        return optimizer_parameters2

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr,
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)

    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler

    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = RMSELoss(reduction="mean")   # nn.SmoothL1Loss(reduction='mean')

    best_score = np.inf

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)

        # scoring
        score, scores = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}  Scores: {scores}')

        if best_score > score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_MODEL_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_MODEL_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                             map_location=torch.device('cpu'))['predictions']
    valid_folds[[f"pred_{c}" for c in CFG.target_cols]] = predictions

    torch.cuda.empty_cache()
    gc.collect()

    return valid_folds

In [None]:
if __name__ == '__main__':

    def get_result(oof_df):
        labels = oof_df[CFG.target_cols].values
        preds = oof_df[[f"pred_{c}" for c in CFG.target_cols]].values
        score, scores = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}  Scores: {scores}')

    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_MODEL_DIR+'oof_df.pkl')





DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
 

Epoch: [1][0/1277] Elapsed 0m 4s (remain 94m 1s) Loss: 0.6580(0.6580) Grad: inf  LR: 0.00000500  
Epoch: [1][20/1277] Elapsed 0m 13s (remain 13m 45s) Loss: 0.6825(0.9607) Grad: 40766.1484  LR: 0.00000500  
Epoch: [1][40/1277] Elapsed 0m 22s (remain 11m 15s) Loss: 1.2675(0.8782) Grad: 995562.0000  LR: 0.00000500  
Epoch: [1][60/1277] Elapsed 0m 31s (remain 10m 32s) Loss: 0.6851(0.8690) Grad: 372304.5938  LR: 0.00000500  
Epoch: [1][80/1277] Elapsed 0m 40s (remain 9m 59s) Loss: 0.6759(0.8431) Grad: 1124714.5000  LR: 0.00000500  
Epoch: [1][100/1277] Elapsed 0m 49s (remain 9m 36s) Loss: 0.3372(0.8080) Grad: 290683.3750  LR: 0.00000500  
Epoch: [1][120/1277] Elapsed 0m 58s (remain 9m 20s) Loss: 0.5940(0.7758) Grad: 4706861.5000  LR: 0.00000499  
Epoch: [1][140/1277] Elapsed 1m 7s (remain 9m 7s) Loss: 1.0393(0.7600) Grad: 484556.4375  LR: 0.00000499  
Epoch: [1][160/1277] Elapsed 1m 16s (remain 8m 53s) Loss: 0.8851(0.7332) Grad: 3070884.2500  LR: 0.00000499  
Epoch: [1][180/1277] Elapsed 1m

Epoch 1 - avg_train_loss: 0.4842  avg_val_loss: 0.3688  time: 736s
INFO:__main__:Epoch 1 - avg_train_loss: 0.4842  avg_val_loss: 0.3688  time: 736s
Epoch 1 - Score: 0.4861  Scores: [0.40158530407392656, 0.5705234313062326]
INFO:__main__:Epoch 1 - Score: 0.4861  Scores: [0.40158530407392656, 0.5705234313062326]
Epoch 1 - Save Best Score: 0.4861 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.4861 Model


EVAL: [257/258] Elapsed 2m 33s (remain 0m 0s) Loss: 0.4076(0.3688) 
Epoch: [2][0/1277] Elapsed 0m 0s (remain 14m 55s) Loss: 0.3051(0.3051) Grad: inf  LR: 0.00000427  
Epoch: [2][20/1277] Elapsed 0m 9s (remain 9m 42s) Loss: 0.1914(0.3454) Grad: 471787.6875  LR: 0.00000424  
Epoch: [2][40/1277] Elapsed 0m 18s (remain 9m 26s) Loss: 0.2059(0.3700) Grad: 231979.1094  LR: 0.00000422  
Epoch: [2][60/1277] Elapsed 0m 27s (remain 9m 7s) Loss: 0.2380(0.3614) Grad: 285488.0312  LR: 0.00000420  
Epoch: [2][80/1277] Elapsed 0m 36s (remain 8m 58s) Loss: 0.4108(0.3601) Grad: 272785.6875  LR: 0.00000418  
Epoch: [2][100/1277] Elapsed 0m 45s (remain 8m 45s) Loss: 0.3133(0.3529) Grad: 175668.1250  LR: 0.00000415  
Epoch: [2][120/1277] Elapsed 0m 54s (remain 8m 35s) Loss: 0.4037(0.3619) Grad: 478581.1875  LR: 0.00000413  
Epoch: [2][140/1277] Elapsed 1m 3s (remain 8m 29s) Loss: 0.4172(0.3589) Grad: 248789.6875  LR: 0.00000411  
Epoch: [2][160/1277] Elapsed 1m 12s (remain 8m 23s) Loss: 0.3941(0.3563) Grad

Epoch 2 - avg_train_loss: 0.3500  avg_val_loss: 0.3747  time: 732s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3500  avg_val_loss: 0.3747  time: 732s
Epoch 2 - Score: 0.4732  Scores: [0.3819285111659583, 0.5645677608753279]
INFO:__main__:Epoch 2 - Score: 0.4732  Scores: [0.3819285111659583, 0.5645677608753279]
Epoch 2 - Save Best Score: 0.4732 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.4732 Model


EVAL: [257/258] Elapsed 2m 33s (remain 0m 0s) Loss: 0.2684(0.3747) 
Epoch: [3][0/1277] Elapsed 0m 0s (remain 14m 46s) Loss: 0.3547(0.3547) Grad: inf  LR: 0.00000250  
Epoch: [3][20/1277] Elapsed 0m 9s (remain 9m 41s) Loss: 0.1641(0.3202) Grad: 281191.5000  LR: 0.00000247  
Epoch: [3][40/1277] Elapsed 0m 18s (remain 9m 22s) Loss: 0.4670(0.3349) Grad: 147281.6250  LR: 0.00000244  
Epoch: [3][60/1277] Elapsed 0m 27s (remain 9m 15s) Loss: 0.2758(0.3318) Grad: 218765.3281  LR: 0.00000241  
Epoch: [3][80/1277] Elapsed 0m 37s (remain 9m 10s) Loss: 0.2955(0.3196) Grad: 150487.1562  LR: 0.00000238  
Epoch: [3][100/1277] Elapsed 0m 46s (remain 9m 1s) Loss: 0.2305(0.3162) Grad: 233996.8750  LR: 0.00000234  
Epoch: [3][120/1277] Elapsed 0m 55s (remain 8m 52s) Loss: 0.2462(0.3188) Grad: 266854.9062  LR: 0.00000231  
Epoch: [3][140/1277] Elapsed 1m 5s (remain 8m 43s) Loss: 0.5905(0.3201) Grad: 177492.4531  LR: 0.00000228  
Epoch: [3][160/1277] Elapsed 1m 14s (remain 8m 34s) Loss: 0.5291(0.3251) Grad

Epoch 3 - avg_train_loss: 0.3085  avg_val_loss: 0.3618  time: 733s
INFO:__main__:Epoch 3 - avg_train_loss: 0.3085  avg_val_loss: 0.3618  time: 733s
Epoch 3 - Score: 0.4632  Scores: [0.3758891248605638, 0.5504858324502362]
INFO:__main__:Epoch 3 - Score: 0.4632  Scores: [0.3758891248605638, 0.5504858324502362]
Epoch 3 - Save Best Score: 0.4632 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.4632 Model


EVAL: [257/258] Elapsed 2m 33s (remain 0m 0s) Loss: 0.2853(0.3618) 
Epoch: [4][0/1277] Elapsed 0m 0s (remain 14m 47s) Loss: 0.2650(0.2650) Grad: inf  LR: 0.00000073  
Epoch: [4][20/1277] Elapsed 0m 9s (remain 9m 37s) Loss: 0.2176(0.3126) Grad: 520009.7188  LR: 0.00000071  
Epoch: [4][40/1277] Elapsed 0m 18s (remain 9m 24s) Loss: 0.1551(0.2799) Grad: 208722.3906  LR: 0.00000069  
Epoch: [4][60/1277] Elapsed 0m 27s (remain 9m 9s) Loss: 0.2049(0.2650) Grad: 306526.9062  LR: 0.00000067  
Epoch: [4][80/1277] Elapsed 0m 36s (remain 8m 57s) Loss: 0.2483(0.2699) Grad: 254686.7344  LR: 0.00000065  
Epoch: [4][100/1277] Elapsed 0m 45s (remain 8m 52s) Loss: 0.2115(0.2845) Grad: 212900.7656  LR: 0.00000063  
Epoch: [4][120/1277] Elapsed 0m 54s (remain 8m 44s) Loss: 0.3910(0.2815) Grad: 248575.9219  LR: 0.00000061  
Epoch: [4][140/1277] Elapsed 1m 4s (remain 8m 36s) Loss: 0.1880(0.2786) Grad: 139865.3594  LR: 0.00000059  
Epoch: [4][160/1277] Elapsed 1m 13s (remain 8m 27s) Loss: 0.1616(0.2827) Grad

Epoch 4 - avg_train_loss: 0.2708  avg_val_loss: 0.3513  time: 733s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2708  avg_val_loss: 0.3513  time: 733s
Epoch 4 - Score: 0.4540  Scores: [0.37266360336853455, 0.535394213823385]
INFO:__main__:Epoch 4 - Score: 0.4540  Scores: [0.37266360336853455, 0.535394213823385]
Epoch 4 - Save Best Score: 0.4540 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.4540 Model


EVAL: [257/258] Elapsed 2m 33s (remain 0m 0s) Loss: 0.2680(0.3513) 


Score: 0.4540  Scores: [0.37266360336853455, 0.535394213823385]
INFO:__main__:Score: 0.4540  Scores: [0.37266360336853455, 0.535394213823385]




DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
 

Epoch: [1][0/1289] Elapsed 0m 0s (remain 16m 11s) Loss: 1.2376(1.2376) Grad: nan  LR: 0.00000500  
Epoch: [1][20/1289] Elapsed 0m 9s (remain 9m 56s) Loss: 1.6091(0.9318) Grad: 266850.5000  LR: 0.00000500  
Epoch: [1][40/1289] Elapsed 0m 19s (remain 9m 41s) Loss: 0.5609(0.9295) Grad: 124631.8203  LR: 0.00000500  
Epoch: [1][60/1289] Elapsed 0m 28s (remain 9m 24s) Loss: 0.7300(0.8919) Grad: 401838.5312  LR: 0.00000500  
Epoch: [1][80/1289] Elapsed 0m 37s (remain 9m 11s) Loss: 0.4442(0.8588) Grad: 171857.3438  LR: 0.00000500  
Epoch: [1][100/1289] Elapsed 0m 46s (remain 9m 1s) Loss: 0.3721(0.8082) Grad: 170999.4844  LR: 0.00000500  
Epoch: [1][120/1289] Elapsed 0m 54s (remain 8m 50s) Loss: 0.6139(0.7650) Grad: 142920.8125  LR: 0.00000499  
Epoch: [1][140/1289] Elapsed 1m 4s (remain 8m 42s) Loss: 0.5772(0.7407) Grad: 236507.6094  LR: 0.00000499  
Epoch: [1][160/1289] Elapsed 1m 13s (remain 8m 33s) Loss: 0.7997(0.7087) Grad: 385758.9062  LR: 0.00000499  
Epoch: [1][180/1289] Elapsed 1m 22s 

Epoch 1 - avg_train_loss: 0.4541  avg_val_loss: 0.4596  time: 735s
INFO:__main__:Epoch 1 - avg_train_loss: 0.4541  avg_val_loss: 0.4596  time: 735s
Epoch 1 - Score: 0.5944  Scores: [0.5877207411216726, 0.6010909329187237]
INFO:__main__:Epoch 1 - Score: 0.5944  Scores: [0.5877207411216726, 0.6010909329187237]
Epoch 1 - Save Best Score: 0.5944 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5944 Model


EVAL: [251/252] Elapsed 2m 33s (remain 0m 0s) Loss: 0.3364(0.4596) 
Epoch: [2][0/1289] Elapsed 0m 0s (remain 11m 43s) Loss: 0.5294(0.5294) Grad: inf  LR: 0.00000427  
Epoch: [2][20/1289] Elapsed 0m 9s (remain 9m 24s) Loss: 0.3933(0.3454) Grad: 313166.7188  LR: 0.00000425  
Epoch: [2][40/1289] Elapsed 0m 18s (remain 9m 17s) Loss: 0.4341(0.3620) Grad: 134612.7656  LR: 0.00000422  
Epoch: [2][60/1289] Elapsed 0m 27s (remain 9m 10s) Loss: 0.3282(0.3630) Grad: 161250.8594  LR: 0.00000420  
Epoch: [2][80/1289] Elapsed 0m 36s (remain 9m 2s) Loss: 0.3609(0.3581) Grad: 130250.9922  LR: 0.00000418  
Epoch: [2][100/1289] Elapsed 0m 45s (remain 8m 53s) Loss: 0.4451(0.3677) Grad: 217578.7031  LR: 0.00000416  
Epoch: [2][120/1289] Elapsed 0m 54s (remain 8m 44s) Loss: 0.3396(0.3667) Grad: 302873.7188  LR: 0.00000413  
Epoch: [2][140/1289] Elapsed 1m 3s (remain 8m 36s) Loss: 0.3137(0.3632) Grad: 185114.5938  LR: 0.00000411  
Epoch: [2][160/1289] Elapsed 1m 12s (remain 8m 27s) Loss: 0.3047(0.3622) Grad

Epoch 2 - avg_train_loss: 0.3382  avg_val_loss: 0.3954  time: 732s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3382  avg_val_loss: 0.3954  time: 732s
Epoch 2 - Score: 0.5176  Scores: [0.44339610022764575, 0.5918968850541536]
INFO:__main__:Epoch 2 - Score: 0.5176  Scores: [0.44339610022764575, 0.5918968850541536]
Epoch 2 - Save Best Score: 0.5176 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.5176 Model


EVAL: [251/252] Elapsed 2m 33s (remain 0m 0s) Loss: 0.3151(0.3954) 
Epoch: [3][0/1289] Elapsed 0m 0s (remain 11m 42s) Loss: 0.3037(0.3037) Grad: inf  LR: 0.00000250  
Epoch: [3][20/1289] Elapsed 0m 9s (remain 9m 15s) Loss: 0.1733(0.3169) Grad: 210608.2188  LR: 0.00000247  
Epoch: [3][40/1289] Elapsed 0m 18s (remain 9m 12s) Loss: 0.3073(0.3092) Grad: 206460.2500  LR: 0.00000244  
Epoch: [3][60/1289] Elapsed 0m 27s (remain 9m 8s) Loss: 0.2219(0.2955) Grad: 277439.6250  LR: 0.00000241  
Epoch: [3][80/1289] Elapsed 0m 36s (remain 9m 1s) Loss: 0.4229(0.2967) Grad: 303381.6562  LR: 0.00000238  
Epoch: [3][100/1289] Elapsed 0m 45s (remain 8m 52s) Loss: 0.4359(0.3030) Grad: 249671.7812  LR: 0.00000235  
Epoch: [3][120/1289] Elapsed 0m 54s (remain 8m 42s) Loss: 0.2834(0.2998) Grad: 299375.6250  LR: 0.00000232  
Epoch: [3][140/1289] Elapsed 1m 3s (remain 8m 33s) Loss: 0.2361(0.3034) Grad: 211761.5000  LR: 0.00000229  
Epoch: [3][160/1289] Elapsed 1m 11s (remain 8m 24s) Loss: 0.1736(0.3023) Grad:

Epoch 3 - avg_train_loss: 0.2950  avg_val_loss: 0.3934  time: 731s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2950  avg_val_loss: 0.3934  time: 731s
Epoch 3 - Score: 0.5154  Scores: [0.4475831202910627, 0.5832722188103178]
INFO:__main__:Epoch 3 - Score: 0.5154  Scores: [0.4475831202910627, 0.5832722188103178]
Epoch 3 - Save Best Score: 0.5154 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.5154 Model


EVAL: [251/252] Elapsed 2m 33s (remain 0m 0s) Loss: 0.3511(0.3934) 
Epoch: [4][0/1289] Elapsed 0m 0s (remain 15m 13s) Loss: 0.1822(0.1822) Grad: 724344.4375  LR: 0.00000073  
Epoch: [4][20/1289] Elapsed 0m 9s (remain 9m 30s) Loss: 0.3886(0.2500) Grad: 145595.4375  LR: 0.00000071  
Epoch: [4][40/1289] Elapsed 0m 18s (remain 9m 17s) Loss: 0.2413(0.2440) Grad: 289256.1562  LR: 0.00000069  
Epoch: [4][60/1289] Elapsed 0m 27s (remain 9m 8s) Loss: 0.1585(0.2562) Grad: 308742.5938  LR: 0.00000067  
Epoch: [4][80/1289] Elapsed 0m 36s (remain 8m 58s) Loss: 0.1058(0.2560) Grad: 322421.5938  LR: 0.00000065  
Epoch: [4][100/1289] Elapsed 0m 45s (remain 8m 52s) Loss: 0.4043(0.2603) Grad: 211977.3750  LR: 0.00000063  
Epoch: [4][120/1289] Elapsed 0m 54s (remain 8m 45s) Loss: 0.3022(0.2601) Grad: 207838.7188  LR: 0.00000061  
Epoch: [4][140/1289] Elapsed 1m 3s (remain 8m 37s) Loss: 0.1975(0.2568) Grad: 147779.5781  LR: 0.00000059  
Epoch: [4][160/1289] Elapsed 1m 12s (remain 8m 27s) Loss: 0.1016(0.25

Epoch 4 - avg_train_loss: 0.2581  avg_val_loss: 0.3946  time: 732s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2581  avg_val_loss: 0.3946  time: 732s
Epoch 4 - Score: 0.5228  Scores: [0.450661410491065, 0.5948672079203755]
INFO:__main__:Epoch 4 - Score: 0.5228  Scores: [0.450661410491065, 0.5948672079203755]


EVAL: [251/252] Elapsed 2m 33s (remain 0m 0s) Loss: 0.3116(0.3946) 


Score: 0.5154  Scores: [0.4475831202910627, 0.5832722188103178]
INFO:__main__:Score: 0.5154  Scores: [0.4475831202910627, 0.5832722188103178]




DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
 

Epoch: [1][0/1292] Elapsed 0m 0s (remain 16m 9s) Loss: 1.4342(1.4342) Grad: nan  LR: 0.00000500  
Epoch: [1][20/1292] Elapsed 0m 8s (remain 8m 28s) Loss: 1.0119(0.9455) Grad: 201054.6562  LR: 0.00000500  
Epoch: [1][40/1292] Elapsed 0m 15s (remain 7m 57s) Loss: 1.0625(0.8883) Grad: 75256.3672  LR: 0.00000500  
Epoch: [1][60/1292] Elapsed 0m 22s (remain 7m 43s) Loss: 1.1341(0.8801) Grad: 81185.2031  LR: 0.00000500  
Epoch: [1][80/1292] Elapsed 0m 30s (remain 7m 34s) Loss: 0.9065(0.8563) Grad: 486867.8438  LR: 0.00000500  
Epoch: [1][100/1292] Elapsed 0m 37s (remain 7m 22s) Loss: 0.7311(0.8419) Grad: 333033.4375  LR: 0.00000500  
Epoch: [1][120/1292] Elapsed 0m 44s (remain 7m 12s) Loss: 0.8501(0.8026) Grad: 1027921.8750  LR: 0.00000499  
Epoch: [1][140/1292] Elapsed 0m 52s (remain 7m 4s) Loss: 0.7499(0.7703) Grad: 273457.9375  LR: 0.00000499  
Epoch: [1][160/1292] Elapsed 0m 59s (remain 6m 59s) Loss: 0.5519(0.7593) Grad: 168532.8750  LR: 0.00000499  
Epoch: [1][180/1292] Elapsed 1m 6s (r

Epoch 1 - avg_train_loss: 0.4806  avg_val_loss: 0.3996  time: 650s
INFO:__main__:Epoch 1 - avg_train_loss: 0.4806  avg_val_loss: 0.3996  time: 650s
Epoch 1 - Score: 0.5002  Scores: [0.4389835248834194, 0.5613366926829896]
INFO:__main__:Epoch 1 - Score: 0.5002  Scores: [0.4389835248834194, 0.5613366926829896]
Epoch 1 - Save Best Score: 0.5002 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5002 Model


EVAL: [249/250] Elapsed 3m 3s (remain 0m 0s) Loss: 0.4872(0.3996) 
Epoch: [2][0/1292] Elapsed 0m 0s (remain 11m 26s) Loss: 0.2848(0.2848) Grad: inf  LR: 0.00000427  
Epoch: [2][20/1292] Elapsed 0m 7s (remain 7m 43s) Loss: 0.4052(0.3811) Grad: 538999.3125  LR: 0.00000425  
Epoch: [2][40/1292] Elapsed 0m 15s (remain 7m 40s) Loss: 0.2990(0.3766) Grad: 385781.5000  LR: 0.00000422  
Epoch: [2][60/1292] Elapsed 0m 22s (remain 7m 34s) Loss: 0.3043(0.3611) Grad: 732251.7500  LR: 0.00000420  
Epoch: [2][80/1292] Elapsed 0m 29s (remain 7m 25s) Loss: 0.3198(0.3547) Grad: 813795.0625  LR: 0.00000418  
Epoch: [2][100/1292] Elapsed 0m 36s (remain 7m 15s) Loss: 0.2493(0.3517) Grad: 433702.3750  LR: 0.00000416  
Epoch: [2][120/1292] Elapsed 0m 43s (remain 7m 4s) Loss: 0.2955(0.3478) Grad: 559360.2500  LR: 0.00000413  
Epoch: [2][140/1292] Elapsed 0m 51s (remain 6m 58s) Loss: 0.5200(0.3454) Grad: inf  LR: 0.00000411  
Epoch: [2][160/1292] Elapsed 0m 58s (remain 6m 51s) Loss: 0.4143(0.3473) Grad: 400169

Epoch 2 - avg_train_loss: 0.3312  avg_val_loss: 0.3415  time: 650s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3312  avg_val_loss: 0.3415  time: 650s
Epoch 2 - Score: 0.4441  Scores: [0.40082272650325246, 0.487430838850914]
INFO:__main__:Epoch 2 - Score: 0.4441  Scores: [0.40082272650325246, 0.487430838850914]
Epoch 2 - Save Best Score: 0.4441 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.4441 Model


EVAL: [249/250] Elapsed 3m 3s (remain 0m 0s) Loss: 0.4223(0.3415) 
Epoch: [3][0/1292] Elapsed 0m 0s (remain 13m 32s) Loss: 0.2184(0.2184) Grad: inf  LR: 0.00000250  
Epoch: [3][20/1292] Elapsed 0m 7s (remain 7m 47s) Loss: 0.3060(0.2515) Grad: 761353.0625  LR: 0.00000247  
Epoch: [3][40/1292] Elapsed 0m 15s (remain 7m 51s) Loss: 0.2965(0.2878) Grad: 852475.7500  LR: 0.00000244  
Epoch: [3][60/1292] Elapsed 0m 22s (remain 7m 32s) Loss: 0.4065(0.2956) Grad: 383121.5312  LR: 0.00000241  
Epoch: [3][80/1292] Elapsed 0m 29s (remain 7m 18s) Loss: 0.2792(0.2959) Grad: 242881.2500  LR: 0.00000238  
Epoch: [3][100/1292] Elapsed 0m 36s (remain 7m 12s) Loss: 0.3050(0.2992) Grad: 364845.8438  LR: 0.00000235  
Epoch: [3][120/1292] Elapsed 0m 43s (remain 7m 4s) Loss: 0.4634(0.3012) Grad: 503092.6250  LR: 0.00000232  
Epoch: [3][140/1292] Elapsed 0m 50s (remain 6m 55s) Loss: 0.3593(0.3090) Grad: 257657.5938  LR: 0.00000229  
Epoch: [3][160/1292] Elapsed 0m 58s (remain 6m 48s) Loss: 0.3660(0.3110) Grad

Epoch 3 - avg_train_loss: 0.3018  avg_val_loss: 0.3427  time: 650s
INFO:__main__:Epoch 3 - avg_train_loss: 0.3018  avg_val_loss: 0.3427  time: 650s
Epoch 3 - Score: 0.4430  Scores: [0.39420794966145534, 0.4918089882330459]
INFO:__main__:Epoch 3 - Score: 0.4430  Scores: [0.39420794966145534, 0.4918089882330459]
Epoch 3 - Save Best Score: 0.4430 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.4430 Model


EVAL: [249/250] Elapsed 3m 3s (remain 0m 0s) Loss: 0.4407(0.3427) 
Epoch: [4][0/1292] Elapsed 0m 0s (remain 13m 27s) Loss: 0.2759(0.2759) Grad: inf  LR: 0.00000073  
Epoch: [4][20/1292] Elapsed 0m 7s (remain 7m 55s) Loss: 0.1952(0.2357) Grad: 231553.3438  LR: 0.00000071  
Epoch: [4][40/1292] Elapsed 0m 14s (remain 7m 28s) Loss: 0.2322(0.2546) Grad: 471317.9375  LR: 0.00000069  
Epoch: [4][60/1292] Elapsed 0m 21s (remain 7m 15s) Loss: 0.3739(0.2542) Grad: 402525.7812  LR: 0.00000067  
Epoch: [4][80/1292] Elapsed 0m 28s (remain 7m 9s) Loss: 0.1285(0.2579) Grad: 358868.0000  LR: 0.00000065  
Epoch: [4][100/1292] Elapsed 0m 35s (remain 7m 3s) Loss: 0.4731(0.2740) Grad: 562502.0000  LR: 0.00000063  
Epoch: [4][120/1292] Elapsed 0m 43s (remain 7m 0s) Loss: 0.1825(0.2690) Grad: 583410.6250  LR: 0.00000061  
Epoch: [4][140/1292] Elapsed 0m 50s (remain 6m 51s) Loss: 0.3811(0.2696) Grad: 478652.0625  LR: 0.00000059  
Epoch: [4][160/1292] Elapsed 0m 57s (remain 6m 42s) Loss: 0.1696(0.2704) Grad: 

Epoch 4 - avg_train_loss: 0.2777  avg_val_loss: 0.3420  time: 651s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2777  avg_val_loss: 0.3420  time: 651s
Epoch 4 - Score: 0.4427  Scores: [0.399044614398343, 0.48625640610017307]
INFO:__main__:Epoch 4 - Score: 0.4427  Scores: [0.399044614398343, 0.48625640610017307]
Epoch 4 - Save Best Score: 0.4427 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.4427 Model


EVAL: [249/250] Elapsed 3m 3s (remain 0m 0s) Loss: 0.4310(0.3420) 


Score: 0.4427  Scores: [0.399044614398343, 0.48625640610017307]
INFO:__main__:Score: 0.4427  Scores: [0.399044614398343, 0.48625640610017307]




DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
 

Epoch: [1][0/1515] Elapsed 0m 0s (remain 15m 54s) Loss: 0.5770(0.5770) Grad: nan  LR: 0.00000500  
Epoch: [1][20/1515] Elapsed 0m 9s (remain 10m 59s) Loss: 0.7941(0.7209) Grad: 355551.6875  LR: 0.00000500  
Epoch: [1][40/1515] Elapsed 0m 18s (remain 10m 54s) Loss: 0.8628(0.7479) Grad: 784806.1250  LR: 0.00000500  
Epoch: [1][60/1515] Elapsed 0m 27s (remain 10m 47s) Loss: 1.2610(0.7374) Grad: 297604.6250  LR: 0.00000500  
Epoch: [1][80/1515] Elapsed 0m 36s (remain 10m 40s) Loss: 0.7148(0.7382) Grad: 3659890.2500  LR: 0.00000500  
Epoch: [1][100/1515] Elapsed 0m 44s (remain 10m 27s) Loss: 0.6281(0.7301) Grad: 1045654.6250  LR: 0.00000500  
Epoch: [1][120/1515] Elapsed 0m 53s (remain 10m 17s) Loss: 0.5832(0.7289) Grad: 974441.8125  LR: 0.00000500  
Epoch: [1][140/1515] Elapsed 1m 2s (remain 10m 9s) Loss: 0.7072(0.7137) Grad: 42711.2891  LR: 0.00000499  
Epoch: [1][160/1515] Elapsed 1m 11s (remain 10m 2s) Loss: 0.9755(0.6914) Grad: 699087.6250  LR: 0.00000499  
Epoch: [1][180/1515] Elapsed

Epoch 1 - avg_train_loss: 0.4579  avg_val_loss: 0.4367  time: 751s
INFO:__main__:Epoch 1 - avg_train_loss: 0.4579  avg_val_loss: 0.4367  time: 751s
Epoch 1 - Score: 0.5830  Scores: [0.4821143540349593, 0.6838480480865927]
INFO:__main__:Epoch 1 - Score: 0.5830  Scores: [0.4821143540349593, 0.6838480480865927]
Epoch 1 - Save Best Score: 0.5830 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5830 Model


EVAL: [137/138] Elapsed 1m 18s (remain 0m 0s) Loss: 0.6436(0.4367) 
Epoch: [2][0/1515] Elapsed 0m 0s (remain 17m 19s) Loss: 0.1788(0.1788) Grad: inf  LR: 0.00000427  
Epoch: [2][20/1515] Elapsed 0m 9s (remain 10m 49s) Loss: 0.2960(0.3939) Grad: 165036.8750  LR: 0.00000425  
Epoch: [2][40/1515] Elapsed 0m 18s (remain 10m 50s) Loss: 0.2871(0.3615) Grad: 221055.9844  LR: 0.00000423  
Epoch: [2][60/1515] Elapsed 0m 27s (remain 10m 52s) Loss: 0.3520(0.3655) Grad: 190240.4531  LR: 0.00000421  
Epoch: [2][80/1515] Elapsed 0m 36s (remain 10m 41s) Loss: 0.3831(0.3612) Grad: 247159.4688  LR: 0.00000419  
Epoch: [2][100/1515] Elapsed 0m 45s (remain 10m 32s) Loss: 0.3311(0.3464) Grad: 320395.9688  LR: 0.00000417  
Epoch: [2][120/1515] Elapsed 0m 53s (remain 10m 20s) Loss: 0.3808(0.3451) Grad: 249372.0156  LR: 0.00000415  
Epoch: [2][140/1515] Elapsed 1m 2s (remain 10m 12s) Loss: 0.2587(0.3424) Grad: 96885.3047  LR: 0.00000413  
Epoch: [2][160/1515] Elapsed 1m 11s (remain 10m 2s) Loss: 0.2502(0.343

Epoch 2 - avg_train_loss: 0.3233  avg_val_loss: 0.5017  time: 750s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3233  avg_val_loss: 0.5017  time: 750s
Epoch 2 - Score: 0.6588  Scores: [0.5407697951516193, 0.7769109001542583]
INFO:__main__:Epoch 2 - Score: 0.6588  Scores: [0.5407697951516193, 0.7769109001542583]


EVAL: [137/138] Elapsed 1m 18s (remain 0m 0s) Loss: 0.7540(0.5017) 
Epoch: [3][0/1515] Elapsed 0m 0s (remain 16m 50s) Loss: 0.1356(0.1356) Grad: inf  LR: 0.00000250  
Epoch: [3][20/1515] Elapsed 0m 9s (remain 10m 54s) Loss: 0.3898(0.2745) Grad: 141290.2812  LR: 0.00000247  
Epoch: [3][40/1515] Elapsed 0m 18s (remain 10m 47s) Loss: 0.3547(0.2878) Grad: 113002.1094  LR: 0.00000245  
Epoch: [3][60/1515] Elapsed 0m 26s (remain 10m 27s) Loss: 0.3050(0.2786) Grad: 170458.1250  LR: 0.00000242  
Epoch: [3][80/1515] Elapsed 0m 35s (remain 10m 20s) Loss: 0.1680(0.2808) Grad: 124902.6406  LR: 0.00000240  
Epoch: [3][100/1515] Elapsed 0m 44s (remain 10m 17s) Loss: 0.2699(0.2757) Grad: 209243.7812  LR: 0.00000237  
Epoch: [3][120/1515] Elapsed 0m 53s (remain 10m 10s) Loss: 0.3140(0.2805) Grad: 200345.9531  LR: 0.00000234  
Epoch: [3][140/1515] Elapsed 1m 1s (remain 10m 2s) Loss: 0.3792(0.2824) Grad: 337925.9688  LR: 0.00000232  
Epoch: [3][160/1515] Elapsed 1m 10s (remain 9m 54s) Loss: 0.1196(0.283

Epoch 3 - avg_train_loss: 0.2929  avg_val_loss: 0.4597  time: 747s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2929  avg_val_loss: 0.4597  time: 747s
Epoch 3 - Score: 0.6129  Scores: [0.5357666646148573, 0.6901104257719244]
INFO:__main__:Epoch 3 - Score: 0.6129  Scores: [0.5357666646148573, 0.6901104257719244]


EVAL: [137/138] Elapsed 1m 18s (remain 0m 0s) Loss: 0.7014(0.4597) 
Epoch: [4][0/1515] Elapsed 0m 0s (remain 16m 57s) Loss: 0.2927(0.2927) Grad: inf  LR: 0.00000073  
Epoch: [4][20/1515] Elapsed 0m 9s (remain 11m 9s) Loss: 0.2369(0.2473) Grad: 182689.9531  LR: 0.00000071  
Epoch: [4][40/1515] Elapsed 0m 18s (remain 10m 56s) Loss: 0.3259(0.2563) Grad: 125358.7188  LR: 0.00000070  
Epoch: [4][60/1515] Elapsed 0m 27s (remain 10m 45s) Loss: 0.3861(0.2837) Grad: 295428.0938  LR: 0.00000068  
Epoch: [4][80/1515] Elapsed 0m 35s (remain 10m 35s) Loss: 0.2564(0.2794) Grad: 195317.1094  LR: 0.00000066  
Epoch: [4][100/1515] Elapsed 0m 44s (remain 10m 24s) Loss: 0.2081(0.2752) Grad: 133530.8750  LR: 0.00000064  
Epoch: [4][120/1515] Elapsed 0m 53s (remain 10m 15s) Loss: 0.2561(0.2774) Grad: 141522.0156  LR: 0.00000063  
Epoch: [4][140/1515] Elapsed 1m 2s (remain 10m 5s) Loss: 0.1376(0.2750) Grad: 80357.1328  LR: 0.00000061  
Epoch: [4][160/1515] Elapsed 1m 10s (remain 9m 54s) Loss: 0.1892(0.2768)

Epoch 4 - avg_train_loss: 0.2648  avg_val_loss: 0.4459  time: 752s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2648  avg_val_loss: 0.4459  time: 752s
Epoch 4 - Score: 0.5905  Scores: [0.5192799085390658, 0.6617847598499629]
INFO:__main__:Epoch 4 - Score: 0.5905  Scores: [0.5192799085390658, 0.6617847598499629]


EVAL: [137/138] Elapsed 1m 18s (remain 0m 0s) Loss: 0.6599(0.4459) 




In [None]:
labels = oof_df[CFG.target_cols].values
preds = oof_df[[f"pred_{c}" for c in CFG.target_cols]].values
score, scores = get_score(labels, preds)
print(f'Score: {score:<.4f}  Scores: {scores}')

In [None]:
runtime.unassign()