In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
!pip install datasets
!pip install sentencepiece
!pip install transformers==4.21.2
!pip install tokenizers==0.12.1

Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.17.3-py3-none-a

In [3]:
!nvidia-smi

Fri Oct  6 15:44:20 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P0    52W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    apex=True
    print_freq=20
    num_workers=4
    model="microsoft/deberta-v3-large"
    gradient_checkpointing=False
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=5e-6
    decoder_lr=5e-6
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=4
    max_len=1024
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    target_size=2
    fc_dropout=0.2
    target_cols=['content','wording']
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]
    train=True
    freezing=True

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]

In [5]:
import os

DIR = '/content/drive/MyDrive/Competitions/Kaggle/Commonlit'
INPUT_DIR = os.path.join(DIR,'input')
OUTPUT_DIR = os.path.join(DIR,'output')
OUTPUT_MODEL_DIR = DIR + '/output/EXP051/'
if not os.path.exists(OUTPUT_MODEL_DIR):
    os.makedirs(OUTPUT_MODEL_DIR)

In [6]:
# ====================================================
# Library
# ====================================================
from google.colab import runtime
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs


import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW, Optimizer
from torch.utils.data import DataLoader, Dataset
from torch.autograd.function import InplaceFunction
import torch.nn.init as init

#os.system('pip uninstall -y transformers')
#os.system('pip uninstall -y tokenizers')
#os.system('python -m pip install --no-index --find-links=/content/drive/MyDrive/Competitions/Kaggle/FeedBack3/pip_wheel.ipynb transformers')
#os.system('python -m pip install --no-index --find-links=/content/drive/MyDrive/Competitions/Kaggle/FeedBack3/pip_wheel.ipynb tokenizers')
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup,get_polynomial_decay_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizers.__version__: 0.12.1
transformers.__version__: 4.21.2
env: TOKENIZERS_PARALLELISM=true


In [7]:
# ====================================================
# Utils
# ====================================================
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores


def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores


def get_logger(filename=OUTPUT_MODEL_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(seed=CFG.seed)

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """

    for parameter in module.parameters():
        parameter.requires_grad = False

def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """

    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)

    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """

    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"

        if hasattr(embeddings_path, attr_name):
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
prompts_train = pd.read_csv(os.path.join(INPUT_DIR,'prompts_train.csv'))
summary_train = pd.read_csv(os.path.join(INPUT_DIR,'summaries_train.csv'))

print(f"Prompt Train.shape: {prompts_train.shape}")
display(prompts_train.head())
print(f"Summary Train.shape: {summary_train.shape}")
display(summary_train.head())

Prompt Train.shape: (4, 4)


Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...
1,3b9047,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
2,814d6b,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
3,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an..."


Summary Train.shape: (7165, 5)


Unnamed: 0,student_id,prompt_id,text,content,wording
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757


In [10]:
train = prompts_train.merge(summary_train, on="prompt_id")
display(train.head())

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text,student_id,text,content,wording
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00791789cc1f,1 element of an ideal tragedy is that it shoul...,-0.210614,-0.471415
1,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0086ef22de8f,The three elements of an ideal tragedy are: H...,-0.970237,-0.417058
2,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0094589c7a22,Aristotle states that an ideal tragedy should ...,-0.387791,-0.584181
3,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00cd5736026a,One element of an Ideal tragedy is having a co...,0.088882,-0.59471
4,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00d98b8ff756,The 3 ideal of tragedy is how complex you need...,-0.687288,-0.460886


In [11]:
# ====================================================
# CV split
# ====================================================
Fold = GroupKFold(n_splits=CFG.n_fold)
for n, (train_index, val_index) in enumerate(Fold.split(train, groups=train["prompt_id"])):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

fold
0    2057
1    2009
2    1996
3    1103
dtype: int64

In [12]:
if CFG.debug:
    display(train.groupby('fold').size())
    train = train.sample(n=1000, random_state=0).reset_index(drop=True)
    display(train.groupby('fold').size())

In [13]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_MODEL_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Downloading tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [14]:
def len2text(x:str):
    if len(x)<200: return 'Quite short'
    if len(x)<300: return 'Short'
    if len(x)<500: return 'Middle'
    else:
        return 'Long'

In [15]:
train["text_length"] = train['text'].apply(lambda x: len2text(x))
SEP = tokenizer.sep_token
train['full_text'] = train['text'] + " " + train['prompt_question'] + " " + train["prompt_text"]

In [16]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text,
        return_tensors=None,
        add_special_tokens=True,
        max_length=CFG.max_len,
        pad_to_max_length=True,
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['full_text'].values
        self.labels = df[cfg.target_cols].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label


def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

In [17]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings


class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())

        #self.pool = MeanPooling()
        self.output = nn.Sequential(
            nn.LayerNorm(self.config.hidden_size),
            nn.Linear(self.config.hidden_size, self.cfg.target_size)
        )

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        feature = outputs[0][:, 0, :]
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.output(feature)
        return output

In [18]:
# ====================================================
# Loss
# ====================================================
class RMSELoss(nn.Module):
    def __init__(self, reduction='mean', eps=1e-9):
        super().__init__()
        self.mse = nn.MSELoss(reduction='none')
        self.reduction = reduction
        self.eps = eps

    def forward(self, y_pred, y_true):
        loss = torch.sqrt(self.mse(y_pred, y_true) + self.eps)
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss

In [19]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        #if scaler is not None:
        #    scaler.unscale_(optimizer)
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader),
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions

In [20]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds[CFG.target_cols].values

    print(f"========== prompt_id: {valid_folds.prompt_id.unique()} validation ==========")

    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_MODEL_DIR+'config.pth')
    model.to(device)

    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]

        group1=['layer.0.','layer.1.','layer.2.','layer.3.','layer.4.','layer.5.','layer.6.','layer.7.']
        group2=['layer.8.','layer.9.','layer.10.','layer.11.','layer.12.','layer.13.','layer.14.','layer.15.']
        group3=['layer.16.','layer.17.','layer.18.','layer.19.','layer.20.','layer.21.','layer.22.','layer.23.']
        group_all=['layer.0.','layer.1.','layer.2.','layer.3.','layer.4.','layer.5.','layer.6.','layer.7.','layer.8.','layer.9.','layer.10.','layer.11.',
                  'layer.12.','layer.13.','layer.14.','layer.15.','layer.16.','layer.17.','layer.18.','layer.19.','layer.20.','layer.21.','layer.22.','layer.23.']


        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]

        optimizer_parameters2 = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': weight_decay, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': weight_decay, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': weight_decay, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay': 0.0},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay': 0.0, 'lr': encoder_lr/2.6},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay': 0.0, 'lr': encoder_lr},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay': 0.0, 'lr': encoder_lr*2.6},
        {'params': [p for n, p in model.named_parameters() if "model" not in n], 'lr':decoder_lr, "momentum" : 0.99},
    ]
        return optimizer_parameters2

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr,
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)

    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler

    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = RMSELoss(reduction="mean")   # nn.SmoothL1Loss(reduction='mean')

    best_score = np.inf

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)

        # scoring
        score, scores = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}  Scores: {scores}')

        if best_score > score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_MODEL_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_MODEL_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                             map_location=torch.device('cpu'))['predictions']
    valid_folds[[f"pred_{c}" for c in CFG.target_cols]] = predictions

    torch.cuda.empty_cache()
    gc.collect()

    return valid_folds

In [None]:
if __name__ == '__main__':

    def get_result(oof_df):
        labels = oof_df[CFG.target_cols].values
        preds = oof_df[[f"pred_{c}" for c in CFG.target_cols]].values
        score, scores = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}  Scores: {scores}')

    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_MODEL_DIR+'oof_df.pkl')





DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
 

Downloading pytorch_model.bin:   0%|          | 0.00/833M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.weight', 'mask_predictions.classifier.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/1277] Elapsed 0m 4s (remain 97m 49s) Loss: 0.4300(0.4300) Grad: inf  LR: 0.00000500  
Epoch: [1][20/1277] Elapsed 0m 13s (remain 13m 40s) Loss: 1.2283(0.9389) Grad: 578748.1875  LR: 0.00000500  
Epoch: [1][40/1277] Elapsed 0m 22s (remain 11m 15s) Loss: 1.0243(0.9173) Grad: 142105.7500  LR: 0.00000500  
Epoch: [1][60/1277] Elapsed 0m 30s (remain 10m 15s) Loss: 0.7999(0.8687) Grad: 182823.7031  LR: 0.00000500  
Epoch: [1][80/1277] Elapsed 0m 39s (remain 9m 46s) Loss: 0.8943(0.8425) Grad: 140373.2500  LR: 0.00000500  
Epoch: [1][100/1277] Elapsed 0m 48s (remain 9m 23s) Loss: 0.4836(0.8123) Grad: 196732.1562  LR: 0.00000500  
Epoch: [1][120/1277] Elapsed 0m 57s (remain 9m 8s) Loss: 0.4469(0.7797) Grad: 193994.9688  LR: 0.00000499  
Epoch: [1][140/1277] Elapsed 1m 6s (remain 8m 52s) Loss: 0.5742(0.7538) Grad: 509884.6250  LR: 0.00000499  
Epoch: [1][160/1277] Elapsed 1m 14s (remain 8m 38s) Loss: 0.5066(0.7300) Grad: 208688.6250  LR: 0.00000499  
Epoch: [1][180/1277] Elapsed 1m 

Epoch 1 - avg_train_loss: 0.4725  avg_val_loss: 0.3830  time: 716s
INFO:__main__:Epoch 1 - avg_train_loss: 0.4725  avg_val_loss: 0.3830  time: 716s
Epoch 1 - Score: 0.5119  Scores: [0.44483420300990417, 0.5790308909567133]
INFO:__main__:Epoch 1 - Score: 0.5119  Scores: [0.44483420300990417, 0.5790308909567133]
Epoch 1 - Save Best Score: 0.5119 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5119 Model


EVAL: [257/258] Elapsed 2m 34s (remain 0m 0s) Loss: 0.3336(0.3830) 
Epoch: [2][0/1277] Elapsed 0m 0s (remain 14m 28s) Loss: 0.2931(0.2931) Grad: inf  LR: 0.00000427  
Epoch: [2][20/1277] Elapsed 0m 9s (remain 9m 38s) Loss: 0.2806(0.3802) Grad: 329069.3438  LR: 0.00000424  
Epoch: [2][40/1277] Elapsed 0m 18s (remain 9m 14s) Loss: 0.4352(0.3682) Grad: 424373.7188  LR: 0.00000422  
Epoch: [2][60/1277] Elapsed 0m 27s (remain 9m 2s) Loss: 0.1496(0.3560) Grad: 203443.9062  LR: 0.00000420  
Epoch: [2][80/1277] Elapsed 0m 35s (remain 8m 45s) Loss: 0.2658(0.3566) Grad: 198044.3281  LR: 0.00000418  
Epoch: [2][100/1277] Elapsed 0m 44s (remain 8m 36s) Loss: 0.3042(0.3558) Grad: 188234.9531  LR: 0.00000415  
Epoch: [2][120/1277] Elapsed 0m 53s (remain 8m 29s) Loss: 0.3013(0.3610) Grad: 280612.1875  LR: 0.00000413  
Epoch: [2][140/1277] Elapsed 1m 1s (remain 8m 18s) Loss: 0.2407(0.3642) Grad: 198914.5469  LR: 0.00000411  
Epoch: [2][160/1277] Elapsed 1m 10s (remain 8m 11s) Loss: 0.1903(0.3635) Grad

Epoch 2 - avg_train_loss: 0.3501  avg_val_loss: 0.3563  time: 714s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3501  avg_val_loss: 0.3563  time: 714s
Epoch 2 - Score: 0.4753  Scores: [0.42690990090064596, 0.5236497811189503]
INFO:__main__:Epoch 2 - Score: 0.4753  Scores: [0.42690990090064596, 0.5236497811189503]
Epoch 2 - Save Best Score: 0.4753 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.4753 Model


EVAL: [257/258] Elapsed 2m 34s (remain 0m 0s) Loss: 0.2953(0.3563) 
Epoch: [3][0/1277] Elapsed 0m 0s (remain 14m 59s) Loss: 0.3812(0.3812) Grad: inf  LR: 0.00000250  
Epoch: [3][20/1277] Elapsed 0m 9s (remain 9m 8s) Loss: 0.5053(0.3381) Grad: 275114.9375  LR: 0.00000247  
Epoch: [3][40/1277] Elapsed 0m 18s (remain 9m 3s) Loss: 0.3069(0.3337) Grad: 132528.3438  LR: 0.00000244  
Epoch: [3][60/1277] Elapsed 0m 26s (remain 8m 52s) Loss: 0.4277(0.3244) Grad: 168061.0781  LR: 0.00000241  
Epoch: [3][80/1277] Elapsed 0m 35s (remain 8m 45s) Loss: 0.3552(0.3187) Grad: 771920.1875  LR: 0.00000238  
Epoch: [3][100/1277] Elapsed 0m 44s (remain 8m 36s) Loss: 0.6330(0.3284) Grad: 208215.0312  LR: 0.00000234  
Epoch: [3][120/1277] Elapsed 0m 53s (remain 8m 29s) Loss: 0.3087(0.3253) Grad: 167086.3281  LR: 0.00000231  
Epoch: [3][140/1277] Elapsed 1m 2s (remain 8m 22s) Loss: 0.1950(0.3206) Grad: 204483.8906  LR: 0.00000228  
Epoch: [3][160/1277] Elapsed 1m 11s (remain 8m 13s) Loss: 0.4075(0.3233) Grad:

Epoch 3 - avg_train_loss: 0.3060  avg_val_loss: 0.3710  time: 714s
INFO:__main__:Epoch 3 - avg_train_loss: 0.3060  avg_val_loss: 0.3710  time: 714s
Epoch 3 - Score: 0.4779  Scores: [0.40340548522010666, 0.5523803101758128]
INFO:__main__:Epoch 3 - Score: 0.4779  Scores: [0.40340548522010666, 0.5523803101758128]


EVAL: [257/258] Elapsed 2m 34s (remain 0m 0s) Loss: 0.2561(0.3710) 
Epoch: [4][0/1277] Elapsed 0m 0s (remain 14m 37s) Loss: 0.2227(0.2227) Grad: inf  LR: 0.00000073  
Epoch: [4][20/1277] Elapsed 0m 9s (remain 9m 26s) Loss: 0.3116(0.2796) Grad: 296240.7188  LR: 0.00000071  
Epoch: [4][40/1277] Elapsed 0m 18s (remain 9m 5s) Loss: 0.3548(0.2821) Grad: 164856.8594  LR: 0.00000069  
Epoch: [4][60/1277] Elapsed 0m 26s (remain 8m 55s) Loss: 0.3339(0.2874) Grad: 226343.3750  LR: 0.00000067  
Epoch: [4][80/1277] Elapsed 0m 35s (remain 8m 46s) Loss: 0.1377(0.2827) Grad: 380244.1250  LR: 0.00000065  
Epoch: [4][100/1277] Elapsed 0m 44s (remain 8m 38s) Loss: 0.2526(0.2823) Grad: 325458.7188  LR: 0.00000063  
Epoch: [4][120/1277] Elapsed 0m 53s (remain 8m 29s) Loss: 0.3951(0.2771) Grad: 259619.6562  LR: 0.00000061  
Epoch: [4][140/1277] Elapsed 1m 1s (remain 8m 18s) Loss: 0.1747(0.2720) Grad: 171878.9844  LR: 0.00000059  
Epoch: [4][160/1277] Elapsed 1m 10s (remain 8m 9s) Loss: 0.2646(0.2707) Grad:

Epoch 4 - avg_train_loss: 0.2672  avg_val_loss: 0.3591  time: 717s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2672  avg_val_loss: 0.3591  time: 717s
Epoch 4 - Score: 0.4654  Scores: [0.39583632324723655, 0.5349437231233777]
INFO:__main__:Epoch 4 - Score: 0.4654  Scores: [0.39583632324723655, 0.5349437231233777]
Epoch 4 - Save Best Score: 0.4654 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.4654 Model


EVAL: [257/258] Elapsed 2m 34s (remain 0m 0s) Loss: 0.2574(0.3591) 


Score: 0.4654  Scores: [0.39583632324723655, 0.5349437231233777]
INFO:__main__:Score: 0.4654  Scores: [0.39583632324723655, 0.5349437231233777]




DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
 

Epoch: [1][0/1289] Elapsed 0m 0s (remain 18m 10s) Loss: 1.6900(1.6900) Grad: nan  LR: 0.00000500  
Epoch: [1][20/1289] Elapsed 0m 9s (remain 9m 48s) Loss: 0.9303(0.8280) Grad: 193538.8438  LR: 0.00000500  
Epoch: [1][40/1289] Elapsed 0m 18s (remain 9m 24s) Loss: 1.0490(0.8066) Grad: 128732.6094  LR: 0.00000500  
Epoch: [1][60/1289] Elapsed 0m 27s (remain 9m 5s) Loss: 0.6676(0.7884) Grad: 81768.9531  LR: 0.00000500  
Epoch: [1][80/1289] Elapsed 0m 35s (remain 8m 56s) Loss: 1.3247(0.7667) Grad: 765052.8750  LR: 0.00000500  
Epoch: [1][100/1289] Elapsed 0m 44s (remain 8m 42s) Loss: 0.4763(0.7387) Grad: 359124.5312  LR: 0.00000500  
Epoch: [1][120/1289] Elapsed 0m 53s (remain 8m 33s) Loss: 0.5581(0.7130) Grad: 247514.8281  LR: 0.00000499  
Epoch: [1][140/1289] Elapsed 1m 2s (remain 8m 25s) Loss: 0.7721(0.6970) Grad: 370200.1875  LR: 0.00000499  
Epoch: [1][160/1289] Elapsed 1m 10s (remain 8m 15s) Loss: 0.7688(0.6861) Grad: 1456553.7500  LR: 0.00000499  
Epoch: [1][180/1289] Elapsed 1m 19s 

Epoch 1 - avg_train_loss: 0.4650  avg_val_loss: 0.6953  time: 713s
INFO:__main__:Epoch 1 - avg_train_loss: 0.4650  avg_val_loss: 0.6953  time: 713s
Epoch 1 - Score: 0.8406  Scores: [0.7779752297077392, 0.9032787886020053]
INFO:__main__:Epoch 1 - Score: 0.8406  Scores: [0.7779752297077392, 0.9032787886020053]
Epoch 1 - Save Best Score: 0.8406 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.8406 Model


EVAL: [251/252] Elapsed 2m 33s (remain 0m 0s) Loss: 0.7895(0.6953) 
Epoch: [2][0/1289] Elapsed 0m 0s (remain 16m 43s) Loss: 0.7640(0.7640) Grad: inf  LR: 0.00000427  
Epoch: [2][20/1289] Elapsed 0m 9s (remain 9m 32s) Loss: 0.2214(0.4304) Grad: 392023.1875  LR: 0.00000425  
Epoch: [2][40/1289] Elapsed 0m 18s (remain 9m 10s) Loss: 0.3764(0.3970) Grad: 217018.8438  LR: 0.00000422  
Epoch: [2][60/1289] Elapsed 0m 26s (remain 9m 1s) Loss: 0.2918(0.3738) Grad: 345003.7812  LR: 0.00000420  
Epoch: [2][80/1289] Elapsed 0m 35s (remain 8m 50s) Loss: 0.2816(0.3777) Grad: 471616.8438  LR: 0.00000418  
Epoch: [2][100/1289] Elapsed 0m 43s (remain 8m 36s) Loss: 0.2585(0.3703) Grad: 369491.1875  LR: 0.00000416  
Epoch: [2][120/1289] Elapsed 0m 52s (remain 8m 29s) Loss: 0.2747(0.3663) Grad: 305029.5625  LR: 0.00000413  
Epoch: [2][140/1289] Elapsed 1m 1s (remain 8m 21s) Loss: 0.2496(0.3658) Grad: 177834.7188  LR: 0.00000411  
Epoch: [2][160/1289] Elapsed 1m 10s (remain 8m 13s) Loss: 0.3504(0.3657) Grad

Epoch 2 - avg_train_loss: 0.3370  avg_val_loss: 0.4390  time: 709s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3370  avg_val_loss: 0.4390  time: 709s
Epoch 2 - Score: 0.5606  Scores: [0.5184668684609455, 0.602665491343888]
INFO:__main__:Epoch 2 - Score: 0.5606  Scores: [0.5184668684609455, 0.602665491343888]
Epoch 2 - Save Best Score: 0.5606 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.5606 Model


EVAL: [251/252] Elapsed 2m 33s (remain 0m 0s) Loss: 0.4517(0.4390) 
Epoch: [3][0/1289] Elapsed 0m 0s (remain 16m 30s) Loss: 0.4577(0.4577) Grad: inf  LR: 0.00000250  
Epoch: [3][20/1289] Elapsed 0m 9s (remain 9m 13s) Loss: 0.2204(0.2868) Grad: 195042.0312  LR: 0.00000247  
Epoch: [3][40/1289] Elapsed 0m 17s (remain 9m 0s) Loss: 0.2570(0.2814) Grad: 285724.2500  LR: 0.00000244  
Epoch: [3][60/1289] Elapsed 0m 26s (remain 8m 49s) Loss: 0.4700(0.2770) Grad: 197902.5938  LR: 0.00000241  
Epoch: [3][80/1289] Elapsed 0m 35s (remain 8m 42s) Loss: 0.3233(0.2817) Grad: 373620.1875  LR: 0.00000238  
Epoch: [3][100/1289] Elapsed 0m 43s (remain 8m 33s) Loss: 0.2997(0.2848) Grad: 529268.9375  LR: 0.00000235  
Epoch: [3][120/1289] Elapsed 0m 52s (remain 8m 26s) Loss: 0.3767(0.2886) Grad: 191595.0000  LR: 0.00000232  
Epoch: [3][140/1289] Elapsed 1m 1s (remain 8m 17s) Loss: 0.3185(0.2930) Grad: 160826.1406  LR: 0.00000229  
Epoch: [3][160/1289] Elapsed 1m 9s (remain 8m 8s) Loss: 0.2224(0.2947) Grad: 

Epoch 3 - avg_train_loss: 0.2970  avg_val_loss: 0.4101  time: 708s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2970  avg_val_loss: 0.4101  time: 708s
Epoch 3 - Score: 0.5323  Scores: [0.45984544460468624, 0.6047384115187127]
INFO:__main__:Epoch 3 - Score: 0.5323  Scores: [0.45984544460468624, 0.6047384115187127]
Epoch 3 - Save Best Score: 0.5323 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.5323 Model


EVAL: [251/252] Elapsed 2m 33s (remain 0m 0s) Loss: 0.3365(0.4101) 
Epoch: [4][0/1289] Elapsed 0m 0s (remain 16m 23s) Loss: 0.4230(0.4230) Grad: inf  LR: 0.00000073  
Epoch: [4][20/1289] Elapsed 0m 9s (remain 9m 39s) Loss: 0.2925(0.2850) Grad: 472680.2500  LR: 0.00000071  
Epoch: [4][40/1289] Elapsed 0m 18s (remain 9m 9s) Loss: 0.1523(0.2788) Grad: 224410.2500  LR: 0.00000069  
Epoch: [4][60/1289] Elapsed 0m 26s (remain 8m 56s) Loss: 0.2947(0.2732) Grad: 180090.2656  LR: 0.00000067  
Epoch: [4][80/1289] Elapsed 0m 35s (remain 8m 43s) Loss: 0.1691(0.2663) Grad: 186970.6094  LR: 0.00000065  
Epoch: [4][100/1289] Elapsed 0m 43s (remain 8m 34s) Loss: 0.1668(0.2672) Grad: 242058.1250  LR: 0.00000063  
Epoch: [4][120/1289] Elapsed 0m 52s (remain 8m 25s) Loss: 0.0862(0.2648) Grad: 138325.8906  LR: 0.00000061  
Epoch: [4][140/1289] Elapsed 1m 1s (remain 8m 18s) Loss: 0.4260(0.2695) Grad: 159167.4375  LR: 0.00000059  
Epoch: [4][160/1289] Elapsed 1m 10s (remain 8m 11s) Loss: 0.2158(0.2692) Grad

Epoch 4 - avg_train_loss: 0.2629  avg_val_loss: 0.4126  time: 707s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2629  avg_val_loss: 0.4126  time: 707s
Epoch 4 - Score: 0.5328  Scores: [0.4658163611150141, 0.5997514901145253]
INFO:__main__:Epoch 4 - Score: 0.5328  Scores: [0.4658163611150141, 0.5997514901145253]


EVAL: [251/252] Elapsed 2m 33s (remain 0m 0s) Loss: 0.3849(0.4126) 


Score: 0.5323  Scores: [0.45984544460468624, 0.6047384115187127]
INFO:__main__:Score: 0.5323  Scores: [0.45984544460468624, 0.6047384115187127]




DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
 

Epoch: [1][0/1292] Elapsed 0m 0s (remain 16m 17s) Loss: 2.1244(2.1244) Grad: inf  LR: 0.00000500  
Epoch: [1][20/1292] Elapsed 0m 7s (remain 7m 45s) Loss: 0.7573(1.0166) Grad: 122319.5938  LR: 0.00000500  
Epoch: [1][40/1292] Elapsed 0m 14s (remain 7m 32s) Loss: 0.5295(0.9044) Grad: 815699.7500  LR: 0.00000500  
Epoch: [1][60/1292] Elapsed 0m 21s (remain 7m 21s) Loss: 0.8274(0.8522) Grad: 3309849.7500  LR: 0.00000500  
Epoch: [1][80/1292] Elapsed 0m 28s (remain 7m 12s) Loss: 0.6208(0.8300) Grad: 119938.0625  LR: 0.00000500  
Epoch: [1][100/1292] Elapsed 0m 35s (remain 7m 1s) Loss: 0.6372(0.7817) Grad: 429386.1562  LR: 0.00000500  
Epoch: [1][120/1292] Elapsed 0m 42s (remain 6m 54s) Loss: 0.9738(0.7517) Grad: 398015.9375  LR: 0.00000499  
Epoch: [1][140/1292] Elapsed 0m 49s (remain 6m 44s) Loss: 0.5852(0.7302) Grad: 217283.2500  LR: 0.00000499  
Epoch: [1][160/1292] Elapsed 0m 56s (remain 6m 38s) Loss: 0.4490(0.7132) Grad: 192011.3750  LR: 0.00000499  
Epoch: [1][180/1292] Elapsed 1m 3s

Epoch 1 - avg_train_loss: 0.4737  avg_val_loss: 0.3807  time: 635s
INFO:__main__:Epoch 1 - avg_train_loss: 0.4737  avg_val_loss: 0.3807  time: 635s
Epoch 1 - Score: 0.5020  Scores: [0.49288389891292894, 0.5111690069662758]
INFO:__main__:Epoch 1 - Score: 0.5020  Scores: [0.49288389891292894, 0.5111690069662758]
Epoch 1 - Save Best Score: 0.5020 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5020 Model


EVAL: [249/250] Elapsed 3m 2s (remain 0m 0s) Loss: 0.5259(0.3807) 
Epoch: [2][0/1292] Elapsed 0m 0s (remain 15m 0s) Loss: 0.3451(0.3451) Grad: inf  LR: 0.00000427  
Epoch: [2][20/1292] Elapsed 0m 8s (remain 8m 5s) Loss: 0.3735(0.3380) Grad: 207337.8125  LR: 0.00000425  
Epoch: [2][40/1292] Elapsed 0m 14s (remain 7m 29s) Loss: 0.7063(0.3466) Grad: 371340.0312  LR: 0.00000422  
Epoch: [2][60/1292] Elapsed 0m 21s (remain 7m 19s) Loss: 0.3748(0.3538) Grad: 126224.0781  LR: 0.00000420  
Epoch: [2][80/1292] Elapsed 0m 28s (remain 7m 9s) Loss: 0.2387(0.3430) Grad: 590144.0625  LR: 0.00000418  
Epoch: [2][100/1292] Elapsed 0m 35s (remain 7m 0s) Loss: 0.5908(0.3382) Grad: 513365.3438  LR: 0.00000416  
Epoch: [2][120/1292] Elapsed 0m 42s (remain 6m 53s) Loss: 0.5250(0.3359) Grad: 405289.5938  LR: 0.00000413  
Epoch: [2][140/1292] Elapsed 0m 50s (remain 6m 48s) Loss: 0.3880(0.3341) Grad: 181201.1406  LR: 0.00000411  
Epoch: [2][160/1292] Elapsed 0m 57s (remain 6m 41s) Loss: 0.3833(0.3311) Grad: 2

Epoch 2 - avg_train_loss: 0.3363  avg_val_loss: 0.3536  time: 633s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3363  avg_val_loss: 0.3536  time: 633s
Epoch 2 - Score: 0.4644  Scores: [0.43860453396902144, 0.4902857944600095]
INFO:__main__:Epoch 2 - Score: 0.4644  Scores: [0.43860453396902144, 0.4902857944600095]
Epoch 2 - Save Best Score: 0.4644 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.4644 Model


EVAL: [249/250] Elapsed 3m 1s (remain 0m 0s) Loss: 0.4206(0.3536) 
Epoch: [3][0/1292] Elapsed 0m 0s (remain 14m 3s) Loss: 0.1283(0.1283) Grad: 719197.8125  LR: 0.00000250  
Epoch: [3][20/1292] Elapsed 0m 7s (remain 7m 31s) Loss: 0.3126(0.3006) Grad: 251820.3750  LR: 0.00000247  
Epoch: [3][40/1292] Elapsed 0m 14s (remain 7m 22s) Loss: 0.2738(0.2942) Grad: 457109.2812  LR: 0.00000244  
Epoch: [3][60/1292] Elapsed 0m 21s (remain 7m 11s) Loss: 0.4466(0.3012) Grad: 355875.0625  LR: 0.00000241  
Epoch: [3][80/1292] Elapsed 0m 28s (remain 7m 6s) Loss: 0.3796(0.3072) Grad: 425948.8125  LR: 0.00000238  
Epoch: [3][100/1292] Elapsed 0m 35s (remain 6m 57s) Loss: 0.2267(0.2939) Grad: 425327.6562  LR: 0.00000235  
Epoch: [3][120/1292] Elapsed 0m 42s (remain 6m 52s) Loss: 0.3050(0.2937) Grad: 249299.2344  LR: 0.00000232  
Epoch: [3][140/1292] Elapsed 0m 49s (remain 6m 44s) Loss: 0.2718(0.2953) Grad: 220472.8906  LR: 0.00000229  
Epoch: [3][160/1292] Elapsed 0m 56s (remain 6m 37s) Loss: 0.2739(0.294

Epoch 3 - avg_train_loss: 0.3037  avg_val_loss: 0.3494  time: 633s
INFO:__main__:Epoch 3 - avg_train_loss: 0.3037  avg_val_loss: 0.3494  time: 633s
Epoch 3 - Score: 0.4546  Scores: [0.41174326642239495, 0.497505934019776]
INFO:__main__:Epoch 3 - Score: 0.4546  Scores: [0.41174326642239495, 0.497505934019776]
Epoch 3 - Save Best Score: 0.4546 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.4546 Model


EVAL: [249/250] Elapsed 3m 2s (remain 0m 0s) Loss: 0.4935(0.3494) 
Epoch: [4][0/1292] Elapsed 0m 0s (remain 13m 53s) Loss: 0.2379(0.2379) Grad: inf  LR: 0.00000073  
Epoch: [4][20/1292] Elapsed 0m 7s (remain 7m 55s) Loss: 0.1725(0.3144) Grad: 419119.0938  LR: 0.00000071  
Epoch: [4][40/1292] Elapsed 0m 14s (remain 7m 37s) Loss: 0.2443(0.3080) Grad: 313442.1562  LR: 0.00000069  
Epoch: [4][60/1292] Elapsed 0m 22s (remain 7m 24s) Loss: 0.4197(0.3040) Grad: 639898.5000  LR: 0.00000067  
Epoch: [4][80/1292] Elapsed 0m 28s (remain 7m 12s) Loss: 0.1273(0.2987) Grad: 483162.5625  LR: 0.00000065  
Epoch: [4][100/1292] Elapsed 0m 36s (remain 7m 6s) Loss: 0.2096(0.2936) Grad: 232325.3750  LR: 0.00000063  
Epoch: [4][120/1292] Elapsed 0m 43s (remain 6m 56s) Loss: 0.3829(0.2912) Grad: 181049.4688  LR: 0.00000061  
Epoch: [4][140/1292] Elapsed 0m 49s (remain 6m 46s) Loss: 0.1938(0.2847) Grad: 596755.4375  LR: 0.00000059  
Epoch: [4][160/1292] Elapsed 0m 56s (remain 6m 38s) Loss: 0.2299(0.2865) Grad

Epoch 4 - avg_train_loss: 0.2746  avg_val_loss: 0.3436  time: 633s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2746  avg_val_loss: 0.3436  time: 633s
Epoch 4 - Score: 0.4473  Scores: [0.4111314095520786, 0.48350018598369315]
INFO:__main__:Epoch 4 - Score: 0.4473  Scores: [0.4111314095520786, 0.48350018598369315]
Epoch 4 - Save Best Score: 0.4473 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.4473 Model


EVAL: [249/250] Elapsed 3m 2s (remain 0m 0s) Loss: 0.4637(0.3436) 


Score: 0.4473  Scores: [0.4111314095520786, 0.48350018598369315]
INFO:__main__:Score: 0.4473  Scores: [0.4111314095520786, 0.48350018598369315]




DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.21.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
 

Epoch: [1][0/1515] Elapsed 0m 0s (remain 21m 37s) Loss: 0.9210(0.9210) Grad: 8166189.5000  LR: 0.00000500  
Epoch: [1][20/1515] Elapsed 0m 9s (remain 11m 17s) Loss: 1.0471(0.9011) Grad: 178095.7969  LR: 0.00000500  
Epoch: [1][40/1515] Elapsed 0m 18s (remain 10m 51s) Loss: 0.3203(0.8043) Grad: 98480.4219  LR: 0.00000500  
Epoch: [1][60/1515] Elapsed 0m 26s (remain 10m 39s) Loss: 0.6012(0.8041) Grad: 471706.9688  LR: 0.00000500  
Epoch: [1][80/1515] Elapsed 0m 35s (remain 10m 26s) Loss: 0.4242(0.7976) Grad: 1958238.1250  LR: 0.00000500  
Epoch: [1][100/1515] Elapsed 0m 43s (remain 10m 13s) Loss: 0.9157(0.7714) Grad: 247364.3125  LR: 0.00000500  
Epoch: [1][120/1515] Elapsed 0m 52s (remain 10m 3s) Loss: 0.5443(0.7474) Grad: 146340.2188  LR: 0.00000500  
Epoch: [1][140/1515] Elapsed 1m 0s (remain 9m 52s) Loss: 0.5285(0.7294) Grad: 817501.5000  LR: 0.00000499  
Epoch: [1][160/1515] Elapsed 1m 9s (remain 9m 45s) Loss: 0.3111(0.7112) Grad: 445388.5000  LR: 0.00000499  
Epoch: [1][180/1515] E

Epoch 1 - avg_train_loss: 0.4483  avg_val_loss: 0.4976  time: 723s
INFO:__main__:Epoch 1 - avg_train_loss: 0.4483  avg_val_loss: 0.4976  time: 723s
Epoch 1 - Score: 0.6531  Scores: [0.527906272189333, 0.7781947412741725]
INFO:__main__:Epoch 1 - Score: 0.6531  Scores: [0.527906272189333, 0.7781947412741725]
Epoch 1 - Save Best Score: 0.6531 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6531 Model


EVAL: [137/138] Elapsed 1m 19s (remain 0m 0s) Loss: 0.7244(0.4976) 
Epoch: [2][0/1515] Elapsed 0m 0s (remain 20m 0s) Loss: 0.4402(0.4402) Grad: inf  LR: 0.00000427  
Epoch: [2][20/1515] Elapsed 0m 9s (remain 10m 43s) Loss: 0.4217(0.3722) Grad: 208190.2344  LR: 0.00000425  
Epoch: [2][40/1515] Elapsed 0m 17s (remain 10m 20s) Loss: 0.3847(0.3755) Grad: 137133.4062  LR: 0.00000423  
Epoch: [2][60/1515] Elapsed 0m 25s (remain 10m 13s) Loss: 0.2771(0.3594) Grad: 324603.8750  LR: 0.00000421  
Epoch: [2][80/1515] Elapsed 0m 34s (remain 10m 9s) Loss: 0.2957(0.3500) Grad: 275898.8750  LR: 0.00000419  
Epoch: [2][100/1515] Elapsed 0m 43s (remain 10m 7s) Loss: 0.4388(0.3588) Grad: 173802.9375  LR: 0.00000417  
Epoch: [2][120/1515] Elapsed 0m 51s (remain 9m 58s) Loss: 0.3915(0.3540) Grad: 126000.1406  LR: 0.00000415  
Epoch: [2][140/1515] Elapsed 1m 0s (remain 9m 47s) Loss: 0.4625(0.3561) Grad: 259595.0625  LR: 0.00000413  
Epoch: [2][160/1515] Elapsed 1m 8s (remain 9m 39s) Loss: 0.2688(0.3555) Gr

Epoch 2 - avg_train_loss: 0.3358  avg_val_loss: 0.4502  time: 723s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3358  avg_val_loss: 0.4502  time: 723s
Epoch 2 - Score: 0.5909  Scores: [0.49063706703281357, 0.6911610379800536]
INFO:__main__:Epoch 2 - Score: 0.5909  Scores: [0.49063706703281357, 0.6911610379800536]
Epoch 2 - Save Best Score: 0.5909 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.5909 Model


EVAL: [137/138] Elapsed 1m 19s (remain 0m 0s) Loss: 0.6126(0.4502) 
Epoch: [3][0/1515] Elapsed 0m 0s (remain 20m 8s) Loss: 0.1550(0.1550) Grad: inf  LR: 0.00000250  
Epoch: [3][20/1515] Elapsed 0m 9s (remain 11m 8s) Loss: 0.2993(0.3417) Grad: 123067.0156  LR: 0.00000247  
Epoch: [3][40/1515] Elapsed 0m 18s (remain 10m 48s) Loss: 0.3063(0.3215) Grad: 161034.2031  LR: 0.00000245  
Epoch: [3][60/1515] Elapsed 0m 26s (remain 10m 30s) Loss: 0.3846(0.3109) Grad: 218167.7344  LR: 0.00000242  
Epoch: [3][80/1515] Elapsed 0m 35s (remain 10m 20s) Loss: 0.2280(0.3142) Grad: 162868.4688  LR: 0.00000240  
Epoch: [3][100/1515] Elapsed 0m 43s (remain 10m 10s) Loss: 0.2083(0.3138) Grad: 221295.7812  LR: 0.00000237  
Epoch: [3][120/1515] Elapsed 0m 52s (remain 10m 0s) Loss: 0.2978(0.3149) Grad: 173589.9375  LR: 0.00000234  
Epoch: [3][140/1515] Elapsed 1m 0s (remain 9m 51s) Loss: 0.3705(0.3138) Grad: 244841.3906  LR: 0.00000232  
Epoch: [3][160/1515] Elapsed 1m 9s (remain 9m 44s) Loss: 0.2822(0.3066) G

Epoch 3 - avg_train_loss: 0.2955  avg_val_loss: 0.4589  time: 726s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2955  avg_val_loss: 0.4589  time: 726s
Epoch 3 - Score: 0.6043  Scores: [0.505107575209257, 0.7034044729086261]
INFO:__main__:Epoch 3 - Score: 0.6043  Scores: [0.505107575209257, 0.7034044729086261]


EVAL: [137/138] Elapsed 1m 19s (remain 0m 0s) Loss: 0.6391(0.4589) 
Epoch: [4][0/1515] Elapsed 0m 0s (remain 19m 58s) Loss: 0.2833(0.2833) Grad: inf  LR: 0.00000073  
Epoch: [4][20/1515] Elapsed 0m 9s (remain 10m 42s) Loss: 0.1499(0.2520) Grad: 326217.9062  LR: 0.00000071  
Epoch: [4][40/1515] Elapsed 0m 17s (remain 10m 35s) Loss: 0.2483(0.2712) Grad: 175347.5625  LR: 0.00000070  
Epoch: [4][60/1515] Elapsed 0m 26s (remain 10m 33s) Loss: 0.1883(0.2583) Grad: 172899.4219  LR: 0.00000068  
Epoch: [4][80/1515] Elapsed 0m 35s (remain 10m 25s) Loss: 0.2092(0.2518) Grad: 116701.5469  LR: 0.00000066  
Epoch: [4][100/1515] Elapsed 0m 43s (remain 10m 14s) Loss: 0.1376(0.2585) Grad: 151018.1406  LR: 0.00000064  
Epoch: [4][120/1515] Elapsed 0m 52s (remain 10m 5s) Loss: 0.3718(0.2641) Grad: 185162.7656  LR: 0.00000063  
Epoch: [4][140/1515] Elapsed 1m 1s (remain 9m 55s) Loss: 0.3959(0.2644) Grad: 188688.1094  LR: 0.00000061  
Epoch: [4][160/1515] Elapsed 1m 9s (remain 9m 47s) Loss: 0.2058(0.2645)

Epoch 4 - avg_train_loss: 0.2623  avg_val_loss: 0.4256  time: 726s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2623  avg_val_loss: 0.4256  time: 726s
Epoch 4 - Score: 0.5603  Scores: [0.4889781275212197, 0.631637156143467]
INFO:__main__:Epoch 4 - Score: 0.5603  Scores: [0.4889781275212197, 0.631637156143467]
Epoch 4 - Save Best Score: 0.5603 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.5603 Model


EVAL: [137/138] Elapsed 1m 19s (remain 0m 0s) Loss: 0.5669(0.4256) 




In [None]:
labels = oof_df[CFG.target_cols].values
preds = oof_df[[f"pred_{c}" for c in CFG.target_cols]].values
score, scores = get_score(labels, preds)
print(f'Score: {score:<.4f}  Scores: {scores}')

In [None]:
runtime.unassign()