## References

- https://www.kaggle.com/yasufuminakama/nbme-deberta-base-baseline-train

## Configurations

In [34]:
EXP_NAME = "nbme-exp001"
ENV = "local"
DEBUG_MODE = False
SUBMISSION_MODE = False

In [35]:
class CFG:
    apex=True
    input_dir=None
    output_dir=None
    #env="colab"  # ["kaggle", "colab"]
    library="pytorch"  # ["tf", "pytorch"]
    device="GPU"  # ["GPU", "TPU"]
    competition_name="nbme-score-clinical-patient-notes"
    id_col="id"
    target_col="location"
    pretrained_model_name="microsoft/deberta-base"
    tokenizer=None
    max_len=None
    output_dim=1
    dropout=0.2
    num_workers=4
    batch_size=16
    lr=2e-5
    weight_decay=0.1
    num_warmup_steps_rate=0.1
    batch_scheduler=True
    epochs=5
    n_fold=5
    train_fold=[0, 1, 2, 3, 4]
    seed=71
    gradient_accumulation_steps=1
    max_grad_norm=1000
    print_freq=100
    train=False
    inference=True

In [36]:
if DEBUG_MODE:
    CFG.epochs = 2
    CFG.train_fold = [0]

if SUBMISSION_MODE:
    CFG.train = False
    CFG.inference = True

## Directory Settings

In [37]:
import sys
from pathlib import Path


print(ENV)
if ENV == "colab":
    # colab環境
    CFG.env = "colab"
    from google.colab import drive
    drive._mount('/content/drive')
    CFG.input_dir = Path("./drive/MyDrive/00.kaggle/petfinder2/input/petfinder-pawpularity-score/")
    CFG.train_image_dir = CFG.input_dir / "train"
    CFG.test_image_dir = CFG.input_dir / "test"
    CFG.output_dir = Path("./drive/MyDrive/00.kaggle/petfinder2/output/") / EXP_NAME
    if not CFG.output_dir.exists():
        CFG.output_dir.mkdir()
elif ENV == "local":
    # ローカルサーバ
    CFG.env = "local"
    CFG.input_dir = Path("../input/") / CFG.competition_name
    CFG.output_dir = Path("../output/") / CFG.competition_name / EXP_NAME
    if not CFG.output_dir.exists():
        CFG.output_dir.mkdir()
elif ENV == "kaggle":
    # kaggle環境
    CFG.env = "kaggle"
    CFG.input_dir = Path("../input/") / CFG.competition_name
    CFG.train_image_dir = CFG.input_dir / "train"
    CFG.test_image_dir = CFG.input_dir / "test"
    CFG.output_dir = Path("./")

local


In [38]:
import gc
import os
import ast
import time
import math
import random
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

from sklearn.metrics import roc_auc_score, mean_squared_error, f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as T
from torchvision.io import read_image
from torch.utils.data import DataLoader, Dataset

from transformers import BartModel,BertModel,BertTokenizer
from transformers import DebertaModel,DebertaTokenizer
from transformers import RobertaModel,RobertaTokenizer
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel,AutoConfig
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import ElectraModel, ElectraTokenizer, ElectraForSequenceClassification

import warnings
warnings.filterwarnings("ignore")

## Utilities

In [39]:
def micro_f1(preds, truths):
    """
    Micro f1 on binary arrays.

    Args:
        preds (list of lists of ints): Predictions.
        truths (list of lists of ints): Ground truths.

    Returns:
        float: f1 score.
    """
    # Micro : aggregating over all instances
    preds = np.concatenate(preds)
    truths = np.concatenate(truths)
    return f1_score(truths, preds)


def spans_to_binary(spans, length=None):
    """
    Converts spans to a binary array indicating whether each character is in the span.

    Args:
        spans (list of lists of two ints): Spans.

    Returns:
        np array [length]: Binarized spans.
    """
    length = np.max(spans) if length is None else length
    binary = np.zeros(length)
    for start, end in spans:
        binary[start:end] = 1
    return binary


def span_micro_f1(preds, truths):
    """
    Micro f1 on spans.

    Args:
        preds (list of lists of two ints): Prediction spans.
        truths (list of lists of two ints): Ground truth spans.

    Returns:
        float: f1 score.
    """
    bin_preds = []
    bin_truths = []
    for pred, truth in zip(preds, truths):
        if not len(pred) and not len(truth):
            continue
        length = max(np.max(pred) if len(pred) else 0, np.max(truth) if len(truth) else 0)
        bin_preds.append(spans_to_binary(pred, length))
        bin_truths.append(spans_to_binary(truth, length))
    return micro_f1(bin_preds, bin_truths)


def get_score(y_true, y_pred):
    score = span_micro_f1(y_true, y_pred)
    return score


class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


seed_everything()

## Data Loading

In [40]:
train = pd.read_csv(CFG.input_dir / "train.csv")
features = pd.read_csv(CFG.input_dir / "features.csv")
patient_notes = pd.read_csv(CFG.input_dir / "patient_notes.csv")
test = pd.read_csv(CFG.input_dir / "test.csv")
train.shape, features.shape, patient_notes.shape, test.shape

((14300, 6), (143, 3), (42146, 3), (5, 4))

In [41]:
if DEBUG_MODE:
    train = train.sample(n=1000, random_state=0).reset_index(drop=True)

## Preprocessing

In [42]:
def preprocess_features(features):
    features.loc[features["feature_text"] == "Last-Pap-smear-I-year-ago", "feature_text"] = "Last-Pap-smear-1-year-ago"
    return features


features = preprocess_features(features)

In [43]:
train = train.merge(features, on=["feature_num", "case_num"], how="left")
train = train.merge(patient_notes, on=["pn_num", "case_num"], how="left")
test = test.merge(features, on=["feature_num", "case_num"], how="left")
test = test.merge(patient_notes, on=["pn_num", "case_num"], how="left")
train.shape, test.shape

((14300, 8), (5, 6))

In [44]:
def fix_anno(df, target_id, annotation, location):
    idx = df["id"] == target_id
    df.loc[idx, "annotation"] = annotation
    df.loc[idx, "location"] = location

In [45]:
fix_anno(train, "00669_000", "['father heart attack']", "['764 783']")
fix_anno(train, "01110_010", "['for the last 2-3 months', 'over the last 2 months']", "['77 100', '398 420']")
fix_anno(train, "01146_005", "['no heat intolerance', 'no cold intolerance']", "['285 292;301 312', '285 287;296 312']")
fix_anno(train, "02428_001", "['mother thyroid problem']", "['551 557;565 580']")
fix_anno(train, "02428_004", '[\'felt like he was going to "pass out"\']', "['131 135;181 212']")
fix_anno(train, "10047_105", "['stool , with no blood']", "['259 280']")
fix_anno(train, "10196_105", "['diarrhoe non blooody']", "['176 184;201 212']")
fix_anno(train, "10206_103", "['diarrhea for last 2-3 days']", "['249 257;271 288']")
fix_anno(train, "10228_100", "['no vaginal discharge']", "['822 824;907 924']")
fix_anno(train, "10268_111", "['started about 8-10 hours ago']", "['101 129']")
fix_anno(train, "10459_105", "['no blood in the stool']", "['531 539;549 561']")
fix_anno(train, "10620_102", "['last sexually active 9 months ago']", "['540 560;581 593']")
fix_anno(train, "10646_107", "['right lower quadrant pain']", "['32 57']")
fix_anno(train, "10968_105", "['diarrhoea no blood']", "['308 317;376 384']")
fix_anno(train, "20747_214", "['sweating']", "['549 557']")
fix_anno(
    train,
    "21686_200",
    "['previously as regular', 'previously eveyr 28-29 days', 'previously lasting 5 days', 'previously regular flow']",
    "['102 123', '102 112;125 141', '102 112;143 157', '102 112;159 171']",
)
fix_anno(train, "30437_309", "['for 2 months']", "['33 45']")
fix_anno(train, "32657_315", "['35 year old']", "['5 16']")
fix_anno(train, "32996_302", "['darker brown stools']", "['175 194']")
fix_anno(train, "33531_300", "['uncle with peptic ulcer']", "['700 723']")
fix_anno(train, "40974_406", "['difficulty falling asleep']", "['225 250']")
fix_anno(train, "41825_402", "['helps to take care of aging mother and in-laws']", "['197 218;236 260']")
fix_anno(
    train,
    "42625_400",
    "['No hair changes', 'No skin changes', 'No GI changes', 'No palpitations', 'No excessive sweating']",
    "['480 482;507 519', '480 482;499 503;512 519', '480 482;521 531', '480 482;533 545', '480 482;564 582']",
)
fix_anno(
    train,
    "43451_402",
    "['stressed due to taking care of her mother', 'stressed due to taking care of husbands parents']",
    "['290 320;327 337', '290 320;342 358']",
)
fix_anno(train, "44958_402", "['stressor taking care of many sick family members']", "['288 296;324 363']")
fix_anno(train, "50574_514", "['heart started racing and felt numbness for the 1st time in her finger tips']", "['108 182']")
fix_anno(train, "52512_500", "['first started 5 yrs']", "['102 121']")
fix_anno(train, "60235_608", "['No shortness of breath']", "['481 483;533 552']")
fix_anno(train, "60469_603", "['recent URI', 'nasal stuffines, rhinorrhea, for 3-4 days']", "['92 102', '123 164']")
fix_anno(
    train,
    "70255_702",
    "['irregularity with her cycles', 'heavier bleeding', 'changes her pad every couple hours']",
    "['89 117', '122 138', '368 402']",
)
fix_anno(train, "70412_701", "['gaining 10-15 lbs']", "['344 361']")
fix_anno(train, "72660_701", "['weight gain', 'gain of 10-16lbs']", "['600 611', '607 623']")
fix_anno(train, "81856_813", "['seeing her son knows are not real']", "['386 400;443 461']")
fix_anno(train, "81985_813", "['saw him once in the kitchen after he died']", "['160 201']")
fix_anno(train, "83199_810", "['tried Ambien but it didnt work']", "['325 337;349 366']")
fix_anno(train, "83757_803", "['heard what she described as a party later than evening these things did not actually happen']", "['405 459;488 524']")
fix_anno(train, "83757_813", "['experienced seeing her son at the kitchen table these things did not actually happen']", "['353 400;488 524']")
fix_anno(train, "92224_909", "['SCRACHY THROAT', 'RUNNY NOSE']", "['293 307', '321 331']")
fix_anno(train, "92385_900", "['without improvement when taking tylenol', 'without improvement when taking ibuprofen']", "['182 221', '182 213;225 234']")
fix_anno(train, "92385_902", "['yesterday', 'yesterday']", "['79 88', '409 418']")
fix_anno(train, "93988_904", "['headache global', 'headache throughout her head']", "['86 94;230 236', '86 94;237 256']")
fix_anno(train, "94656_904", "['headache generalized in her head']", "['56 64;156 179']")

In [46]:
train["annotation"] = train["annotation"].apply(ast.literal_eval)
train["location"] = train["location"].apply(ast.literal_eval)

In [47]:
train["annotation_length"] = train["annotation"].apply(len)
display(train['annotation_length'].value_counts().sort_index())

0    4399
1    8184
2    1293
3     287
4      99
5      27
6       9
7       1
8       1
Name: annotation_length, dtype: int64

## CV split

In [48]:
def get_groupkfold(df, group_name):
    groups = df[group_name].unique()

    kf = KFold(
        n_splits=CFG.n_fold,
        shuffle=True,
        random_state=CFG.seed,
    )
    folds_ids = []
    for i_fold, (_, val_group_idx) in enumerate(kf.split(groups)):
        val_group = groups[val_group_idx]
        is_val = df[group_name].isin(val_group)
        val_idx = df[is_val].index
        df.loc[val_idx, "fold"] = int(i_fold)

    df["fold"] = df["fold"].astype(int)
    return df

In [49]:
train = get_groupkfold(train, "pn_num")
display(train.groupby("fold").size())

fold
0    2902
1    2894
2    2813
3    2791
4    2900
dtype: int64

## Setup tokenizer

In [50]:
if SUBMISSION_MODE:
    tokenizer = AutoTokenizer.from_pretrained(Path("../input/") / EXP_NAME / "tokenizer/")
else:
    tokenizer = AutoTokenizer.from_pretrained(CFG.pretrained_model_name)
    tokenizer.save_pretrained(CFG.output_dir / "tokenizer/")

CFG.tokenizer = tokenizer

## Create dataset

In [51]:
pn_history_lengths = []
tk0 = tqdm(patient_notes["pn_history"].fillna("").values, total=len(patient_notes))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)["input_ids"])
    pn_history_lengths.append(length)

print("max length:", np.max(pn_history_lengths))

  0%|          | 0/42146 [00:00<?, ?it/s]

max length: 433


In [52]:
feature_text_lengths = []
tk0 = tqdm(features["feature_text"].fillna("").values, total=len(features))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)["input_ids"])
    feature_text_lengths.append(length)

print("max length:", np.max(feature_text_lengths))

  0%|          | 0/143 [00:00<?, ?it/s]

max length: 30


In [53]:
CFG.max_len = max(pn_history_lengths) + max(feature_text_lengths) + 3   # cls & sep & sep
print("max length:", CFG.max_len)

max length: 466


In [54]:
class TrainingDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.df = df
        self.tokenizer = self.cfg.tokenizer
        self.max_len = self.cfg.max_len
        self.feature_texts = self.df["feature_text"].values
        self.pn_historys = self.df["pn_history"].values
        self.annotation_lengths = self.df["annotation_length"].values
        self.locations = self.df["location"].values

    def __len__(self):
        return len(self.df)

    def _create_input(self, pn_history, feature_text):
        encoded = self.tokenizer(
            text=pn_history,
            text_pair=feature_text,
            max_length=self.max_len,
            padding="max_length",
            return_offsets_mapping=False,
        )
        for k, v in encoded.items():
            encoded[k] = torch.tensor(v, dtype=torch.long)
        return encoded

    def _create_label(self, pn_history, annotation_length, location_list):
        encoded = self.tokenizer(
            text=pn_history,
            max_length=self.max_len,
            padding="max_length",
            return_offsets_mapping=True,
        )
        offset_mapping = encoded["offset_mapping"]
        ignore_idxes = np.where(np.array(encoded.sequence_ids()) != 0)[0]
        label = np.zeros(len(offset_mapping))
        label[ignore_idxes] = -1

        if annotation_length > 0:
            for location in location_list:
                for loc in [s.split() for s in location.split(";")]:
                    start, end = int(loc[0]), int(loc[1])
                    start_idx = -1
                    end_idx = -1
                    for idx in range(len(offset_mapping)):
                        if (start_idx == -1) & (start < offset_mapping[idx][0]):
                            start_idx = idx - 1
                        if (end_idx == -1) & (end <= offset_mapping[idx][1]):
                            end_idx = idx + 1
                    if start_idx == -1:
                        start_idx = end_idx
                    if (start_idx != -1) & (end_idx != -1):
                        label[start_idx:end_idx] = 1

        return torch.tensor(label, dtype=torch.float)

    def __getitem__(self, idx):
        input_ = self._create_input(self.pn_historys[idx], self.feature_texts[idx])
        label = self._create_label(self.pn_historys[idx], self.annotation_lengths[idx], self.locations[idx])
        return input_, label

In [55]:
class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.df = df
        self.tokenizer = self.cfg.tokenizer
        self.max_len = self.cfg.max_len
        self.feature_texts = self.df["feature_text"].values
        self.pn_historys = self.df["pn_history"].values

    def __len__(self):
        return len(self.df)

    def _create_input(self, pn_history, feature_text):
        encoded = self.tokenizer(
            text=pn_history,
            text_pair=feature_text,
            max_length=self.max_len,
            padding="max_length",
            return_offsets_mapping=False,
        )
        for k, v in encoded.items():
            encoded[k] = torch.tensor(v, dtype=torch.long)
        return encoded

    def __getitem__(self, idx):
        input_ = self._create_input(self.pn_historys[idx], self.feature_texts[idx])
        return input_

## Model

In [56]:
class CustomModel(nn.Module):
    def __init__(self, cfg, model_config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg

        if model_config_path is None:
            self.model_config = AutoConfig.from_pretrained(
                self.cfg.pretrained_model_name,
                output_hidden_states=True,
            )
        else:
            self.model_config = torch.load(model_config_path)

        if pretrained:
            self.backbone = AutoModel.from_pretrained(
                self.cfg.pretrained_model_name,
                config=self.model_config,
            )
        else:
            self.backbone = AutoModel.from_config(self.model_config)

        self.fc = nn.Sequential(
            nn.Dropout(self.cfg.dropout),
            nn.Linear(self.model_config.hidden_size, self.cfg.output_dim),
        )

    def forward(self, inputs):
        h = self.backbone(**inputs)["last_hidden_state"]
        output = self.fc(h)
        return output

## Training

In [57]:
def create_labels_for_scoring(df):
    # example: ['0 1', '3 4'] -> ['0 1; 3 4']
    df["location_for_create_labels"] = [ast.literal_eval(f"[]")] * len(df)
    for i in range(len(df)):
        lst = df.loc[i, "location"]
        if lst:
            new_lst = ";".join(lst)
            df.loc[i, "location_for_create_labels"] = ast.literal_eval(f"[['{new_lst}']]")

    # create labels
    truths = []
    for location_list in df["location_for_create_labels"].values:
        truth = []
        if len(location_list) > 0:
            location = location_list[0]
            for loc in [s.split() for s in location.split(";")]:
                start, end = int(loc[0]), int(loc[1])
                truth.append([start, end])
        truths.append(truth)
    return truths


def get_char_probs(texts, predictions, tokenizer):
    res = [np.zeros(len(t)) for t in texts]
    for i, (text, prediction) in enumerate(zip(texts, predictions)):
        encoded = tokenizer(
            text=text,
            max_length=CFG.max_len,
            padding="max_length",
            return_offsets_mapping=True,
        )
        for (offset_mapping, pred) in zip(encoded["offset_mapping"], prediction):
            start, end = offset_mapping
            res[i][start:end] = pred
    return res


def get_results(char_probs, th=0.5):
    results = []
    for char_prob in char_probs:
        result = np.where(char_prob >= th)[0] + 1
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        result = [f"{min(r)} {max(r)}" for r in result]
        result = ";".join(result)
        results.append(result)
    return results


def get_predictions(results):
    predictions = []
    for result in results:
        prediction = []
        if result != "":
            for loc in [s.split() for s in result.split(";")]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
    return predictions


def get_result(df):
    labels = create_labels_for_scoring(df)
    predictions = df[[i for i in range(CFG.max_len)]].values
    char_probs = get_char_probs(df["pn_history"].values, predictions, CFG.tokenizer)
    results = get_results(char_probs, th=0.5)
    preds = get_predictions(results)
    score = get_score(labels, preds)
    return score

In [58]:
def train_fn(
    train_dataloader,
    model,
    criterion,
    optimizer,
    epoch,
    scheduler,
    device,
):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = time.time()
    for step, (inputs, labels) in enumerate(train_dataloader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)

        with torch.cuda.amp.autocast(enabled=CFG.apex):
            output = model(inputs)

        loss = criterion(output.view(-1, 1), labels.view(-1, 1))
        loss = torch.masked_select(loss, labels.view(-1, 1) != -1).mean()
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)

        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        if CFG.batch_scheduler:
            scheduler.step()

        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_dataloader)-1):
            print(
                "Epoch: [{0}][{1}/{2}] "
                "Elapsed {remain:s} "
                "Loss: {loss.val:.4f}({loss.avg:.4f}) "
                "Grad: {grad_norm:.4f}  "
                "LR: {lr:.6f}  "
                .format(
                    epoch+1,
                    step,
                    len(train_dataloader),
                    remain=timeSince(start, float(step+1) / len(train_dataloader)),
                    loss=losses,
                     grad_norm=grad_norm,
                     lr=scheduler.get_lr()[0],
                )
            )
    return losses.avg

In [59]:
def valid_fn(
    val_dataloader,
    model,
    criterion,
    device,
):
    model.eval()
    preds = []
    losses = AverageMeter()
    start = time.time()
    for step, (inputs, labels) in enumerate(val_dataloader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)

        with torch.no_grad():
            output = model(inputs)

        loss = criterion(output.view(-1, 1), labels.view(-1, 1))
        loss = torch.masked_select(loss, labels.view(-1, 1) != -1).mean()
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(output.sigmoid().squeeze().detach().cpu().numpy())

        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(val_dataloader)-1):
            print(
                "EVAL: [{0}/{1}] "
                "Elapsed {remain:s} "
                "Loss: {loss.val:.4f}({loss.avg:.4f}) "
                .format(
                    step, len(val_dataloader),
                    remain=timeSince(start, float(step+1) / len(val_dataloader)),
                    loss=losses,
                )
            )
    preds = np.concatenate(preds)
    return losses.avg, preds

In [60]:
def inference_fn(test_dataloader, model, device):
    model.eval()
    model.to(device)
    preds = []
    tk0 = tqdm(test_dataloader, total=len(test_dataloader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            output = model(inputs)
        preds.append(output.sigmoid().squeeze().detach().cpu().numpy())
    preds = np.concatenate(preds)
    return preds

In [61]:
def train_loop(df, i_fold, device):
    print(f"========== fold: {i_fold} training ==========")
    train_idx = df[df["fold"] != i_fold].index
    val_idx = df[df["fold"] == i_fold].index

    train_folds = df.loc[train_idx].reset_index(drop=True)
    val_folds = df.loc[val_idx].reset_index(drop=True)

    train_dataset = TrainingDataset(CFG, train_folds)
    val_dataset = TrainingDataset(CFG, val_folds)

    train_dataloader = DataLoader(
        train_dataset,
        batch_size=CFG.batch_size,
        shuffle=True,
        num_workers=CFG.num_workers,
        pin_memory=True,
        drop_last=True,
    )
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=CFG.batch_size,
        shuffle=False,
        num_workers=CFG.num_workers,
        pin_memory=True,
        drop_last=False,
    )

    model = CustomModel(CFG, model_config_path=None, pretrained=True)
    torch.save(model.model_config, CFG.output_dir / "model_config.pth")
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {"params": [p for n, p in param_optimizer if not any(
            nd in n for nd in no_decay)], "weight_decay": CFG.weight_decay},
        {"params": [p for n, p in param_optimizer if any(
            nd in n for nd in no_decay)], "weight_decay": 0.0}
    ]
    optimizer = AdamW(
        optimizer_grouped_parameters,
        lr=CFG.lr,
        betas=(0.9, 0.98),
        weight_decay=CFG.weight_decay,
    )
    num_train_optimization_steps = int(len(train_dataloader) * CFG.epochs)
    num_warmup_steps = int(num_train_optimization_steps * CFG.num_warmup_steps_rate)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_train_optimization_steps,
    )

    criterion = nn.BCEWithLogitsLoss(reduction="none")
    best_score = -1 * np.inf

    for epoch in range(CFG.epochs):
        start_time = time.time()
        avg_loss = train_fn(
            train_dataloader,
            model,
            criterion,
            optimizer,
            epoch,
            scheduler,
            device,
        )
        avg_val_loss, preds = valid_fn(
            val_dataloader,
            model,
            criterion,
            device,
        )

        if isinstance(scheduler, optim.lr_scheduler.CosineAnnealingWarmRestarts):
            scheduler.step()

        # scoring
        val_folds[[i for i in range(CFG.max_len)]] = preds
        score = get_result(val_folds)

        elapsed = time.time() - start_time

        print(f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s")
        print(f"Epoch {epoch+1} - Score: {score:.4f}")
        if score > best_score:
            best_score = score
            print(f"Epoch {epoch+1} - Save Best Score: {score:.4f} Model")
            torch.save({
                "model": model.state_dict(),
                "predictions": preds,
                },
                CFG.output_dir / f"fold{i_fold}_best.pth",
            )

    predictions = torch.load(
        CFG.output_dir / f"fold{i_fold}_best.pth",
        map_location=torch.device("cpu"),
    )["predictions"]
    val_folds[[i for i in range(CFG.max_len)]] = preds

    torch.cuda.empty_cache()
    gc.collect()

    return val_folds

## Main

In [62]:
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if CFG.train:
        oof_df = pd.DataFrame()
        for i_fold in range(CFG.n_fold):
            if i_fold in CFG.train_fold:
                _oof_df = train_loop(train, i_fold, device)
                oof_df = pd.concat([oof_df, _oof_df], axis=0, ignore_index=True)
        oof_df.to_csv(CFG.output_dir / "oof_df.csv", index=False)
        score = get_result(oof_df)
        print(f"Score: {score:.4f}")

    if CFG.inference:
        test_dataset = TestDataset(CFG, test)
        test_dataloader = DataLoader(
            test_dataset,
            batch_size=CFG.batch_size,
            shuffle=False,
            num_workers=CFG.num_workers,
            pin_memory=True,
            drop_last=False,
        )
        predictions = []
        for i_fold in CFG.train_fold:
            if SUBMISSION_MODE:
                model = CustomModel(CFG, model_config_path=Path("../input/") / EXP_NAME / "model_config.pth", pretrained=False)
                path = Path("../input/") / EXP_NAME / f"fold{i_fold}_best.pth"
            else:
                model = CustomModel(CFG, model_config_path=None, pretrained=True)
                path = CFG.output_dir / f"fold{i_fold}_best.pth"
            state = torch.load(path, map_location=torch.device("cpu"))
            model.load_state_dict(state["model"])
            prediction = inference_fn(test_dataloader, model, device)
            test_char_probs = get_char_probs(test["pn_history"].values, prediction, CFG.tokenizer)
            predictions.append(test_char_probs)
            del state, prediction; gc.collect()
            torch.cuda.empty_cache()
            
        predictions = np.mean(predictions, axis=0)
        best_th = 0.5
        results = get_results(predictions, th=best_th)
        test[CFG.target_col] = results
        test[[CFG.id_col, CFG.target_col]].to_csv(
            CFG.output_dir / "submission.csv", index=False
        )

In [63]:
if __name__ == "__main__":
    main()

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1 [00:00<?, ?it/s]

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1 [00:00<?, ?it/s]

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1 [00:00<?, ?it/s]

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1 [00:00<?, ?it/s]

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1 [00:00<?, ?it/s]

========== fold: 0 training ==========
Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Epoch: [1][0/712] Elapsed 0m 1s (remain 16m 12s) Loss: 0.7798(0.7798) Grad: inf  LR: 0.000000  
Epoch: [1][100/712] Elapsed 0m 51s (remain 5m 14s) Loss: 0.0757(0.2923) Grad: 1366.5475  LR: 0.000006  
Epoch: [1][200/712] Elapsed 1m 42s (remain 4m 21s) Loss: 0.0270(0.1687) Grad: 1315.1006  LR: 0.000011  
Epoch: [1][300/712] Elapsed 2m 33s (remain 3m 29s) Loss: 0.0225(0.1229) Grad: 2152.5745  LR: 0.000017  
Epoch: [1][400/712] Elapsed 3m 24s (remain 2m 38s) Loss: 0.0192(0.0984) Grad: 1430.9381  LR: 0.000020  
Epoch: [1][500/712] Elapsed 4m 15s (remain 1m 47s) Loss: 0.0432(0.0832) Grad: 4269.2373  LR: 0.000019  
Epoch: [1][600/712] Elapsed 5m 6s (remain 0m 56s) Loss: 0.0085(0.0726) Grad: 934.1167  LR: 0.000018  
Epoch: [1][700/712] Elapsed 5m 58s (remain 0m 5s) Loss: 0.0137(0.0649) Grad: 1024.1807  LR: 0.000018  
Epoch: [1][711/712] Elapsed 6m 3s (remain 0m 0s) Loss: 0.0100(0.0641) Grad: 1044.4680  LR: 0.000018  
EVAL: [0/182] Elapsed 0m 0s (remain 1m 42s) Loss: 0.0053(0.0053) 
EVAL: [100/182] Elapsed 0m 26s (remain 0m 21s) Loss: 0.0186(0.0163) 
EVAL: [181/182] Elapsed 0m 46s (remain 0m 0s) Loss: 0.0031(0.0147) 
Epoch 1 - avg_train_loss: 0.0641  avg_val_loss: 0.0147  time: 416s
Epoch 1 - Score: 0.8176
Epoch 1 - Save Best Score: 0.8176 Model
Epoch: [2][0/712] Elapsed 0m 0s (remain 9m 45s) Loss: 0.0081(0.0081) Grad: 13977.5127  LR: 0.000018  
Epoch: [2][100/712] Elapsed 0m 51s (remain 5m 14s) Loss: 0.0102(0.0141) Grad: 16649.7344  LR: 0.000017  
Epoch: [2][200/712] Elapsed 1m 43s (remain 4m 22s) Loss: 0.0124(0.0135) Grad: 17529.8496  LR: 0.000017  
Epoch: [2][300/712] Elapsed 2m 34s (remain 3m 30s) Loss: 0.0140(0.0136) Grad: 23777.1445  LR: 0.000016  
Epoch: [2][400/712] Elapsed 3m 26s (remain 2m 40s) Loss: 0.0186(0.0132) Grad: 37763.7578  LR: 0.000015  
Epoch: [2][500/712] Elapsed 4m 18s (remain 1m 48s) Loss: 0.0090(0.0130) Grad: 34108.5156  LR: 0.000015  
Epoch: [2][600/712] Elapsed 5m 9s (remain 0m 57s) Loss: 0.0150(0.0129) Grad: 42035.9922  LR: 0.000014  
Epoch: [2][700/712] Elapsed 6m 1s (remain 0m 5s) Loss: 0.0100(0.0129) Grad: 29793.0078  LR: 0.000013  
Epoch: [2][711/712] Elapsed 6m 7s (remain 0m 0s) Loss: 0.0115(0.0130) Grad: 24105.8145  LR: 0.000013  
EVAL: [0/182] Elapsed 0m 0s (remain 1m 44s) Loss: 0.0080(0.0080) 
EVAL: [100/182] Elapsed 0m 26s (remain 0m 21s) Loss: 0.0124(0.0137) 
EVAL: [181/182] Elapsed 0m 46s (remain 0m 0s) Loss: 0.0013(0.0124) 
Epoch 2 - avg_train_loss: 0.0130  avg_val_loss: 0.0124  time: 419s
Epoch 2 - Score: 0.8480
Epoch 2 - Save Best Score: 0.8480 Model
Epoch: [3][0/712] Elapsed 0m 0s (remain 10m 26s) Loss: 0.0111(0.0111) Grad: 52333.1719  LR: 0.000013  
Epoch: [3][100/712] Elapsed 0m 52s (remain 5m 14s) Loss: 0.0068(0.0100) Grad: 20206.5234  LR: 0.000013  
Epoch: [3][200/712] Elapsed 1m 43s (remain 4m 22s) Loss: 0.0013(0.0102) Grad: 8466.7520  LR: 0.000012  
Epoch: [3][300/712] Elapsed 2m 34s (remain 3m 30s) Loss: 0.0148(0.0102) Grad: 29660.7559  LR: 0.000011  
Epoch: [3][400/712] Elapsed 3m 25s (remain 2m 39s) Loss: 0.0051(0.0105) Grad: 7590.0947  LR: 0.000011  
Epoch: [3][500/712] Elapsed 4m 16s (remain 1m 47s) Loss: 0.0083(0.0104) Grad: 15675.9365  LR: 0.000010  
Epoch: [3][600/712] Elapsed 5m 7s (remain 0m 56s) Loss: 0.0055(0.0103) Grad: 16806.9258  LR: 0.000010  
Epoch: [3][700/712] Elapsed 5m 58s (remain 0m 5s) Loss: 0.0081(0.0102) Grad: 19197.1680  LR: 0.000009  
Epoch: [3][711/712] Elapsed 6m 3s (remain 0m 0s) Loss: 0.0050(0.0102) Grad: 21164.6172  LR: 0.000009  
EVAL: [0/182] Elapsed 0m 0s (remain 1m 44s) Loss: 0.0036(0.0036) 
EVAL: [100/182] Elapsed 0m 26s (remain 0m 21s) Loss: 0.0133(0.0138) 
EVAL: [181/182] Elapsed 0m 47s (remain 0m 0s) Loss: 0.0008(0.0121) 
Epoch 3 - avg_train_loss: 0.0102  avg_val_loss: 0.0121  time: 416s
Epoch 3 - Score: 0.8602
Epoch 3 - Save Best Score: 0.8602 Model
Epoch: [4][0/712] Elapsed 0m 0s (remain 10m 20s) Loss: 0.0200(0.0200) Grad: 34881.4219  LR: 0.000009  
Epoch: [4][100/712] Elapsed 0m 52s (remain 5m 14s) Loss: 0.0047(0.0082) Grad: 11323.3809  LR: 0.000008  
Epoch: [4][200/712] Elapsed 1m 43s (remain 4m 22s) Loss: 0.0061(0.0081) Grad: 15933.8809  LR: 0.000008  
Epoch: [4][300/712] Elapsed 2m 34s (remain 3m 31s) Loss: 0.0027(0.0081) Grad: 7276.8257  LR: 0.000007  
Epoch: [4][400/712] Elapsed 3m 26s (remain 2m 40s) Loss: 0.0016(0.0080) Grad: 8965.0859  LR: 0.000006  
Epoch: [4][500/712] Elapsed 4m 18s (remain 1m 48s) Loss: 0.0024(0.0079) Grad: 16135.4619  LR: 0.000006  
Epoch: [4][600/712] Elapsed 5m 9s (remain 0m 57s) Loss: 0.0061(0.0080) Grad: 10176.5928  LR: 0.000005  
Epoch: [4][700/712] Elapsed 6m 1s (remain 0m 5s) Loss: 0.0058(0.0080) Grad: 16652.6504  LR: 0.000005  
Epoch: [4][711/712] Elapsed 6m 7s (remain 0m 0s) Loss: 0.0101(0.0080) Grad: 18709.5957  LR: 0.000004  
EVAL: [0/182] Elapsed 0m 0s (remain 1m 42s) Loss: 0.0032(0.0032) 
EVAL: [100/182] Elapsed 0m 26s (remain 0m 21s) Loss: 0.0121(0.0136) 
EVAL: [181/182] Elapsed 0m 46s (remain 0m 0s) Loss: 0.0012(0.0121) 
Epoch 4 - avg_train_loss: 0.0080  avg_val_loss: 0.0121  time: 419s
Epoch 4 - Score: 0.8647
Epoch 4 - Save Best Score: 0.8647 Model
Epoch: [5][0/712] Elapsed 0m 0s (remain 10m 30s) Loss: 0.0029(0.0029) Grad: 5419.8066  LR: 0.000004  
Epoch: [5][100/712] Elapsed 0m 52s (remain 5m 17s) Loss: 0.0143(0.0071) Grad: 31381.6836  LR: 0.000004  
Epoch: [5][200/712] Elapsed 1m 44s (remain 4m 24s) Loss: 0.0041(0.0066) Grad: 22125.6816  LR: 0.000003  
Epoch: [5][300/712] Elapsed 2m 35s (remain 3m 32s) Loss: 0.0035(0.0068) Grad: 11048.9355  LR: 0.000003  
Epoch: [5][400/712] Elapsed 3m 26s (remain 2m 40s) Loss: 0.0025(0.0068) Grad: 11144.6133  LR: 0.000002  
Epoch: [5][500/712] Elapsed 4m 17s (remain 1m 48s) Loss: 0.0170(0.0067) Grad: 44204.5703  LR: 0.000001  
Epoch: [5][600/712] Elapsed 5m 9s (remain 0m 57s) Loss: 0.0080(0.0068) Grad: 24316.0000  LR: 0.000001  
Epoch: [5][700/712] Elapsed 6m 0s (remain 0m 5s) Loss: 0.0157(0.0067) Grad: 25459.1484  LR: 0.000000  
Epoch: [5][711/712] Elapsed 6m 6s (remain 0m 0s) Loss: 0.0063(0.0067) Grad: 13977.8398  LR: 0.000000  
EVAL: [0/182] Elapsed 0m 0s (remain 1m 42s) Loss: 0.0026(0.0026) 
EVAL: [100/182] Elapsed 0m 26s (remain 0m 21s) Loss: 0.0124(0.0141) 
EVAL: [181/182] Elapsed 0m 47s (remain 0m 0s) Loss: 0.0008(0.0126) 
Epoch 5 - avg_train_loss: 0.0067  avg_val_loss: 0.0126  time: 418s
Epoch 5 - Score: 0.8672
Epoch 5 - Save Best Score: 0.8672 Model
========== fold: 1 training ==========
Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Epoch: [1][0/712] Elapsed 0m 0s (remain 9m 47s) Loss: 0.9500(0.9500) Grad: inf  LR: 0.000000  
Epoch: [1][100/712] Elapsed 0m 52s (remain 5m 15s) Loss: 0.0802(0.3611) Grad: 952.4308  LR: 0.000006  
Epoch: [1][200/712] Elapsed 1m 43s (remain 4m 23s) Loss: 0.0504(0.2055) Grad: 2900.6521  LR: 0.000011  
Epoch: [1][300/712] Elapsed 2m 35s (remain 3m 31s) Loss: 0.0342(0.1472) Grad: 2159.0554  LR: 0.000017  
Epoch: [1][400/712] Elapsed 3m 26s (remain 2m 39s) Loss: 0.0076(0.1164) Grad: 1385.1884  LR: 0.000020  
Epoch: [1][500/712] Elapsed 4m 17s (remain 1m 48s) Loss: 0.0178(0.0976) Grad: 1936.5457  LR: 0.000019  
Epoch: [1][600/712] Elapsed 5m 9s (remain 0m 57s) Loss: 0.0093(0.0846) Grad: 2250.4241  LR: 0.000018  
Epoch: [1][700/712] Elapsed 6m 0s (remain 0m 5s) Loss: 0.0058(0.0750) Grad: 1090.1874  LR: 0.000018  
Epoch: [1][711/712] Elapsed 6m 6s (remain 0m 0s) Loss: 0.0190(0.0741) Grad: 2345.7434  LR: 0.000018  
EVAL: [0/181] Elapsed 0m 0s (remain 1m 49s) Loss: 0.0094(0.0094) 
EVAL: [100/181] Elapsed 0m 26s (remain 0m 20s) Loss: 0.0283(0.0169) 
EVAL: [180/181] Elapsed 0m 46s (remain 0m 0s) Loss: 0.0067(0.0158) 
Epoch 1 - avg_train_loss: 0.0741  avg_val_loss: 0.0158  time: 419s
Epoch 1 - Score: 0.8033
Epoch 1 - Save Best Score: 0.8033 Model
Epoch: [2][0/712] Elapsed 0m 0s (remain 9m 44s) Loss: 0.0310(0.0310) Grad: 54322.7734  LR: 0.000018  
Epoch: [2][100/712] Elapsed 0m 52s (remain 5m 15s) Loss: 0.0103(0.0140) Grad: 19509.5195  LR: 0.000017  
Epoch: [2][200/712] Elapsed 1m 43s (remain 4m 23s) Loss: 0.0091(0.0138) Grad: 20611.0625  LR: 0.000017  
Epoch: [2][300/712] Elapsed 2m 34s (remain 3m 31s) Loss: 0.0057(0.0137) Grad: 12656.3066  LR: 0.000016  
Epoch: [2][400/712] Elapsed 3m 26s (remain 2m 39s) Loss: 0.0105(0.0134) Grad: 17143.4453  LR: 0.000015  
Epoch: [2][500/712] Elapsed 4m 18s (remain 1m 48s) Loss: 0.0101(0.0135) Grad: 16158.7529  LR: 0.000015  
Epoch: [2][600/712] Elapsed 5m 10s (remain 0m 57s) Loss: 0.0456(0.0133) Grad: 82531.8125  LR: 0.000014  
Epoch: [2][700/712] Elapsed 6m 1s (remain 0m 5s) Loss: 0.0092(0.0131) Grad: 13869.7910  LR: 0.000013  
Epoch: [2][711/712] Elapsed 6m 7s (remain 0m 0s) Loss: 0.0095(0.0131) Grad: 12595.5254  LR: 0.000013  
EVAL: [0/181] Elapsed 0m 0s (remain 1m 39s) Loss: 0.0054(0.0054) 
EVAL: [100/181] Elapsed 0m 26s (remain 0m 20s) Loss: 0.0295(0.0148) 
EVAL: [180/181] Elapsed 0m 46s (remain 0m 0s) Loss: 0.0055(0.0133) 
Epoch 2 - avg_train_loss: 0.0131  avg_val_loss: 0.0133  time: 419s
Epoch 2 - Score: 0.8393
Epoch 2 - Save Best Score: 0.8393 Model
Epoch: [3][0/712] Elapsed 0m 0s (remain 10m 49s) Loss: 0.0135(0.0135) Grad: 30406.5605  LR: 0.000013  
Epoch: [3][100/712] Elapsed 0m 52s (remain 5m 15s) Loss: 0.0125(0.0109) Grad: 36558.4336  LR: 0.000013  
Epoch: [3][200/712] Elapsed 1m 43s (remain 4m 22s) Loss: 0.0276(0.0107) Grad: 99852.3516  LR: 0.000012  
Epoch: [3][300/712] Elapsed 2m 34s (remain 3m 31s) Loss: 0.0068(0.0105) Grad: 14221.4824  LR: 0.000011  
Epoch: [3][400/712] Elapsed 3m 26s (remain 2m 40s) Loss: 0.0099(0.0107) Grad: 19987.7129  LR: 0.000011  
Epoch: [3][500/712] Elapsed 4m 18s (remain 1m 48s) Loss: 0.0193(0.0107) Grad: 29633.6191  LR: 0.000010  
Epoch: [3][600/712] Elapsed 5m 11s (remain 0m 57s) Loss: 0.0226(0.0106) Grad: 35131.6875  LR: 0.000010  
Epoch: [3][700/712] Elapsed 6m 3s (remain 0m 5s) Loss: 0.0388(0.0104) Grad: 63908.1758  LR: 0.000009  
Epoch: [3][711/712] Elapsed 6m 9s (remain 0m 0s) Loss: 0.0020(0.0103) Grad: 17033.3809  LR: 0.000009  
EVAL: [0/181] Elapsed 0m 0s (remain 1m 37s) Loss: 0.0070(0.0070) 
EVAL: [100/181] Elapsed 0m 26s (remain 0m 20s) Loss: 0.0299(0.0143) 
EVAL: [180/181] Elapsed 0m 46s (remain 0m 0s) Loss: 0.0042(0.0131) 
Epoch 3 - avg_train_loss: 0.0103  avg_val_loss: 0.0131  time: 421s
Epoch 3 - Score: 0.8527
Epoch 3 - Save Best Score: 0.8527 Model
Epoch: [4][0/712] Elapsed 0m 0s (remain 10m 13s) Loss: 0.0095(0.0095) Grad: 21045.0254  LR: 0.000009  
Epoch: [4][100/712] Elapsed 0m 52s (remain 5m 16s) Loss: 0.0142(0.0076) Grad: 24434.9883  LR: 0.000008  
Epoch: [4][200/712] Elapsed 1m 43s (remain 4m 23s) Loss: 0.0042(0.0077) Grad: 12008.2266  LR: 0.000008  
Epoch: [4][300/712] Elapsed 2m 35s (remain 3m 31s) Loss: 0.0041(0.0083) Grad: 17314.1875  LR: 0.000007  
Epoch: [4][400/712] Elapsed 3m 27s (remain 2m 40s) Loss: 0.0016(0.0086) Grad: 4729.5532  LR: 0.000006  
Epoch: [4][500/712] Elapsed 4m 18s (remain 1m 48s) Loss: 0.0049(0.0083) Grad: 14056.5371  LR: 0.000006  
Epoch: [4][600/712] Elapsed 5m 9s (remain 0m 57s) Loss: 0.0048(0.0084) Grad: 19522.2598  LR: 0.000005  
Epoch: [4][700/712] Elapsed 6m 1s (remain 0m 5s) Loss: 0.0058(0.0084) Grad: 15232.4775  LR: 0.000005  
Epoch: [4][711/712] Elapsed 6m 6s (remain 0m 0s) Loss: 0.0080(0.0083) Grad: 14370.6113  LR: 0.000004  
EVAL: [0/181] Elapsed 0m 0s (remain 1m 40s) Loss: 0.0060(0.0060) 
EVAL: [100/181] Elapsed 0m 26s (remain 0m 20s) Loss: 0.0345(0.0142) 
EVAL: [180/181] Elapsed 0m 46s (remain 0m 0s) Loss: 0.0035(0.0133) 
Epoch 4 - avg_train_loss: 0.0083  avg_val_loss: 0.0133  time: 419s
Epoch 4 - Score: 0.8568
Epoch 4 - Save Best Score: 0.8568 Model
Epoch: [5][0/712] Elapsed 0m 0s (remain 11m 5s) Loss: 0.0046(0.0046) Grad: 10348.6074  LR: 0.000004  
Epoch: [5][100/712] Elapsed 0m 52s (remain 5m 17s) Loss: 0.0060(0.0072) Grad: 20586.1855  LR: 0.000004  
Epoch: [5][200/712] Elapsed 1m 44s (remain 4m 25s) Loss: 0.0048(0.0071) Grad: 20680.5859  LR: 0.000003  
Epoch: [5][300/712] Elapsed 2m 35s (remain 3m 32s) Loss: 0.0042(0.0073) Grad: 20825.9531  LR: 0.000003  
Epoch: [5][400/712] Elapsed 3m 27s (remain 2m 40s) Loss: 0.0052(0.0072) Grad: 14706.0859  LR: 0.000002  
Epoch: [5][500/712] Elapsed 4m 18s (remain 1m 48s) Loss: 0.0065(0.0072) Grad: 19743.6758  LR: 0.000001  
Epoch: [5][600/712] Elapsed 5m 9s (remain 0m 57s) Loss: 0.0014(0.0072) Grad: 5298.4155  LR: 0.000001  
Epoch: [5][700/712] Elapsed 6m 1s (remain 0m 5s) Loss: 0.0078(0.0071) Grad: 27145.3359  LR: 0.000000  
Epoch: [5][711/712] Elapsed 6m 6s (remain 0m 0s) Loss: 0.0039(0.0070) Grad: 10878.4453  LR: 0.000000  
EVAL: [0/181] Elapsed 0m 0s (remain 1m 40s) Loss: 0.0056(0.0056) 
EVAL: [100/181] Elapsed 0m 26s (remain 0m 20s) Loss: 0.0337(0.0142) 
EVAL: [180/181] Elapsed 0m 46s (remain 0m 0s) Loss: 0.0031(0.0132) 
Epoch 5 - avg_train_loss: 0.0070  avg_val_loss: 0.0132  time: 419s
Epoch 5 - Score: 0.8603
Epoch 5 - Save Best Score: 0.8603 Model
========== fold: 2 training ==========
Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Epoch: [1][0/717] Elapsed 0m 0s (remain 9m 48s) Loss: 0.5760(0.5760) Grad: inf  LR: 0.000000  
Epoch: [1][100/717] Elapsed 0m 52s (remain 5m 19s) Loss: 0.0565(0.2336) Grad: 1776.0911  LR: 0.000006  
Epoch: [1][200/717] Elapsed 1m 43s (remain 4m 26s) Loss: 0.0312(0.1384) Grad: 5241.5347  LR: 0.000011  
Epoch: [1][300/717] Elapsed 2m 35s (remain 3m 35s) Loss: 0.0248(0.1016) Grad: 4024.2803  LR: 0.000017  
Epoch: [1][400/717] Elapsed 3m 27s (remain 2m 43s) Loss: 0.0297(0.0828) Grad: 3945.1711  LR: 0.000020  
Epoch: [1][500/717] Elapsed 4m 18s (remain 1m 51s) Loss: 0.0209(0.0709) Grad: 2584.2070  LR: 0.000019  
Epoch: [1][600/717] Elapsed 5m 9s (remain 0m 59s) Loss: 0.0168(0.0621) Grad: 3280.2688  LR: 0.000018  
Epoch: [1][700/717] Elapsed 6m 1s (remain 0m 8s) Loss: 0.0105(0.0558) Grad: 2134.2964  LR: 0.000018  
Epoch: [1][716/717] Elapsed 6m 9s (remain 0m 0s) Loss: 0.0033(0.0550) Grad: 719.3870  LR: 0.000018  
EVAL: [0/176] Elapsed 0m 0s (remain 1m 40s) Loss: 0.0256(0.0256) 
EVAL: [100/176] Elapsed 0m 26s (remain 0m 19s) Loss: 0.0259(0.0167) 
EVAL: [175/176] Elapsed 0m 45s (remain 0m 0s) Loss: 0.0043(0.0168) 
Epoch 1 - avg_train_loss: 0.0550  avg_val_loss: 0.0168  time: 420s
Epoch 1 - Score: 0.7849
Epoch 1 - Save Best Score: 0.7849 Model
Epoch: [2][0/717] Elapsed 0m 0s (remain 10m 2s) Loss: 0.0073(0.0073) Grad: 12840.9932  LR: 0.000018  
Epoch: [2][100/717] Elapsed 0m 52s (remain 5m 17s) Loss: 0.0141(0.0149) Grad: 20557.3613  LR: 0.000017  
Epoch: [2][200/717] Elapsed 1m 43s (remain 4m 25s) Loss: 0.0054(0.0143) Grad: 22376.9180  LR: 0.000017  
Epoch: [2][300/717] Elapsed 2m 35s (remain 3m 34s) Loss: 0.0070(0.0138) Grad: 33000.8711  LR: 0.000016  
Epoch: [2][400/717] Elapsed 3m 27s (remain 2m 43s) Loss: 0.0263(0.0135) Grad: 36933.8438  LR: 0.000015  
Epoch: [2][500/717] Elapsed 4m 19s (remain 1m 51s) Loss: 0.0252(0.0135) Grad: 39680.6719  LR: 0.000015  
Epoch: [2][600/717] Elapsed 5m 10s (remain 0m 59s) Loss: 0.0142(0.0134) Grad: 29544.3164  LR: 0.000014  
Epoch: [2][700/717] Elapsed 6m 2s (remain 0m 8s) Loss: 0.0115(0.0133) Grad: 24327.6602  LR: 0.000013  
Epoch: [2][716/717] Elapsed 6m 10s (remain 0m 0s) Loss: 0.0129(0.0132) Grad: 28564.6660  LR: 0.000013  
EVAL: [0/176] Elapsed 0m 0s (remain 1m 46s) Loss: 0.0141(0.0141) 
EVAL: [100/176] Elapsed 0m 26s (remain 0m 19s) Loss: 0.0204(0.0140) 
EVAL: [175/176] Elapsed 0m 45s (remain 0m 0s) Loss: 0.0019(0.0142) 
Epoch 2 - avg_train_loss: 0.0132  avg_val_loss: 0.0142  time: 421s
Epoch 2 - Score: 0.8240
Epoch 2 - Save Best Score: 0.8240 Model
Epoch: [3][0/717] Elapsed 0m 0s (remain 11m 8s) Loss: 0.0182(0.0182) Grad: 27961.2891  LR: 0.000013  
Epoch: [3][100/717] Elapsed 0m 52s (remain 5m 20s) Loss: 0.0097(0.0104) Grad: 16493.1973  LR: 0.000013  
Epoch: [3][200/717] Elapsed 1m 44s (remain 4m 27s) Loss: 0.0091(0.0104) Grad: 17227.7773  LR: 0.000012  
Epoch: [3][300/717] Elapsed 2m 36s (remain 3m 36s) Loss: 0.0105(0.0098) Grad: 23725.2188  LR: 0.000011  
Epoch: [3][400/717] Elapsed 3m 27s (remain 2m 43s) Loss: 0.0150(0.0100) Grad: 31148.3223  LR: 0.000011  
Epoch: [3][500/717] Elapsed 4m 18s (remain 1m 51s) Loss: 0.0077(0.0102) Grad: 36118.2305  LR: 0.000010  
Epoch: [3][600/717] Elapsed 5m 10s (remain 0m 59s) Loss: 0.0165(0.0100) Grad: 30110.5820  LR: 0.000010  
Epoch: [3][700/717] Elapsed 6m 1s (remain 0m 8s) Loss: 0.0117(0.0098) Grad: 29895.6621  LR: 0.000009  
Epoch: [3][716/717] Elapsed 6m 9s (remain 0m 0s) Loss: 0.0064(0.0099) Grad: 19579.7871  LR: 0.000009  
EVAL: [0/176] Elapsed 0m 0s (remain 1m 47s) Loss: 0.0058(0.0058) 
EVAL: [100/176] Elapsed 0m 26s (remain 0m 19s) Loss: 0.0123(0.0132) 
EVAL: [175/176] Elapsed 0m 45s (remain 0m 0s) Loss: 0.0016(0.0137) 
Epoch 3 - avg_train_loss: 0.0099  avg_val_loss: 0.0137  time: 420s
Epoch 3 - Score: 0.8475
Epoch 3 - Save Best Score: 0.8475 Model
Epoch: [4][0/717] Elapsed 0m 0s (remain 10m 32s) Loss: 0.0031(0.0031) Grad: 7216.0815  LR: 0.000009  
Epoch: [4][100/717] Elapsed 0m 53s (remain 5m 23s) Loss: 0.0066(0.0084) Grad: 16821.7930  LR: 0.000008  
Epoch: [4][200/717] Elapsed 1m 44s (remain 4m 28s) Loss: 0.0133(0.0075) Grad: 51619.0000  LR: 0.000008  
Epoch: [4][300/717] Elapsed 2m 35s (remain 3m 35s) Loss: 0.0133(0.0076) Grad: 42018.2344  LR: 0.000007  
Epoch: [4][400/717] Elapsed 3m 27s (remain 2m 43s) Loss: 0.0067(0.0077) Grad: 25267.9785  LR: 0.000006  
Epoch: [4][500/717] Elapsed 4m 18s (remain 1m 51s) Loss: 0.0039(0.0078) Grad: 9958.6562  LR: 0.000006  
Epoch: [4][600/717] Elapsed 5m 11s (remain 1m 0s) Loss: 0.0036(0.0078) Grad: 12617.1387  LR: 0.000005  
Epoch: [4][700/717] Elapsed 6m 4s (remain 0m 8s) Loss: 0.0113(0.0078) Grad: 16916.3613  LR: 0.000005  
Epoch: [4][716/717] Elapsed 6m 12s (remain 0m 0s) Loss: 0.0052(0.0078) Grad: 13479.9639  LR: 0.000004  
EVAL: [0/176] Elapsed 0m 0s (remain 1m 41s) Loss: 0.0033(0.0033) 
EVAL: [100/176] Elapsed 0m 26s (remain 0m 19s) Loss: 0.0123(0.0131) 
EVAL: [175/176] Elapsed 0m 45s (remain 0m 0s) Loss: 0.0007(0.0135) 
Epoch 4 - avg_train_loss: 0.0078  avg_val_loss: 0.0135  time: 423s
Epoch 4 - Score: 0.8517
Epoch 4 - Save Best Score: 0.8517 Model
Epoch: [5][0/717] Elapsed 0m 0s (remain 11m 5s) Loss: 0.0044(0.0044) Grad: 15515.7500  LR: 0.000004  
Epoch: [5][100/717] Elapsed 0m 52s (remain 5m 18s) Loss: 0.0015(0.0066) Grad: 12929.9893  LR: 0.000004  
Epoch: [5][200/717] Elapsed 1m 43s (remain 4m 25s) Loss: 0.0063(0.0066) Grad: 12418.2285  LR: 0.000003  
Epoch: [5][300/717] Elapsed 2m 34s (remain 3m 33s) Loss: 0.0048(0.0066) Grad: 11512.3496  LR: 0.000003  
Epoch: [5][400/717] Elapsed 3m 26s (remain 2m 42s) Loss: 0.0120(0.0065) Grad: 20664.1543  LR: 0.000002  
Epoch: [5][500/717] Elapsed 4m 17s (remain 1m 51s) Loss: 0.0030(0.0065) Grad: 7600.6919  LR: 0.000001  
Epoch: [5][600/717] Elapsed 5m 9s (remain 0m 59s) Loss: 0.0054(0.0066) Grad: 12602.7676  LR: 0.000001  
Epoch: [5][700/717] Elapsed 6m 0s (remain 0m 8s) Loss: 0.0033(0.0066) Grad: 7729.0146  LR: 0.000000  
Epoch: [5][716/717] Elapsed 6m 8s (remain 0m 0s) Loss: 0.0017(0.0066) Grad: 8355.9502  LR: 0.000000  
EVAL: [0/176] Elapsed 0m 0s (remain 1m 48s) Loss: 0.0024(0.0024) 
EVAL: [100/176] Elapsed 0m 26s (remain 0m 19s) Loss: 0.0135(0.0132) 
EVAL: [175/176] Elapsed 0m 45s (remain 0m 0s) Loss: 0.0008(0.0135) 
Epoch 5 - avg_train_loss: 0.0066  avg_val_loss: 0.0135  time: 420s
Epoch 5 - Score: 0.8555
Epoch 5 - Save Best Score: 0.8555 Model
========== fold: 3 training ==========
Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Epoch: [1][0/719] Elapsed 0m 0s (remain 9m 58s) Loss: 1.0619(1.0619) Grad: inf  LR: 0.000000  
Epoch: [1][100/719] Elapsed 0m 52s (remain 5m 18s) Loss: 0.0374(0.4016) Grad: 975.7155  LR: 0.000006  
Epoch: [1][200/719] Elapsed 1m 43s (remain 4m 27s) Loss: 0.0294(0.2240) Grad: 2603.8152  LR: 0.000011  
Epoch: [1][300/719] Elapsed 2m 34s (remain 3m 35s) Loss: 0.0263(0.1594) Grad: 2475.5859  LR: 0.000017  
Epoch: [1][400/719] Elapsed 3m 26s (remain 2m 43s) Loss: 0.0206(0.1260) Grad: 1416.5098  LR: 0.000020  
Epoch: [1][500/719] Elapsed 4m 17s (remain 1m 52s) Loss: 0.0199(0.1052) Grad: 1542.1224  LR: 0.000019  
Epoch: [1][600/719] Elapsed 5m 8s (remain 1m 0s) Loss: 0.0225(0.0907) Grad: 1038.6124  LR: 0.000019  
Epoch: [1][700/719] Elapsed 5m 59s (remain 0m 9s) Loss: 0.0170(0.0801) Grad: 2482.5432  LR: 0.000018  
Epoch: [1][718/719] Elapsed 6m 8s (remain 0m 0s) Loss: 0.0073(0.0785) Grad: 828.7495  LR: 0.000018  
EVAL: [0/175] Elapsed 0m 0s (remain 1m 39s) Loss: 0.0074(0.0074) 
EVAL: [100/175] Elapsed 0m 26s (remain 0m 19s) Loss: 0.0119(0.0153) 
EVAL: [174/175] Elapsed 0m 45s (remain 0m 0s) Loss: 0.0015(0.0145) 
Epoch 1 - avg_train_loss: 0.0785  avg_val_loss: 0.0145  time: 419s
Epoch 1 - Score: 0.8167
Epoch 1 - Save Best Score: 0.8167 Model
Epoch: [2][0/719] Elapsed 0m 0s (remain 10m 1s) Loss: 0.0189(0.0189) Grad: 44838.2969  LR: 0.000018  
Epoch: [2][100/719] Elapsed 0m 52s (remain 5m 19s) Loss: 0.0197(0.0135) Grad: 31879.7852  LR: 0.000017  
Epoch: [2][200/719] Elapsed 1m 43s (remain 4m 27s) Loss: 0.0124(0.0142) Grad: 17697.9863  LR: 0.000017  
Epoch: [2][300/719] Elapsed 2m 36s (remain 3m 36s) Loss: 0.0183(0.0140) Grad: 27233.4473  LR: 0.000016  
Epoch: [2][400/719] Elapsed 3m 28s (remain 2m 45s) Loss: 0.0154(0.0139) Grad: 17281.6445  LR: 0.000015  
Epoch: [2][500/719] Elapsed 4m 20s (remain 1m 53s) Loss: 0.0014(0.0134) Grad: 3942.8113  LR: 0.000015  
Epoch: [2][600/719] Elapsed 5m 12s (remain 1m 1s) Loss: 0.0063(0.0134) Grad: 20015.9297  LR: 0.000014  
Epoch: [2][700/719] Elapsed 6m 4s (remain 0m 9s) Loss: 0.0176(0.0132) Grad: 30243.2109  LR: 0.000013  
Epoch: [2][718/719] Elapsed 6m 13s (remain 0m 0s) Loss: 0.0080(0.0131) Grad: 8403.2920  LR: 0.000013  
EVAL: [0/175] Elapsed 0m 0s (remain 1m 35s) Loss: 0.0040(0.0040) 
EVAL: [100/175] Elapsed 0m 26s (remain 0m 19s) Loss: 0.0120(0.0127) 
EVAL: [174/175] Elapsed 0m 44s (remain 0m 0s) Loss: 0.0006(0.0123) 
Epoch 2 - avg_train_loss: 0.0131  avg_val_loss: 0.0123  time: 423s
Epoch 2 - Score: 0.8477
Epoch 2 - Save Best Score: 0.8477 Model
Epoch: [3][0/719] Elapsed 0m 0s (remain 11m 0s) Loss: 0.0085(0.0085) Grad: 13265.3516  LR: 0.000013  
Epoch: [3][100/719] Elapsed 0m 52s (remain 5m 18s) Loss: 0.0102(0.0096) Grad: 29454.7188  LR: 0.000013  
Epoch: [3][200/719] Elapsed 1m 43s (remain 4m 26s) Loss: 0.0093(0.0099) Grad: 23030.0078  LR: 0.000012  
Epoch: [3][300/719] Elapsed 2m 35s (remain 3m 36s) Loss: 0.0083(0.0105) Grad: 17166.6699  LR: 0.000011  
Epoch: [3][400/719] Elapsed 3m 26s (remain 2m 44s) Loss: 0.0031(0.0105) Grad: 10209.6865  LR: 0.000011  
Epoch: [3][500/719] Elapsed 4m 18s (remain 1m 52s) Loss: 0.0006(0.0105) Grad: 3198.8638  LR: 0.000010  
Epoch: [3][600/719] Elapsed 5m 9s (remain 1m 0s) Loss: 0.0047(0.0104) Grad: 9279.9111  LR: 0.000010  
Epoch: [3][700/719] Elapsed 6m 0s (remain 0m 9s) Loss: 0.0268(0.0103) Grad: 42701.5664  LR: 0.000009  
Epoch: [3][718/719] Elapsed 6m 9s (remain 0m 0s) Loss: 0.0025(0.0103) Grad: 8551.9297  LR: 0.000009  
EVAL: [0/175] Elapsed 0m 0s (remain 1m 47s) Loss: 0.0023(0.0023) 
EVAL: [100/175] Elapsed 0m 26s (remain 0m 19s) Loss: 0.0158(0.0122) 
EVAL: [174/175] Elapsed 0m 45s (remain 0m 0s) Loss: 0.0006(0.0117) 
Epoch 3 - avg_train_loss: 0.0103  avg_val_loss: 0.0117  time: 420s
Epoch 3 - Score: 0.8599
Epoch 3 - Save Best Score: 0.8599 Model
Epoch: [4][0/719] Elapsed 0m 0s (remain 11m 9s) Loss: 0.0033(0.0033) Grad: 13959.6514  LR: 0.000009  
Epoch: [4][100/719] Elapsed 0m 52s (remain 5m 19s) Loss: 0.0027(0.0083) Grad: 6956.8550  LR: 0.000008  
Epoch: [4][200/719] Elapsed 1m 43s (remain 4m 26s) Loss: 0.0032(0.0083) Grad: 15149.8184  LR: 0.000008  
Epoch: [4][300/719] Elapsed 2m 34s (remain 3m 34s) Loss: 0.0044(0.0080) Grad: 16648.5879  LR: 0.000007  
Epoch: [4][400/719] Elapsed 3m 26s (remain 2m 43s) Loss: 0.0028(0.0082) Grad: 13756.1797  LR: 0.000006  
Epoch: [4][500/719] Elapsed 4m 18s (remain 1m 52s) Loss: 0.0066(0.0084) Grad: 11495.6768  LR: 0.000006  
Epoch: [4][600/719] Elapsed 5m 9s (remain 1m 0s) Loss: 0.0113(0.0083) Grad: 23293.7441  LR: 0.000005  
Epoch: [4][700/719] Elapsed 6m 0s (remain 0m 9s) Loss: 0.0010(0.0082) Grad: 4205.5894  LR: 0.000005  
Epoch: [4][718/719] Elapsed 6m 10s (remain 0m 0s) Loss: 0.0027(0.0081) Grad: 17012.6465  LR: 0.000004  
EVAL: [0/175] Elapsed 0m 0s (remain 1m 37s) Loss: 0.0026(0.0026) 
EVAL: [100/175] Elapsed 0m 26s (remain 0m 19s) Loss: 0.0111(0.0126) 
EVAL: [174/175] Elapsed 0m 44s (remain 0m 0s) Loss: 0.0003(0.0121) 
Epoch 4 - avg_train_loss: 0.0081  avg_val_loss: 0.0121  time: 420s
Epoch 4 - Score: 0.8642
Epoch 4 - Save Best Score: 0.8642 Model
Epoch: [5][0/719] Elapsed 0m 0s (remain 11m 20s) Loss: 0.0034(0.0034) Grad: 9203.2881  LR: 0.000004  
Epoch: [5][100/719] Elapsed 0m 52s (remain 5m 21s) Loss: 0.0066(0.0065) Grad: 15132.8213  LR: 0.000004  
Epoch: [5][200/719] Elapsed 1m 43s (remain 4m 27s) Loss: 0.0092(0.0068) Grad: 45111.8320  LR: 0.000003  
Epoch: [5][300/719] Elapsed 2m 34s (remain 3m 34s) Loss: 0.0085(0.0069) Grad: 24726.6562  LR: 0.000003  
Epoch: [5][400/719] Elapsed 3m 25s (remain 2m 43s) Loss: 0.0077(0.0069) Grad: 23677.4824  LR: 0.000002  
Epoch: [5][500/719] Elapsed 4m 16s (remain 1m 51s) Loss: 0.0008(0.0068) Grad: 4383.9536  LR: 0.000001  
Epoch: [5][600/719] Elapsed 5m 8s (remain 1m 0s) Loss: 0.0071(0.0068) Grad: 11846.6787  LR: 0.000001  
Epoch: [5][700/719] Elapsed 6m 1s (remain 0m 9s) Loss: 0.0043(0.0069) Grad: 11576.6797  LR: 0.000000  
Epoch: [5][718/719] Elapsed 6m 10s (remain 0m 0s) Loss: 0.0027(0.0069) Grad: 13109.6387  LR: 0.000000  
EVAL: [0/175] Elapsed 0m 0s (remain 1m 38s) Loss: 0.0023(0.0023) 
EVAL: [100/175] Elapsed 0m 26s (remain 0m 19s) Loss: 0.0107(0.0125) 
EVAL: [174/175] Elapsed 0m 44s (remain 0m 0s) Loss: 0.0004(0.0120) 
Epoch 5 - avg_train_loss: 0.0069  avg_val_loss: 0.0120  time: 421s
Epoch 5 - Score: 0.8645
Epoch 5 - Save Best Score: 0.8645 Model
========== fold: 4 training ==========
Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Epoch: [1][0/712] Elapsed 0m 0s (remain 9m 51s) Loss: 0.9575(0.9575) Grad: inf  LR: 0.000000  
Epoch: [1][100/712] Elapsed 0m 52s (remain 5m 14s) Loss: 0.0472(0.3601) Grad: 897.9247  LR: 0.000006  
Epoch: [1][200/712] Elapsed 1m 44s (remain 4m 24s) Loss: 0.0246(0.2014) Grad: 2885.2925  LR: 0.000011  
Epoch: [1][300/712] Elapsed 2m 35s (remain 3m 32s) Loss: 0.0237(0.1439) Grad: 2856.0505  LR: 0.000017  
Epoch: [1][400/712] Elapsed 3m 26s (remain 2m 40s) Loss: 0.0248(0.1141) Grad: 2242.8555  LR: 0.000020  
Epoch: [1][500/712] Elapsed 4m 17s (remain 1m 48s) Loss: 0.0259(0.0956) Grad: 4059.1238  LR: 0.000019  
Epoch: [1][600/712] Elapsed 5m 8s (remain 0m 57s) Loss: 0.0278(0.0833) Grad: 3173.1501  LR: 0.000018  
Epoch: [1][700/712] Elapsed 5m 59s (remain 0m 5s) Loss: 0.0320(0.0739) Grad: 2918.3689  LR: 0.000018  
Epoch: [1][711/712] Elapsed 6m 5s (remain 0m 0s) Loss: 0.0147(0.0729) Grad: 1188.6802  LR: 0.000018  
EVAL: [0/182] Elapsed 0m 0s (remain 1m 42s) Loss: 0.0256(0.0256) 
EVAL: [100/182] Elapsed 0m 26s (remain 0m 20s) Loss: 0.0495(0.0172) 
EVAL: [181/182] Elapsed 0m 46s (remain 0m 0s) Loss: 0.0053(0.0153) 
Epoch 1 - avg_train_loss: 0.0729  avg_val_loss: 0.0153  time: 417s
Epoch 1 - Score: 0.8157
Epoch 1 - Save Best Score: 0.8157 Model
Epoch: [2][0/712] Elapsed 0m 0s (remain 9m 55s) Loss: 0.0092(0.0092) Grad: 20895.6699  LR: 0.000018  
Epoch: [2][100/712] Elapsed 0m 52s (remain 5m 14s) Loss: 0.0087(0.0143) Grad: 8895.1973  LR: 0.000017  
Epoch: [2][200/712] Elapsed 1m 43s (remain 4m 22s) Loss: 0.0197(0.0132) Grad: 34384.1289  LR: 0.000017  
Epoch: [2][300/712] Elapsed 2m 34s (remain 3m 31s) Loss: 0.0065(0.0130) Grad: 14027.5430  LR: 0.000016  
Epoch: [2][400/712] Elapsed 3m 26s (remain 2m 39s) Loss: 0.0059(0.0129) Grad: 9931.1221  LR: 0.000015  
Epoch: [2][500/712] Elapsed 4m 18s (remain 1m 48s) Loss: 0.0212(0.0131) Grad: 56407.7891  LR: 0.000015  
Epoch: [2][600/712] Elapsed 5m 9s (remain 0m 57s) Loss: 0.0171(0.0129) Grad: 17855.0195  LR: 0.000014  
Epoch: [2][700/712] Elapsed 6m 0s (remain 0m 5s) Loss: 0.0092(0.0128) Grad: 16696.6738  LR: 0.000013  
Epoch: [2][711/712] Elapsed 6m 6s (remain 0m 0s) Loss: 0.0142(0.0128) Grad: 18871.1973  LR: 0.000013  
EVAL: [0/182] Elapsed 0m 0s (remain 1m 44s) Loss: 0.0120(0.0120) 
EVAL: [100/182] Elapsed 0m 26s (remain 0m 21s) Loss: 0.0594(0.0160) 
EVAL: [181/182] Elapsed 0m 46s (remain 0m 0s) Loss: 0.0061(0.0141) 
Epoch 2 - avg_train_loss: 0.0128  avg_val_loss: 0.0141  time: 418s
Epoch 2 - Score: 0.8426
Epoch 2 - Save Best Score: 0.8426 Model
Epoch: [3][0/712] Elapsed 0m 0s (remain 11m 4s) Loss: 0.0048(0.0048) Grad: 23358.0195  LR: 0.000013  
Epoch: [3][100/712] Elapsed 0m 51s (remain 5m 14s) Loss: 0.0090(0.0101) Grad: 16714.3223  LR: 0.000013  
Epoch: [3][200/712] Elapsed 1m 43s (remain 4m 22s) Loss: 0.0187(0.0103) Grad: 72955.9219  LR: 0.000012  
Epoch: [3][300/712] Elapsed 2m 34s (remain 3m 31s) Loss: 0.0052(0.0102) Grad: 20347.4434  LR: 0.000011  
Epoch: [3][400/712] Elapsed 3m 26s (remain 2m 40s) Loss: 0.0113(0.0102) Grad: 40586.5664  LR: 0.000011  
Epoch: [3][500/712] Elapsed 4m 18s (remain 1m 48s) Loss: 0.0037(0.0101) Grad: 9424.6201  LR: 0.000010  
Epoch: [3][600/712] Elapsed 5m 9s (remain 0m 57s) Loss: 0.0103(0.0101) Grad: 28416.9805  LR: 0.000010  
Epoch: [3][700/712] Elapsed 6m 1s (remain 0m 5s) Loss: 0.0139(0.0101) Grad: 27063.6992  LR: 0.000009  
Epoch: [3][711/712] Elapsed 6m 7s (remain 0m 0s) Loss: 0.0043(0.0101) Grad: 20963.6035  LR: 0.000009  
EVAL: [0/182] Elapsed 0m 0s (remain 1m 55s) Loss: 0.0079(0.0079) 
EVAL: [100/182] Elapsed 0m 26s (remain 0m 21s) Loss: 0.0480(0.0146) 
EVAL: [181/182] Elapsed 0m 46s (remain 0m 0s) Loss: 0.0059(0.0127) 
Epoch 3 - avg_train_loss: 0.0101  avg_val_loss: 0.0127  time: 419s
Epoch 3 - Score: 0.8535
Epoch 3 - Save Best Score: 0.8535 Model
Epoch: [4][0/712] Elapsed 0m 0s (remain 11m 5s) Loss: 0.0026(0.0026) Grad: 5051.4468  LR: 0.000009  
Epoch: [4][100/712] Elapsed 0m 52s (remain 5m 16s) Loss: 0.0031(0.0079) Grad: 16139.7725  LR: 0.000008  
Epoch: [4][200/712] Elapsed 1m 43s (remain 4m 23s) Loss: 0.0145(0.0077) Grad: 30466.1426  LR: 0.000008  
Epoch: [4][300/712] Elapsed 2m 35s (remain 3m 31s) Loss: 0.0084(0.0079) Grad: 18274.2012  LR: 0.000007  
Epoch: [4][400/712] Elapsed 3m 26s (remain 2m 40s) Loss: 0.0226(0.0078) Grad: 35175.9219  LR: 0.000006  
Epoch: [4][500/712] Elapsed 4m 17s (remain 1m 48s) Loss: 0.0065(0.0079) Grad: 11477.7207  LR: 0.000006  
Epoch: [4][600/712] Elapsed 5m 9s (remain 0m 57s) Loss: 0.0068(0.0080) Grad: 21257.8867  LR: 0.000005  
Epoch: [4][700/712] Elapsed 6m 1s (remain 0m 5s) Loss: 0.0043(0.0080) Grad: 17176.4707  LR: 0.000005  
Epoch: [4][711/712] Elapsed 6m 6s (remain 0m 0s) Loss: 0.0048(0.0080) Grad: 11005.6768  LR: 0.000004  
EVAL: [0/182] Elapsed 0m 0s (remain 1m 56s) Loss: 0.0103(0.0103) 
EVAL: [100/182] Elapsed 0m 26s (remain 0m 21s) Loss: 0.0538(0.0157) 
EVAL: [181/182] Elapsed 0m 46s (remain 0m 0s) Loss: 0.0065(0.0133) 
Epoch 4 - avg_train_loss: 0.0080  avg_val_loss: 0.0133  time: 419s
Epoch 4 - Score: 0.8580
Epoch 4 - Save Best Score: 0.8580 Model
Epoch: [5][0/712] Elapsed 0m 0s (remain 11m 8s) Loss: 0.0096(0.0096) Grad: 28264.8184  LR: 0.000004  
Epoch: [5][100/712] Elapsed 0m 52s (remain 5m 15s) Loss: 0.0034(0.0066) Grad: 14201.5898  LR: 0.000004  
Epoch: [5][200/712] Elapsed 1m 43s (remain 4m 22s) Loss: 0.0054(0.0068) Grad: 11046.1191  LR: 0.000003  
Epoch: [5][300/712] Elapsed 2m 34s (remain 3m 30s) Loss: 0.0087(0.0069) Grad: 18392.0234  LR: 0.000003  
Epoch: [5][400/712] Elapsed 3m 25s (remain 2m 39s) Loss: 0.0125(0.0071) Grad: 38556.3125  LR: 0.000002  
Epoch: [5][500/712] Elapsed 4m 17s (remain 1m 48s) Loss: 0.0006(0.0070) Grad: 3282.4463  LR: 0.000001  
Epoch: [5][600/712] Elapsed 5m 8s (remain 0m 57s) Loss: 0.0086(0.0069) Grad: 27053.7969  LR: 0.000001  
Epoch: [5][700/712] Elapsed 5m 59s (remain 0m 5s) Loss: 0.0025(0.0069) Grad: 13092.8965  LR: 0.000000  
Epoch: [5][711/712] Elapsed 6m 5s (remain 0m 0s) Loss: 0.0043(0.0069) Grad: 21682.1992  LR: 0.000000  
EVAL: [0/182] Elapsed 0m 0s (remain 1m 51s) Loss: 0.0103(0.0103) 
EVAL: [100/182] Elapsed 0m 26s (remain 0m 21s) Loss: 0.0503(0.0157) 
EVAL: [181/182] Elapsed 0m 46s (remain 0m 0s) Loss: 0.0065(0.0133) 
Epoch 5 - avg_train_loss: 0.0069  avg_val_loss: 0.0133  time: 418s
Epoch 5 - Score: 0.8607
Epoch 5 - Save Best Score: 0.8607 Model
Score: 0.8616