## References

- https://www.kaggle.com/yasufuminakama/nbme-deberta-base-baseline-train

## Configurations

In [31]:
EXP_NAME = "nbme-exp014"
ENV = "local"
DEBUG_MODE = False
SUBMISSION_MODE = False

In [32]:
class CFG:
    env=ENV
    exp_name=EXP_NAME
    debug=DEBUG_MODE
    submission=SUBMISSION_MODE
    apex=True
    input_dir=None
    output_dir=None
    library="pytorch"  # ["tf", "pytorch"]
    device="GPU"  # ["GPU", "TPU"]
    competition_name="nbme-score-clinical-patient-notes"
    id_col="id"
    target_col="location"
    pretrained_model_name="microsoft/deberta-large"
    tokenizer=None
    max_len=None
    output_dim=1
    dropout=0.2
    num_workers=4
    batch_size=4
    lr=2e-5
    betas=(0.9, 0.98)
    weight_decay=0.1
    num_warmup_steps_rate=0.1
    batch_scheduler=True
    epochs=5
    n_fold=5
    train_fold=[0, 1, 2, 3, 4]
    seed=71
    gradient_accumulation_steps=2
    max_grad_norm=1000
    print_freq=100
    train=True
    inference=True

In [33]:
if CFG.debug:
    CFG.epochs = 2
    CFG.train_fold = [0, 1]

if CFG.submission:
    CFG.train = False
    CFG.inference = True

## Directory Settings

In [34]:
import sys
from pathlib import Path


print(CFG.env)
if CFG.env == "colab":
    # colab環境
    from google.colab import drive
    drive.mount("/content/drive")
    CFG.input_dir = Path("./drive/MyDrive/00.kaggle/input") / CFG.competition_name
    CFG.output_dir = Path("./drive/MyDrive/00.kaggle/output") / CFG.competition_name / CFG.exp_name
    if not CFG.output_dir.exists():
        CFG.output_dir.mkdir()
    # install packages
    !pip install transformers

elif CFG.env == "local":
    # ローカルサーバ
    CFG.input_dir = Path("../input/") / CFG.competition_name
    CFG.output_dir = Path("../output/") / CFG.competition_name / CFG.exp_name
    if not CFG.output_dir.exists():
        CFG.output_dir.mkdir()

elif CFG.env == "kaggle":
    # kaggle環境
    CFG.input_dir = Path("../input/") / CFG.competition_name
    CFG.output_dir = Path("./")

local


In [35]:
import gc
import os
import ast
import time
import math
import random
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from scipy.optimize import minimize
from sklearn.metrics import roc_auc_score, mean_squared_error, f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as T
from torchvision.io import read_image
from torch.utils.data import DataLoader, Dataset

from transformers import AutoModelForMaskedLM
from transformers import BartModel,BertModel,BertTokenizer
from transformers import DebertaModel,DebertaTokenizer
from transformers import RobertaModel,RobertaTokenizer
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel,AutoConfig
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import ElectraModel, ElectraTokenizer, ElectraForSequenceClassification

import warnings
warnings.filterwarnings("ignore")

## Utilities

In [36]:
def micro_f1(preds, truths):
    """
    Micro f1 on binary arrays.

    Args:
        preds (list of lists of ints): Predictions.
        truths (list of lists of ints): Ground truths.

    Returns:
        float: f1 score.
    """
    # Micro : aggregating over all instances
    preds = np.concatenate(preds)
    truths = np.concatenate(truths)
    return f1_score(truths, preds)


def spans_to_binary(spans, length=None):
    """
    Converts spans to a binary array indicating whether each character is in the span.

    Args:
        spans (list of lists of two ints): Spans.

    Returns:
        np array [length]: Binarized spans.
    """
    length = np.max(spans) if length is None else length
    binary = np.zeros(length)
    for start, end in spans:
        binary[start:end] = 1
    return binary


def span_micro_f1(preds, truths):
    """
    Micro f1 on spans.

    Args:
        preds (list of lists of two ints): Prediction spans.
        truths (list of lists of two ints): Ground truth spans.

    Returns:
        float: f1 score.
    """
    bin_preds = []
    bin_truths = []
    for pred, truth in zip(preds, truths):
        if not len(pred) and not len(truth):
            continue
        length = max(np.max(pred) if len(pred) else 0, np.max(truth) if len(truth) else 0)
        bin_preds.append(spans_to_binary(pred, length))
        bin_truths.append(spans_to_binary(truth, length))
    return micro_f1(bin_preds, bin_truths)


def get_score(y_true, y_pred):
    score = span_micro_f1(y_true, y_pred)
    return score

In [37]:
def create_labels_for_scoring(df):
    # example: ['48 61', '111 128'] -> [[48, 61], [111, 128]]
    df["location_for_create_labels"] = [ast.literal_eval(f"[]")] * len(df)
    for i in range(len(df)):
        lst = df.loc[i, "location"]
        if lst:
            new_lst = ";".join(lst)
            df.loc[i, "location_for_create_labels"] = ast.literal_eval(f"[['{new_lst}']]")

    # create labels
    truths = []
    for location_list in df["location_for_create_labels"].values:
        truth = []
        if len(location_list) > 0:
            location = location_list[0]
            for loc in [s.split() for s in location.split(";")]:
                start, end = int(loc[0]), int(loc[1])
                truth.append([start, end])
        truths.append(truth)

    return truths


def get_char_probs(texts, token_probs, tokenizer):
    res = [np.zeros(len(t)) for t in texts]
    for i, (text, prediction) in enumerate(zip(texts, token_probs)):
        encoded = tokenizer(
            text=text,
            max_length=CFG.max_len,
            padding="max_length",
            return_offsets_mapping=True,
        )
        for (offset_mapping, pred) in zip(encoded["offset_mapping"], prediction):
            start, end = offset_mapping
            res[i][start:end] = pred
    return res


def get_predicted_location_str(char_probs, th=0.5):
    results = []
    for char_prob in char_probs:
        result = np.where(char_prob >= th)[0] + 1
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        result = [f"{min(r)} {max(r)}" for r in result]
        result = ";".join(result)
        results.append(result)
    return results


def get_predictions(results):
    predictions = []
    for result in results:
        prediction = []
        if result != "":
            for loc in [s.split() for s in result.split(";")]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
    return predictions


def scoring(df, th=0.5):
    labels = create_labels_for_scoring(df)

    token_probs = df[[str(i) for i in range(CFG.max_len)]].values
    char_probs = get_char_probs(df["pn_history"].values, token_probs, CFG.tokenizer)
    predicted_location_str = get_predicted_location_str(char_probs, th=th)
    preds = get_predictions(predicted_location_str)

    score = get_score(labels, preds)
    return score


def get_best_thres(oof_df):
    def f1_opt(x):
        return -1 * scoring(oof_df, th=x)

    best_thres = minimize(f1_opt, x0=np.array([0.5]), method="Nelder-Mead")["x"].item()
    return best_thres

In [38]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [39]:
seed_everything()

## Data Loading

In [40]:
train = pd.read_csv(CFG.input_dir / "train.csv")
features = pd.read_csv(CFG.input_dir / "features.csv")
patient_notes = pd.read_csv(CFG.input_dir / "patient_notes.csv")
test = pd.read_csv(CFG.input_dir / "test.csv")

train.shape, features.shape, patient_notes.shape, test.shape

((14300, 6), (143, 3), (42146, 3), (5, 4))

In [41]:
if CFG.debug:
    train = train.sample(n=1000, random_state=0).reset_index(drop=True)
    print(train.shape)

## Preprocessing

In [42]:
def preprocess_features(features):
    features.loc[features["feature_text"] == "Last-Pap-smear-I-year-ago", "feature_text"] = "Last-Pap-smear-1-year-ago"
    return features


features = preprocess_features(features)

In [43]:
train = train.merge(features, on=["feature_num", "case_num"], how="left")
train = train.merge(patient_notes, on=["pn_num", "case_num"], how="left")
test = test.merge(features, on=["feature_num", "case_num"], how="left")
test = test.merge(patient_notes, on=["pn_num", "case_num"], how="left")

train.shape, test.shape

((14300, 8), (5, 6))

In [44]:
train["annotation"] = train["annotation"].apply(ast.literal_eval)
train["location"] = train["location"].apply(ast.literal_eval)

In [45]:
train["annotation_length"] = train["annotation"].apply(len)
display(train['annotation_length'].value_counts().sort_index())

0    4399
1    8181
2    1296
3     287
4      99
5      27
6       9
7       1
8       1
Name: annotation_length, dtype: int64

## CV split

In [46]:
def get_groupkfold(df, group_name):
    groups = df[group_name].unique()

    kf = KFold(
        n_splits=CFG.n_fold,
        shuffle=True,
        random_state=CFG.seed,
    )
    folds_ids = []
    for i_fold, (_, val_group_idx) in enumerate(kf.split(groups)):
        val_group = groups[val_group_idx]
        is_val = df[group_name].isin(val_group)
        val_idx = df[is_val].index
        df.loc[val_idx, "fold"] = int(i_fold)

    df["fold"] = df["fold"].astype(int)
    return df

In [47]:
train = get_groupkfold(train, "pn_num")
display(train.groupby("fold").size())

fold
0    2902
1    2894
2    2813
3    2791
4    2900
dtype: int64

## Setup tokenizer

In [48]:
if CFG.submission:
    tokenizer = AutoTokenizer.from_pretrained(Path("../input/") / CFG.exp_name / "tokenizer/")
else:
    tokenizer = AutoTokenizer.from_pretrained(CFG.pretrained_model_name)
    tokenizer.save_pretrained(CFG.output_dir / "tokenizer/")

CFG.tokenizer = tokenizer

## Create dataset

In [49]:
pn_history_lengths = []
tk0 = tqdm(patient_notes["pn_history"].fillna("").values, total=len(patient_notes))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)["input_ids"])
    pn_history_lengths.append(length)

print("max length:", np.max(pn_history_lengths))

  0%|          | 0/42146 [00:00<?, ?it/s]

max length: 433


In [50]:
feature_text_lengths = []
tk0 = tqdm(features["feature_text"].fillna("").values, total=len(features))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)["input_ids"])
    feature_text_lengths.append(length)

print("max length:", np.max(feature_text_lengths))

  0%|          | 0/143 [00:00<?, ?it/s]

max length: 30


In [51]:
CFG.max_len = max(pn_history_lengths) + max(feature_text_lengths) + 3   # cls & sep & sep

print("max length:", CFG.max_len)

max length: 466


In [52]:
class TrainingDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.df = df
        self.tokenizer = self.cfg.tokenizer
        self.max_len = self.cfg.max_len
        self.feature_texts = self.df["feature_text"].values
        self.pn_historys = self.df["pn_history"].values
        self.annotation_lengths = self.df["annotation_length"].values
        self.locations = self.df["location"].values

    def __len__(self):
        return len(self.df)

    def _create_input(self, pn_history, feature_text):
        encoded = self.tokenizer(
            text=pn_history,
            text_pair=feature_text,
            max_length=self.max_len,
            padding="max_length",
            return_offsets_mapping=False,
        )
        for k, v in encoded.items():
            encoded[k] = torch.tensor(v, dtype=torch.long)
        return encoded

    def _create_label(self, pn_history, annotation_length, location_list):
        encoded = self.tokenizer(
            text=pn_history,
            max_length=self.max_len,
            padding="max_length",
            return_offsets_mapping=True,
        )
        offset_mapping = encoded["offset_mapping"]
        ignore_idxes = np.where(np.array(encoded.sequence_ids()) != 0)[0]
        label = np.zeros(len(offset_mapping))
        label[ignore_idxes] = -1

        if annotation_length > 0:
            for location in location_list:
                for loc in [s.split() for s in location.split(";")]:
                    start, end = int(loc[0]), int(loc[1])
                    start_idx = -1
                    end_idx = -1
                    for idx in range(len(offset_mapping)):
                        if (start_idx == -1) & (start < offset_mapping[idx][0]):
                            start_idx = idx - 1
                        if (end_idx == -1) & (end <= offset_mapping[idx][1]):
                            end_idx = idx + 1
                    if start_idx == -1:
                        start_idx = end_idx
                    if (start_idx != -1) & (end_idx != -1):
                        label[start_idx:end_idx] = 1

        return torch.tensor(label, dtype=torch.float)

    def __getitem__(self, idx):
        input_ = self._create_input(self.pn_historys[idx], self.feature_texts[idx])
        label = self._create_label(self.pn_historys[idx], self.annotation_lengths[idx], self.locations[idx])
        return input_, label

In [53]:
class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.df = df
        self.tokenizer = self.cfg.tokenizer
        self.max_len = self.cfg.max_len
        self.feature_texts = self.df["feature_text"].values
        self.pn_historys = self.df["pn_history"].values

    def __len__(self):
        return len(self.df)

    def _create_input(self, pn_history, feature_text):
        encoded = self.tokenizer(
            text=pn_history,
            text_pair=feature_text,
            max_length=self.max_len,
            padding="max_length",
            return_offsets_mapping=False,
        )
        for k, v in encoded.items():
            encoded[k] = torch.tensor(v, dtype=torch.long)
        return encoded

    def __getitem__(self, idx):
        input_ = self._create_input(self.pn_historys[idx], self.feature_texts[idx])
        return input_

## Model

In [54]:
class CustomModel(nn.Module):
    def __init__(self, cfg, model_config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg

        if model_config_path is None:
            self.model_config = AutoConfig.from_pretrained(
                self.cfg.pretrained_model_name,
                output_hidden_states=True,
            )
        else:
            self.model_config = torch.load(model_config_path)

        if pretrained:
            self.backbone = AutoModel.from_pretrained(
                self.cfg.pretrained_model_name,
                config=self.model_config,
            )
            print(f"Load weight from pretrained")
        else:
            #self.backbone = AutoModel.from_config(self.model_config)
            itpt = AutoModelForMaskedLM.from_config(self.model_config)
            #path = str(Path("./drive/MyDrive/00.kaggle/output") / CFG.competition_name /  "nbme-exp009/checkpoint-129000/pytorch_model.bin")
            path = "../output/nbme-score-clinical-patient-notes/nbme-exp010/checkpoint-130170/pytorch_model.bin"
            state_dict = torch.load(path)
            itpt.load_state_dict(state_dict)
            self.backbone = itpt.deberta
            print(f"Load weight from {path}")

        self.fc = nn.Sequential(
            nn.Dropout(self.cfg.dropout),
            nn.Linear(self.model_config.hidden_size, self.cfg.output_dim),
        )

    def forward(self, inputs):
        h = self.backbone(**inputs)["last_hidden_state"]
        output = self.fc(h)
        return output

## Training

In [55]:
def train_fn(
    train_dataloader,
    model,
    criterion,
    optimizer,
    epoch,
    scheduler,
    device,
):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = time.time()
    for step, (inputs, labels) in enumerate(train_dataloader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)

        with torch.cuda.amp.autocast(enabled=CFG.apex):
            output = model(inputs)

        loss = criterion(output.view(-1, 1), labels.view(-1, 1))
        loss = torch.masked_select(loss, labels.view(-1, 1) != -1).mean()
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)

        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        if CFG.batch_scheduler:
            scheduler.step()

        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_dataloader)-1):
            print(
                "Epoch: [{0}][{1}/{2}] "
                "Elapsed {remain:s} "
                "Loss: {loss.val:.4f}({loss.avg:.4f}) "
                "Grad: {grad_norm:.4f}  "
                "LR: {lr:.6f}  "
                .format(
                    epoch+1,
                    step,
                    len(train_dataloader),
                    remain=timeSince(start, float(step+1) / len(train_dataloader)),
                    loss=losses,
                     grad_norm=grad_norm,
                     lr=scheduler.get_lr()[0],
                )
            )
    return losses.avg

In [56]:
def valid_fn(
    val_dataloader,
    model,
    criterion,
    device,
):
    model.eval()
    preds = []
    losses = AverageMeter()
    start = time.time()
    for step, (inputs, labels) in enumerate(val_dataloader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)

        with torch.no_grad():
            output = model(inputs)

        loss = criterion(output.view(-1, 1), labels.view(-1, 1))
        loss = torch.masked_select(loss, labels.view(-1, 1) != -1).mean()
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(output.sigmoid().squeeze(2).detach().cpu().numpy())

        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(val_dataloader)-1):
            print(
                "EVAL: [{0}/{1}] "
                "Elapsed {remain:s} "
                "Loss: {loss.val:.4f}({loss.avg:.4f}) "
                .format(
                    step, len(val_dataloader),
                    remain=timeSince(start, float(step+1) / len(val_dataloader)),
                    loss=losses,
                )
            )
    preds = np.concatenate(preds)
    return losses.avg, preds

In [57]:
def inference_fn(test_dataloader, model, device):
    model.eval()
    model.to(device)
    preds = []
    tk0 = tqdm(test_dataloader, total=len(test_dataloader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            output = model(inputs)
        preds.append(output.sigmoid().squeeze(2).detach().cpu().numpy())
    preds = np.concatenate(preds)
    return preds

In [58]:
def train_loop(df, i_fold, device):
    print(f"========== fold: {i_fold} training ==========")
    train_idx = df[df["fold"] != i_fold].index
    val_idx = df[df["fold"] == i_fold].index

    train_folds = df.loc[train_idx].reset_index(drop=True)
    val_folds = df.loc[val_idx].reset_index(drop=True)

    train_dataset = TrainingDataset(CFG, train_folds)
    val_dataset = TrainingDataset(CFG, val_folds)

    train_dataloader = DataLoader(
        train_dataset,
        batch_size=CFG.batch_size,
        shuffle=True,
        num_workers=CFG.num_workers,
        pin_memory=True,
        drop_last=True,
    )
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=CFG.batch_size,
        shuffle=False,
        num_workers=CFG.num_workers,
        pin_memory=True,
        drop_last=False,
    )

    # model = CustomModel(CFG, model_config_path=None, pretrained=True)
    model = CustomModel(CFG, model_config_path=None, pretrained=False)   # itptを使うため
    torch.save(model.model_config, CFG.output_dir / "model_config.pth")
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {"params": [p for n, p in param_optimizer if not any(
            nd in n for nd in no_decay)], "weight_decay": CFG.weight_decay},
        {"params": [p for n, p in param_optimizer if any(
            nd in n for nd in no_decay)], "weight_decay": 0.0}
    ]
    optimizer = AdamW(
        optimizer_grouped_parameters,
        lr=CFG.lr,
        betas=CFG.betas,
        weight_decay=CFG.weight_decay,
    )
    num_train_optimization_steps = int(len(train_dataloader) * CFG.epochs)
    num_warmup_steps = int(num_train_optimization_steps * CFG.num_warmup_steps_rate)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_train_optimization_steps,
    )

    criterion = nn.BCEWithLogitsLoss(reduction="none")
    best_score = -1 * np.inf
    """

    for epoch in range(CFG.epochs):
        start_time = time.time()
        avg_loss = train_fn(
            train_dataloader,
            model,
            criterion,
            optimizer,
            epoch,
            scheduler,
            device,
        )
        avg_val_loss, val_preds = valid_fn(
            val_dataloader,
            model,
            criterion,
            device,
        )

        if isinstance(scheduler, optim.lr_scheduler.CosineAnnealingWarmRestarts):
            scheduler.step()

        # scoring
        val_folds[[str(i) for i in range(CFG.max_len)]] = val_preds
        score = scoring(val_folds, th=0.5)

        elapsed = time.time() - start_time

        print(f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s")
        print(f"Epoch {epoch+1} - Score: {score:.4f}")
        if score > best_score:
            best_score = score
            print(f"Epoch {epoch+1} - Save Best Score: {score:.4f} Model")
            torch.save({
                "model": model.state_dict(),
                "predictions": val_preds,
                },
                CFG.output_dir / f"fold{i_fold}_best.pth",
            )
    """
    predictions = torch.load(
        CFG.output_dir / f"fold{i_fold}_best.pth",
        map_location=torch.device("cpu"),
    )["predictions"]
    val_folds[[str(i) for i in range(CFG.max_len)]] = predictions

    torch.cuda.empty_cache()
    gc.collect()

    return val_folds

## Main

In [59]:
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if CFG.train:
        oof_df = pd.DataFrame()
        for i_fold in range(CFG.n_fold):
            if i_fold in CFG.train_fold:
                _oof_df = train_loop(train, i_fold, device)
                oof_df = pd.concat([oof_df, _oof_df], axis=0, ignore_index=True)
        oof_df.to_pickle(CFG.output_dir / "oof_df.pkl")

    if CFG.submission:
        oof_df = pd.read_pickle(Path("../input/") / CFG.exp_name / "oof_df.pkl")
    else:
        oof_df = pd.read_pickle(CFG.output_dir / "oof_df.pkl")

    score = scoring(oof_df, th=0.5)
    print(f"Best thres: 0.5, Score: {score:.4f}")
    best_thres = get_best_thres(oof_df)
    score = scoring(oof_df, th=best_thres)
    print(f"Best thres: {best_thres}, Score: {score:.4f}")

    if CFG.inference:
        test_dataset = TestDataset(CFG, test)
        test_dataloader = DataLoader(
            test_dataset,
            batch_size=CFG.batch_size,
            shuffle=False,
            num_workers=CFG.num_workers,
            pin_memory=True,
            drop_last=False,
        )
        predictions = []
        for i_fold in CFG.train_fold:
            if CFG.submission:
                model = CustomModel(CFG, model_config_path=Path("../input/") / CFG.exp_name / "model_config.pth", pretrained=False)
                path = Path("../input/") / CFG.exp_name / f"fold{i_fold}_best.pth"
            else:
                model = CustomModel(CFG, model_config_path=None, pretrained=True)
                path = CFG.output_dir / f"fold{i_fold}_best.pth"

            state = torch.load(path, map_location=torch.device("cpu"))
            model.load_state_dict(state["model"])
            test_token_probs = inference_fn(test_dataloader, model, device)
            test[[f"fold{i_fold}_{i}" for i in range(CFG.max_len)]] = test_token_probs
            test_char_probs = get_char_probs(test["pn_history"].values, test_token_probs, CFG.tokenizer)
            predictions.append(test_char_probs)

            del state, test_token_probs, model; gc.collect()
            torch.cuda.empty_cache()

        predictions = np.mean(predictions, axis=0)
        predicted_location_str = get_predicted_location_str(predictions, th=best_thres)
        test[CFG.target_col] = predicted_location_str
        test.to_csv(CFG.output_dir / "raw_submission.csv", index=False)
        test[[CFG.id_col, CFG.target_col]].to_csv(
            CFG.output_dir / "submission.csv", index=False
        )

========== fold: 0 training ==========
Load weight from ../output/nbme-score-clinical-patient-notes/nbme-exp010/checkpoint-130170/pytorch_model.bin
Epoch: [1][0/2849] Elapsed 0m 1s (remain 57m 11s) Loss: 0.3151(0.3151) Grad: 197880.7969  LR: 0.000000  
Epoch: [1][100/2849] Elapsed 0m 43s (remain 19m 54s) Loss: 0.2384(0.2924) Grad: 77975.1328  LR: 0.000001  
Epoch: [1][200/2849] Elapsed 1m 26s (remain 19m 3s) Loss: 0.1058(0.2348) Grad: 16965.8301  LR: 0.000003  
Epoch: [1][300/2849] Elapsed 2m 10s (remain 18m 21s) Loss: 0.0317(0.1741) Grad: 1877.3456  LR: 0.000004  
Epoch: [1][400/2849] Elapsed 2m 53s (remain 17m 39s) Loss: 0.0808(0.1406) Grad: 5765.3096  LR: 0.000006  
Epoch: [1][500/2849] Elapsed 3m 36s (remain 16m 55s) Loss: 0.0402(0.1196) Grad: 3003.1406  LR: 0.000007  
Epoch: [1][600/2849] Elapsed 4m 19s (remain 16m 12s) Loss: 0.0390(0.1037) Grad: 9670.9375  LR: 0.000008  
Epoch: [1][700/2849] Elapsed 5m 3s (remain 15m 29s) Loss: 0.0769(0.0918) Grad: 15728.8750  LR: 0.000010  
Epoch: [1][800/2849] Elapsed 5m 46s (remain 14m 46s) Loss: 0.0083(0.0823) Grad: 4873.8574  LR: 0.000011  
Epoch: [1][900/2849] Elapsed 6m 30s (remain 14m 3s) Loss: 0.0058(0.0746) Grad: 3241.6628  LR: 0.000013  
Epoch: [1][1000/2849] Elapsed 7m 13s (remain 13m 20s) Loss: 0.0059(0.0684) Grad: 2390.3203  LR: 0.000014  
Epoch: [1][1100/2849] Elapsed 7m 57s (remain 12m 37s) Loss: 0.0143(0.0631) Grad: 11790.8125  LR: 0.000015  
Epoch: [1][1200/2849] Elapsed 8m 40s (remain 11m 53s) Loss: 0.0071(0.0587) Grad: 3858.2175  LR: 0.000017  
Epoch: [1][1300/2849] Elapsed 9m 23s (remain 11m 10s) Loss: 0.0151(0.0549) Grad: 20028.5469  LR: 0.000018  
Epoch: [1][1400/2849] Elapsed 10m 7s (remain 10m 27s) Loss: 0.0092(0.0515) Grad: 5077.8594  LR: 0.000020  
Epoch: [1][1500/2849] Elapsed 10m 50s (remain 9m 44s) Loss: 0.0036(0.0487) Grad: 1275.7528  LR: 0.000020  
Epoch: [1][1600/2849] Elapsed 11m 33s (remain 9m 0s) Loss: 0.0092(0.0462) Grad: 2933.0239  LR: 0.000020  
Epoch: [1][1700/2849] Elapsed 12m 17s (remain 8m 17s) Loss: 0.0050(0.0440) Grad: 2224.7229  LR: 0.000020  
Epoch: [1][1800/2849] Elapsed 13m 0s (remain 7m 34s) Loss: 0.0012(0.0420) Grad: 770.3402  LR: 0.000019  
Epoch: [1][1900/2849] Elapsed 13m 43s (remain 6m 50s) Loss: 0.0001(0.0401) Grad: 125.5079  LR: 0.000019  
Epoch: [1][2000/2849] Elapsed 14m 27s (remain 6m 7s) Loss: 0.0118(0.0385) Grad: 9037.4229  LR: 0.000019  
Epoch: [1][2100/2849] Elapsed 15m 11s (remain 5m 24s) Loss: 0.0152(0.0370) Grad: 15561.3965  LR: 0.000019  
Epoch: [1][2200/2849] Elapsed 15m 54s (remain 4m 41s) Loss: 0.0001(0.0356) Grad: 132.8673  LR: 0.000019  
Epoch: [1][2300/2849] Elapsed 16m 37s (remain 3m 57s) Loss: 0.0687(0.0344) Grad: 10559.9092  LR: 0.000019  
Epoch: [1][2400/2849] Elapsed 17m 21s (remain 3m 14s) Loss: 0.0054(0.0332) Grad: 2685.0435  LR: 0.000018  
Epoch: [1][2500/2849] Elapsed 18m 4s (remain 2m 30s) Loss: 0.0058(0.0321) Grad: 3165.1885  LR: 0.000018  
Epoch: [1][2600/2849] Elapsed 18m 47s (remain 1m 47s) Loss: 0.0007(0.0312) Grad: 365.9367  LR: 0.000018  
Epoch: [1][2700/2849] Elapsed 19m 31s (remain 1m 4s) Loss: 0.0010(0.0303) Grad: 587.4985  LR: 0.000018  
Epoch: [1][2800/2849] Elapsed 20m 14s (remain 0m 20s) Loss: 0.0017(0.0294) Grad: 1027.0442  LR: 0.000018  
Epoch: [1][2848/2849] Elapsed 20m 34s (remain 0m 0s) Loss: 0.0062(0.0291) Grad: 3010.7744  LR: 0.000018  
EVAL: [0/726] Elapsed 0m 0s (remain 5m 2s) Loss: 0.0027(0.0027) 
EVAL: [100/726] Elapsed 0m 21s (remain 2m 10s) Loss: 0.0026(0.0055) 
EVAL: [200/726] Elapsed 0m 41s (remain 1m 49s) Loss: 0.0003(0.0068) 
EVAL: [300/726] Elapsed 1m 2s (remain 1m 28s) Loss: 0.0020(0.0063) 
EVAL: [400/726] Elapsed 1m 23s (remain 1m 7s) Loss: 0.0032(0.0072) 
EVAL: [500/726] Elapsed 1m 44s (remain 0m 46s) Loss: 0.0240(0.0072) 
EVAL: [600/726] Elapsed 2m 5s (remain 0m 26s) Loss: 0.0015(0.0068) 
EVAL: [700/726] Elapsed 2m 26s (remain 0m 5s) Loss: 0.0009(0.0064) 
EVAL: [725/726] Elapsed 2m 31s (remain 0m 0s) Loss: 0.0002(0.0063) 
Epoch 1 - avg_train_loss: 0.0291  avg_val_loss: 0.0063  time: 1392s
Epoch 1 - Score: 0.8566
Epoch 1 - Save Best Score: 0.8566 Model
Epoch: [2][0/2849] Elapsed 0m 0s (remain 29m 0s) Loss: 0.0080(0.0080) Grad: 7312.4053  LR: 0.000018  
Epoch: [2][100/2849] Elapsed 0m 43s (remain 19m 52s) Loss: 0.0007(0.0050) Grad: 3881.3765  LR: 0.000018  
Epoch: [2][200/2849] Elapsed 1m 27s (remain 19m 12s) Loss: 0.0001(0.0062) Grad: 1052.9917  LR: 0.000017  
Epoch: [2][300/2849] Elapsed 2m 11s (remain 18m 30s) Loss: 0.0017(0.0056) Grad: 5146.1035  LR: 0.000017  
Epoch: [2][400/2849] Elapsed 2m 54s (remain 17m 44s) Loss: 0.0063(0.0056) Grad: 9386.4141  LR: 0.000017  
Epoch: [2][500/2849] Elapsed 3m 37s (remain 17m 0s) Loss: 0.0034(0.0057) Grad: 19834.1211  LR: 0.000017  
Epoch: [2][600/2849] Elapsed 4m 21s (remain 16m 17s) Loss: 0.0002(0.0055) Grad: 850.4203  LR: 0.000017  
Epoch: [2][700/2849] Elapsed 5m 4s (remain 15m 33s) Loss: 0.0000(0.0054) Grad: 139.1904  LR: 0.000017  
Epoch: [2][800/2849] Elapsed 5m 47s (remain 14m 49s) Loss: 0.0002(0.0053) Grad: 1595.8861  LR: 0.000017  
Epoch: [2][900/2849] Elapsed 6m 31s (remain 14m 5s) Loss: 0.0151(0.0054) Grad: 23033.5664  LR: 0.000016  
Epoch: [2][1000/2849] Elapsed 7m 14s (remain 13m 22s) Loss: 0.0160(0.0055) Grad: 14913.0352  LR: 0.000016  
Epoch: [2][1100/2849] Elapsed 7m 57s (remain 12m 38s) Loss: 0.0023(0.0054) Grad: 11077.8105  LR: 0.000016  
Epoch: [2][1200/2849] Elapsed 8m 41s (remain 11m 54s) Loss: 0.0231(0.0054) Grad: 54241.1680  LR: 0.000016  
Epoch: [2][1300/2849] Elapsed 9m 24s (remain 11m 11s) Loss: 0.0015(0.0055) Grad: 6194.8848  LR: 0.000016  
Epoch: [2][1400/2849] Elapsed 10m 8s (remain 10m 28s) Loss: 0.0001(0.0055) Grad: 496.7228  LR: 0.000016  
Epoch: [2][1500/2849] Elapsed 10m 51s (remain 9m 45s) Loss: 0.0030(0.0056) Grad: 9482.0635  LR: 0.000015  
Epoch: [2][1600/2849] Elapsed 11m 34s (remain 9m 1s) Loss: 0.0495(0.0056) Grad: 180490.6875  LR: 0.000015  
Epoch: [2][1700/2849] Elapsed 12m 18s (remain 8m 18s) Loss: 0.0038(0.0056) Grad: 8194.7188  LR: 0.000015  
Epoch: [2][1800/2849] Elapsed 13m 1s (remain 7m 34s) Loss: 0.0042(0.0056) Grad: 7641.6826  LR: 0.000015  
Epoch: [2][1900/2849] Elapsed 13m 44s (remain 6m 51s) Loss: 0.0017(0.0057) Grad: 5601.8315  LR: 0.000015  
Epoch: [2][2000/2849] Elapsed 14m 27s (remain 6m 7s) Loss: 0.0015(0.0056) Grad: 4040.9968  LR: 0.000015  
Epoch: [2][2100/2849] Elapsed 15m 11s (remain 5m 24s) Loss: 0.0011(0.0056) Grad: 3570.0239  LR: 0.000014  
Epoch: [2][2200/2849] Elapsed 15m 54s (remain 4m 41s) Loss: 0.0202(0.0056) Grad: 36663.6250  LR: 0.000014  
Epoch: [2][2300/2849] Elapsed 16m 38s (remain 3m 57s) Loss: 0.0027(0.0055) Grad: 8051.5073  LR: 0.000014  
Epoch: [2][2400/2849] Elapsed 17m 22s (remain 3m 14s) Loss: 0.0186(0.0055) Grad: 54086.9453  LR: 0.000014  
Epoch: [2][2500/2849] Elapsed 18m 5s (remain 2m 31s) Loss: 0.0005(0.0055) Grad: 3878.3972  LR: 0.000014  
Epoch: [2][2600/2849] Elapsed 18m 49s (remain 1m 47s) Loss: 0.0074(0.0054) Grad: 6154.3813  LR: 0.000014  
Epoch: [2][2700/2849] Elapsed 19m 32s (remain 1m 4s) Loss: 0.0029(0.0054) Grad: 9037.5771  LR: 0.000014  
Epoch: [2][2800/2849] Elapsed 20m 16s (remain 0m 20s) Loss: 0.0003(0.0054) Grad: 2393.7598  LR: 0.000013  
Epoch: [2][2848/2849] Elapsed 20m 37s (remain 0m 0s) Loss: 0.0001(0.0055) Grad: 264.5609  LR: 0.000013  
EVAL: [0/726] Elapsed 0m 0s (remain 4m 54s) Loss: 0.0011(0.0011) 
EVAL: [100/726] Elapsed 0m 21s (remain 2m 10s) Loss: 0.0081(0.0061) 
EVAL: [200/726] Elapsed 0m 41s (remain 1m 49s) Loss: 0.0001(0.0062) 
EVAL: [300/726] Elapsed 1m 2s (remain 1m 28s) Loss: 0.0001(0.0060) 
EVAL: [400/726] Elapsed 1m 23s (remain 1m 7s) Loss: 0.0030(0.0071) 
EVAL: [500/726] Elapsed 1m 44s (remain 0m 46s) Loss: 0.0256(0.0071) 
EVAL: [600/726] Elapsed 2m 5s (remain 0m 26s) Loss: 0.0034(0.0066) 
EVAL: [700/726] Elapsed 2m 26s (remain 0m 5s) Loss: 0.0008(0.0062) 
EVAL: [725/726] Elapsed 2m 31s (remain 0m 0s) Loss: 0.0000(0.0061) 
Epoch 2 - avg_train_loss: 0.0055  avg_val_loss: 0.0061  time: 1394s
Epoch 2 - Score: 0.8763
Epoch 2 - Save Best Score: 0.8763 Model
Epoch: [3][0/2849] Elapsed 0m 0s (remain 32m 12s) Loss: 0.0007(0.0007) Grad: 4067.4443  LR: 0.000013  
Epoch: [3][100/2849] Elapsed 0m 44s (remain 20m 0s) Loss: 0.0001(0.0038) Grad: 564.6037  LR: 0.000013  
Epoch: [3][200/2849] Elapsed 1m 27s (remain 19m 9s) Loss: 0.0126(0.0039) Grad: 20890.8145  LR: 0.000013  
Epoch: [3][300/2849] Elapsed 2m 10s (remain 18m 23s) Loss: 0.0000(0.0038) Grad: 105.4964  LR: 0.000013  
Epoch: [3][400/2849] Elapsed 2m 53s (remain 17m 38s) Loss: 0.0000(0.0037) Grad: 211.3320  LR: 0.000013  
Epoch: [3][500/2849] Elapsed 3m 36s (remain 16m 56s) Loss: 0.0091(0.0036) Grad: 13103.7900  LR: 0.000013  
Epoch: [3][600/2849] Elapsed 4m 20s (remain 16m 13s) Loss: 0.0117(0.0039) Grad: 7003.4312  LR: 0.000012  
Epoch: [3][700/2849] Elapsed 5m 3s (remain 15m 29s) Loss: 0.0073(0.0040) Grad: 16121.0410  LR: 0.000012  
Epoch: [3][800/2849] Elapsed 5m 46s (remain 14m 45s) Loss: 0.0000(0.0039) Grad: 108.9366  LR: 0.000012  
Epoch: [3][900/2849] Elapsed 6m 29s (remain 14m 2s) Loss: 0.0028(0.0040) Grad: 7211.7100  LR: 0.000012  
Epoch: [3][1000/2849] Elapsed 7m 12s (remain 13m 19s) Loss: 0.0036(0.0040) Grad: 11712.3447  LR: 0.000012  
Epoch: [3][1100/2849] Elapsed 7m 56s (remain 12m 36s) Loss: 0.0071(0.0040) Grad: 18848.7539  LR: 0.000012  
Epoch: [3][1200/2849] Elapsed 8m 39s (remain 11m 53s) Loss: 0.0000(0.0041) Grad: 33.9751  LR: 0.000011  
Epoch: [3][1300/2849] Elapsed 9m 22s (remain 11m 9s) Loss: 0.0002(0.0043) Grad: 902.3162  LR: 0.000011  
Epoch: [3][1400/2849] Elapsed 10m 6s (remain 10m 26s) Loss: 0.0051(0.0043) Grad: 6878.7690  LR: 0.000011  
Epoch: [3][1500/2849] Elapsed 10m 49s (remain 9m 43s) Loss: 0.0000(0.0042) Grad: 84.8286  LR: 0.000011  
Epoch: [3][1600/2849] Elapsed 11m 32s (remain 8m 59s) Loss: 0.0017(0.0042) Grad: 4392.9805  LR: 0.000011  
Epoch: [3][1700/2849] Elapsed 12m 15s (remain 8m 16s) Loss: 0.0107(0.0042) Grad: 22398.0273  LR: 0.000011  
Epoch: [3][1800/2849] Elapsed 12m 58s (remain 7m 32s) Loss: 0.0028(0.0042) Grad: 17426.3242  LR: 0.000011  
Epoch: [3][1900/2849] Elapsed 13m 41s (remain 6m 49s) Loss: 0.0034(0.0042) Grad: 9815.8613  LR: 0.000010  
Epoch: [3][2000/2849] Elapsed 14m 24s (remain 6m 6s) Loss: 0.0000(0.0042) Grad: 40.7478  LR: 0.000010  
Epoch: [3][2100/2849] Elapsed 15m 8s (remain 5m 23s) Loss: 0.0000(0.0041) Grad: 216.9669  LR: 0.000010  
Epoch: [3][2200/2849] Elapsed 15m 51s (remain 4m 40s) Loss: 0.0000(0.0042) Grad: 50.4579  LR: 0.000010  
Epoch: [3][2300/2849] Elapsed 16m 34s (remain 3m 56s) Loss: 0.0005(0.0042) Grad: 2121.1982  LR: 0.000010  
Epoch: [3][2400/2849] Elapsed 17m 17s (remain 3m 13s) Loss: 0.0044(0.0042) Grad: 12142.9717  LR: 0.000010  
Epoch: [3][2500/2849] Elapsed 18m 1s (remain 2m 30s) Loss: 0.0032(0.0042) Grad: 10789.1357  LR: 0.000009  
Epoch: [3][2600/2849] Elapsed 18m 44s (remain 1m 47s) Loss: 0.0263(0.0042) Grad: 34410.6016  LR: 0.000009  
Epoch: [3][2700/2849] Elapsed 19m 28s (remain 1m 4s) Loss: 0.0000(0.0042) Grad: 92.4404  LR: 0.000009  
Epoch: [3][2800/2849] Elapsed 20m 11s (remain 0m 20s) Loss: 0.0072(0.0042) Grad: 8229.0195  LR: 0.000009  
Epoch: [3][2848/2849] Elapsed 20m 32s (remain 0m 0s) Loss: 0.0000(0.0042) Grad: 110.1534  LR: 0.000009  
EVAL: [0/726] Elapsed 0m 0s (remain 4m 59s) Loss: 0.0008(0.0008) 
EVAL: [100/726] Elapsed 0m 21s (remain 2m 10s) Loss: 0.0014(0.0061) 
EVAL: [200/726] Elapsed 0m 41s (remain 1m 49s) Loss: 0.0000(0.0070) 
EVAL: [300/726] Elapsed 1m 2s (remain 1m 28s) Loss: 0.0000(0.0066) 
EVAL: [400/726] Elapsed 1m 23s (remain 1m 7s) Loss: 0.0033(0.0080) 
EVAL: [500/726] Elapsed 1m 44s (remain 0m 46s) Loss: 0.0234(0.0077) 
EVAL: [600/726] Elapsed 2m 5s (remain 0m 26s) Loss: 0.0053(0.0071) 
EVAL: [700/726] Elapsed 2m 26s (remain 0m 5s) Loss: 0.0025(0.0067) 
EVAL: [725/726] Elapsed 2m 31s (remain 0m 0s) Loss: 0.0000(0.0066) 
Epoch 3 - avg_train_loss: 0.0042  avg_val_loss: 0.0066  time: 1389s
Epoch 3 - Score: 0.8805
Epoch 3 - Save Best Score: 0.8805 Model
Epoch: [4][0/2849] Elapsed 0m 0s (remain 32m 20s) Loss: 0.0011(0.0011) Grad: 5811.2661  LR: 0.000009  
Epoch: [4][100/2849] Elapsed 0m 44s (remain 20m 0s) Loss: 0.0119(0.0030) Grad: 39226.1445  LR: 0.000009  
Epoch: [4][200/2849] Elapsed 1m 27s (remain 19m 12s) Loss: 0.0031(0.0032) Grad: 12208.1074  LR: 0.000009  
Epoch: [4][300/2849] Elapsed 2m 10s (remain 18m 23s) Loss: 0.0020(0.0031) Grad: 8131.5825  LR: 0.000008  
Epoch: [4][400/2849] Elapsed 2m 53s (remain 17m 38s) Loss: 0.0014(0.0029) Grad: 4776.4668  LR: 0.000008  
Epoch: [4][500/2849] Elapsed 3m 36s (remain 16m 54s) Loss: 0.0000(0.0029) Grad: 106.5332  LR: 0.000008  
Epoch: [4][600/2849] Elapsed 4m 19s (remain 16m 11s) Loss: 0.0001(0.0031) Grad: 227.4460  LR: 0.000008  
Epoch: [4][700/2849] Elapsed 5m 2s (remain 15m 27s) Loss: 0.0013(0.0030) Grad: 6058.5527  LR: 0.000008  
Epoch: [4][800/2849] Elapsed 5m 45s (remain 14m 44s) Loss: 0.0070(0.0029) Grad: 19759.0176  LR: 0.000008  
Epoch: [4][900/2849] Elapsed 6m 29s (remain 14m 1s) Loss: 0.0009(0.0030) Grad: 10906.0391  LR: 0.000007  
Epoch: [4][1000/2849] Elapsed 7m 12s (remain 13m 18s) Loss: 0.0354(0.0031) Grad: 40945.7305  LR: 0.000007  
Epoch: [4][1100/2849] Elapsed 7m 55s (remain 12m 35s) Loss: 0.0072(0.0031) Grad: 17045.0996  LR: 0.000007  
Epoch: [4][1200/2849] Elapsed 8m 38s (remain 11m 52s) Loss: 0.0016(0.0032) Grad: 6983.1626  LR: 0.000007  
Epoch: [4][1300/2849] Elapsed 9m 21s (remain 11m 8s) Loss: 0.0001(0.0031) Grad: 726.7953  LR: 0.000007  
Epoch: [4][1400/2849] Elapsed 10m 5s (remain 10m 25s) Loss: 0.0029(0.0032) Grad: 11627.6162  LR: 0.000007  
Epoch: [4][1500/2849] Elapsed 10m 48s (remain 9m 42s) Loss: 0.0002(0.0033) Grad: 4452.9839  LR: 0.000007  
Epoch: [4][1600/2849] Elapsed 11m 31s (remain 8m 59s) Loss: 0.0003(0.0033) Grad: 2435.6809  LR: 0.000006  
Epoch: [4][1700/2849] Elapsed 12m 14s (remain 8m 16s) Loss: 0.0101(0.0033) Grad: 87587.8750  LR: 0.000006  
Epoch: [4][1800/2849] Elapsed 12m 58s (remain 7m 32s) Loss: 0.0026(0.0032) Grad: 18695.4004  LR: 0.000006  
Epoch: [4][1900/2849] Elapsed 13m 41s (remain 6m 49s) Loss: 0.0001(0.0033) Grad: 519.3492  LR: 0.000006  
Epoch: [4][2000/2849] Elapsed 14m 24s (remain 6m 6s) Loss: 0.0035(0.0033) Grad: 5416.2241  LR: 0.000006  
Epoch: [4][2100/2849] Elapsed 15m 7s (remain 5m 23s) Loss: 0.0008(0.0034) Grad: 18018.2227  LR: 0.000006  
Epoch: [4][2200/2849] Elapsed 15m 51s (remain 4m 39s) Loss: 0.0012(0.0033) Grad: 17542.5215  LR: 0.000005  
Epoch: [4][2300/2849] Elapsed 16m 34s (remain 3m 56s) Loss: 0.0000(0.0033) Grad: 79.2980  LR: 0.000005  
Epoch: [4][2400/2849] Elapsed 17m 17s (remain 3m 13s) Loss: 0.0098(0.0034) Grad: 17310.6621  LR: 0.000005  
Epoch: [4][2500/2849] Elapsed 18m 1s (remain 2m 30s) Loss: 0.0001(0.0034) Grad: 643.1182  LR: 0.000005  
Epoch: [4][2600/2849] Elapsed 18m 44s (remain 1m 47s) Loss: 0.0100(0.0034) Grad: 23739.8086  LR: 0.000005  
Epoch: [4][2700/2849] Elapsed 19m 27s (remain 1m 3s) Loss: 0.0000(0.0033) Grad: 115.4078  LR: 0.000005  
Epoch: [4][2800/2849] Elapsed 20m 10s (remain 0m 20s) Loss: 0.0000(0.0034) Grad: 293.7319  LR: 0.000005  
Epoch: [4][2848/2849] Elapsed 20m 30s (remain 0m 0s) Loss: 0.0001(0.0034) Grad: 273.1033  LR: 0.000004  
EVAL: [0/726] Elapsed 0m 0s (remain 5m 24s) Loss: 0.0003(0.0003) 
EVAL: [100/726] Elapsed 0m 21s (remain 2m 11s) Loss: 0.0055(0.0065) 
EVAL: [200/726] Elapsed 0m 41s (remain 1m 49s) Loss: 0.0000(0.0069) 
EVAL: [300/726] Elapsed 1m 2s (remain 1m 28s) Loss: 0.0000(0.0066) 
EVAL: [400/726] Elapsed 1m 23s (remain 1m 7s) Loss: 0.0014(0.0081) 
EVAL: [500/726] Elapsed 1m 44s (remain 0m 46s) Loss: 0.0294(0.0079) 
EVAL: [600/726] Elapsed 2m 5s (remain 0m 26s) Loss: 0.0032(0.0072) 
EVAL: [700/726] Elapsed 2m 26s (remain 0m 5s) Loss: 0.0004(0.0069) 
EVAL: [725/726] Elapsed 2m 31s (remain 0m 0s) Loss: 0.0000(0.0068) 
Epoch 4 - avg_train_loss: 0.0034  avg_val_loss: 0.0068  time: 1388s
Epoch 4 - Score: 0.8869
Epoch 4 - Save Best Score: 0.8869 Model
Epoch: [5][0/2849] Elapsed 0m 0s (remain 33m 0s) Loss: 0.0000(0.0000) Grad: 277.0902  LR: 0.000004  
Epoch: [5][100/2849] Elapsed 0m 43s (remain 19m 43s) Loss: 0.0007(0.0026) Grad: 4136.5483  LR: 0.000004  
Epoch: [5][200/2849] Elapsed 1m 26s (remain 19m 0s) Loss: 0.0001(0.0029) Grad: 607.3643  LR: 0.000004  
Epoch: [5][300/2849] Elapsed 2m 10s (remain 18m 21s) Loss: 0.0001(0.0028) Grad: 477.9574  LR: 0.000004  
Epoch: [5][400/2849] Elapsed 2m 53s (remain 17m 36s) Loss: 0.0000(0.0027) Grad: 23.4805  LR: 0.000004  
Epoch: [5][500/2849] Elapsed 3m 36s (remain 16m 52s) Loss: 0.0001(0.0026) Grad: 671.1517  LR: 0.000004  
Epoch: [5][600/2849] Elapsed 4m 19s (remain 16m 10s) Loss: 0.0000(0.0027) Grad: 21.2969  LR: 0.000004  
Epoch: [5][700/2849] Elapsed 5m 2s (remain 15m 26s) Loss: 0.0023(0.0025) Grad: 20411.8281  LR: 0.000003  
Epoch: [5][800/2849] Elapsed 5m 45s (remain 14m 43s) Loss: 0.0002(0.0025) Grad: 1166.5641  LR: 0.000003  
Epoch: [5][900/2849] Elapsed 6m 28s (remain 14m 0s) Loss: 0.0120(0.0026) Grad: 20248.7129  LR: 0.000003  
Epoch: [5][1000/2849] Elapsed 7m 11s (remain 13m 17s) Loss: 0.0120(0.0025) Grad: 16661.0547  LR: 0.000003  
Epoch: [5][1100/2849] Elapsed 7m 55s (remain 12m 34s) Loss: 0.0002(0.0026) Grad: 891.5912  LR: 0.000003  
Epoch: [5][1200/2849] Elapsed 8m 38s (remain 11m 51s) Loss: 0.0337(0.0026) Grad: 26300.3691  LR: 0.000003  
Epoch: [5][1300/2849] Elapsed 9m 22s (remain 11m 8s) Loss: 0.0000(0.0026) Grad: 4.8858  LR: 0.000002  
Epoch: [5][1400/2849] Elapsed 10m 4s (remain 10m 25s) Loss: 0.0026(0.0027) Grad: 9827.9209  LR: 0.000002  
Epoch: [5][1500/2849] Elapsed 10m 47s (remain 9m 41s) Loss: 0.0043(0.0027) Grad: 26296.8457  LR: 0.000002  
Epoch: [5][1600/2849] Elapsed 11m 31s (remain 8m 58s) Loss: 0.0494(0.0027) Grad: 63786.0156  LR: 0.000002  
Epoch: [5][1700/2849] Elapsed 12m 14s (remain 8m 15s) Loss: 0.0000(0.0027) Grad: 11.7317  LR: 0.000002  
Epoch: [5][1800/2849] Elapsed 12m 58s (remain 7m 32s) Loss: 0.0132(0.0027) Grad: 13353.3291  LR: 0.000002  
Epoch: [5][1900/2849] Elapsed 13m 41s (remain 6m 49s) Loss: 0.0000(0.0028) Grad: 12.7526  LR: 0.000001  
Epoch: [5][2000/2849] Elapsed 14m 25s (remain 6m 6s) Loss: 0.0000(0.0027) Grad: 11.1150  LR: 0.000001  
Epoch: [5][2100/2849] Elapsed 15m 8s (remain 5m 23s) Loss: 0.0030(0.0027) Grad: 12698.0322  LR: 0.000001  
Epoch: [5][2200/2849] Elapsed 15m 51s (remain 4m 40s) Loss: 0.0000(0.0028) Grad: 27.7720  LR: 0.000001  
Epoch: [5][2300/2849] Elapsed 16m 34s (remain 3m 56s) Loss: 0.0007(0.0028) Grad: 6377.6455  LR: 0.000001  
Epoch: [5][2400/2849] Elapsed 17m 17s (remain 3m 13s) Loss: 0.0008(0.0028) Grad: 4949.7466  LR: 0.000001  
Epoch: [5][2500/2849] Elapsed 18m 0s (remain 2m 30s) Loss: 0.0022(0.0028) Grad: 67140.3047  LR: 0.000001  
Epoch: [5][2600/2849] Elapsed 18m 43s (remain 1m 47s) Loss: 0.0000(0.0028) Grad: 10.3882  LR: 0.000000  
Epoch: [5][2700/2849] Elapsed 19m 26s (remain 1m 3s) Loss: 0.0000(0.0028) Grad: 8.8045  LR: 0.000000  
Epoch: [5][2800/2849] Elapsed 20m 9s (remain 0m 20s) Loss: 0.0000(0.0028) Grad: 75.6704  LR: 0.000000  
Epoch: [5][2848/2849] Elapsed 20m 29s (remain 0m 0s) Loss: 0.0002(0.0028) Grad: 1135.4016  LR: 0.000000  
EVAL: [0/726] Elapsed 0m 0s (remain 5m 16s) Loss: 0.0003(0.0003) 
EVAL: [100/726] Elapsed 0m 21s (remain 2m 11s) Loss: 0.0059(0.0070) 
EVAL: [200/726] Elapsed 0m 42s (remain 1m 49s) Loss: 0.0000(0.0076) 
EVAL: [300/726] Elapsed 1m 2s (remain 1m 28s) Loss: 0.0000(0.0073) 
EVAL: [400/726] Elapsed 1m 23s (remain 1m 7s) Loss: 0.0016(0.0088) 
EVAL: [500/726] Elapsed 1m 44s (remain 0m 46s) Loss: 0.0286(0.0086) 
EVAL: [600/726] Elapsed 2m 5s (remain 0m 26s) Loss: 0.0044(0.0079) 
EVAL: [700/726] Elapsed 2m 26s (remain 0m 5s) Loss: 0.0002(0.0075) 
EVAL: [725/726] Elapsed 2m 31s (remain 0m 0s) Loss: 0.0000(0.0074) 
Epoch 5 - avg_train_loss: 0.0028  avg_val_loss: 0.0074  time: 1387s
Epoch 5 - Score: 0.8852
========== fold: 1 training ==========
Load weight from ../output/nbme-score-clinical-patient-notes/nbme-exp010/checkpoint-130170/pytorch_model.bin
Epoch: [1][0/2851] Elapsed 0m 0s (remain 29m 59s) Loss: 0.3465(0.3465) Grad: 204467.4062  LR: 0.000000  
Epoch: [1][100/2851] Elapsed 0m 44s (remain 20m 16s) Loss: 0.3152(0.3376) Grad: 228724.4219  LR: 0.000001  
Epoch: [1][200/2851] Elapsed 1m 27s (remain 19m 19s) Loss: 0.1813(0.2896) Grad: 58470.9922  LR: 0.000003  
Epoch: [1][300/2851] Elapsed 2m 11s (remain 18m 31s) Loss: 0.0349(0.2212) Grad: 2396.7236  LR: 0.000004  
Epoch: [1][400/2851] Elapsed 2m 55s (remain 17m 49s) Loss: 0.0660(0.1756) Grad: 4492.5952  LR: 0.000006  
Epoch: [1][500/2851] Elapsed 3m 38s (remain 17m 5s) Loss: 0.0365(0.1475) Grad: 1857.1151  LR: 0.000007  
Epoch: [1][600/2851] Elapsed 4m 22s (remain 16m 21s) Loss: 0.0212(0.1279) Grad: 5222.0479  LR: 0.000008  
Epoch: [1][700/2851] Elapsed 5m 5s (remain 15m 36s) Loss: 0.0090(0.1123) Grad: 1762.0983  LR: 0.000010  
Epoch: [1][800/2851] Elapsed 5m 48s (remain 14m 52s) Loss: 0.0047(0.1005) Grad: 1236.7269  LR: 0.000011  
Epoch: [1][900/2851] Elapsed 6m 32s (remain 14m 9s) Loss: 0.0092(0.0908) Grad: 1665.8403  LR: 0.000013  
Epoch: [1][1000/2851] Elapsed 7m 15s (remain 13m 25s) Loss: 0.0015(0.0828) Grad: 502.2487  LR: 0.000014  
Epoch: [1][1100/2851] Elapsed 7m 59s (remain 12m 41s) Loss: 0.0015(0.0761) Grad: 927.5496  LR: 0.000015  
Epoch: [1][1200/2851] Elapsed 8m 42s (remain 11m 57s) Loss: 0.0209(0.0704) Grad: 3168.2468  LR: 0.000017  
Epoch: [1][1300/2851] Elapsed 9m 25s (remain 11m 13s) Loss: 0.0008(0.0657) Grad: 293.7862  LR: 0.000018  
Epoch: [1][1400/2851] Elapsed 10m 8s (remain 10m 29s) Loss: 0.0036(0.0617) Grad: 696.6323  LR: 0.000020  
Epoch: [1][1500/2851] Elapsed 10m 51s (remain 9m 46s) Loss: 0.0021(0.0582) Grad: 710.2718  LR: 0.000020  
Epoch: [1][1600/2851] Elapsed 11m 35s (remain 9m 2s) Loss: 0.0034(0.0551) Grad: 744.0278  LR: 0.000020  
Epoch: [1][1700/2851] Elapsed 12m 18s (remain 8m 19s) Loss: 0.0049(0.0524) Grad: 1398.2842  LR: 0.000020  
Epoch: [1][1800/2851] Elapsed 13m 1s (remain 7m 35s) Loss: 0.0131(0.0498) Grad: 4879.5508  LR: 0.000019  
Epoch: [1][1900/2851] Elapsed 13m 44s (remain 6m 52s) Loss: 0.0046(0.0476) Grad: 1275.6598  LR: 0.000019  
Epoch: [1][2000/2851] Elapsed 14m 27s (remain 6m 8s) Loss: 0.0029(0.0456) Grad: 543.2283  LR: 0.000019  
Epoch: [1][2100/2851] Elapsed 15m 10s (remain 5m 25s) Loss: 0.0051(0.0437) Grad: 1368.8165  LR: 0.000019  
Epoch: [1][2200/2851] Elapsed 15m 53s (remain 4m 41s) Loss: 0.0017(0.0420) Grad: 419.8411  LR: 0.000019  
Epoch: [1][2300/2851] Elapsed 16m 36s (remain 3m 58s) Loss: 0.0098(0.0406) Grad: 4453.2266  LR: 0.000019  
Epoch: [1][2400/2851] Elapsed 17m 19s (remain 3m 14s) Loss: 0.0135(0.0392) Grad: 1847.0735  LR: 0.000018  
Epoch: [1][2500/2851] Elapsed 18m 3s (remain 2m 31s) Loss: 0.0016(0.0379) Grad: 412.2578  LR: 0.000018  
Epoch: [1][2600/2851] Elapsed 18m 46s (remain 1m 48s) Loss: 0.0039(0.0367) Grad: 704.8419  LR: 0.000018  
Epoch: [1][2700/2851] Elapsed 19m 28s (remain 1m 4s) Loss: 0.0047(0.0355) Grad: 850.6146  LR: 0.000018  
Epoch: [1][2800/2851] Elapsed 20m 11s (remain 0m 21s) Loss: 0.0082(0.0344) Grad: 1364.6437  LR: 0.000018  
Epoch: [1][2850/2851] Elapsed 20m 33s (remain 0m 0s) Loss: 0.0032(0.0339) Grad: 1912.3817  LR: 0.000018  
EVAL: [0/724] Elapsed 0m 0s (remain 5m 15s) Loss: 0.0014(0.0014) 
EVAL: [100/724] Elapsed 0m 21s (remain 2m 10s) Loss: 0.0023(0.0050) 
EVAL: [200/724] Elapsed 0m 41s (remain 1m 49s) Loss: 0.0017(0.0063) 
EVAL: [300/724] Elapsed 1m 2s (remain 1m 28s) Loss: 0.0010(0.0062) 
EVAL: [400/724] Elapsed 1m 23s (remain 1m 7s) Loss: 0.0001(0.0064) 
EVAL: [500/724] Elapsed 1m 44s (remain 0m 46s) Loss: 0.0116(0.0067) 
EVAL: [600/724] Elapsed 2m 5s (remain 0m 25s) Loss: 0.0026(0.0065) 
EVAL: [700/724] Elapsed 2m 26s (remain 0m 4s) Loss: 0.0001(0.0060) 
EVAL: [723/724] Elapsed 2m 31s (remain 0m 0s) Loss: 0.0008(0.0060) 
Epoch 1 - avg_train_loss: 0.0339  avg_val_loss: 0.0060  time: 1390s
Epoch 1 - Score: 0.8504
Epoch 1 - Save Best Score: 0.8504 Model
Epoch: [2][0/2851] Elapsed 0m 0s (remain 30m 53s) Loss: 0.0008(0.0008) Grad: 1906.1155  LR: 0.000018  
Epoch: [2][100/2851] Elapsed 0m 44s (remain 20m 6s) Loss: 0.0046(0.0046) Grad: 13039.7061  LR: 0.000018  
Epoch: [2][200/2851] Elapsed 1m 27s (remain 19m 18s) Loss: 0.0004(0.0049) Grad: 1738.7960  LR: 0.000017  
Epoch: [2][300/2851] Elapsed 2m 11s (remain 18m 30s) Loss: 0.0019(0.0054) Grad: 5689.4839  LR: 0.000017  
Epoch: [2][400/2851] Elapsed 2m 54s (remain 17m 45s) Loss: 0.0005(0.0050) Grad: 2241.6653  LR: 0.000017  
Epoch: [2][500/2851] Elapsed 3m 37s (remain 17m 2s) Loss: 0.0145(0.0052) Grad: 11512.6787  LR: 0.000017  
Epoch: [2][600/2851] Elapsed 4m 21s (remain 16m 19s) Loss: 0.0019(0.0051) Grad: 6138.7495  LR: 0.000017  
Epoch: [2][700/2851] Elapsed 5m 5s (remain 15m 36s) Loss: 0.0008(0.0052) Grad: 3717.0415  LR: 0.000017  
Epoch: [2][800/2851] Elapsed 5m 48s (remain 14m 52s) Loss: 0.0001(0.0052) Grad: 261.6287  LR: 0.000017  
Epoch: [2][900/2851] Elapsed 6m 32s (remain 14m 8s) Loss: 0.0001(0.0054) Grad: 1197.3711  LR: 0.000016  
Epoch: [2][1000/2851] Elapsed 7m 15s (remain 13m 24s) Loss: 0.0110(0.0055) Grad: 81412.9297  LR: 0.000016  
Epoch: [2][1100/2851] Elapsed 7m 58s (remain 12m 40s) Loss: 0.0002(0.0055) Grad: 640.3331  LR: 0.000016  
Epoch: [2][1200/2851] Elapsed 8m 41s (remain 11m 56s) Loss: 0.0037(0.0055) Grad: 4818.8833  LR: 0.000016  
Epoch: [2][1300/2851] Elapsed 9m 25s (remain 11m 13s) Loss: 0.0000(0.0054) Grad: 50.7537  LR: 0.000016  
Epoch: [2][1400/2851] Elapsed 10m 8s (remain 10m 29s) Loss: 0.0024(0.0054) Grad: 3364.3904  LR: 0.000016  
Epoch: [2][1500/2851] Elapsed 10m 51s (remain 9m 46s) Loss: 0.0030(0.0054) Grad: 13003.8076  LR: 0.000015  
Epoch: [2][1600/2851] Elapsed 11m 34s (remain 9m 2s) Loss: 0.0003(0.0054) Grad: 495.2165  LR: 0.000015  
Epoch: [2][1700/2851] Elapsed 12m 18s (remain 8m 19s) Loss: 0.0007(0.0054) Grad: 1308.0134  LR: 0.000015  
Epoch: [2][1800/2851] Elapsed 13m 1s (remain 7m 35s) Loss: 0.0001(0.0054) Grad: 275.3367  LR: 0.000015  
Epoch: [2][1900/2851] Elapsed 13m 44s (remain 6m 52s) Loss: 0.0104(0.0054) Grad: 8233.1436  LR: 0.000015  
Epoch: [2][2000/2851] Elapsed 14m 27s (remain 6m 8s) Loss: 0.0036(0.0054) Grad: 8519.0186  LR: 0.000015  
Epoch: [2][2100/2851] Elapsed 15m 11s (remain 5m 25s) Loss: 0.0000(0.0053) Grad: 3.6904  LR: 0.000015  
Epoch: [2][2200/2851] Elapsed 15m 54s (remain 4m 41s) Loss: 0.0030(0.0052) Grad: 7958.4575  LR: 0.000014  
Epoch: [2][2300/2851] Elapsed 16m 38s (remain 3m 58s) Loss: 0.0026(0.0052) Grad: 2311.8394  LR: 0.000014  
Epoch: [2][2400/2851] Elapsed 17m 21s (remain 3m 15s) Loss: 0.0000(0.0052) Grad: 62.0075  LR: 0.000014  
Epoch: [2][2500/2851] Elapsed 18m 5s (remain 2m 31s) Loss: 0.0088(0.0053) Grad: 19682.8047  LR: 0.000014  
Epoch: [2][2600/2851] Elapsed 18m 49s (remain 1m 48s) Loss: 0.0008(0.0052) Grad: 1186.3380  LR: 0.000014  
Epoch: [2][2700/2851] Elapsed 19m 32s (remain 1m 5s) Loss: 0.0003(0.0053) Grad: 553.4240  LR: 0.000014  
Epoch: [2][2800/2851] Elapsed 20m 15s (remain 0m 21s) Loss: 0.0008(0.0052) Grad: 954.4187  LR: 0.000013  
Epoch: [2][2850/2851] Elapsed 20m 37s (remain 0m 0s) Loss: 0.0000(0.0052) Grad: 7.9962  LR: 0.000013  
EVAL: [0/724] Elapsed 0m 0s (remain 5m 17s) Loss: 0.0010(0.0010) 
EVAL: [100/724] Elapsed 0m 21s (remain 2m 10s) Loss: 0.0009(0.0047) 
EVAL: [200/724] Elapsed 0m 42s (remain 1m 49s) Loss: 0.0000(0.0068) 
EVAL: [300/724] Elapsed 1m 2s (remain 1m 28s) Loss: 0.0004(0.0062) 
EVAL: [400/724] Elapsed 1m 23s (remain 1m 7s) Loss: 0.0000(0.0063) 
EVAL: [500/724] Elapsed 1m 44s (remain 0m 46s) Loss: 0.0106(0.0069) 
EVAL: [600/724] Elapsed 2m 5s (remain 0m 25s) Loss: 0.0013(0.0067) 
EVAL: [700/724] Elapsed 2m 26s (remain 0m 4s) Loss: 0.0000(0.0062) 
EVAL: [723/724] Elapsed 2m 31s (remain 0m 0s) Loss: 0.0002(0.0062) 
Epoch 2 - avg_train_loss: 0.0052  avg_val_loss: 0.0062  time: 1394s
Epoch 2 - Score: 0.8674
Epoch 2 - Save Best Score: 0.8674 Model
Epoch: [3][0/2851] Elapsed 0m 0s (remain 34m 14s) Loss: 0.0008(0.0008) Grad: 10763.8809  LR: 0.000013  
Epoch: [3][100/2851] Elapsed 0m 44s (remain 20m 4s) Loss: 0.0037(0.0037) Grad: 6687.5073  LR: 0.000013  
Epoch: [3][200/2851] Elapsed 1m 27s (remain 19m 16s) Loss: 0.0011(0.0038) Grad: 6494.1211  LR: 0.000013  
Epoch: [3][300/2851] Elapsed 2m 11s (remain 18m 31s) Loss: 0.0009(0.0040) Grad: 3113.8872  LR: 0.000013  
Epoch: [3][400/2851] Elapsed 2m 54s (remain 17m 45s) Loss: 0.0000(0.0040) Grad: 12.3376  LR: 0.000013  
Epoch: [3][500/2851] Elapsed 3m 37s (remain 17m 0s) Loss: 0.0024(0.0040) Grad: 19309.5312  LR: 0.000013  
Epoch: [3][600/2851] Elapsed 4m 20s (remain 16m 15s) Loss: 0.0001(0.0039) Grad: 1207.3245  LR: 0.000012  
Epoch: [3][700/2851] Elapsed 5m 4s (remain 15m 32s) Loss: 0.0000(0.0040) Grad: 83.0510  LR: 0.000012  
Epoch: [3][800/2851] Elapsed 5m 47s (remain 14m 48s) Loss: 0.0000(0.0040) Grad: 22.0785  LR: 0.000012  
Epoch: [3][900/2851] Elapsed 6m 30s (remain 14m 5s) Loss: 0.0000(0.0040) Grad: 77.5542  LR: 0.000012  
Epoch: [3][1000/2851] Elapsed 7m 14s (remain 13m 22s) Loss: 0.0041(0.0040) Grad: 6978.5083  LR: 0.000012  
Epoch: [3][1100/2851] Elapsed 7m 57s (remain 12m 39s) Loss: 0.0000(0.0040) Grad: 150.2409  LR: 0.000012  
Epoch: [3][1200/2851] Elapsed 8m 41s (remain 11m 56s) Loss: 0.0000(0.0039) Grad: 18.8023  LR: 0.000011  
Epoch: [3][1300/2851] Elapsed 9m 24s (remain 11m 12s) Loss: 0.0002(0.0039) Grad: 732.1606  LR: 0.000011  
Epoch: [3][1400/2851] Elapsed 10m 7s (remain 10m 29s) Loss: 0.0136(0.0041) Grad: 32969.6680  LR: 0.000011  
Epoch: [3][1500/2851] Elapsed 10m 50s (remain 9m 45s) Loss: 0.0058(0.0040) Grad: 9186.9834  LR: 0.000011  
Epoch: [3][1600/2851] Elapsed 11m 34s (remain 9m 2s) Loss: 0.0004(0.0040) Grad: 11648.7764  LR: 0.000011  
Epoch: [3][1700/2851] Elapsed 12m 17s (remain 8m 18s) Loss: 0.0019(0.0040) Grad: 6553.5430  LR: 0.000011  
Epoch: [3][1800/2851] Elapsed 13m 1s (remain 7m 35s) Loss: 0.0003(0.0040) Grad: 1967.8130  LR: 0.000011  
Epoch: [3][1900/2851] Elapsed 13m 44s (remain 6m 51s) Loss: 0.0135(0.0040) Grad: 9758.5381  LR: 0.000010  
Epoch: [3][2000/2851] Elapsed 14m 27s (remain 6m 8s) Loss: 0.0000(0.0040) Grad: 241.2142  LR: 0.000010  
Epoch: [3][2100/2851] Elapsed 15m 10s (remain 5m 25s) Loss: 0.0004(0.0040) Grad: 1374.7668  LR: 0.000010  
Epoch: [3][2200/2851] Elapsed 15m 54s (remain 4m 41s) Loss: 0.0052(0.0040) Grad: 9497.1279  LR: 0.000010  
Epoch: [3][2300/2851] Elapsed 16m 37s (remain 3m 58s) Loss: 0.0104(0.0039) Grad: 19310.5840  LR: 0.000010  
Epoch: [3][2400/2851] Elapsed 17m 21s (remain 3m 15s) Loss: 0.0105(0.0040) Grad: 12752.3438  LR: 0.000010  
Epoch: [3][2500/2851] Elapsed 18m 4s (remain 2m 31s) Loss: 0.0014(0.0039) Grad: 13741.3301  LR: 0.000009  
Epoch: [3][2600/2851] Elapsed 18m 47s (remain 1m 48s) Loss: 0.0015(0.0039) Grad: 8623.0625  LR: 0.000009  
Epoch: [3][2700/2851] Elapsed 19m 31s (remain 1m 5s) Loss: 0.0139(0.0039) Grad: 33197.9219  LR: 0.000009  
Epoch: [3][2800/2851] Elapsed 20m 14s (remain 0m 21s) Loss: 0.0072(0.0039) Grad: 23131.1094  LR: 0.000009  
Epoch: [3][2850/2851] Elapsed 20m 35s (remain 0m 0s) Loss: 0.0035(0.0039) Grad: 26111.8965  LR: 0.000009  
EVAL: [0/724] Elapsed 0m 0s (remain 5m 14s) Loss: 0.0012(0.0012) 
EVAL: [100/724] Elapsed 0m 21s (remain 2m 10s) Loss: 0.0018(0.0054) 
EVAL: [200/724] Elapsed 0m 41s (remain 1m 49s) Loss: 0.0000(0.0071) 
EVAL: [300/724] Elapsed 1m 2s (remain 1m 28s) Loss: 0.0002(0.0064) 
EVAL: [400/724] Elapsed 1m 23s (remain 1m 7s) Loss: 0.0000(0.0063) 
EVAL: [500/724] Elapsed 1m 44s (remain 0m 46s) Loss: 0.0126(0.0071) 
EVAL: [600/724] Elapsed 2m 5s (remain 0m 25s) Loss: 0.0013(0.0069) 
EVAL: [700/724] Elapsed 2m 26s (remain 0m 4s) Loss: 0.0000(0.0063) 
EVAL: [723/724] Elapsed 2m 31s (remain 0m 0s) Loss: 0.0000(0.0063) 
Epoch 3 - avg_train_loss: 0.0039  avg_val_loss: 0.0063  time: 1392s
Epoch 3 - Score: 0.8856
Epoch 3 - Save Best Score: 0.8856 Model
Epoch: [4][0/2851] Elapsed 0m 0s (remain 35m 31s) Loss: 0.0005(0.0005) Grad: 2259.9712  LR: 0.000009  
Epoch: [4][100/2851] Elapsed 0m 43s (remain 19m 56s) Loss: 0.0010(0.0030) Grad: 2411.9326  LR: 0.000009  
Epoch: [4][200/2851] Elapsed 1m 26s (remain 19m 5s) Loss: 0.0033(0.0028) Grad: 15176.5283  LR: 0.000009  
Epoch: [4][300/2851] Elapsed 2m 10s (remain 18m 22s) Loss: 0.0007(0.0032) Grad: 5301.3037  LR: 0.000008  
Epoch: [4][400/2851] Elapsed 2m 53s (remain 17m 40s) Loss: 0.0208(0.0032) Grad: 36567.1445  LR: 0.000008  
Epoch: [4][500/2851] Elapsed 3m 36s (remain 16m 56s) Loss: 0.0009(0.0032) Grad: 5816.9473  LR: 0.000008  
Epoch: [4][600/2851] Elapsed 4m 19s (remain 16m 12s) Loss: 0.0027(0.0032) Grad: 9889.6885  LR: 0.000008  
Epoch: [4][700/2851] Elapsed 5m 3s (remain 15m 29s) Loss: 0.0001(0.0033) Grad: 357.5256  LR: 0.000008  
Epoch: [4][800/2851] Elapsed 5m 46s (remain 14m 47s) Loss: 0.0000(0.0032) Grad: 70.1443  LR: 0.000008  
Epoch: [4][900/2851] Elapsed 6m 29s (remain 14m 3s) Loss: 0.0183(0.0031) Grad: 33551.7969  LR: 0.000007  
Epoch: [4][1000/2851] Elapsed 7m 12s (remain 13m 20s) Loss: 0.0010(0.0032) Grad: 4935.8389  LR: 0.000007  
Epoch: [4][1100/2851] Elapsed 7m 55s (remain 12m 36s) Loss: 0.0275(0.0032) Grad: 21660.4102  LR: 0.000007  
Epoch: [4][1200/2851] Elapsed 8m 39s (remain 11m 53s) Loss: 0.0002(0.0032) Grad: 1629.5698  LR: 0.000007  
Epoch: [4][1300/2851] Elapsed 9m 22s (remain 11m 10s) Loss: 0.0078(0.0032) Grad: 51846.8750  LR: 0.000007  
Epoch: [4][1400/2851] Elapsed 10m 6s (remain 10m 27s) Loss: 0.0006(0.0032) Grad: 2201.4275  LR: 0.000007  
Epoch: [4][1500/2851] Elapsed 10m 49s (remain 9m 43s) Loss: 0.0000(0.0033) Grad: 17.0025  LR: 0.000007  
Epoch: [4][1600/2851] Elapsed 11m 32s (remain 9m 0s) Loss: 0.0001(0.0032) Grad: 1333.6726  LR: 0.000006  
Epoch: [4][1700/2851] Elapsed 12m 15s (remain 8m 17s) Loss: 0.0000(0.0032) Grad: 147.2238  LR: 0.000006  
Epoch: [4][1800/2851] Elapsed 12m 58s (remain 7m 34s) Loss: 0.0037(0.0032) Grad: 13139.6484  LR: 0.000006  
Epoch: [4][1900/2851] Elapsed 13m 41s (remain 6m 50s) Loss: 0.0010(0.0033) Grad: 3705.9448  LR: 0.000006  
Epoch: [4][2000/2851] Elapsed 14m 25s (remain 6m 7s) Loss: 0.0051(0.0033) Grad: 40057.1055  LR: 0.000006  
Epoch: [4][2100/2851] Elapsed 15m 8s (remain 5m 24s) Loss: 0.0002(0.0033) Grad: 1812.7024  LR: 0.000006  
Epoch: [4][2200/2851] Elapsed 15m 51s (remain 4m 41s) Loss: 0.0005(0.0032) Grad: 6290.8804  LR: 0.000005  
Epoch: [4][2300/2851] Elapsed 16m 35s (remain 3m 57s) Loss: 0.0000(0.0032) Grad: 81.8823  LR: 0.000005  
Epoch: [4][2400/2851] Elapsed 17m 18s (remain 3m 14s) Loss: 0.0309(0.0032) Grad: 29732.0625  LR: 0.000005  
Epoch: [4][2500/2851] Elapsed 18m 1s (remain 2m 31s) Loss: 0.0054(0.0032) Grad: 10044.0879  LR: 0.000005  
Epoch: [4][2600/2851] Elapsed 18m 45s (remain 1m 48s) Loss: 0.0015(0.0031) Grad: 5492.8687  LR: 0.000005  
Epoch: [4][2700/2851] Elapsed 19m 28s (remain 1m 4s) Loss: 0.0048(0.0031) Grad: 7724.6943  LR: 0.000005  
Epoch: [4][2800/2851] Elapsed 20m 11s (remain 0m 21s) Loss: 0.0108(0.0031) Grad: 9939.3916  LR: 0.000005  
Epoch: [4][2850/2851] Elapsed 20m 32s (remain 0m 0s) Loss: 0.0000(0.0031) Grad: 387.3120  LR: 0.000004  
EVAL: [0/724] Elapsed 0m 0s (remain 5m 4s) Loss: 0.0005(0.0005) 
EVAL: [100/724] Elapsed 0m 21s (remain 2m 10s) Loss: 0.0023(0.0059) 
EVAL: [200/724] Elapsed 0m 42s (remain 1m 49s) Loss: 0.0000(0.0077) 
EVAL: [300/724] Elapsed 1m 2s (remain 1m 28s) Loss: 0.0007(0.0069) 
EVAL: [400/724] Elapsed 1m 23s (remain 1m 7s) Loss: 0.0000(0.0068) 
EVAL: [500/724] Elapsed 1m 44s (remain 0m 46s) Loss: 0.0113(0.0077) 
EVAL: [600/724] Elapsed 2m 5s (remain 0m 25s) Loss: 0.0015(0.0075) 
EVAL: [700/724] Elapsed 2m 26s (remain 0m 4s) Loss: 0.0000(0.0069) 
EVAL: [723/724] Elapsed 2m 31s (remain 0m 0s) Loss: 0.0000(0.0069) 
Epoch 4 - avg_train_loss: 0.0031  avg_val_loss: 0.0069  time: 1390s
Epoch 4 - Score: 0.8845
Epoch: [5][0/2851] Elapsed 0m 0s (remain 30m 50s) Loss: 0.0125(0.0125) Grad: 14903.7529  LR: 0.000004  
Epoch: [5][100/2851] Elapsed 0m 44s (remain 20m 4s) Loss: 0.0004(0.0028) Grad: 4639.6431  LR: 0.000004  
Epoch: [5][200/2851] Elapsed 1m 27s (remain 19m 10s) Loss: 0.0002(0.0024) Grad: 3665.4402  LR: 0.000004  
Epoch: [5][300/2851] Elapsed 2m 10s (remain 18m 21s) Loss: 0.0004(0.0024) Grad: 7950.5488  LR: 0.000004  
Epoch: [5][400/2851] Elapsed 2m 52s (remain 17m 36s) Loss: 0.0148(0.0025) Grad: 20103.8574  LR: 0.000004  
Epoch: [5][500/2851] Elapsed 3m 36s (remain 16m 54s) Loss: 0.0029(0.0026) Grad: 12255.2412  LR: 0.000004  
Epoch: [5][600/2851] Elapsed 4m 19s (remain 16m 11s) Loss: 0.0001(0.0027) Grad: 539.7539  LR: 0.000004  
Epoch: [5][700/2851] Elapsed 5m 2s (remain 15m 27s) Loss: 0.0152(0.0026) Grad: 28115.6602  LR: 0.000003  
Epoch: [5][800/2851] Elapsed 5m 45s (remain 14m 44s) Loss: 0.0057(0.0027) Grad: 15851.1162  LR: 0.000003  
Epoch: [5][900/2851] Elapsed 6m 28s (remain 14m 1s) Loss: 0.0022(0.0027) Grad: 7730.7046  LR: 0.000003  
Epoch: [5][1000/2851] Elapsed 7m 11s (remain 13m 18s) Loss: 0.0030(0.0026) Grad: 9275.9609  LR: 0.000003  
Epoch: [5][1100/2851] Elapsed 7m 55s (remain 12m 35s) Loss: 0.0073(0.0026) Grad: 24938.4883  LR: 0.000003  
Epoch: [5][1200/2851] Elapsed 8m 38s (remain 11m 52s) Loss: 0.0016(0.0026) Grad: 4339.9609  LR: 0.000003  
Epoch: [5][1300/2851] Elapsed 9m 21s (remain 11m 9s) Loss: 0.0000(0.0026) Grad: 40.1037  LR: 0.000002  
Epoch: [5][1400/2851] Elapsed 10m 5s (remain 10m 26s) Loss: 0.0000(0.0026) Grad: 25.2733  LR: 0.000002  
Epoch: [5][1500/2851] Elapsed 10m 48s (remain 9m 43s) Loss: 0.0000(0.0026) Grad: 119.2565  LR: 0.000002  
Epoch: [5][1600/2851] Elapsed 11m 32s (remain 9m 0s) Loss: 0.0116(0.0026) Grad: 39435.6680  LR: 0.000002  
Epoch: [5][1700/2851] Elapsed 12m 15s (remain 8m 17s) Loss: 0.0000(0.0026) Grad: 13.8637  LR: 0.000002  
Epoch: [5][1800/2851] Elapsed 12m 58s (remain 7m 33s) Loss: 0.0000(0.0026) Grad: 18.8631  LR: 0.000002  
Epoch: [5][1900/2851] Elapsed 13m 41s (remain 6m 50s) Loss: 0.0000(0.0026) Grad: 9.4319  LR: 0.000001  
Epoch: [5][2000/2851] Elapsed 14m 24s (remain 6m 7s) Loss: 0.0008(0.0026) Grad: 19903.5098  LR: 0.000001  
Epoch: [5][2100/2851] Elapsed 15m 7s (remain 5m 24s) Loss: 0.0042(0.0026) Grad: 19385.8770  LR: 0.000001  
Epoch: [5][2200/2851] Elapsed 15m 51s (remain 4m 40s) Loss: 0.0000(0.0026) Grad: 6.0979  LR: 0.000001  
Epoch: [5][2300/2851] Elapsed 16m 34s (remain 3m 57s) Loss: 0.0053(0.0026) Grad: 33444.4258  LR: 0.000001  
Epoch: [5][2400/2851] Elapsed 17m 17s (remain 3m 14s) Loss: 0.0005(0.0027) Grad: 7382.8896  LR: 0.000001  
Epoch: [5][2500/2851] Elapsed 18m 0s (remain 2m 31s) Loss: 0.0562(0.0027) Grad: 63682.2500  LR: 0.000001  
Epoch: [5][2600/2851] Elapsed 18m 43s (remain 1m 47s) Loss: 0.0002(0.0027) Grad: 1873.1287  LR: 0.000000  
Epoch: [5][2700/2851] Elapsed 19m 27s (remain 1m 4s) Loss: 0.0001(0.0027) Grad: 463.9747  LR: 0.000000  
Epoch: [5][2800/2851] Elapsed 20m 10s (remain 0m 21s) Loss: 0.0002(0.0026) Grad: 9337.2783  LR: 0.000000  
Epoch: [5][2850/2851] Elapsed 20m 31s (remain 0m 0s) Loss: 0.0012(0.0026) Grad: 6785.6362  LR: 0.000000  
EVAL: [0/724] Elapsed 0m 0s (remain 5m 44s) Loss: 0.0007(0.0007) 
EVAL: [100/724] Elapsed 0m 21s (remain 2m 11s) Loss: 0.0024(0.0059) 
EVAL: [200/724] Elapsed 0m 42s (remain 1m 49s) Loss: 0.0000(0.0077) 
EVAL: [300/724] Elapsed 1m 2s (remain 1m 28s) Loss: 0.0022(0.0069) 
EVAL: [400/724] Elapsed 1m 23s (remain 1m 7s) Loss: 0.0000(0.0070) 
EVAL: [500/724] Elapsed 1m 44s (remain 0m 46s) Loss: 0.0128(0.0080) 
EVAL: [600/724] Elapsed 2m 5s (remain 0m 25s) Loss: 0.0006(0.0078) 
EVAL: [700/724] Elapsed 2m 26s (remain 0m 4s) Loss: 0.0000(0.0073) 
EVAL: [723/724] Elapsed 2m 31s (remain 0m 0s) Loss: 0.0000(0.0072) 
Epoch 5 - avg_train_loss: 0.0026  avg_val_loss: 0.0072  time: 1389s
Epoch 5 - Score: 0.8845

In [60]:
if __name__ == "__main__":
    main()

Load weight from ../output/nbme-score-clinical-patient-notes/nbme-exp010/checkpoint-130170/pytorch_model.bin
Load weight from ../output/nbme-score-clinical-patient-notes/nbme-exp010/checkpoint-130170/pytorch_model.bin
Load weight from ../output/nbme-score-clinical-patient-notes/nbme-exp010/checkpoint-130170/pytorch_model.bin
Load weight from ../output/nbme-score-clinical-patient-notes/nbme-exp010/checkpoint-130170/pytorch_model.bin
Load weight from ../output/nbme-score-clinical-patient-notes/nbme-exp010/checkpoint-130170/pytorch_model.bin
Best thres: 0.5, Score: 0.8844
Best thres: 0.4453125, Score: 0.8846


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Load weight from pretrained


  0%|          | 0/2 [00:00<?, ?it/s]

Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Load weight from pretrained


  0%|          | 0/2 [00:00<?, ?it/s]

Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Load weight from pretrained


  0%|          | 0/2 [00:00<?, ?it/s]

Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Load weight from pretrained


  0%|          | 0/2 [00:00<?, ?it/s]

Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Load weight from pretrained


  0%|          | 0/2 [00:00<?, ?it/s]