## References

- https://www.kaggle.com/yasufuminakama/nbme-deberta-base-baseline-train

## Configurations

In [1]:
EXP_NAME = "nbme-exp067"
ENV = "local"
DEBUG_MODE = False
SUBMISSION_MODE = False

In [2]:
class CFG:
    env=ENV
    exp_name=EXP_NAME
    debug=DEBUG_MODE
    submission=SUBMISSION_MODE
    apex=True
    input_dir=None
    output_dir=None
    library="pytorch"  # ["tf", "pytorch"]
    device="GPU"  # ["GPU", "TPU"]
    competition_name="nbme-score-clinical-patient-notes"
    id_col="id"
    target_col="location"
    pretrained_model_name="microsoft/deberta-xlarge"
    tokenizer=None
    max_len=None
    max_char_len=None
    output_dim=1
    dropout=0.2
    num_workers=4
    batch_size=4
    lr=2e-5
    betas=(0.9, 0.98)
    weight_decay=0.1
    num_warmup_steps_rate=0.1
    batch_scheduler=True
    epochs=5
    n_fold=4
    train_fold=[0, 1, 2, 3]
    seed=71
    gradient_accumulation_steps=1
    max_grad_norm=1000
    print_freq=100
    train=True
    inference=True

In [3]:
if CFG.debug:
    CFG.epochs = 2
    CFG.train_fold = [0, 1]

if CFG.submission:
    CFG.train = False
    CFG.inference = True

## Directory Settings

In [4]:
import sys
from pathlib import Path


print(CFG.env)
if CFG.env == "colab":
    # colab環境
    from google.colab import drive
    drive.mount("/content/drive")
    CFG.input_dir = Path("./drive/MyDrive/00.kaggle/input") / CFG.competition_name
    CFG.output_dir = Path("./drive/MyDrive/00.kaggle/output") / CFG.competition_name / CFG.exp_name
    if not CFG.output_dir.exists():
        CFG.output_dir.mkdir()
    # install packages
    !pip install transformers==4.16.2

elif CFG.env == "local":
    # ローカルサーバ
    CFG.input_dir = Path("../input/") / CFG.competition_name
    CFG.output_dir = Path("../output/") / CFG.competition_name / CFG.exp_name
    if not CFG.output_dir.exists():
        CFG.output_dir.mkdir()

elif CFG.env == "kaggle":
    # kaggle環境
    CFG.input_dir = Path("../input/") / CFG.competition_name
    CFG.output_dir = Path("./")

local


In [5]:
import gc
import os
import ast
import time
import math
import random
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from scipy.optimize import minimize
from sklearn.metrics import roc_auc_score, mean_squared_error, f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as T
from torchvision.io import read_image
from torch.utils.data import DataLoader, Dataset

from transformers import AutoModelForMaskedLM
from transformers import BartModel,BertModel,BertTokenizer
from transformers import DebertaModel,DebertaTokenizer
from transformers import RobertaModel,RobertaTokenizer
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel,AutoConfig
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import ElectraModel, ElectraTokenizer, ElectraForSequenceClassification

import warnings
warnings.filterwarnings("ignore")

In [6]:
os.environ["TOKENIZERS_PARALLELISM"]= "false"

## Utilities

In [7]:
def micro_f1(preds, truths):
    """
    Micro f1 on binary arrays.

    Args:
        preds (list of lists of ints): Predictions.
        truths (list of lists of ints): Ground truths.

    Returns:
        float: f1 score.
    """
    # Micro : aggregating over all instances
    preds = np.concatenate(preds)
    truths = np.concatenate(truths)
    return f1_score(truths, preds)


def spans_to_binary(spans, length=None):
    """
    Converts spans to a binary array indicating whether each character is in the span.

    Args:
        spans (list of lists of two ints): Spans.

    Returns:
        np array [length]: Binarized spans.
    """
    length = np.max(spans) if length is None else length
    binary = np.zeros(length)
    for start, end in spans:
        binary[start:end] = 1
    return binary


def span_micro_f1(preds, truths):
    """
    Micro f1 on spans.

    Args:
        preds (list of lists of two ints): Prediction spans.
        truths (list of lists of two ints): Ground truth spans.

    Returns:
        float: f1 score.
    """
    bin_preds = []
    bin_truths = []
    for pred, truth in zip(preds, truths):
        if not len(pred) and not len(truth):
            continue
        length = max(np.max(pred) if len(pred) else 0, np.max(truth) if len(truth) else 0)
        bin_preds.append(spans_to_binary(pred, length))
        bin_truths.append(spans_to_binary(truth, length))
    return micro_f1(bin_preds, bin_truths)


def get_score(y_true, y_pred):
    score = span_micro_f1(y_true, y_pred)
    return score

In [8]:
def create_labels_for_scoring(df):
    # example: ['48 61', '111 128'] -> [[48, 61], [111, 128]]
    df["location_for_create_labels"] = [ast.literal_eval(f"[]")] * len(df)
    for i in range(len(df)):
        lst = df.loc[i, "location"]
        if lst:
            new_lst = ";".join(lst)
            df.loc[i, "location_for_create_labels"] = ast.literal_eval(f"[['{new_lst}']]")

    # create labels
    truths = []
    for location_list in df["location_for_create_labels"].values:
        truth = []
        if len(location_list) > 0:
            location = location_list[0]
            for loc in [s.split() for s in location.split(";")]:
                start, end = int(loc[0]), int(loc[1])
                truth.append([start, end])
        truths.append(truth)

    return truths


def get_char_probs(texts, token_probs, tokenizer):
    res = [np.zeros(len(t)) for t in texts]
    for i, (text, prediction) in enumerate(zip(texts, token_probs)):
        encoded = tokenizer(
            text=text,
            max_length=CFG.max_len,
            padding="max_length",
            return_offsets_mapping=True,
        )
        for (offset_mapping, pred) in zip(encoded["offset_mapping"], prediction):
            start, end = offset_mapping
            res[i][start:end] = pred
    return res


def get_predicted_location_str(char_probs, th=0.5):
    results = []
    for char_prob in char_probs:
        # result = np.where(char_prob >= th)[0] + 1
        result = np.where(char_prob >= th)[0]
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        # result = [f"{min(r)} {max(r)}" for r in result]
        result = [f"{min(r)} {max(r) + 1}" for r in result]
        result = ";".join(result)
        results.append(result)
    return results


def get_predictions(results):
    predictions = []
    for result in results:
        prediction = []
        if result != "":
            for loc in [s.split() for s in result.split(";")]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
    return predictions


def scoring(df, th=0.5, use_token_prob=True):
    labels = create_labels_for_scoring(df)

    if use_token_prob:
        token_probs = df[[str(i) for i in range(CFG.max_len)]].values
        char_probs = get_char_probs(df["pn_history"].values, token_probs, CFG.tokenizer)
    else:
        char_probs = df[[str(i) for i in range(CFG.max_char_len)]].values
        char_probs = [char_probs[i] for i in range(len(char_probs))]

    predicted_location_str = get_predicted_location_str(char_probs, th=th)
    preds = get_predictions(predicted_location_str)

    score = get_score(labels, preds)
    return score


def get_best_thres(oof_df):
    def f1_opt(x):
        return -1 * scoring(oof_df, th=x)

    best_thres = minimize(f1_opt, x0=np.array([0.5]), method="Nelder-Mead")["x"].item()
    return best_thres

In [9]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [10]:
seed_everything()

## Data Loading

In [11]:
train = pd.read_csv(CFG.input_dir / "train.csv")
features = pd.read_csv(CFG.input_dir / "features.csv")
patient_notes = pd.read_csv(CFG.input_dir / "patient_notes.csv")
test = pd.read_csv(CFG.input_dir / "test.csv")

train.shape, features.shape, patient_notes.shape, test.shape

((14300, 6), (143, 3), (42146, 3), (5, 4))

In [12]:
if CFG.debug:
    train = train.sample(n=1000, random_state=0).reset_index(drop=True)
    print(train.shape)

## Preprocessing

In [13]:
def preprocess_features(features):
    features.loc[features["feature_text"] == "Last-Pap-smear-I-year-ago", "feature_text"] = "Last-Pap-smear-1-year-ago"
    return features


features = preprocess_features(features)

In [14]:
train = train.merge(features, on=["feature_num", "case_num"], how="left")
train = train.merge(patient_notes, on=["pn_num", "case_num"], how="left")
test = test.merge(features, on=["feature_num", "case_num"], how="left")
test = test.merge(patient_notes, on=["pn_num", "case_num"], how="left")

train.shape, test.shape

((14300, 8), (5, 6))

In [15]:
train["annotation"] = train["annotation"].apply(ast.literal_eval)
train["location"] = train["location"].apply(ast.literal_eval)

In [16]:
train["annotation_length"] = train["annotation"].apply(len)
display(train['annotation_length'].value_counts().sort_index())

0    4399
1    8181
2    1296
3     287
4      99
5      27
6       9
7       1
8       1
Name: annotation_length, dtype: int64

## CV split

In [17]:
Fold = GroupKFold(n_splits=CFG.n_fold)
groups = train['pn_num'].values
for n, (train_index, val_index) in enumerate(Fold.split(train, train['location'], groups)):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

fold
0    3575
1    3575
2    3575
3    3575
dtype: int64

## Setup tokenizer

In [18]:
if CFG.submission:
    tokenizer = AutoTokenizer.from_pretrained(Path("../input/") / CFG.exp_name / "tokenizer/")
else:
    tokenizer = AutoTokenizer.from_pretrained(CFG.pretrained_model_name)
    tokenizer.save_pretrained(CFG.output_dir / "tokenizer/")

CFG.tokenizer = tokenizer

## Create dataset

In [19]:
pn_history_lengths = []
tk0 = tqdm(patient_notes["pn_history"].fillna("").values, total=len(patient_notes))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)["input_ids"])
    pn_history_lengths.append(length)

print("max length:", np.max(pn_history_lengths))

  0%|          | 0/42146 [00:00<?, ?it/s]

max length: 433


In [20]:
feature_text_lengths = []
tk0 = tqdm(features["feature_text"].fillna("").values, total=len(features))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)["input_ids"])
    feature_text_lengths.append(length)

print("max length:", np.max(feature_text_lengths))

  0%|          | 0/143 [00:00<?, ?it/s]

max length: 30


In [21]:
CFG.max_len = max(pn_history_lengths) + max(feature_text_lengths) + 3   # cls & sep & sep

print("max length:", CFG.max_len)

max length: 466


In [22]:
pn_history_lengths = []
tk0 = tqdm(patient_notes["pn_history"].fillna("").values, total=len(patient_notes))
for text in tk0:
    length = len(text)
    pn_history_lengths.append(length)

CFG.max_char_len = max(pn_history_lengths)

print("max length:", CFG.max_char_len)

  0%|          | 0/42146 [00:00<?, ?it/s]

max length: 950


In [23]:
class TrainingDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.df = df
        self.tokenizer = self.cfg.tokenizer
        self.max_len = self.cfg.max_len
        self.max_char_len = self.cfg.max_char_len
        self.feature_texts = self.df["feature_text"].values
        self.pn_historys = self.df["pn_history"].values
        self.annotation_lengths = self.df["annotation_length"].values
        self.locations = self.df["location"].values

    def __len__(self):
        return len(self.df)

    def _create_input(self, pn_history, feature_text):
        encoded = self.tokenizer(
            text=pn_history,
            text_pair=feature_text,
            max_length=self.max_len,
            padding="max_length",
            return_offsets_mapping=False,
        )
        for k, v in encoded.items():
            encoded[k] = torch.tensor(v, dtype=torch.long)
        return encoded

    def _create_mapping_from_token_to_char(self, pn_history):
        encoded = self.tokenizer(
            text=pn_history,
            max_length=self.max_len,
            padding="max_length",
            return_offsets_mapping=True,
        )
        mapping_from_token_to_char = np.zeros(self.max_char_len)
        offset_mapping = encoded["offset_mapping"]
        for i, offset in enumerate(offset_mapping):
            start_idx, end_idx = offset
            mapping_from_token_to_char[start_idx:end_idx] = i
        return torch.tensor(mapping_from_token_to_char, dtype=torch.long)

    def _create_label(self, pn_history, annotation_length, location_list):
        label = np.zeros(self.max_char_len)
        label[len(pn_history):] = -1
        if annotation_length > 0:
            for location in location_list:
                for loc in [s.split() for s in location.split(";")]:
                    start, end = int(loc[0]), int(loc[1])
                    label[start:end] = 1
        return torch.tensor(label, dtype=torch.float)

    def __getitem__(self, idx):
        input_ = self._create_input(self.pn_historys[idx], self.feature_texts[idx])
        label = self._create_label(self.pn_historys[idx], self.annotation_lengths[idx], self.locations[idx])
        mapping_from_token_to_char = self._create_mapping_from_token_to_char(self.pn_historys[idx])
        return input_, label, mapping_from_token_to_char

In [24]:
class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.df = df
        self.tokenizer = self.cfg.tokenizer
        self.max_len = self.cfg.max_len
        self.max_char_len = self.cfg.max_char_len
        self.feature_texts = self.df["feature_text"].values
        self.pn_historys = self.df["pn_history"].values

    def __len__(self):
        return len(self.df)

    def _create_input(self, pn_history, feature_text):
        encoded = self.tokenizer(
            text=pn_history,
            text_pair=feature_text,
            max_length=self.max_len,
            padding="max_length",
            return_offsets_mapping=False,
        )
        for k, v in encoded.items():
            encoded[k] = torch.tensor(v, dtype=torch.long)
        return encoded

    def _create_mapping_from_token_to_char(self, pn_history):
        encoded = self.tokenizer(
            text=pn_history,
            max_length=self.max_len,
            padding="max_length",
            return_offsets_mapping=True,
        )
        mapping_from_token_to_char = np.zeros(self.max_char_len)
        offset_mapping = encoded["offset_mapping"]
        for i, offset in enumerate(offset_mapping):
            start_idx, end_idx = offset
            mapping_from_token_to_char[start_idx:end_idx] = i
        return torch.tensor(mapping_from_token_to_char, dtype=torch.long)

    def __getitem__(self, idx):
        input_ = self._create_input(self.pn_historys[idx], self.feature_texts[idx])
        mapping_from_token_to_char = self._create_mapping_from_token_to_char(self.pn_historys[idx])
        return input_, mapping_from_token_to_char

## Model

In [25]:
# ====================================================
# Model
# ====================================================
class MaskedModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(
                cfg.pretrained_model_name,
                output_hidden_states=False
                )
        else:
            self.config = torch.load(config_path)
        
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.pretrained_model_name, config=self.config)
            self.lm_head = AutoModelForMaskedLM.from_pretrained(cfg.pretrained_model_name, config=self.config).cls # [cls, lm_head]
        else:
            self.model = AutoModel(self.config)
            self.lm_head = AutoModelForMaskedLM(self.config).cls # [cls, lm_head]
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(
            self, 
            input_ids=None,
            attention_mask=None,
            token_type_ids=None,
            #position_ids=None,
            inputs_embeds=None,
            labels=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None):
        
        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            #position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,)
        
        sequence_output = outputs[0]
        prediction_scores = self.lm_head(sequence_output)

        masked_lm_loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

        return MaskedLMOutput(loss=masked_lm_loss,
                              logits=prediction_scores,
                              hidden_states=outputs.hidden_states,
                              attentions=outputs.attentions)

In [26]:
class Exp066Model(nn.Module):
    def __init__(self, cfg, model_config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg

        if model_config_path is None:
            self.model_config = AutoConfig.from_pretrained(
                self.cfg.pretrained_model_name,
                output_hidden_states=True,
            )
        else:
            self.model_config = torch.load(model_config_path)

        if pretrained:
            self.backbone = AutoModel.from_pretrained(
                self.cfg.pretrained_model_name,
                config=self.model_config,
            )
            print(f"Load weight from pretrained")
        else:
            #self.backbone = AutoModel.from_config(self.model_config)
            # itpt = AutoModelForMaskedLM.from_config(self.model_config)
            #path = str(Path("./drive/MyDrive/00.kaggle/output") / CFG.competition_name /  "nbme-exp010/checkpoint-130170/pytorch_model.bin")
            # path = "../output/nbme-score-clinical-patient-notes/nbme-exp010/checkpoint-130170/pytorch_model.bin"
            # state_dict = torch.load(path)
            # itpt.load_state_dict(state_dict)
            path = "../output/nbme-score-clinical-patient-notes/nbme-exp045/microsoft-deberta-xlarge-mlm-epoch-v4.bin"
            masked_model = MaskedModel(CFG, config_path=None, pretrained=True)
            state = torch.load(path, map_location=torch.device("cpu"))
            masked_model.load_state_dict(state)
            self.backbone = masked_model.model
            print(f"Load weight from {path}")

        self.fc = nn.Sequential(
            nn.Dropout(self.cfg.dropout),
            nn.Linear(self.model_config.hidden_size, self.cfg.output_dim),
        )

    def forward(self, inputs):
        h = self.backbone(**inputs)["last_hidden_state"]
        output = self.fc(h)
        return output

In [27]:
class CustomModel(nn.Module):
    def __init__(self, cfg, model_config_path=None, pretrained=False, i_fold=None):
        super().__init__()
        self.cfg = cfg

        if model_config_path is None:
            self.model_config = AutoConfig.from_pretrained(
                self.cfg.pretrained_model_name,
                output_hidden_states=True,
            )
        else:
            self.model_config = torch.load(model_config_path)

        if pretrained:
            self.backbone = AutoModel.from_pretrained(
                self.cfg.pretrained_model_name,
                config=self.model_config,
            )
            print(f"Load weight from pretrained")
        else:
            #self.backbone = AutoModel.from_config(self.model_config)

            model = Exp066Model(cfg, model_config_path=None, pretrained=False)
            # path = str(Path("./drive/MyDrive/00.kaggle/output") / CFG.competition_name /  "nbme-exp066" /  f"fold{i_fold}_best.pth")
            path = f"../output/nbme-score-clinical-patient-notes/nbme-exp066/fold{i_fold}_best.pth"
            state = torch.load(path, map_location=torch.device("cpu"))
            model.load_state_dict(state["model"])
            self.backbone = model.backbone
            print(f"Load weight from {path}")

        self.lstm = nn.LSTM(
            input_size=self.model_config.hidden_size,
            bidirectional=True,
            hidden_size=self.model_config.hidden_size // 2,
            num_layers=4,
            dropout=self.cfg.dropout,
            batch_first=True,
        )
        self.fc = nn.Sequential(
            nn.Dropout(self.cfg.dropout),
            nn.Linear(self.model_config.hidden_size, self.cfg.output_dim),
        )

    def forward(self, inputs, mappings_from_token_to_char):
        h = self.backbone(**inputs)["last_hidden_state"]  # [batch, seq_len, d_model]
        mappings_from_token_to_char = mappings_from_token_to_char.unsqueeze(2).expand(-1, -1, self.model_config.hidden_size)
        h = torch.gather(h, 1, mappings_from_token_to_char)    # [batch, seq_len, d_model]
        h, _ = self.lstm(h)
        output = self.fc(h)

        return output

## Training

In [28]:
def train_fn(
    train_dataloader,
    model,
    criterion,
    optimizer,
    epoch,
    scheduler,
    device,
):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = time.time()
    for step, (inputs, labels, mappings_from_token_to_char) in enumerate(train_dataloader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device) 
        batch_size = labels.size(0)
        mappings_from_token_to_char = mappings_from_token_to_char.to(device)

        with torch.cuda.amp.autocast(enabled=CFG.apex):
            output = model(inputs, mappings_from_token_to_char)

        loss = criterion(output.view(-1, 1), labels.view(-1, 1))
        loss = torch.masked_select(loss, labels.view(-1, 1) != -1)
        loss = loss.mean()

        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)

        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        if CFG.batch_scheduler:
            scheduler.step()

        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_dataloader)-1):
            print(
                "Epoch: [{0}][{1}/{2}] "
                "Elapsed {remain:s} "
                "Loss: {loss.val:.4f}({loss.avg:.4f}) "
                "Grad: {grad_norm:.4f}  "
                "LR: {lr:.6f}  "
                .format(
                    epoch+1,
                    step,
                    len(train_dataloader),
                    remain=timeSince(start, float(step+1) / len(train_dataloader)),
                    loss=losses,
                     grad_norm=grad_norm,
                     lr=scheduler.get_lr()[0],
                )
            )
    return losses.avg

In [29]:
def valid_fn(
    val_dataloader,
    model,
    criterion,
    device,
):
    model.eval()
    preds = []
    losses = AverageMeter()
    start = time.time()
    for step, (inputs, labels, mappings_from_token_to_char) in enumerate(val_dataloader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device) 
        batch_size = labels.size(0)
        mappings_from_token_to_char = mappings_from_token_to_char.to(device)

        with torch.cuda.amp.autocast(enabled=CFG.apex):
            output = model(inputs, mappings_from_token_to_char)

        loss = criterion(output.view(-1, 1), labels.view(-1, 1))
        loss = torch.masked_select(loss, labels.view(-1, 1) != -1)
        loss = loss.mean()
    
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(output.sigmoid().squeeze(2).detach().cpu().numpy())

        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(val_dataloader)-1):
            print(
                "EVAL: [{0}/{1}] "
                "Elapsed {remain:s} "
                "Loss: {loss.val:.4f}({loss.avg:.4f}) "
                .format(
                    step, len(val_dataloader),
                    remain=timeSince(start, float(step+1) / len(val_dataloader)),
                    loss=losses,
                )
            )
    preds = np.concatenate(preds)
    return losses.avg, preds

In [30]:
def inference_fn(test_dataloader, model, device):
    model.eval()
    model.to(device)
    preds = []
    tk0 = tqdm(test_dataloader, total=len(test_dataloader))
    for (inputs, mappings_from_token_to_char) in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        mappings_from_token_to_char = mappings_from_token_to_char.to(device)

        with torch.no_grad():
            output = model(inputs, mappings_from_token_to_char)
        preds.append(output.sigmoid().squeeze(2).detach().cpu().numpy())
    preds = np.concatenate(preds)
    return preds

In [31]:
def train_loop(df, i_fold, device):
    print(f"========== fold: {i_fold} training ==========")
    train_idx = df[df["fold"] != i_fold].index
    val_idx = df[df["fold"] == i_fold].index

    train_folds = df.loc[train_idx].reset_index(drop=True)
    val_folds = df.loc[val_idx].reset_index(drop=True)

    train_dataset = TrainingDataset(CFG, train_folds)
    val_dataset = TrainingDataset(CFG, val_folds)

    train_dataloader = DataLoader(
        train_dataset,
        batch_size=CFG.batch_size,
        shuffle=True,
        num_workers=CFG.num_workers,
        pin_memory=True,
        drop_last=True,
    )
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=CFG.batch_size,
        shuffle=False,
        num_workers=CFG.num_workers,
        pin_memory=True,
        drop_last=False,
    )

    # model = CustomModel(CFG, model_config_path=None, pretrained=True)
    model = CustomModel(CFG, model_config_path=None, pretrained=False, i_fold=i_fold)   # itptを使うため
    torch.save(model.model_config, CFG.output_dir / "model_config.pth")
    model.to(device)

    # freeze
    for param in model.backbone.parameters():
        param.requires_grad = False

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {"params": [p for n, p in param_optimizer if not any(
            nd in n for nd in no_decay)], "weight_decay": CFG.weight_decay},
        {"params": [p for n, p in param_optimizer if any(
            nd in n for nd in no_decay)], "weight_decay": 0.0}
    ]
    optimizer = AdamW(
        optimizer_grouped_parameters,
        lr=CFG.lr,
        betas=CFG.betas,
        weight_decay=CFG.weight_decay,
    )
    num_train_optimization_steps = int(len(train_dataloader) * CFG.epochs)
    num_warmup_steps = int(num_train_optimization_steps * CFG.num_warmup_steps_rate)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_train_optimization_steps,
    )

    criterion = nn.BCEWithLogitsLoss(reduction="none")
    best_score = -1 * np.inf

    for epoch in range(CFG.epochs):
        start_time = time.time()
        avg_loss = train_fn(
            train_dataloader,
            model,
            criterion,
            optimizer,
            epoch,
            scheduler,
            device,
        )
        avg_val_loss, val_preds = valid_fn(
            val_dataloader,
            model,
            criterion,
            device,
        )

        if isinstance(scheduler, optim.lr_scheduler.CosineAnnealingWarmRestarts):
            scheduler.step()

        # scoring
        val_folds[[str(i) for i in range(CFG.max_char_len)]] = val_preds
        score = scoring(val_folds, th=0.5, use_token_prob=False)

        elapsed = time.time() - start_time

        print(f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s")
        print(f"Epoch {epoch+1} - Score: {score:.4f}")
        if score > best_score:
            best_score = score
            print(f"Epoch {epoch+1} - Save Best Score: {score:.4f} Model")
            torch.save({
                "model": model.state_dict(),
                "predictions": val_preds,
                },
                CFG.output_dir / f"fold{i_fold}_best.pth",
            )

    predictions = torch.load(
        CFG.output_dir / f"fold{i_fold}_best.pth",
        map_location=torch.device("cpu"),
    )["predictions"]
    val_folds[[str(i) for i in range(CFG.max_char_len)]] = predictions

    torch.cuda.empty_cache()
    gc.collect()

    return val_folds

## Main

In [32]:
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if CFG.train:
        oof_df = pd.DataFrame()
        for i_fold in range(CFG.n_fold):
            if i_fold in CFG.train_fold:
                _oof_df = train_loop(train, i_fold, device)
                oof_df = pd.concat([oof_df, _oof_df], axis=0, ignore_index=True)
        oof_df.to_pickle(CFG.output_dir / "oof_df.pkl")

    if CFG.submission:
        oof_df = pd.read_pickle(Path("../input/") / CFG.exp_name / "oof_df.pkl")
    else:
        oof_df = pd.read_pickle(CFG.output_dir / "oof_df.pkl")

    best_thres = 0.5
    best_score = 0.
    for th in np.arange(0.45, 0.55, 0.01):
        th = np.round(th, 2)
        score = scoring(oof_df, th=th, use_token_prob=False)
        if best_score < score:
            best_thres = th
            best_score = score
    print(f"best_thres: {best_thres}  score: {best_score:.5f}")

    if CFG.inference:
        test_dataset = TestDataset(CFG, test)
        test_dataloader = DataLoader(
            test_dataset,
            batch_size=CFG.batch_size,
            shuffle=False,
            num_workers=CFG.num_workers,
            pin_memory=True,
            drop_last=False,
        )
        predictions = []
        for i_fold in CFG.train_fold:
            if CFG.submission:
                model = CustomModel(CFG, model_config_path=Path("../input/") / CFG.exp_name / "model_config.pth", pretrained=False)
                path = Path("../input/") / CFG.exp_name / f"fold{i_fold}_best.pth"
            else:
                model = CustomModel(CFG, model_config_path=None, pretrained=True)
                path = CFG.output_dir / f"fold{i_fold}_best.pth"

            state = torch.load(path, map_location=torch.device("cpu"))
            model.load_state_dict(state["model"])
            print(f"load weights from {path}")
            test_char_probs = inference_fn(test_dataloader, model, device)
            predictions.append(test_char_probs)

            del state, test_char_probs, model; gc.collect()
            torch.cuda.empty_cache()

        predictions = np.mean(predictions, axis=0)
        predicted_location_str = get_predicted_location_str(predictions, th=best_thres)
        test[CFG.target_col] = predicted_location_str
        test.to_csv(CFG.output_dir / "raw_submission.csv", index=False)
        test[[CFG.id_col, CFG.target_col]].to_csv(
            CFG.output_dir / "submission.csv", index=False
        )

In [33]:
if __name__ == "__main__":
    main()



Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForMaskedLM: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_p

Load weight from ../output/nbme-score-clinical-patient-notes/nbme-exp045/microsoft-deberta-xlarge-mlm-epoch-v4.bin
Load weight from ../output/nbme-score-clinical-patient-notes/nbme-exp066/fold0_best.pth
Epoch: [1][0/2681] Elapsed 0m 1s (remain 46m 4s) Loss: 0.6844(0.6844) Grad: 52778.6680  LR: 0.000000  
Epoch: [1][100/2681] Elapsed 1m 4s (remain 27m 15s) Loss: 0.6441(0.6718) Grad: 52420.0938  LR: 0.000002  
Epoch: [1][200/2681] Elapsed 2m 7s (remain 26m 7s) Loss: 0.4969(0.6271) Grad: 57028.8516  LR: 0.000003  
Epoch: [1][300/2681] Elapsed 3m 10s (remain 25m 2s) Loss: 0.1881(0.5360) Grad: 50494.0430  LR: 0.000004  
Epoch: [1][400/2681] Elapsed 4m 13s (remain 23m 58s) Loss: 0.0330(0.4242) Grad: 6636.1689  LR: 0.000006  
Epoch: [1][500/2681] Elapsed 5m 16s (remain 22m 55s) Loss: 0.0066(0.3430) Grad: 1146.6115  LR: 0.000007  
Epoch: [1][600/2681] Elapsed 6m 19s (remain 21m 51s) Loss: 0.0048(0.2873) Grad: 484.9509  LR: 0.000009  
Epoch: [1][700/2681] Elapsed 7m 22s (remain 20m 48s) Loss: 0



EVAL: [0/894] Elapsed 0m 0s (remain 8m 49s) Loss: 0.0005(0.0005) 
EVAL: [100/894] Elapsed 0m 32s (remain 4m 18s) Loss: 0.0001(0.0161) 
EVAL: [200/894] Elapsed 1m 5s (remain 3m 44s) Loss: 0.0019(0.0186) 
EVAL: [300/894] Elapsed 1m 37s (remain 3m 12s) Loss: 0.0123(0.0197) 
EVAL: [400/894] Elapsed 2m 9s (remain 2m 39s) Loss: 0.0531(0.0186) 
EVAL: [500/894] Elapsed 2m 42s (remain 2m 7s) Loss: 0.0126(0.0223) 
EVAL: [600/894] Elapsed 3m 14s (remain 1m 34s) Loss: 0.0076(0.0244) 
EVAL: [700/894] Elapsed 3m 47s (remain 1m 2s) Loss: 0.0001(0.0247) 
EVAL: [800/894] Elapsed 4m 19s (remain 0m 30s) Loss: 0.0025(0.0240) 
EVAL: [893/894] Elapsed 4m 49s (remain 0m 0s) Loss: 0.0001(0.0228) 
Epoch 1 - avg_train_loss: 0.0680  avg_val_loss: 0.0228  time: 1981s
Epoch 1 - Score: 0.8814
Epoch 1 - Save Best Score: 0.8814 Model




Epoch: [2][0/2681] Elapsed 0m 0s (remain 43m 0s) Loss: 0.0472(0.0472) Grad: 7530.2241  LR: 0.000018  
Epoch: [2][100/2681] Elapsed 1m 4s (remain 27m 16s) Loss: 0.0011(0.0027) Grad: 423.0115  LR: 0.000018  
Epoch: [2][200/2681] Elapsed 2m 7s (remain 26m 8s) Loss: 0.0006(0.0027) Grad: 324.5608  LR: 0.000017  
Epoch: [2][300/2681] Elapsed 3m 10s (remain 25m 3s) Loss: 0.0050(0.0026) Grad: 1471.2389  LR: 0.000017  
Epoch: [2][400/2681] Elapsed 4m 13s (remain 23m 59s) Loss: 0.0001(0.0029) Grad: 61.3083  LR: 0.000017  
Epoch: [2][500/2681] Elapsed 5m 16s (remain 22m 56s) Loss: 0.0006(0.0027) Grad: 360.2280  LR: 0.000017  
Epoch: [2][600/2681] Elapsed 6m 19s (remain 21m 52s) Loss: 0.0000(0.0028) Grad: 39.6150  LR: 0.000017  
Epoch: [2][700/2681] Elapsed 7m 22s (remain 20m 49s) Loss: 0.0002(0.0028) Grad: 128.0180  LR: 0.000017  
Epoch: [2][800/2681] Elapsed 8m 25s (remain 19m 46s) Loss: 0.0033(0.0029) Grad: 1406.2792  LR: 0.000016  
Epoch: [2][900/2681] Elapsed 9m 28s (remain 18m 43s) Loss: 0.0



Epoch: [2][2680/2681] Elapsed 28m 11s (remain 0m 0s) Loss: 0.0000(0.0035) Grad: 53.2582  LR: 0.000013  




EVAL: [0/894] Elapsed 0m 0s (remain 8m 43s) Loss: 0.0001(0.0001) 
EVAL: [100/894] Elapsed 0m 33s (remain 4m 19s) Loss: 0.0000(0.0171) 
EVAL: [200/894] Elapsed 1m 5s (remain 3m 45s) Loss: 0.0022(0.0196) 
EVAL: [300/894] Elapsed 1m 37s (remain 3m 12s) Loss: 0.0150(0.0207) 
EVAL: [400/894] Elapsed 2m 10s (remain 2m 40s) Loss: 0.0537(0.0197) 
EVAL: [500/894] Elapsed 2m 42s (remain 2m 7s) Loss: 0.0130(0.0238) 
EVAL: [600/894] Elapsed 3m 15s (remain 1m 35s) Loss: 0.0087(0.0262) 
EVAL: [700/894] Elapsed 3m 47s (remain 1m 2s) Loss: 0.0001(0.0265) 
EVAL: [800/894] Elapsed 4m 19s (remain 0m 30s) Loss: 0.0016(0.0258) 




EVAL: [893/894] Elapsed 4m 50s (remain 0m 0s) Loss: 0.0000(0.0245) 
Epoch 2 - avg_train_loss: 0.0035  avg_val_loss: 0.0245  time: 1983s
Epoch 2 - Score: 0.8824
Epoch 2 - Save Best Score: 0.8824 Model




Epoch: [3][0/2681] Elapsed 0m 0s (remain 42m 42s) Loss: 0.0002(0.0002) Grad: 109.8384  LR: 0.000013  
Epoch: [3][100/2681] Elapsed 1m 4s (remain 27m 17s) Loss: 0.0008(0.0029) Grad: 737.4916  LR: 0.000013  
Epoch: [3][200/2681] Elapsed 2m 7s (remain 26m 9s) Loss: 0.0005(0.0028) Grad: 309.4971  LR: 0.000013  
Epoch: [3][300/2681] Elapsed 3m 10s (remain 25m 4s) Loss: 0.0006(0.0030) Grad: 399.4721  LR: 0.000013  
Epoch: [3][400/2681] Elapsed 4m 13s (remain 24m 0s) Loss: 0.0052(0.0034) Grad: 2635.2759  LR: 0.000013  
Epoch: [3][500/2681] Elapsed 5m 16s (remain 22m 56s) Loss: 0.0002(0.0036) Grad: 204.2128  LR: 0.000013  
Epoch: [3][600/2681] Elapsed 6m 19s (remain 21m 53s) Loss: 0.0001(0.0035) Grad: 66.1801  LR: 0.000012  
Epoch: [3][700/2681] Elapsed 7m 22s (remain 20m 50s) Loss: 0.0004(0.0039) Grad: 597.8340  LR: 0.000012  
Epoch: [3][800/2681] Elapsed 8m 25s (remain 19m 47s) Loss: 0.0003(0.0038) Grad: 319.8338  LR: 0.000012  
Epoch: [3][900/2681] Elapsed 9m 28s (remain 18m 43s) Loss: 0.00



EVAL: [0/894] Elapsed 0m 0s (remain 8m 47s) Loss: 0.0001(0.0001) 
EVAL: [100/894] Elapsed 0m 32s (remain 4m 18s) Loss: 0.0000(0.0163) 
EVAL: [200/894] Elapsed 1m 5s (remain 3m 45s) Loss: 0.0024(0.0184) 
EVAL: [300/894] Elapsed 1m 37s (remain 3m 12s) Loss: 0.0145(0.0194) 
EVAL: [400/894] Elapsed 2m 10s (remain 2m 39s) Loss: 0.0494(0.0185) 
EVAL: [500/894] Elapsed 2m 42s (remain 2m 7s) Loss: 0.0106(0.0223) 
EVAL: [600/894] Elapsed 3m 14s (remain 1m 35s) Loss: 0.0083(0.0247) 
EVAL: [700/894] Elapsed 3m 47s (remain 1m 2s) Loss: 0.0001(0.0250) 
EVAL: [800/894] Elapsed 4m 19s (remain 0m 30s) Loss: 0.0014(0.0243) 




EVAL: [893/894] Elapsed 4m 49s (remain 0m 0s) Loss: 0.0000(0.0232) 
Epoch 3 - avg_train_loss: 0.0034  avg_val_loss: 0.0232  time: 1984s
Epoch 3 - Score: 0.8835
Epoch 3 - Save Best Score: 0.8835 Model




Epoch: [4][0/2681] Elapsed 0m 0s (remain 43m 23s) Loss: 0.0000(0.0000) Grad: 36.9130  LR: 0.000009  
Epoch: [4][100/2681] Elapsed 1m 4s (remain 27m 16s) Loss: 0.0000(0.0025) Grad: 16.8867  LR: 0.000009  
Epoch: [4][200/2681] Elapsed 2m 7s (remain 26m 8s) Loss: 0.0081(0.0028) Grad: 1915.0236  LR: 0.000009  
Epoch: [4][300/2681] Elapsed 3m 10s (remain 25m 4s) Loss: 0.0016(0.0033) Grad: 435.2961  LR: 0.000008  
Epoch: [4][400/2681] Elapsed 4m 13s (remain 24m 0s) Loss: 0.0043(0.0032) Grad: 1731.4443  LR: 0.000008  
Epoch: [4][500/2681] Elapsed 5m 16s (remain 22m 56s) Loss: 0.0055(0.0032) Grad: 2495.4966  LR: 0.000008  
Epoch: [4][600/2681] Elapsed 6m 19s (remain 21m 53s) Loss: 0.0004(0.0031) Grad: 434.3258  LR: 0.000008  
Epoch: [4][700/2681] Elapsed 7m 22s (remain 20m 49s) Loss: 0.0001(0.0034) Grad: 158.1666  LR: 0.000008  
Epoch: [4][800/2681] Elapsed 8m 25s (remain 19m 46s) Loss: 0.0000(0.0034) Grad: 19.7404  LR: 0.000008  
Epoch: [4][900/2681] Elapsed 9m 28s (remain 18m 43s) Loss: 0.00



EVAL: [0/894] Elapsed 0m 0s (remain 8m 22s) Loss: 0.0001(0.0001) 
EVAL: [100/894] Elapsed 0m 32s (remain 4m 18s) Loss: 0.0000(0.0164) 
EVAL: [200/894] Elapsed 1m 5s (remain 3m 45s) Loss: 0.0025(0.0189) 
EVAL: [300/894] Elapsed 1m 37s (remain 3m 12s) Loss: 0.0146(0.0201) 
EVAL: [400/894] Elapsed 2m 10s (remain 2m 40s) Loss: 0.0503(0.0192) 
EVAL: [500/894] Elapsed 2m 42s (remain 2m 7s) Loss: 0.0109(0.0232) 
EVAL: [600/894] Elapsed 3m 15s (remain 1m 35s) Loss: 0.0075(0.0257) 
EVAL: [700/894] Elapsed 3m 47s (remain 1m 2s) Loss: 0.0000(0.0260) 
EVAL: [800/894] Elapsed 4m 19s (remain 0m 30s) Loss: 0.0018(0.0253) 




EVAL: [893/894] Elapsed 4m 50s (remain 0m 0s) Loss: 0.0000(0.0242) 
Epoch 4 - avg_train_loss: 0.0033  avg_val_loss: 0.0242  time: 1983s
Epoch 4 - Score: 0.8831




Epoch: [5][0/2681] Elapsed 0m 0s (remain 40m 36s) Loss: 0.0002(0.0002) Grad: 191.7501  LR: 0.000004  
Epoch: [5][100/2681] Elapsed 1m 3s (remain 27m 14s) Loss: 0.0000(0.0047) Grad: 28.4498  LR: 0.000004  
Epoch: [5][200/2681] Elapsed 2m 7s (remain 26m 7s) Loss: 0.0000(0.0043) Grad: 13.0859  LR: 0.000004  
Epoch: [5][300/2681] Elapsed 3m 10s (remain 25m 3s) Loss: 0.0001(0.0042) Grad: 151.8632  LR: 0.000004  
Epoch: [5][400/2681] Elapsed 4m 13s (remain 23m 59s) Loss: 0.0000(0.0041) Grad: 15.6246  LR: 0.000004  
Epoch: [5][500/2681] Elapsed 5m 16s (remain 22m 56s) Loss: 0.0004(0.0038) Grad: 335.6844  LR: 0.000004  
Epoch: [5][600/2681] Elapsed 6m 19s (remain 21m 52s) Loss: 0.0004(0.0035) Grad: 938.1263  LR: 0.000003  
Epoch: [5][700/2681] Elapsed 7m 22s (remain 20m 49s) Loss: 0.0001(0.0034) Grad: 166.6261  LR: 0.000003  
Epoch: [5][800/2681] Elapsed 8m 25s (remain 19m 46s) Loss: 0.0009(0.0032) Grad: 590.3839  LR: 0.000003  
Epoch: [5][900/2681] Elapsed 9m 28s (remain 18m 43s) Loss: 0.0001



Epoch: [5][2680/2681] Elapsed 28m 12s (remain 0m 0s) Loss: 0.0009(0.0033) Grad: 2211.1179  LR: 0.000000  




EVAL: [0/894] Elapsed 0m 0s (remain 8m 27s) Loss: 0.0001(0.0001) 
EVAL: [100/894] Elapsed 0m 33s (remain 4m 19s) Loss: 0.0000(0.0158) 
EVAL: [200/894] Elapsed 1m 5s (remain 3m 46s) Loss: 0.0025(0.0181) 
EVAL: [300/894] Elapsed 1m 38s (remain 3m 13s) Loss: 0.0147(0.0192) 
EVAL: [400/894] Elapsed 2m 10s (remain 2m 40s) Loss: 0.0485(0.0183) 
EVAL: [500/894] Elapsed 2m 43s (remain 2m 8s) Loss: 0.0097(0.0221) 
EVAL: [600/894] Elapsed 3m 15s (remain 1m 35s) Loss: 0.0075(0.0245) 
EVAL: [700/894] Elapsed 3m 48s (remain 1m 2s) Loss: 0.0001(0.0248) 
EVAL: [800/894] Elapsed 4m 20s (remain 0m 30s) Loss: 0.0016(0.0242) 




EVAL: [893/894] Elapsed 4m 51s (remain 0m 0s) Loss: 0.0000(0.0231) 
Epoch 5 - avg_train_loss: 0.0033  avg_val_loss: 0.0231  time: 1985s
Epoch 5 - Score: 0.8833


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForMaskedLM: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_p

Load weight from ../output/nbme-score-clinical-patient-notes/nbme-exp045/microsoft-deberta-xlarge-mlm-epoch-v4.bin
Load weight from ../output/nbme-score-clinical-patient-notes/nbme-exp066/fold1_best.pth




Epoch: [1][0/2681] Elapsed 0m 1s (remain 52m 12s) Loss: 0.6821(0.6821) Grad: 55009.8164  LR: 0.000000  
Epoch: [1][100/2681] Elapsed 1m 4s (remain 27m 23s) Loss: 0.6377(0.6677) Grad: 53870.3477  LR: 0.000002  
Epoch: [1][200/2681] Elapsed 2m 7s (remain 26m 12s) Loss: 0.4821(0.6194) Grad: 57921.1094  LR: 0.000003  
Epoch: [1][300/2681] Elapsed 3m 10s (remain 25m 7s) Loss: 0.1812(0.5236) Grad: 47428.1016  LR: 0.000004  
Epoch: [1][400/2681] Elapsed 4m 13s (remain 24m 2s) Loss: 0.0286(0.4132) Grad: 7014.2153  LR: 0.000006  
Epoch: [1][500/2681] Elapsed 5m 16s (remain 22m 58s) Loss: 0.0079(0.3347) Grad: 1353.7759  LR: 0.000007  
Epoch: [1][600/2681] Elapsed 6m 20s (remain 21m 55s) Loss: 0.0035(0.2809) Grad: 574.4606  LR: 0.000009  
Epoch: [1][700/2681] Elapsed 7m 23s (remain 20m 51s) Loss: 0.0080(0.2420) Grad: 934.3875  LR: 0.000010  
Epoch: [1][800/2681] Elapsed 8m 26s (remain 19m 48s) Loss: 0.0026(0.2130) Grad: 560.3124  LR: 0.000012  
Epoch: [1][900/2681] Elapsed 9m 29s (remain 18m 45s)



Epoch: [1][2680/2681] Elapsed 28m 12s (remain 0m 0s) Loss: 0.0039(0.0687) Grad: 1556.4785  LR: 0.000018  




EVAL: [0/894] Elapsed 0m 0s (remain 11m 37s) Loss: 0.0199(0.0199) 
EVAL: [100/894] Elapsed 0m 33s (remain 4m 21s) Loss: 0.0105(0.0145) 
EVAL: [200/894] Elapsed 1m 5s (remain 3m 46s) Loss: 0.0718(0.0241) 
EVAL: [300/894] Elapsed 1m 38s (remain 3m 13s) Loss: 0.0383(0.0251) 
EVAL: [400/894] Elapsed 2m 10s (remain 2m 40s) Loss: 0.0004(0.0229) 
EVAL: [500/894] Elapsed 2m 43s (remain 2m 8s) Loss: 0.0263(0.0235) 
EVAL: [600/894] Elapsed 3m 15s (remain 1m 35s) Loss: 0.0144(0.0237) 
EVAL: [700/894] Elapsed 3m 48s (remain 1m 2s) Loss: 0.0002(0.0221) 
EVAL: [800/894] Elapsed 4m 21s (remain 0m 30s) Loss: 0.0607(0.0214) 




EVAL: [893/894] Elapsed 4m 51s (remain 0m 0s) Loss: 0.0122(0.0198) 
Epoch 1 - avg_train_loss: 0.0687  avg_val_loss: 0.0198  time: 1986s
Epoch 1 - Score: 0.8780
Epoch 1 - Save Best Score: 0.8780 Model




Epoch: [2][0/2681] Elapsed 0m 1s (remain 50m 34s) Loss: 0.0014(0.0014) Grad: 606.6125  LR: 0.000018  
Epoch: [2][100/2681] Elapsed 1m 4s (remain 27m 21s) Loss: 0.0001(0.0077) Grad: 85.1166  LR: 0.000018  
Epoch: [2][200/2681] Elapsed 2m 7s (remain 26m 12s) Loss: 0.0031(0.0071) Grad: 1478.6708  LR: 0.000017  
Epoch: [2][300/2681] Elapsed 3m 10s (remain 25m 6s) Loss: 0.0001(0.0065) Grad: 52.7685  LR: 0.000017  
Epoch: [2][400/2681] Elapsed 4m 13s (remain 24m 2s) Loss: 0.0397(0.0069) Grad: 9213.5947  LR: 0.000017  
Epoch: [2][500/2681] Elapsed 5m 16s (remain 22m 58s) Loss: 0.0054(0.0066) Grad: 1303.5679  LR: 0.000017  
Epoch: [2][600/2681] Elapsed 6m 19s (remain 21m 54s) Loss: 0.0018(0.0063) Grad: 692.8738  LR: 0.000017  
Epoch: [2][700/2681] Elapsed 7m 22s (remain 20m 51s) Loss: 0.0002(0.0064) Grad: 85.6409  LR: 0.000017  
Epoch: [2][800/2681] Elapsed 8m 26s (remain 19m 47s) Loss: 0.0010(0.0064) Grad: 478.8583  LR: 0.000016  
Epoch: [2][900/2681] Elapsed 9m 29s (remain 18m 44s) Loss: 0.0



Epoch: [2][2680/2681] Elapsed 28m 12s (remain 0m 0s) Loss: 0.0093(0.0062) Grad: 3270.8147  LR: 0.000013  




EVAL: [0/894] Elapsed 0m 0s (remain 11m 31s) Loss: 0.0235(0.0235) 
EVAL: [100/894] Elapsed 0m 33s (remain 4m 21s) Loss: 0.0093(0.0157) 
EVAL: [200/894] Elapsed 1m 5s (remain 3m 46s) Loss: 0.0787(0.0270) 
EVAL: [300/894] Elapsed 1m 38s (remain 3m 13s) Loss: 0.0502(0.0285) 
EVAL: [400/894] Elapsed 2m 10s (remain 2m 40s) Loss: 0.0001(0.0258) 
EVAL: [500/894] Elapsed 2m 43s (remain 2m 8s) Loss: 0.0367(0.0267) 
EVAL: [600/894] Elapsed 3m 15s (remain 1m 35s) Loss: 0.0201(0.0271) 
EVAL: [700/894] Elapsed 3m 48s (remain 1m 2s) Loss: 0.0000(0.0252) 
EVAL: [800/894] Elapsed 4m 20s (remain 0m 30s) Loss: 0.0729(0.0244) 
EVAL: [893/894] Elapsed 4m 51s (remain 0m 0s) Loss: 0.0131(0.0225) 
Epoch 2 - avg_train_loss: 0.0062  avg_val_loss: 0.0225  time: 1986s
Epoch 2 - Score: 0.8795
Epoch 2 - Save Best Score: 0.8795 Model




Epoch: [3][0/2681] Elapsed 0m 1s (remain 51m 13s) Loss: 0.0092(0.0092) Grad: 1721.4969  LR: 0.000013  
Epoch: [3][100/2681] Elapsed 1m 4s (remain 27m 23s) Loss: 0.0015(0.0073) Grad: 1052.8585  LR: 0.000013  
Epoch: [3][200/2681] Elapsed 2m 7s (remain 26m 13s) Loss: 0.0138(0.0068) Grad: 4112.2520  LR: 0.000013  
Epoch: [3][300/2681] Elapsed 3m 10s (remain 25m 7s) Loss: 0.0000(0.0063) Grad: 42.6391  LR: 0.000013  
Epoch: [3][400/2681] Elapsed 4m 13s (remain 24m 3s) Loss: 0.0000(0.0062) Grad: 35.8916  LR: 0.000013  
Epoch: [3][500/2681] Elapsed 5m 16s (remain 22m 59s) Loss: 0.0002(0.0062) Grad: 165.2703  LR: 0.000013  
Epoch: [3][600/2681] Elapsed 6m 20s (remain 21m 55s) Loss: 0.0003(0.0064) Grad: 358.2878  LR: 0.000012  
Epoch: [3][700/2681] Elapsed 7m 23s (remain 20m 52s) Loss: 0.0001(0.0064) Grad: 71.4236  LR: 0.000012  
Epoch: [3][800/2681] Elapsed 8m 26s (remain 19m 48s) Loss: 0.0094(0.0064) Grad: 3211.1172  LR: 0.000012  
Epoch: [3][900/2681] Elapsed 9m 29s (remain 18m 45s) Loss: 0.



EVAL: [0/894] Elapsed 0m 0s (remain 11m 41s) Loss: 0.0231(0.0231) 
EVAL: [100/894] Elapsed 0m 33s (remain 4m 21s) Loss: 0.0066(0.0140) 
EVAL: [200/894] Elapsed 1m 5s (remain 3m 46s) Loss: 0.0662(0.0236) 
EVAL: [300/894] Elapsed 1m 38s (remain 3m 13s) Loss: 0.0484(0.0248) 
EVAL: [400/894] Elapsed 2m 10s (remain 2m 40s) Loss: 0.0001(0.0224) 
EVAL: [500/894] Elapsed 2m 43s (remain 2m 8s) Loss: 0.0341(0.0234) 
EVAL: [600/894] Elapsed 3m 15s (remain 1m 35s) Loss: 0.0157(0.0238) 
EVAL: [700/894] Elapsed 3m 48s (remain 1m 2s) Loss: 0.0000(0.0222) 
EVAL: [800/894] Elapsed 4m 20s (remain 0m 30s) Loss: 0.0601(0.0215) 




EVAL: [893/894] Elapsed 4m 50s (remain 0m 0s) Loss: 0.0125(0.0198) 
Epoch 3 - avg_train_loss: 0.0059  avg_val_loss: 0.0198  time: 1986s
Epoch 3 - Score: 0.8817
Epoch 3 - Save Best Score: 0.8817 Model




Epoch: [4][0/2681] Elapsed 0m 1s (remain 50m 30s) Loss: 0.0074(0.0074) Grad: 2951.3618  LR: 0.000009  
Epoch: [4][100/2681] Elapsed 1m 4s (remain 27m 23s) Loss: 0.0000(0.0067) Grad: 16.3995  LR: 0.000009  
Epoch: [4][200/2681] Elapsed 2m 7s (remain 26m 13s) Loss: 0.0001(0.0070) Grad: 70.1462  LR: 0.000009  
Epoch: [4][300/2681] Elapsed 3m 10s (remain 25m 7s) Loss: 0.0001(0.0066) Grad: 101.0886  LR: 0.000008  
Epoch: [4][400/2681] Elapsed 4m 13s (remain 24m 3s) Loss: 0.0345(0.0061) Grad: 4745.8628  LR: 0.000008  
Epoch: [4][500/2681] Elapsed 5m 17s (remain 22m 59s) Loss: 0.0022(0.0062) Grad: 777.3861  LR: 0.000008  
Epoch: [4][600/2681] Elapsed 6m 20s (remain 21m 56s) Loss: 0.0001(0.0065) Grad: 161.8965  LR: 0.000008  
Epoch: [4][700/2681] Elapsed 7m 23s (remain 20m 52s) Loss: 0.0000(0.0061) Grad: 31.0137  LR: 0.000008  
Epoch: [4][800/2681] Elapsed 8m 26s (remain 19m 49s) Loss: 0.0014(0.0061) Grad: 782.1714  LR: 0.000008  
Epoch: [4][900/2681] Elapsed 9m 29s (remain 18m 45s) Loss: 0.00



Epoch: [4][2680/2681] Elapsed 28m 14s (remain 0m 0s) Loss: 0.0183(0.0059) Grad: 17305.7246  LR: 0.000004  




EVAL: [0/894] Elapsed 0m 0s (remain 11m 40s) Loss: 0.0245(0.0245) 
EVAL: [100/894] Elapsed 0m 33s (remain 4m 22s) Loss: 0.0053(0.0139) 
EVAL: [200/894] Elapsed 1m 5s (remain 3m 47s) Loss: 0.0622(0.0229) 
EVAL: [300/894] Elapsed 1m 38s (remain 3m 14s) Loss: 0.0510(0.0241) 
EVAL: [400/894] Elapsed 2m 11s (remain 2m 41s) Loss: 0.0001(0.0218) 
EVAL: [500/894] Elapsed 2m 43s (remain 2m 8s) Loss: 0.0349(0.0229) 
EVAL: [600/894] Elapsed 3m 16s (remain 1m 35s) Loss: 0.0144(0.0233) 
EVAL: [700/894] Elapsed 3m 48s (remain 1m 3s) Loss: 0.0000(0.0218) 
EVAL: [800/894] Elapsed 4m 21s (remain 0m 30s) Loss: 0.0576(0.0210) 
EVAL: [893/894] Elapsed 4m 51s (remain 0m 0s) Loss: 0.0129(0.0194) 
Epoch 4 - avg_train_loss: 0.0059  avg_val_loss: 0.0194  time: 1988s
Epoch 4 - Score: 0.8822
Epoch 4 - Save Best Score: 0.8822 Model




Epoch: [5][0/2681] Elapsed 0m 1s (remain 51m 13s) Loss: 0.0014(0.0014) Grad: 888.1537  LR: 0.000004  
Epoch: [5][100/2681] Elapsed 1m 4s (remain 27m 23s) Loss: 0.0027(0.0061) Grad: 1519.3555  LR: 0.000004  
Epoch: [5][200/2681] Elapsed 2m 7s (remain 26m 13s) Loss: 0.0299(0.0058) Grad: 6733.7290  LR: 0.000004  
Epoch: [5][300/2681] Elapsed 3m 10s (remain 25m 7s) Loss: 0.0002(0.0054) Grad: 396.6176  LR: 0.000004  
Epoch: [5][400/2681] Elapsed 4m 13s (remain 24m 3s) Loss: 0.0001(0.0057) Grad: 42.3447  LR: 0.000004  
Epoch: [5][500/2681] Elapsed 5m 17s (remain 22m 59s) Loss: 0.0003(0.0058) Grad: 298.6197  LR: 0.000004  
Epoch: [5][600/2681] Elapsed 6m 20s (remain 21m 55s) Loss: 0.0054(0.0061) Grad: 2146.9014  LR: 0.000003  
Epoch: [5][700/2681] Elapsed 7m 23s (remain 20m 52s) Loss: 0.0001(0.0060) Grad: 118.7149  LR: 0.000003  
Epoch: [5][800/2681] Elapsed 8m 26s (remain 19m 49s) Loss: 0.0024(0.0059) Grad: 705.7523  LR: 0.000003  
Epoch: [5][900/2681] Elapsed 9m 29s (remain 18m 45s) Loss: 0



Epoch: [5][2680/2681] Elapsed 28m 14s (remain 0m 0s) Loss: 0.0016(0.0057) Grad: 2283.4485  LR: 0.000000  




EVAL: [0/894] Elapsed 0m 0s (remain 11m 28s) Loss: 0.0243(0.0243) 
EVAL: [100/894] Elapsed 0m 33s (remain 4m 21s) Loss: 0.0057(0.0144) 
EVAL: [200/894] Elapsed 1m 5s (remain 3m 47s) Loss: 0.0686(0.0242) 
EVAL: [300/894] Elapsed 1m 38s (remain 3m 13s) Loss: 0.0515(0.0256) 
EVAL: [400/894] Elapsed 2m 10s (remain 2m 40s) Loss: 0.0001(0.0232) 
EVAL: [500/894] Elapsed 2m 43s (remain 2m 8s) Loss: 0.0393(0.0242) 
EVAL: [600/894] Elapsed 3m 16s (remain 1m 35s) Loss: 0.0168(0.0246) 
EVAL: [700/894] Elapsed 3m 48s (remain 1m 2s) Loss: 0.0000(0.0230) 
EVAL: [800/894] Elapsed 4m 21s (remain 0m 30s) Loss: 0.0653(0.0222) 
EVAL: [893/894] Elapsed 4m 51s (remain 0m 0s) Loss: 0.0130(0.0205) 
Epoch 5 - avg_train_loss: 0.0057  avg_val_loss: 0.0205  time: 1988s
Epoch 5 - Score: 0.8816


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForMaskedLM: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_p

Load weight from ../output/nbme-score-clinical-patient-notes/nbme-exp045/microsoft-deberta-xlarge-mlm-epoch-v4.bin
Load weight from ../output/nbme-score-clinical-patient-notes/nbme-exp066/fold2_best.pth




Epoch: [1][0/2681] Elapsed 0m 1s (remain 51m 24s) Loss: 0.7173(0.7173) Grad: 54956.4492  LR: 0.000000  
Epoch: [1][100/2681] Elapsed 1m 4s (remain 27m 20s) Loss: 0.6746(0.7032) Grad: 54148.2070  LR: 0.000002  
Epoch: [1][200/2681] Elapsed 2m 7s (remain 26m 11s) Loss: 0.5225(0.6576) Grad: 58934.8828  LR: 0.000003  
Epoch: [1][300/2681] Elapsed 3m 10s (remain 25m 6s) Loss: 0.2048(0.5631) Grad: 52431.2227  LR: 0.000004  
Epoch: [1][400/2681] Elapsed 4m 13s (remain 24m 2s) Loss: 0.0249(0.4457) Grad: 7687.8608  LR: 0.000006  
Epoch: [1][500/2681] Elapsed 5m 16s (remain 22m 58s) Loss: 0.0204(0.3602) Grad: 1833.6693  LR: 0.000007  
Epoch: [1][600/2681] Elapsed 6m 19s (remain 21m 54s) Loss: 0.0074(0.3019) Grad: 1257.7811  LR: 0.000009  
Epoch: [1][700/2681] Elapsed 7m 23s (remain 20m 51s) Loss: 0.0209(0.2599) Grad: 2816.8013  LR: 0.000010  
Epoch: [1][800/2681] Elapsed 8m 26s (remain 19m 48s) Loss: 0.0136(0.2285) Grad: 4161.6362  LR: 0.000012  
Epoch: [1][900/2681] Elapsed 9m 29s (remain 18m 4



EVAL: [0/894] Elapsed 0m 0s (remain 11m 54s) Loss: 0.0017(0.0017) 
EVAL: [100/894] Elapsed 0m 33s (remain 4m 21s) Loss: 0.0003(0.0195) 
EVAL: [200/894] Elapsed 1m 5s (remain 3m 46s) Loss: 0.0035(0.0172) 
EVAL: [300/894] Elapsed 1m 38s (remain 3m 13s) Loss: 0.0003(0.0184) 
EVAL: [400/894] Elapsed 2m 10s (remain 2m 40s) Loss: 0.0001(0.0165) 
EVAL: [500/894] Elapsed 2m 43s (remain 2m 8s) Loss: 0.0036(0.0178) 
EVAL: [600/894] Elapsed 3m 15s (remain 1m 35s) Loss: 0.0000(0.0188) 
EVAL: [700/894] Elapsed 3m 48s (remain 1m 2s) Loss: 0.1022(0.0198) 
EVAL: [800/894] Elapsed 4m 20s (remain 0m 30s) Loss: 0.0026(0.0193) 




EVAL: [893/894] Elapsed 4m 51s (remain 0m 0s) Loss: 0.0001(0.0181) 
Epoch 1 - avg_train_loss: 0.0722  avg_val_loss: 0.0181  time: 1986s
Epoch 1 - Score: 0.8862
Epoch 1 - Save Best Score: 0.8862 Model




Epoch: [2][0/2681] Elapsed 0m 1s (remain 52m 25s) Loss: 0.0004(0.0004) Grad: 186.0639  LR: 0.000018  
Epoch: [2][100/2681] Elapsed 1m 4s (remain 27m 23s) Loss: 0.0070(0.0048) Grad: 2853.3518  LR: 0.000018  
Epoch: [2][200/2681] Elapsed 2m 7s (remain 26m 12s) Loss: 0.0002(0.0052) Grad: 126.7569  LR: 0.000017  
Epoch: [2][300/2681] Elapsed 3m 10s (remain 25m 6s) Loss: 0.0000(0.0048) Grad: 46.2562  LR: 0.000017  
Epoch: [2][400/2681] Elapsed 4m 13s (remain 24m 2s) Loss: 0.0001(0.0051) Grad: 69.8260  LR: 0.000017  
Epoch: [2][500/2681] Elapsed 5m 16s (remain 22m 58s) Loss: 0.0004(0.0052) Grad: 152.5297  LR: 0.000017  
Epoch: [2][600/2681] Elapsed 6m 19s (remain 21m 54s) Loss: 0.0244(0.0050) Grad: 6481.2188  LR: 0.000017  
Epoch: [2][700/2681] Elapsed 7m 23s (remain 20m 51s) Loss: 0.0109(0.0050) Grad: 3548.2593  LR: 0.000017  
Epoch: [2][800/2681] Elapsed 8m 26s (remain 19m 47s) Loss: 0.0004(0.0048) Grad: 176.9986  LR: 0.000016  
Epoch: [2][900/2681] Elapsed 9m 29s (remain 18m 44s) Loss: 0.



EVAL: [0/894] Elapsed 0m 0s (remain 11m 59s) Loss: 0.0012(0.0012) 
EVAL: [100/894] Elapsed 0m 33s (remain 4m 21s) Loss: 0.0001(0.0195) 
EVAL: [200/894] Elapsed 1m 5s (remain 3m 46s) Loss: 0.0029(0.0173) 
EVAL: [300/894] Elapsed 1m 38s (remain 3m 13s) Loss: 0.0002(0.0185) 
EVAL: [400/894] Elapsed 2m 10s (remain 2m 40s) Loss: 0.0000(0.0166) 
EVAL: [500/894] Elapsed 2m 43s (remain 2m 8s) Loss: 0.0028(0.0178) 
EVAL: [600/894] Elapsed 3m 15s (remain 1m 35s) Loss: 0.0000(0.0188) 
EVAL: [700/894] Elapsed 3m 48s (remain 1m 2s) Loss: 0.1046(0.0199) 
EVAL: [800/894] Elapsed 4m 20s (remain 0m 30s) Loss: 0.0029(0.0194) 




EVAL: [893/894] Elapsed 4m 51s (remain 0m 0s) Loss: 0.0001(0.0182) 
Epoch 2 - avg_train_loss: 0.0047  avg_val_loss: 0.0182  time: 1986s
Epoch 2 - Score: 0.8877
Epoch 2 - Save Best Score: 0.8877 Model




Epoch: [3][0/2681] Elapsed 0m 1s (remain 51m 52s) Loss: 0.0021(0.0021) Grad: 1240.9869  LR: 0.000013  
Epoch: [3][100/2681] Elapsed 1m 4s (remain 27m 21s) Loss: 0.0000(0.0030) Grad: 65.6331  LR: 0.000013  
Epoch: [3][200/2681] Elapsed 2m 7s (remain 26m 11s) Loss: 0.0001(0.0035) Grad: 83.8684  LR: 0.000013  
Epoch: [3][300/2681] Elapsed 3m 10s (remain 25m 6s) Loss: 0.0034(0.0040) Grad: 1545.6168  LR: 0.000013  
Epoch: [3][400/2681] Elapsed 4m 13s (remain 24m 1s) Loss: 0.0000(0.0039) Grad: 38.4622  LR: 0.000013  
Epoch: [3][500/2681] Elapsed 5m 16s (remain 22m 57s) Loss: 0.0013(0.0039) Grad: 365.9211  LR: 0.000013  
Epoch: [3][600/2681] Elapsed 6m 19s (remain 21m 54s) Loss: 0.0212(0.0041) Grad: 4587.3901  LR: 0.000012  
Epoch: [3][700/2681] Elapsed 7m 22s (remain 20m 50s) Loss: 0.0099(0.0045) Grad: 3879.4707  LR: 0.000012  
Epoch: [3][800/2681] Elapsed 8m 26s (remain 19m 47s) Loss: 0.0001(0.0046) Grad: 102.5259  LR: 0.000012  
Epoch: [3][900/2681] Elapsed 9m 29s (remain 18m 44s) Loss: 0.



EVAL: [0/894] Elapsed 0m 0s (remain 12m 2s) Loss: 0.0010(0.0010) 
EVAL: [100/894] Elapsed 0m 33s (remain 4m 21s) Loss: 0.0001(0.0205) 
EVAL: [200/894] Elapsed 1m 5s (remain 3m 46s) Loss: 0.0032(0.0182) 
EVAL: [300/894] Elapsed 1m 38s (remain 3m 13s) Loss: 0.0001(0.0197) 
EVAL: [400/894] Elapsed 2m 10s (remain 2m 40s) Loss: 0.0000(0.0176) 
EVAL: [500/894] Elapsed 2m 43s (remain 2m 8s) Loss: 0.0022(0.0189) 
EVAL: [600/894] Elapsed 3m 15s (remain 1m 35s) Loss: 0.0000(0.0199) 
EVAL: [700/894] Elapsed 3m 48s (remain 1m 2s) Loss: 0.1084(0.0211) 
EVAL: [800/894] Elapsed 4m 21s (remain 0m 30s) Loss: 0.0036(0.0206) 
EVAL: [893/894] Elapsed 4m 51s (remain 0m 0s) Loss: 0.0000(0.0194) 
Epoch 3 - avg_train_loss: 0.0046  avg_val_loss: 0.0194  time: 1986s
Epoch 3 - Score: 0.8881
Epoch 3 - Save Best Score: 0.8881 Model




Epoch: [4][0/2681] Elapsed 0m 1s (remain 52m 25s) Loss: 0.0096(0.0096) Grad: 3180.8752  LR: 0.000009  
Epoch: [4][100/2681] Elapsed 1m 4s (remain 27m 23s) Loss: 0.0009(0.0028) Grad: 486.4574  LR: 0.000009  
Epoch: [4][200/2681] Elapsed 2m 7s (remain 26m 13s) Loss: 0.0200(0.0041) Grad: 6091.1392  LR: 0.000009  
Epoch: [4][300/2681] Elapsed 3m 10s (remain 25m 8s) Loss: 0.0008(0.0040) Grad: 505.4217  LR: 0.000008  
Epoch: [4][400/2681] Elapsed 4m 13s (remain 24m 3s) Loss: 0.0088(0.0042) Grad: 3200.5654  LR: 0.000008  
Epoch: [4][500/2681] Elapsed 5m 17s (remain 22m 59s) Loss: 0.0000(0.0041) Grad: 60.3278  LR: 0.000008  
Epoch: [4][600/2681] Elapsed 6m 20s (remain 21m 56s) Loss: 0.0002(0.0044) Grad: 144.9449  LR: 0.000008  
Epoch: [4][700/2681] Elapsed 7m 23s (remain 20m 52s) Loss: 0.0005(0.0043) Grad: 341.0161  LR: 0.000008  
Epoch: [4][800/2681] Elapsed 8m 26s (remain 19m 49s) Loss: 0.0001(0.0042) Grad: 144.4004  LR: 0.000008  
Epoch: [4][900/2681] Elapsed 9m 29s (remain 18m 45s) Loss: 0



EVAL: [0/894] Elapsed 0m 0s (remain 11m 58s) Loss: 0.0010(0.0010) 
EVAL: [100/894] Elapsed 0m 33s (remain 4m 21s) Loss: 0.0001(0.0203) 
EVAL: [200/894] Elapsed 1m 5s (remain 3m 47s) Loss: 0.0033(0.0180) 
EVAL: [300/894] Elapsed 1m 38s (remain 3m 14s) Loss: 0.0002(0.0195) 
EVAL: [400/894] Elapsed 2m 11s (remain 2m 41s) Loss: 0.0000(0.0175) 
EVAL: [500/894] Elapsed 2m 43s (remain 2m 8s) Loss: 0.0021(0.0188) 
EVAL: [600/894] Elapsed 3m 16s (remain 1m 35s) Loss: 0.0000(0.0198) 
EVAL: [700/894] Elapsed 3m 48s (remain 1m 2s) Loss: 0.1117(0.0210) 
EVAL: [800/894] Elapsed 4m 21s (remain 0m 30s) Loss: 0.0035(0.0205) 
EVAL: [893/894] Elapsed 4m 51s (remain 0m 0s) Loss: 0.0000(0.0193) 
Epoch 4 - avg_train_loss: 0.0044  avg_val_loss: 0.0193  time: 1988s
Epoch 4 - Score: 0.8876




Epoch: [5][0/2681] Elapsed 0m 1s (remain 52m 27s) Loss: 0.0001(0.0001) Grad: 44.5397  LR: 0.000004  
Epoch: [5][100/2681] Elapsed 1m 4s (remain 27m 25s) Loss: 0.0105(0.0046) Grad: 4741.7988  LR: 0.000004  
Epoch: [5][200/2681] Elapsed 2m 7s (remain 26m 13s) Loss: 0.0053(0.0039) Grad: 2232.1726  LR: 0.000004  
Epoch: [5][300/2681] Elapsed 3m 10s (remain 25m 7s) Loss: 0.0007(0.0042) Grad: 719.7734  LR: 0.000004  
Epoch: [5][400/2681] Elapsed 4m 13s (remain 24m 3s) Loss: 0.0004(0.0048) Grad: 617.2029  LR: 0.000004  
Epoch: [5][500/2681] Elapsed 5m 17s (remain 22m 59s) Loss: 0.0087(0.0047) Grad: 3688.8933  LR: 0.000004  
Epoch: [5][600/2681] Elapsed 6m 20s (remain 21m 55s) Loss: 0.0383(0.0048) Grad: 7587.3511  LR: 0.000003  
Epoch: [5][700/2681] Elapsed 7m 23s (remain 20m 52s) Loss: 0.0091(0.0046) Grad: 4943.7700  LR: 0.000003  
Epoch: [5][800/2681] Elapsed 8m 26s (remain 19m 48s) Loss: 0.0004(0.0044) Grad: 454.5786  LR: 0.000003  
Epoch: [5][900/2681] Elapsed 9m 29s (remain 18m 45s) Loss:



Epoch: [5][2680/2681] Elapsed 28m 13s (remain 0m 0s) Loss: 0.0003(0.0043) Grad: 731.5002  LR: 0.000000  




EVAL: [0/894] Elapsed 0m 0s (remain 12m 6s) Loss: 0.0010(0.0010) 
EVAL: [100/894] Elapsed 0m 33s (remain 4m 21s) Loss: 0.0001(0.0203) 
EVAL: [200/894] Elapsed 1m 5s (remain 3m 47s) Loss: 0.0031(0.0180) 
EVAL: [300/894] Elapsed 1m 38s (remain 3m 13s) Loss: 0.0001(0.0195) 
EVAL: [400/894] Elapsed 2m 11s (remain 2m 41s) Loss: 0.0000(0.0174) 
EVAL: [500/894] Elapsed 2m 43s (remain 2m 8s) Loss: 0.0018(0.0187) 
EVAL: [600/894] Elapsed 3m 16s (remain 1m 35s) Loss: 0.0000(0.0197) 
EVAL: [700/894] Elapsed 3m 48s (remain 1m 2s) Loss: 0.1073(0.0210) 
EVAL: [800/894] Elapsed 4m 21s (remain 0m 30s) Loss: 0.0034(0.0205) 




EVAL: [893/894] Elapsed 4m 51s (remain 0m 0s) Loss: 0.0000(0.0193) 
Epoch 5 - avg_train_loss: 0.0043  avg_val_loss: 0.0193  time: 1987s
Epoch 5 - Score: 0.8876


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForMaskedLM: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_p

Load weight from ../output/nbme-score-clinical-patient-notes/nbme-exp045/microsoft-deberta-xlarge-mlm-epoch-v4.bin
Load weight from ../output/nbme-score-clinical-patient-notes/nbme-exp066/fold3_best.pth




Epoch: [1][0/2681] Elapsed 0m 1s (remain 53m 18s) Loss: 0.7141(0.7141) Grad: 55104.5938  LR: 0.000000  
Epoch: [1][100/2681] Elapsed 1m 4s (remain 27m 22s) Loss: 0.6717(0.7004) Grad: 52545.5469  LR: 0.000002  
Epoch: [1][200/2681] Elapsed 2m 7s (remain 26m 12s) Loss: 0.5214(0.6542) Grad: 58449.2422  LR: 0.000003  
Epoch: [1][300/2681] Elapsed 3m 10s (remain 25m 6s) Loss: 0.2022(0.5592) Grad: 53201.7812  LR: 0.000004  
Epoch: [1][400/2681] Elapsed 4m 13s (remain 24m 2s) Loss: 0.0278(0.4425) Grad: 7518.7617  LR: 0.000006  
Epoch: [1][500/2681] Elapsed 5m 16s (remain 22m 58s) Loss: 0.0088(0.3577) Grad: 1086.3611  LR: 0.000007  
Epoch: [1][600/2681] Elapsed 6m 19s (remain 21m 54s) Loss: 0.0269(0.3000) Grad: 1419.2355  LR: 0.000009  
Epoch: [1][700/2681] Elapsed 7m 22s (remain 20m 51s) Loss: 0.0057(0.2584) Grad: 974.1198  LR: 0.000010  
Epoch: [1][800/2681] Elapsed 8m 26s (remain 19m 47s) Loss: 0.0019(0.2272) Grad: 468.8759  LR: 0.000012  
Epoch: [1][900/2681] Elapsed 9m 29s (remain 18m 44s



Epoch: [1][2680/2681] Elapsed 28m 12s (remain 0m 0s) Loss: 0.0003(0.0719) Grad: 386.3389  LR: 0.000018  




EVAL: [0/894] Elapsed 0m 0s (remain 12m 24s) Loss: 0.0057(0.0057) 
EVAL: [100/894] Elapsed 0m 33s (remain 4m 21s) Loss: 0.0004(0.0144) 
EVAL: [200/894] Elapsed 1m 5s (remain 3m 47s) Loss: 0.0646(0.0170) 
EVAL: [300/894] Elapsed 1m 38s (remain 3m 13s) Loss: 0.0442(0.0166) 
EVAL: [400/894] Elapsed 2m 10s (remain 2m 40s) Loss: 0.0222(0.0158) 
EVAL: [500/894] Elapsed 2m 43s (remain 2m 8s) Loss: 0.0179(0.0182) 
EVAL: [600/894] Elapsed 3m 15s (remain 1m 35s) Loss: 0.0255(0.0186) 
EVAL: [700/894] Elapsed 3m 48s (remain 1m 2s) Loss: 0.0204(0.0186) 
EVAL: [800/894] Elapsed 4m 20s (remain 0m 30s) Loss: 0.0016(0.0182) 




EVAL: [893/894] Elapsed 4m 51s (remain 0m 0s) Loss: 0.0002(0.0173) 
Epoch 1 - avg_train_loss: 0.0719  avg_val_loss: 0.0173  time: 1986s
Epoch 1 - Score: 0.8853
Epoch 1 - Save Best Score: 0.8853 Model




Epoch: [2][0/2681] Elapsed 0m 1s (remain 52m 2s) Loss: 0.0001(0.0001) Grad: 41.7841  LR: 0.000018  
Epoch: [2][100/2681] Elapsed 1m 4s (remain 27m 24s) Loss: 0.0007(0.0058) Grad: 399.2690  LR: 0.000018  
Epoch: [2][200/2681] Elapsed 2m 7s (remain 26m 12s) Loss: 0.0077(0.0053) Grad: 2753.3420  LR: 0.000017  
Epoch: [2][300/2681] Elapsed 3m 10s (remain 25m 7s) Loss: 0.0084(0.0052) Grad: 2699.6711  LR: 0.000017  
Epoch: [2][400/2681] Elapsed 4m 13s (remain 24m 2s) Loss: 0.0013(0.0053) Grad: 588.5737  LR: 0.000017  
Epoch: [2][500/2681] Elapsed 5m 16s (remain 22m 58s) Loss: 0.0001(0.0049) Grad: 51.5795  LR: 0.000017  
Epoch: [2][600/2681] Elapsed 6m 19s (remain 21m 54s) Loss: 0.0070(0.0047) Grad: 1872.7284  LR: 0.000017  
Epoch: [2][700/2681] Elapsed 7m 23s (remain 20m 51s) Loss: 0.0065(0.0046) Grad: 1863.8700  LR: 0.000017  
Epoch: [2][800/2681] Elapsed 8m 26s (remain 19m 47s) Loss: 0.0003(0.0046) Grad: 117.3682  LR: 0.000016  
Epoch: [2][900/2681] Elapsed 9m 29s (remain 18m 44s) Loss: 0.



Epoch: [2][2680/2681] Elapsed 28m 12s (remain 0m 0s) Loss: 0.0000(0.0049) Grad: 44.3579  LR: 0.000013  




EVAL: [0/894] Elapsed 0m 0s (remain 11m 58s) Loss: 0.0052(0.0052) 
EVAL: [100/894] Elapsed 0m 33s (remain 4m 21s) Loss: 0.0003(0.0163) 
EVAL: [200/894] Elapsed 1m 5s (remain 3m 47s) Loss: 0.0746(0.0193) 
EVAL: [300/894] Elapsed 1m 38s (remain 3m 13s) Loss: 0.0447(0.0189) 
EVAL: [400/894] Elapsed 2m 10s (remain 2m 40s) Loss: 0.0219(0.0181) 
EVAL: [500/894] Elapsed 2m 43s (remain 2m 8s) Loss: 0.0196(0.0208) 
EVAL: [600/894] Elapsed 3m 15s (remain 1m 35s) Loss: 0.0274(0.0214) 
EVAL: [700/894] Elapsed 3m 48s (remain 1m 2s) Loss: 0.0245(0.0214) 
EVAL: [800/894] Elapsed 4m 21s (remain 0m 30s) Loss: 0.0010(0.0209) 




EVAL: [893/894] Elapsed 4m 51s (remain 0m 0s) Loss: 0.0000(0.0198) 
Epoch 2 - avg_train_loss: 0.0049  avg_val_loss: 0.0198  time: 1986s
Epoch 2 - Score: 0.8861
Epoch 2 - Save Best Score: 0.8861 Model




Epoch: [3][0/2681] Elapsed 0m 1s (remain 52m 54s) Loss: 0.0011(0.0011) Grad: 805.2070  LR: 0.000013  
Epoch: [3][100/2681] Elapsed 1m 4s (remain 27m 25s) Loss: 0.0003(0.0062) Grad: 229.0934  LR: 0.000013  
Epoch: [3][200/2681] Elapsed 2m 7s (remain 26m 13s) Loss: 0.0004(0.0055) Grad: 318.5094  LR: 0.000013  
Epoch: [3][300/2681] Elapsed 3m 10s (remain 25m 7s) Loss: 0.0029(0.0055) Grad: 1486.9294  LR: 0.000013  
Epoch: [3][400/2681] Elapsed 4m 13s (remain 24m 3s) Loss: 0.0002(0.0055) Grad: 132.7601  LR: 0.000013  
Epoch: [3][500/2681] Elapsed 5m 17s (remain 22m 59s) Loss: 0.0013(0.0052) Grad: 1099.1113  LR: 0.000013  
Epoch: [3][600/2681] Elapsed 6m 20s (remain 21m 56s) Loss: 0.0025(0.0051) Grad: 1439.1260  LR: 0.000012  
Epoch: [3][700/2681] Elapsed 7m 23s (remain 20m 52s) Loss: 0.0002(0.0051) Grad: 130.7961  LR: 0.000012  
Epoch: [3][800/2681] Elapsed 8m 26s (remain 19m 49s) Loss: 0.0006(0.0048) Grad: 445.6460  LR: 0.000012  
Epoch: [3][900/2681] Elapsed 9m 29s (remain 18m 45s) Loss: 



Epoch: [3][2680/2681] Elapsed 28m 13s (remain 0m 0s) Loss: 0.0001(0.0047) Grad: 106.3244  LR: 0.000009  




EVAL: [0/894] Elapsed 0m 0s (remain 12m 1s) Loss: 0.0041(0.0041) 
EVAL: [100/894] Elapsed 0m 33s (remain 4m 21s) Loss: 0.0004(0.0143) 
EVAL: [200/894] Elapsed 1m 5s (remain 3m 47s) Loss: 0.0569(0.0169) 
EVAL: [300/894] Elapsed 1m 38s (remain 3m 13s) Loss: 0.0423(0.0167) 
EVAL: [400/894] Elapsed 2m 10s (remain 2m 40s) Loss: 0.0168(0.0160) 
EVAL: [500/894] Elapsed 2m 43s (remain 2m 8s) Loss: 0.0171(0.0184) 
EVAL: [600/894] Elapsed 3m 16s (remain 1m 35s) Loss: 0.0242(0.0190) 
EVAL: [700/894] Elapsed 3m 48s (remain 1m 2s) Loss: 0.0231(0.0189) 
EVAL: [800/894] Elapsed 4m 21s (remain 0m 30s) Loss: 0.0007(0.0184) 
EVAL: [893/894] Elapsed 4m 51s (remain 0m 0s) Loss: 0.0001(0.0175) 
Epoch 3 - avg_train_loss: 0.0047  avg_val_loss: 0.0175  time: 1987s
Epoch 3 - Score: 0.8869
Epoch 3 - Save Best Score: 0.8869 Model




Epoch: [4][0/2681] Elapsed 0m 1s (remain 52m 3s) Loss: 0.0051(0.0051) Grad: 2655.7786  LR: 0.000009  
Epoch: [4][100/2681] Elapsed 1m 4s (remain 27m 23s) Loss: 0.0000(0.0043) Grad: 15.8979  LR: 0.000009  
Epoch: [4][200/2681] Elapsed 2m 7s (remain 26m 13s) Loss: 0.0001(0.0042) Grad: 129.9810  LR: 0.000009  
Epoch: [4][300/2681] Elapsed 3m 10s (remain 25m 7s) Loss: 0.0010(0.0045) Grad: 840.8720  LR: 0.000008  
Epoch: [4][400/2681] Elapsed 4m 13s (remain 24m 3s) Loss: 0.0001(0.0042) Grad: 88.7952  LR: 0.000008  
Epoch: [4][500/2681] Elapsed 5m 17s (remain 22m 59s) Loss: 0.0001(0.0043) Grad: 93.2377  LR: 0.000008  
Epoch: [4][600/2681] Elapsed 6m 20s (remain 21m 56s) Loss: 0.0039(0.0046) Grad: 2002.0087  LR: 0.000008  
Epoch: [4][700/2681] Elapsed 7m 23s (remain 20m 52s) Loss: 0.0093(0.0047) Grad: 4676.5293  LR: 0.000008  
Epoch: [4][800/2681] Elapsed 8m 26s (remain 19m 49s) Loss: 0.0271(0.0047) Grad: 5822.5156  LR: 0.000008  
Epoch: [4][900/2681] Elapsed 9m 30s (remain 18m 46s) Loss: 0.0



Epoch: [4][2680/2681] Elapsed 28m 15s (remain 0m 0s) Loss: 0.0078(0.0046) Grad: 8292.4180  LR: 0.000004  




EVAL: [0/894] Elapsed 0m 0s (remain 11m 45s) Loss: 0.0056(0.0056) 
EVAL: [100/894] Elapsed 0m 33s (remain 4m 22s) Loss: 0.0003(0.0155) 
EVAL: [200/894] Elapsed 1m 5s (remain 3m 47s) Loss: 0.0645(0.0184) 
EVAL: [300/894] Elapsed 1m 38s (remain 3m 14s) Loss: 0.0455(0.0180) 
EVAL: [400/894] Elapsed 2m 11s (remain 2m 41s) Loss: 0.0193(0.0171) 
EVAL: [500/894] Elapsed 2m 43s (remain 2m 8s) Loss: 0.0166(0.0198) 
EVAL: [600/894] Elapsed 3m 16s (remain 1m 35s) Loss: 0.0263(0.0203) 
EVAL: [700/894] Elapsed 3m 48s (remain 1m 2s) Loss: 0.0248(0.0203) 
EVAL: [800/894] Elapsed 4m 21s (remain 0m 30s) Loss: 0.0006(0.0198) 




EVAL: [893/894] Elapsed 4m 51s (remain 0m 0s) Loss: 0.0000(0.0188) 
Epoch 4 - avg_train_loss: 0.0046  avg_val_loss: 0.0188  time: 1989s
Epoch 4 - Score: 0.8867




Epoch: [5][0/2681] Elapsed 0m 1s (remain 52m 9s) Loss: 0.0040(0.0040) Grad: 3136.3979  LR: 0.000004  
Epoch: [5][100/2681] Elapsed 1m 4s (remain 27m 26s) Loss: 0.0004(0.0042) Grad: 355.4073  LR: 0.000004  
Epoch: [5][200/2681] Elapsed 2m 7s (remain 26m 15s) Loss: 0.0001(0.0045) Grad: 140.7210  LR: 0.000004  
Epoch: [5][300/2681] Elapsed 3m 10s (remain 25m 9s) Loss: 0.0033(0.0044) Grad: 2360.5388  LR: 0.000004  
Epoch: [5][400/2681] Elapsed 4m 14s (remain 24m 4s) Loss: 0.0004(0.0044) Grad: 520.1922  LR: 0.000004  
Epoch: [5][500/2681] Elapsed 5m 17s (remain 23m 0s) Loss: 0.0001(0.0044) Grad: 56.7684  LR: 0.000004  
Epoch: [5][600/2681] Elapsed 6m 20s (remain 21m 56s) Loss: 0.0010(0.0047) Grad: 1157.1301  LR: 0.000003  
Epoch: [5][700/2681] Elapsed 7m 23s (remain 20m 53s) Loss: 0.0000(0.0048) Grad: 14.9413  LR: 0.000003  
Epoch: [5][800/2681] Elapsed 8m 27s (remain 19m 50s) Loss: 0.0010(0.0049) Grad: 1175.1434  LR: 0.000003  
Epoch: [5][900/2681] Elapsed 9m 30s (remain 18m 46s) Loss: 0.0



Epoch: [5][2680/2681] Elapsed 28m 15s (remain 0m 0s) Loss: 0.0000(0.0046) Grad: 29.2244  LR: 0.000000  




EVAL: [0/894] Elapsed 0m 0s (remain 11m 56s) Loss: 0.0053(0.0053) 
EVAL: [100/894] Elapsed 0m 33s (remain 4m 22s) Loss: 0.0003(0.0153) 
EVAL: [200/894] Elapsed 1m 5s (remain 3m 47s) Loss: 0.0633(0.0182) 
EVAL: [300/894] Elapsed 1m 38s (remain 3m 14s) Loss: 0.0445(0.0178) 
EVAL: [400/894] Elapsed 2m 11s (remain 2m 41s) Loss: 0.0192(0.0169) 
EVAL: [500/894] Elapsed 2m 43s (remain 2m 8s) Loss: 0.0162(0.0195) 
EVAL: [600/894] Elapsed 3m 16s (remain 1m 35s) Loss: 0.0261(0.0201) 
EVAL: [700/894] Elapsed 3m 48s (remain 1m 2s) Loss: 0.0244(0.0201) 
EVAL: [800/894] Elapsed 4m 21s (remain 0m 30s) Loss: 0.0006(0.0195) 
EVAL: [893/894] Elapsed 4m 51s (remain 0m 0s) Loss: 0.0000(0.0185) 
Epoch 5 - avg_train_loss: 0.0046  avg_val_loss: 0.0185  time: 1989s
Epoch 5 - Score: 0.8869
best_thres: 0.49  score: 0.88520


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Load weight from pretrained
load weights from ../output/nbme-score-clinical-patient-notes/nbme-exp067/fold0_best.pth




  0%|          | 0/2 [00:00<?, ?it/s]

Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/queues.py", line 235, in _feed
Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/queues.py", line 235, in _feed
    close()
    close()  File "/usr/lib/python3.8/multiprocessing/connection.py", line 177, in close

  File "/usr/lib/python3.8/multiprocessing/connection.py", line 177, in close
        self._close()
self._close()  File "/usr/lib/python3.8/multiprocessing/connection.py", line 361, in _close

  File "/usr/lib/python3.8/multiprocessing/connection.py", line 361, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.8/threading.py", line 932, in _bootstrap_inner
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor

During handling of the a

Load weight from pretrained
load weights from ../output/nbme-score-clinical-patient-notes/nbme-exp067/fold1_best.pth




  0%|          | 0/2 [00:00<?, ?it/s]

Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/queues.py", line 235, in _feed
Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/queues.py", line 235, in _feed
        close()close()
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 177, in close

  File "/usr/lib/python3.8/multiprocessing/connection.py", line 177, in close
    self._close()
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 361, in _close
        _close(self._handle)
OSError: [Errno 9] Bad file descriptor

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.8/threading.py", line 932, in _bootstrap_inner
    self._close()self.run()

  File "/usr/lib/python3.8/multiprocessing/connection.py", line 361, in _close
  File "/usr/lib/python3.8/threading.py", line 870, in run
        self._t

Load weight from pretrained
load weights from ../output/nbme-score-clinical-patient-notes/nbme-exp067/fold2_best.pth




  0%|          | 0/2 [00:00<?, ?it/s]

Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/queues.py", line 235, in _feed
Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/queues.py", line 235, in _feed
    close()
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 177, in close
    close()
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 177, in close
    self._close()    self._close()

  File "/usr/lib/python3.8/multiprocessing/connection.py", line 361, in _close
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 361, in _close
        _close(self._handle)
_close(self._handle)
OSErrorOSError: [Errno 9] Bad file descriptor

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.8/threading.py", line 932, in _bootstrap_inner
: [Errno 9] Bad file descriptor

During handling of the a

Load weight from pretrained
load weights from ../output/nbme-score-clinical-patient-notes/nbme-exp067/fold3_best.pth




  0%|          | 0/2 [00:00<?, ?it/s]

Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/queues.py", line 235, in _feed
Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/queues.py", line 235, in _feed
        close()
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 177, in close
close()
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 177, in close
    self._close()
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 361, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.8/threading.py", line 932, in _bootstrap_inner
        self.run()
  File "/usr/lib/python3.8/threading.py", line 870, in run
self._close()    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.8/multiprocessing/connec