## References

- https://www.kaggle.com/yasufuminakama/nbme-deberta-base-baseline-train

## Configurations

In [1]:
EXP_NAME = "nbme-exp066"
ENV = "local"
DEBUG_MODE = False
SUBMISSION_MODE = False

In [2]:
class CFG:
    env=ENV
    exp_name=EXP_NAME
    debug=DEBUG_MODE
    submission=SUBMISSION_MODE
    apex=True
    input_dir=None
    output_dir=None
    library="pytorch"  # ["tf", "pytorch"]
    device="GPU"  # ["GPU", "TPU"]
    competition_name="nbme-score-clinical-patient-notes"
    id_col="id"
    target_col="location"
    pretrained_model_name="microsoft/deberta-xlarge"
    tokenizer=None
    max_len=None
    output_dim=1
    dropout=0.2
    num_workers=4
    batch_size=4
    lr=2e-5
    betas=(0.9, 0.98)
    weight_decay=0.1
    num_warmup_steps_rate=0.1
    batch_scheduler=True
    epochs=5
    n_fold=4
    train_fold=[0, 1, 2, 3]
    seed=71
    gradient_accumulation_steps=1
    max_grad_norm=1000
    print_freq=100
    train=True
    inference=True

In [3]:
if CFG.debug:
    CFG.epochs = 2
    CFG.train_fold = [0, 1]

if CFG.submission:
    CFG.train = False
    CFG.inference = True

## Directory Settings

In [4]:
import sys
from pathlib import Path


print(CFG.env)
if CFG.env == "colab":
    # colab環境
    from google.colab import drive
    drive.mount("/content/drive")
    CFG.input_dir = Path("./drive/MyDrive/00.kaggle/input") / CFG.competition_name
    CFG.output_dir = Path("./drive/MyDrive/00.kaggle/output") / CFG.competition_name / CFG.exp_name
    if not CFG.output_dir.exists():
        CFG.output_dir.mkdir()
    # install packages
    !pip install transformers

elif CFG.env == "local":
    # ローカルサーバ
    CFG.input_dir = Path("../input/") / CFG.competition_name
    CFG.output_dir = Path("../output/") / CFG.competition_name / CFG.exp_name
    if not CFG.output_dir.exists():
        CFG.output_dir.mkdir()

elif CFG.env == "kaggle":
    # kaggle環境
    CFG.input_dir = Path("../input/") / CFG.competition_name
    CFG.output_dir = Path("./")

local


In [5]:
%env TOKENIZERS_PARALLELISM=false

env: TOKENIZERS_PARALLELISM=false


In [6]:
import gc
import os
import ast
import time
import math
import random
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from scipy.optimize import minimize
from sklearn.metrics import roc_auc_score, mean_squared_error, f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as T
from torchvision.io import read_image
from torch.utils.data import DataLoader, Dataset

from transformers import AutoModelForMaskedLM
from transformers import BartModel,BertModel,BertTokenizer
from transformers import DebertaModel,DebertaTokenizer
from transformers import RobertaModel,RobertaTokenizer
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel,AutoConfig
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import ElectraModel, ElectraTokenizer, ElectraForSequenceClassification

import warnings
warnings.filterwarnings("ignore")

## Utilities

In [7]:
def micro_f1(preds, truths):
    """
    Micro f1 on binary arrays.

    Args:
        preds (list of lists of ints): Predictions.
        truths (list of lists of ints): Ground truths.

    Returns:
        float: f1 score.
    """
    # Micro : aggregating over all instances
    preds = np.concatenate(preds)
    truths = np.concatenate(truths)
    return f1_score(truths, preds)


def spans_to_binary(spans, length=None):
    """
    Converts spans to a binary array indicating whether each character is in the span.

    Args:
        spans (list of lists of two ints): Spans.

    Returns:
        np array [length]: Binarized spans.
    """
    length = np.max(spans) if length is None else length
    binary = np.zeros(length)
    for start, end in spans:
        binary[start:end] = 1
    return binary


def span_micro_f1(preds, truths):
    """
    Micro f1 on spans.

    Args:
        preds (list of lists of two ints): Prediction spans.
        truths (list of lists of two ints): Ground truth spans.

    Returns:
        float: f1 score.
    """
    bin_preds = []
    bin_truths = []
    for pred, truth in zip(preds, truths):
        if not len(pred) and not len(truth):
            continue
        length = max(np.max(pred) if len(pred) else 0, np.max(truth) if len(truth) else 0)
        bin_preds.append(spans_to_binary(pred, length))
        bin_truths.append(spans_to_binary(truth, length))
    return micro_f1(bin_preds, bin_truths)


def get_score(y_true, y_pred):
    score = span_micro_f1(y_true, y_pred)
    return score

In [8]:
def create_labels_for_scoring(df):
    # example: ['48 61', '111 128'] -> [[48, 61], [111, 128]]
    df["location_for_create_labels"] = [ast.literal_eval(f"[]")] * len(df)
    for i in range(len(df)):
        lst = df.loc[i, "location"]
        if lst:
            new_lst = ";".join(lst)
            df.loc[i, "location_for_create_labels"] = ast.literal_eval(f"[['{new_lst}']]")

    # create labels
    truths = []
    for location_list in df["location_for_create_labels"].values:
        truth = []
        if len(location_list) > 0:
            location = location_list[0]
            for loc in [s.split() for s in location.split(";")]:
                start, end = int(loc[0]), int(loc[1])
                truth.append([start, end])
        truths.append(truth)

    return truths


def get_char_probs(texts, token_probs, tokenizer):
    res = [np.zeros(len(t)) for t in texts]
    for i, (text, prediction) in enumerate(zip(texts, token_probs)):
        encoded = tokenizer(
            text=text,
            max_length=CFG.max_len,
            padding="max_length",
            return_offsets_mapping=True,
        )
        for (offset_mapping, pred) in zip(encoded["offset_mapping"], prediction):
            start, end = offset_mapping
            res[i][start:end] = pred
    return res


def get_predicted_location_str(char_probs, th=0.5):
    results = []
    for char_prob in char_probs:
        result = np.where(char_prob >= th)[0] + 1
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        result = [f"{min(r)} {max(r)}" for r in result]
        result = ";".join(result)
        results.append(result)
    return results


def get_predictions(results):
    predictions = []
    for result in results:
        prediction = []
        if result != "":
            for loc in [s.split() for s in result.split(";")]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
    return predictions


def scoring(df, th=0.5):
    labels = create_labels_for_scoring(df)

    token_probs = df[[str(i) for i in range(CFG.max_len)]].values
    char_probs = get_char_probs(df["pn_history"].values, token_probs, CFG.tokenizer)
    predicted_location_str = get_predicted_location_str(char_probs, th=th)
    preds = get_predictions(predicted_location_str)

    score = get_score(labels, preds)
    return score


def get_best_thres(oof_df):
    def f1_opt(x):
        return -1 * scoring(oof_df, th=x)

    best_thres = minimize(f1_opt, x0=np.array([0.5]), method="Nelder-Mead")["x"].item()
    return best_thres

In [9]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [10]:
seed_everything()

## Data Loading

In [11]:
train = pd.read_csv(CFG.input_dir / "train.csv")
features = pd.read_csv(CFG.input_dir / "features.csv")
patient_notes = pd.read_csv(CFG.input_dir / "patient_notes.csv")
test = pd.read_csv(CFG.input_dir / "test.csv")

train.shape, features.shape, patient_notes.shape, test.shape

((14300, 6), (143, 3), (42146, 3), (5, 4))

In [12]:
if CFG.debug:
    train = train.sample(n=1000, random_state=0).reset_index(drop=True)
    print(train.shape)

## Preprocessing

In [13]:
def preprocess_features(features):
    features.loc[features["feature_text"] == "Last-Pap-smear-I-year-ago", "feature_text"] = "Last-Pap-smear-1-year-ago"
    return features


features = preprocess_features(features)

In [14]:
train = train.merge(features, on=["feature_num", "case_num"], how="left")
train = train.merge(patient_notes, on=["pn_num", "case_num"], how="left")
test = test.merge(features, on=["feature_num", "case_num"], how="left")
test = test.merge(patient_notes, on=["pn_num", "case_num"], how="left")

train.shape, test.shape

((14300, 8), (5, 6))

In [15]:
train["annotation"] = train["annotation"].apply(ast.literal_eval)
train["location"] = train["location"].apply(ast.literal_eval)

In [16]:
train["annotation_length"] = train["annotation"].apply(len)
display(train['annotation_length'].value_counts().sort_index())

0    4399
1    8181
2    1296
3     287
4      99
5      27
6       9
7       1
8       1
Name: annotation_length, dtype: int64

## CV split

In [17]:
Fold = GroupKFold(n_splits=CFG.n_fold)
groups = train['pn_num'].values
for n, (train_index, val_index) in enumerate(Fold.split(train, train['location'], groups)):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

fold
0    3575
1    3575
2    3575
3    3575
dtype: int64

## Setup tokenizer

In [18]:
if CFG.submission:
    tokenizer = AutoTokenizer.from_pretrained(Path("../input/") / CFG.exp_name / "tokenizer/")
else:
    tokenizer = AutoTokenizer.from_pretrained(CFG.pretrained_model_name)
    tokenizer.save_pretrained(CFG.output_dir / "tokenizer/")

CFG.tokenizer = tokenizer

In [19]:
tmp = 'dad with recent heart attack'
encode = tokenizer(tmp, return_offsets_mapping=True)
for (start,end) in encode['offset_mapping']:
    print(f"'{tmp[start:end]}', {start}, {end}")

print("ans")
print("""
'', 0, 0
'dad', 0, 3
' with', 3, 8
' recent', 8, 15
' heart', 15, 21
' attack', 21, 28
'', 0, 0
""")

'', 0, 0
'dad', 0, 3
' with', 3, 8
' recent', 8, 15
' heart', 15, 21
' attack', 21, 28
'', 0, 0
ans

'', 0, 0
'dad', 0, 3
' with', 3, 8
' recent', 8, 15
' heart', 15, 21
' attack', 21, 28
'', 0, 0



## Create dataset

In [20]:
pn_history_lengths = []
tk0 = tqdm(patient_notes["pn_history"].fillna("").values, total=len(patient_notes))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)["input_ids"])
    pn_history_lengths.append(length)

print("max length:", np.max(pn_history_lengths))

  0%|          | 0/42146 [00:00<?, ?it/s]

max length: 433


In [21]:
feature_text_lengths = []
tk0 = tqdm(features["feature_text"].fillna("").values, total=len(features))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)["input_ids"])
    feature_text_lengths.append(length)

print("max length:", np.max(feature_text_lengths))

  0%|          | 0/143 [00:00<?, ?it/s]

max length: 30


In [22]:
CFG.max_len = max(pn_history_lengths) + max(feature_text_lengths) + 3   # cls & sep & sep

print("max length:", CFG.max_len)

max length: 466


In [23]:
class TrainingDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.df = df
        self.tokenizer = self.cfg.tokenizer
        self.max_len = self.cfg.max_len
        self.feature_texts = self.df["feature_text"].values
        self.pn_historys = self.df["pn_history"].values
        self.annotation_lengths = self.df["annotation_length"].values
        self.locations = self.df["location"].values

    def __len__(self):
        return len(self.df)

    def _create_input(self, pn_history, feature_text):
        encoded = self.tokenizer(
            text=pn_history,
            text_pair=feature_text,
            max_length=self.max_len,
            padding="max_length",
            return_offsets_mapping=False,
        )
        for k, v in encoded.items():
            encoded[k] = torch.tensor(v, dtype=torch.long)
        return encoded

    def _create_label(self, pn_history, annotation_length, location_list):
        encoded = self.tokenizer(
            text=pn_history,
            max_length=self.max_len,
            padding="max_length",
            return_offsets_mapping=True,
        )
        offset_mapping = encoded["offset_mapping"]
        ignore_idxes = np.where(np.array(encoded.sequence_ids()) != 0)[0]
        label = np.zeros(len(offset_mapping))
        label[ignore_idxes] = -1

        if annotation_length > 0:
            for location in location_list:
                for loc in [s.split() for s in location.split(";")]:
                    start, end = int(loc[0]), int(loc[1])
                    start_idx = -1
                    end_idx = -1
                    for idx in range(len(offset_mapping)):
                        if (start_idx == -1) & (start < offset_mapping[idx][0]):
                            start_idx = idx - 1
                        if (end_idx == -1) & (end <= offset_mapping[idx][1]):
                            end_idx = idx + 1
                    if start_idx == -1:
                        start_idx = end_idx
                    if (start_idx != -1) & (end_idx != -1):
                        label[start_idx:end_idx] = 1

        return torch.tensor(label, dtype=torch.float)

    def __getitem__(self, idx):
        input_ = self._create_input(self.pn_historys[idx], self.feature_texts[idx])
        label = self._create_label(self.pn_historys[idx], self.annotation_lengths[idx], self.locations[idx])
        return input_, label

In [24]:
class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.df = df
        self.tokenizer = self.cfg.tokenizer
        self.max_len = self.cfg.max_len
        self.feature_texts = self.df["feature_text"].values
        self.pn_historys = self.df["pn_history"].values

    def __len__(self):
        return len(self.df)

    def _create_input(self, pn_history, feature_text):
        encoded = self.tokenizer(
            text=pn_history,
            text_pair=feature_text,
            max_length=self.max_len,
            padding="max_length",
            return_offsets_mapping=False,
        )
        for k, v in encoded.items():
            encoded[k] = torch.tensor(v, dtype=torch.long)
        return encoded

    def __getitem__(self, idx):
        input_ = self._create_input(self.pn_historys[idx], self.feature_texts[idx])
        return input_

## Model

In [25]:
# ====================================================
# Model
# ====================================================
class MaskedModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(
                cfg.pretrained_model_name,
                output_hidden_states=False
                )
        else:
            self.config = torch.load(config_path)
        
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.pretrained_model_name, config=self.config)
            self.lm_head = AutoModelForMaskedLM.from_pretrained(cfg.pretrained_model_name, config=self.config).cls # [cls, lm_head]
        else:
            self.model = AutoModel(self.config)
            self.lm_head = AutoModelForMaskedLM(self.config).cls # [cls, lm_head]
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(
            self, 
            input_ids=None,
            attention_mask=None,
            token_type_ids=None,
            #position_ids=None,
            inputs_embeds=None,
            labels=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None):
        
        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            #position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,)
        
        sequence_output = outputs[0]
        prediction_scores = self.lm_head(sequence_output)

        masked_lm_loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

        return MaskedLMOutput(loss=masked_lm_loss,
                              logits=prediction_scores,
                              hidden_states=outputs.hidden_states,
                              attentions=outputs.attentions)

In [26]:
class CustomModel(nn.Module):
    def __init__(self, cfg, model_config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg

        if model_config_path is None:
            self.model_config = AutoConfig.from_pretrained(
                self.cfg.pretrained_model_name,
                output_hidden_states=True,
            )
        else:
            self.model_config = torch.load(model_config_path)

        if pretrained:
            self.backbone = AutoModel.from_pretrained(
                self.cfg.pretrained_model_name,
                config=self.model_config,
            )
            print(f"Load weight from pretrained")
        else:
            #self.backbone = AutoModel.from_config(self.model_config)
            # itpt = AutoModelForMaskedLM.from_config(self.model_config)
            #path = str(Path("./drive/MyDrive/00.kaggle/output") / CFG.competition_name /  "nbme-exp010/checkpoint-130170/pytorch_model.bin")
            # path = "../output/nbme-score-clinical-patient-notes/nbme-exp010/checkpoint-130170/pytorch_model.bin"
            # state_dict = torch.load(path)
            # itpt.load_state_dict(state_dict)
            path = "../output/nbme-score-clinical-patient-notes/nbme-exp045/microsoft-deberta-xlarge-mlm-epoch-v4.bin"
            masked_model = MaskedModel(CFG, config_path=None, pretrained=True)
            state = torch.load(path, map_location=torch.device("cpu"))
            masked_model.load_state_dict(state)
            self.backbone = masked_model.model
            print(f"Load weight from {path}")

        self.fc = nn.Sequential(
            nn.Dropout(self.cfg.dropout),
            nn.Linear(self.model_config.hidden_size, self.cfg.output_dim),
        )

    def forward(self, inputs):
        h = self.backbone(**inputs)["last_hidden_state"]
        output = self.fc(h)
        return output

## Training

In [27]:
def train_fn(
    train_dataloader,
    model,
    criterion,
    optimizer,
    epoch,
    scheduler,
    device,
):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = time.time()
    for step, (inputs, labels) in enumerate(train_dataloader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)

        with torch.cuda.amp.autocast(enabled=CFG.apex):
            output = model(inputs)

        loss = criterion(output.view(-1, 1), labels.view(-1, 1))
        loss = torch.masked_select(loss, labels.view(-1, 1) != -1)
        loss = loss.mean()

        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)

        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        if CFG.batch_scheduler:
            scheduler.step()

        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_dataloader)-1):
            print(
                "Epoch: [{0}][{1}/{2}] "
                "Elapsed {remain:s} "
                "Loss: {loss.val:.4f}({loss.avg:.4f}) "
                "Grad: {grad_norm:.4f}  "
                "LR: {lr:.6f}  "
                .format(
                    epoch+1,
                    step,
                    len(train_dataloader),
                    remain=timeSince(start, float(step+1) / len(train_dataloader)),
                    loss=losses,
                     grad_norm=grad_norm,
                     lr=scheduler.get_lr()[0],
                )
            )
    return losses.avg

In [28]:
def valid_fn(
    val_dataloader,
    model,
    criterion,
    device,
):
    model.eval()
    preds = []
    losses = AverageMeter()
    start = time.time()
    for step, (inputs, labels) in enumerate(val_dataloader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)

        with torch.no_grad():
            output = model(inputs)

        loss = criterion(output.view(-1, 1), labels.view(-1, 1))
        loss = torch.masked_select(loss, labels.view(-1, 1) != -1)
        loss = loss.mean()

        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(output.sigmoid().squeeze(2).detach().cpu().numpy())

        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(val_dataloader)-1):
            print(
                "EVAL: [{0}/{1}] "
                "Elapsed {remain:s} "
                "Loss: {loss.val:.4f}({loss.avg:.4f}) "
                .format(
                    step, len(val_dataloader),
                    remain=timeSince(start, float(step+1) / len(val_dataloader)),
                    loss=losses,
                )
            )
    preds = np.concatenate(preds)
    return losses.avg, preds

In [29]:
def inference_fn(test_dataloader, model, device):
    model.eval()
    model.to(device)
    preds = []
    tk0 = tqdm(test_dataloader, total=len(test_dataloader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            output = model(inputs)
        preds.append(output.sigmoid().squeeze(2).detach().cpu().numpy())
    preds = np.concatenate(preds)
    return preds

In [30]:
def train_loop(df, i_fold, device):
    print(f"========== fold: {i_fold} training ==========")
    train_idx = df[df["fold"] != i_fold].index
    val_idx = df[df["fold"] == i_fold].index

    train_folds = df.loc[train_idx].reset_index(drop=True)
    val_folds = df.loc[val_idx].reset_index(drop=True)

    train_dataset = TrainingDataset(CFG, train_folds)
    val_dataset = TrainingDataset(CFG, val_folds)

    train_dataloader = DataLoader(
        train_dataset,
        batch_size=CFG.batch_size,
        shuffle=True,
        num_workers=CFG.num_workers,
        pin_memory=True,
        drop_last=True,
    )
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=CFG.batch_size,
        shuffle=False,
        num_workers=CFG.num_workers,
        pin_memory=True,
        drop_last=False,
    )

    #model = CustomModel(CFG, model_config_path=None, pretrained=True)
    model = CustomModel(CFG, model_config_path=None, pretrained=False)   # itptを使うため
    torch.save(model.model_config, CFG.output_dir / "model_config.pth")
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {"params": [p for n, p in param_optimizer if not any(
            nd in n for nd in no_decay)], "weight_decay": CFG.weight_decay},
        {"params": [p for n, p in param_optimizer if any(
            nd in n for nd in no_decay)], "weight_decay": 0.0}
    ]
    optimizer = AdamW(
        optimizer_grouped_parameters,
        lr=CFG.lr,
        betas=CFG.betas,
        weight_decay=CFG.weight_decay,
    )
    num_train_optimization_steps = int(len(train_dataloader) * CFG.epochs)
    num_warmup_steps = int(num_train_optimization_steps * CFG.num_warmup_steps_rate)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_train_optimization_steps,
    )

    criterion = nn.BCEWithLogitsLoss(reduction="none")
    best_score = -1 * np.inf

    for epoch in range(CFG.epochs):
        start_time = time.time()
        avg_loss = train_fn(
            train_dataloader,
            model,
            criterion,
            optimizer,
            epoch,
            scheduler,
            device,
        )
        avg_val_loss, val_preds = valid_fn(
            val_dataloader,
            model,
            criterion,
            device,
        )

        if isinstance(scheduler, optim.lr_scheduler.CosineAnnealingWarmRestarts):
            scheduler.step()

        # scoring
        val_folds[[str(i) for i in range(CFG.max_len)]] = val_preds
        score = scoring(val_folds, th=0.5)

        elapsed = time.time() - start_time

        print(f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s")
        print(f"Epoch {epoch+1} - Score: {score:.4f}")
        if score > best_score:
            best_score = score
            print(f"Epoch {epoch+1} - Save Best Score: {score:.4f} Model")
            torch.save({
                "model": model.state_dict(),
                "predictions": val_preds,
                },
                CFG.output_dir / f"fold{i_fold}_best.pth",
            )

    predictions = torch.load(
        CFG.output_dir / f"fold{i_fold}_best.pth",
        map_location=torch.device("cpu"),
    )["predictions"]
    val_folds[[str(i) for i in range(CFG.max_len)]] = predictions

    torch.cuda.empty_cache()
    gc.collect()

    return val_folds

## Main

In [31]:
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if CFG.train:
        oof_df = pd.DataFrame()
        for i_fold in range(CFG.n_fold):
            if i_fold in CFG.train_fold:
                _oof_df = train_loop(train, i_fold, device)
                oof_df = pd.concat([oof_df, _oof_df], axis=0, ignore_index=True)
        oof_df.to_pickle(CFG.output_dir / "oof_df.pkl")

    if CFG.submission:
        oof_df = pd.read_pickle(Path("../input/") / CFG.exp_name / "oof_df.pkl")
    else:
        oof_df = pd.read_pickle(CFG.output_dir / "oof_df.pkl")

    best_thres = 0.5
    best_score = 0.
    for th in np.arange(0.45, 0.55, 0.01):
        th = np.round(th, 2)
        score = scoring(oof_df, th=th)
        if best_score < score:
            best_thres = th
            best_score = score
    print(f"best_thres: {best_thres}  score: {best_score:.5f}")

    if CFG.inference:
        test_dataset = TestDataset(CFG, test)
        test_dataloader = DataLoader(
            test_dataset,
            batch_size=CFG.batch_size,
            shuffle=False,
            num_workers=CFG.num_workers,
            pin_memory=True,
            drop_last=False,
        )
        predictions = []
        for i_fold in CFG.train_fold:
            if CFG.submission:
                model = CustomModel(CFG, model_config_path=Path("../input/") / CFG.exp_name / "model_config.pth", pretrained=False)
                path = Path("../input/") / CFG.exp_name / f"fold{i_fold}_best.pth"
            else:
                model = CustomModel(CFG, model_config_path=None, pretrained=True)
                path = CFG.output_dir / f"fold{i_fold}_best.pth"

            state = torch.load(path, map_location=torch.device("cpu"))
            model.load_state_dict(state["model"])
            test_token_probs = inference_fn(test_dataloader, model, device)
            test[[f"fold{i_fold}_{i}" for i in range(CFG.max_len)]] = test_token_probs
            test_char_probs = get_char_probs(test["pn_history"].values, test_token_probs, CFG.tokenizer)
            predictions.append(test_char_probs)

            del state, test_token_probs, model; gc.collect()
            torch.cuda.empty_cache()

        predictions = np.mean(predictions, axis=0)
        predicted_location_str = get_predicted_location_str(predictions, th=best_thres)
        test[CFG.target_col] = predicted_location_str
        test.to_csv(CFG.output_dir / "raw_submission.csv", index=False)
        test[[CFG.id_col, CFG.target_col]].to_csv(
            CFG.output_dir / "submission.csv", index=False
        )

In [32]:
if __name__ == "__main__":
    main()



Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForMaskedLM: ['lm_predictions.lm_head.dense.weight', 'deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.bias', '

Load weight from ../output/nbme-score-clinical-patient-notes/nbme-exp045/microsoft-deberta-xlarge-mlm-epoch-v4.bin
Epoch: [1][0/2681] Elapsed 0m 0s (remain 39m 58s) Loss: 0.8121(0.8121) Grad: inf  LR: 0.000000  
Epoch: [1][100/2681] Elapsed 0m 56s (remain 24m 0s) Loss: 0.1963(0.5512) Grad: 78507.8281  LR: 0.000002  
Epoch: [1][200/2681] Elapsed 1m 51s (remain 23m 0s) Loss: 0.0881(0.3240) Grad: 17285.6797  LR: 0.000003  
Epoch: [1][300/2681] Elapsed 2m 47s (remain 22m 3s) Loss: 0.1075(0.2371) Grad: 50321.4961  LR: 0.000004  
Epoch: [1][400/2681] Elapsed 3m 43s (remain 21m 8s) Loss: 0.0379(0.1863) Grad: 31640.4727  LR: 0.000006  
Epoch: [1][500/2681] Elapsed 4m 38s (remain 20m 12s) Loss: 0.0057(0.1542) Grad: 5957.8691  LR: 0.000007  
Epoch: [1][600/2681] Elapsed 5m 34s (remain 19m 16s) Loss: 0.0009(0.1314) Grad: 2949.3943  LR: 0.000009  
Epoch: [1][700/2681] Elapsed 6m 29s (remain 18m 20s) Loss: 0.0116(0.1154) Grad: 6106.6758  LR: 0.000010  
Epoch: [1][800/2681] Elapsed 7m 25s (remain 17



EVAL: [0/894] Elapsed 0m 0s (remain 4m 57s) Loss: 0.0030(0.0030) 
EVAL: [100/894] Elapsed 0m 13s (remain 1m 43s) Loss: 0.0001(0.0113) 
EVAL: [200/894] Elapsed 0m 25s (remain 1m 29s) Loss: 0.0090(0.0118) 
EVAL: [300/894] Elapsed 0m 38s (remain 1m 16s) Loss: 0.0056(0.0121) 
EVAL: [400/894] Elapsed 0m 51s (remain 1m 3s) Loss: 0.0187(0.0117) 
EVAL: [500/894] Elapsed 1m 4s (remain 0m 50s) Loss: 0.0066(0.0133) 
EVAL: [600/894] Elapsed 1m 17s (remain 0m 37s) Loss: 0.0060(0.0142) 
EVAL: [700/894] Elapsed 1m 30s (remain 0m 24s) Loss: 0.0035(0.0143) 
EVAL: [800/894] Elapsed 1m 42s (remain 0m 11s) Loss: 0.0024(0.0137) 
EVAL: [893/894] Elapsed 1m 54s (remain 0m 0s) Loss: 0.0003(0.0131) 
Epoch 1 - avg_train_loss: 0.0431  avg_val_loss: 0.0131  time: 1606s
Epoch 1 - Score: 0.8491
Epoch 1 - Save Best Score: 0.8491 Model




Epoch: [2][0/2681] Elapsed 0m 0s (remain 35m 55s) Loss: 0.0023(0.0023) Grad: 4122.7847  LR: 0.000018  
Epoch: [2][100/2681] Elapsed 0m 56s (remain 23m 51s) Loss: 0.0001(0.0112) Grad: 706.4938  LR: 0.000018  
Epoch: [2][200/2681] Elapsed 1m 51s (remain 22m 52s) Loss: 0.0001(0.0110) Grad: 367.2842  LR: 0.000017  
Epoch: [2][300/2681] Elapsed 2m 46s (remain 21m 57s) Loss: 0.0004(0.0105) Grad: 1455.3451  LR: 0.000017  
Epoch: [2][400/2681] Elapsed 3m 42s (remain 21m 2s) Loss: 0.0048(0.0110) Grad: 15592.0850  LR: 0.000017  
Epoch: [2][500/2681] Elapsed 4m 37s (remain 20m 6s) Loss: 0.0224(0.0111) Grad: 17369.6094  LR: 0.000017  
Epoch: [2][600/2681] Elapsed 5m 32s (remain 19m 11s) Loss: 0.0009(0.0109) Grad: 1748.6824  LR: 0.000017  
Epoch: [2][700/2681] Elapsed 6m 27s (remain 18m 15s) Loss: 0.0001(0.0108) Grad: 99.1142  LR: 0.000017  
Epoch: [2][800/2681] Elapsed 7m 23s (remain 17m 20s) Loss: 0.0158(0.0113) Grad: 6521.2417  LR: 0.000016  
Epoch: [2][900/2681] Elapsed 8m 18s (remain 16m 24s) 



Epoch: [2][2680/2681] Elapsed 24m 43s (remain 0m 0s) Loss: 0.0001(0.0112) Grad: 210.3608  LR: 0.000013  




EVAL: [0/894] Elapsed 0m 0s (remain 4m 52s) Loss: 0.0027(0.0027) 
EVAL: [100/894] Elapsed 0m 13s (remain 1m 43s) Loss: 0.0000(0.0098) 
EVAL: [200/894] Elapsed 0m 25s (remain 1m 29s) Loss: 0.0087(0.0112) 
EVAL: [300/894] Elapsed 0m 38s (remain 1m 16s) Loss: 0.0130(0.0118) 
EVAL: [400/894] Elapsed 0m 51s (remain 1m 3s) Loss: 0.0360(0.0111) 
EVAL: [500/894] Elapsed 1m 4s (remain 0m 50s) Loss: 0.0012(0.0128) 
EVAL: [600/894] Elapsed 1m 17s (remain 0m 37s) Loss: 0.0029(0.0147) 
EVAL: [700/894] Elapsed 1m 30s (remain 0m 24s) Loss: 0.0206(0.0148) 
EVAL: [800/894] Elapsed 1m 42s (remain 0m 11s) Loss: 0.0021(0.0142) 
EVAL: [893/894] Elapsed 1m 54s (remain 0m 0s) Loss: 0.0000(0.0135) 
Epoch 2 - avg_train_loss: 0.0112  avg_val_loss: 0.0135  time: 1603s
Epoch 2 - Score: 0.8734
Epoch 2 - Save Best Score: 0.8734 Model




Epoch: [3][0/2681] Elapsed 0m 0s (remain 36m 6s) Loss: 0.0000(0.0000) Grad: 277.6299  LR: 0.000013  
Epoch: [3][100/2681] Elapsed 0m 55s (remain 23m 50s) Loss: 0.0124(0.0094) Grad: 19807.5820  LR: 0.000013  
Epoch: [3][200/2681] Elapsed 1m 51s (remain 22m 52s) Loss: 0.0110(0.0081) Grad: 31514.4199  LR: 0.000013  
Epoch: [3][300/2681] Elapsed 2m 46s (remain 21m 56s) Loss: 0.0616(0.0085) Grad: 15892.6191  LR: 0.000013  
Epoch: [3][400/2681] Elapsed 3m 41s (remain 21m 0s) Loss: 0.0011(0.0082) Grad: 6826.9429  LR: 0.000013  
Epoch: [3][500/2681] Elapsed 4m 36s (remain 20m 4s) Loss: 0.0009(0.0088) Grad: 2027.2095  LR: 0.000013  
Epoch: [3][600/2681] Elapsed 5m 32s (remain 19m 9s) Loss: 0.0063(0.0089) Grad: 9560.1406  LR: 0.000012  
Epoch: [3][700/2681] Elapsed 6m 27s (remain 18m 14s) Loss: 0.0002(0.0090) Grad: 378.4389  LR: 0.000012  
Epoch: [3][800/2681] Elapsed 7m 22s (remain 17m 18s) Loss: 0.0060(0.0089) Grad: 9074.1523  LR: 0.000012  
Epoch: [3][900/2681] Elapsed 8m 17s (remain 16m 23s)



Epoch: [3][2680/2681] Elapsed 24m 41s (remain 0m 0s) Loss: 0.0029(0.0082) Grad: 11715.0078  LR: 0.000009  




EVAL: [0/894] Elapsed 0m 0s (remain 4m 58s) Loss: 0.0011(0.0011) 
EVAL: [100/894] Elapsed 0m 13s (remain 1m 43s) Loss: 0.0000(0.0117) 
EVAL: [200/894] Elapsed 0m 25s (remain 1m 29s) Loss: 0.0100(0.0120) 
EVAL: [300/894] Elapsed 0m 38s (remain 1m 16s) Loss: 0.0077(0.0119) 
EVAL: [400/894] Elapsed 0m 51s (remain 1m 3s) Loss: 0.0437(0.0108) 
EVAL: [500/894] Elapsed 1m 4s (remain 0m 50s) Loss: 0.0008(0.0129) 
EVAL: [600/894] Elapsed 1m 17s (remain 0m 37s) Loss: 0.0006(0.0147) 
EVAL: [700/894] Elapsed 1m 30s (remain 0m 24s) Loss: 0.0003(0.0145) 
EVAL: [800/894] Elapsed 1m 42s (remain 0m 11s) Loss: 0.0006(0.0140) 
EVAL: [893/894] Elapsed 1m 54s (remain 0m 0s) Loss: 0.0000(0.0134) 
Epoch 3 - avg_train_loss: 0.0082  avg_val_loss: 0.0134  time: 1601s
Epoch 3 - Score: 0.8798
Epoch 3 - Save Best Score: 0.8798 Model




Epoch: [4][0/2681] Elapsed 0m 0s (remain 36m 2s) Loss: 0.0058(0.0058) Grad: 26790.1504  LR: 0.000009  
Epoch: [4][100/2681] Elapsed 0m 55s (remain 23m 48s) Loss: 0.0005(0.0053) Grad: 1699.7330  LR: 0.000009  
Epoch: [4][200/2681] Elapsed 1m 51s (remain 22m 52s) Loss: 0.0031(0.0054) Grad: 28456.4043  LR: 0.000009  
Epoch: [4][300/2681] Elapsed 2m 46s (remain 21m 56s) Loss: 0.0267(0.0055) Grad: 125925.1328  LR: 0.000008  
Epoch: [4][400/2681] Elapsed 3m 41s (remain 21m 0s) Loss: 0.0000(0.0061) Grad: 92.0513  LR: 0.000008  
Epoch: [4][500/2681] Elapsed 4m 36s (remain 20m 4s) Loss: 0.0131(0.0063) Grad: 26497.2617  LR: 0.000008  
Epoch: [4][600/2681] Elapsed 5m 32s (remain 19m 9s) Loss: 0.0000(0.0061) Grad: 52.4845  LR: 0.000008  
Epoch: [4][700/2681] Elapsed 6m 27s (remain 18m 13s) Loss: 0.0000(0.0058) Grad: 29.7766  LR: 0.000008  
Epoch: [4][800/2681] Elapsed 7m 22s (remain 17m 18s) Loss: 0.0000(0.0058) Grad: 56.8544  LR: 0.000008  
Epoch: [4][900/2681] Elapsed 8m 17s (remain 16m 23s) Los



Epoch: [4][2680/2681] Elapsed 24m 39s (remain 0m 0s) Loss: 0.0001(0.0058) Grad: 345.6910  LR: 0.000004  




EVAL: [0/894] Elapsed 0m 0s (remain 5m 1s) Loss: 0.0003(0.0003) 
EVAL: [100/894] Elapsed 0m 13s (remain 1m 43s) Loss: 0.0000(0.0121) 
EVAL: [200/894] Elapsed 0m 25s (remain 1m 29s) Loss: 0.0118(0.0139) 
EVAL: [300/894] Elapsed 0m 38s (remain 1m 16s) Loss: 0.0141(0.0142) 
EVAL: [400/894] Elapsed 0m 51s (remain 1m 3s) Loss: 0.0394(0.0129) 
EVAL: [500/894] Elapsed 1m 4s (remain 0m 50s) Loss: 0.0017(0.0154) 
EVAL: [600/894] Elapsed 1m 17s (remain 0m 37s) Loss: 0.0033(0.0173) 
EVAL: [700/894] Elapsed 1m 30s (remain 0m 24s) Loss: 0.0001(0.0173) 
EVAL: [800/894] Elapsed 1m 42s (remain 0m 11s) Loss: 0.0036(0.0167) 
EVAL: [893/894] Elapsed 1m 54s (remain 0m 0s) Loss: 0.0000(0.0161) 
Epoch 4 - avg_train_loss: 0.0058  avg_val_loss: 0.0161  time: 1600s
Epoch 4 - Score: 0.8811
Epoch 4 - Save Best Score: 0.8811 Model




Epoch: [5][0/2681] Elapsed 0m 0s (remain 35m 58s) Loss: 0.0003(0.0003) Grad: 2742.3743  LR: 0.000004  
Epoch: [5][100/2681] Elapsed 0m 55s (remain 23m 47s) Loss: 0.0103(0.0033) Grad: 15999.1748  LR: 0.000004  
Epoch: [5][200/2681] Elapsed 1m 50s (remain 22m 49s) Loss: 0.0102(0.0037) Grad: 79884.3594  LR: 0.000004  
Epoch: [5][300/2681] Elapsed 2m 46s (remain 21m 52s) Loss: 0.0062(0.0039) Grad: 28205.1133  LR: 0.000004  
Epoch: [5][400/2681] Elapsed 3m 40s (remain 20m 56s) Loss: 0.0000(0.0039) Grad: 4.6683  LR: 0.000004  
Epoch: [5][500/2681] Elapsed 4m 36s (remain 20m 1s) Loss: 0.0044(0.0039) Grad: 11876.0615  LR: 0.000004  
Epoch: [5][600/2681] Elapsed 5m 31s (remain 19m 5s) Loss: 0.0187(0.0039) Grad: 18578.6426  LR: 0.000003  
Epoch: [5][700/2681] Elapsed 6m 26s (remain 18m 10s) Loss: 0.0000(0.0039) Grad: 37.0401  LR: 0.000003  
Epoch: [5][800/2681] Elapsed 7m 21s (remain 17m 15s) Loss: 0.0016(0.0040) Grad: 10262.4854  LR: 0.000003  
Epoch: [5][900/2681] Elapsed 8m 16s (remain 16m 20



Epoch: [5][2680/2681] Elapsed 24m 35s (remain 0m 0s) Loss: 0.0685(0.0038) Grad: 260267.9219  LR: 0.000000  




EVAL: [0/894] Elapsed 0m 0s (remain 5m 1s) Loss: 0.0001(0.0001) 
EVAL: [100/894] Elapsed 0m 13s (remain 1m 43s) Loss: 0.0000(0.0137) 
EVAL: [200/894] Elapsed 0m 25s (remain 1m 29s) Loss: 0.0122(0.0154) 
EVAL: [300/894] Elapsed 0m 38s (remain 1m 16s) Loss: 0.0184(0.0161) 
EVAL: [400/894] Elapsed 0m 51s (remain 1m 3s) Loss: 0.0543(0.0149) 
EVAL: [500/894] Elapsed 1m 4s (remain 0m 50s) Loss: 0.0066(0.0180) 
EVAL: [600/894] Elapsed 1m 17s (remain 0m 37s) Loss: 0.0040(0.0198) 
EVAL: [700/894] Elapsed 1m 30s (remain 0m 24s) Loss: 0.0001(0.0202) 
EVAL: [800/894] Elapsed 1m 42s (remain 0m 11s) Loss: 0.0018(0.0196) 
EVAL: [893/894] Elapsed 1m 54s (remain 0m 0s) Loss: 0.0000(0.0188) 
Epoch 5 - avg_train_loss: 0.0038  avg_val_loss: 0.0188  time: 1596s
Epoch 5 - Score: 0.8827
Epoch 5 - Save Best Score: 0.8827 Model


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForMaskedLM: ['lm_predictions.lm_head.dense.weight', 'deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.bias', '

Load weight from ../output/nbme-score-clinical-patient-notes/nbme-exp045/microsoft-deberta-xlarge-mlm-epoch-v4.bin




Epoch: [1][0/2681] Elapsed 0m 0s (remain 43m 33s) Loss: 0.7146(0.7146) Grad: inf  LR: 0.000000  
Epoch: [1][100/2681] Elapsed 0m 56s (remain 24m 4s) Loss: 0.1598(0.5036) Grad: 68297.2500  LR: 0.000002  
Epoch: [1][200/2681] Elapsed 1m 52s (remain 23m 2s) Loss: 0.0539(0.2960) Grad: 8778.4893  LR: 0.000003  
Epoch: [1][300/2681] Elapsed 2m 47s (remain 22m 6s) Loss: 0.0497(0.2160) Grad: 168020.9062  LR: 0.000004  
Epoch: [1][400/2681] Elapsed 3m 43s (remain 21m 9s) Loss: 0.0374(0.1692) Grad: 98323.3516  LR: 0.000006  
Epoch: [1][500/2681] Elapsed 4m 38s (remain 20m 13s) Loss: 0.0092(0.1407) Grad: 17929.8379  LR: 0.000007  
Epoch: [1][600/2681] Elapsed 5m 34s (remain 19m 17s) Loss: 0.0008(0.1208) Grad: 2721.0647  LR: 0.000009  
Epoch: [1][700/2681] Elapsed 6m 30s (remain 18m 21s) Loss: 0.0463(0.1064) Grad: 59968.2578  LR: 0.000010  
Epoch: [1][800/2681] Elapsed 7m 25s (remain 17m 26s) Loss: 0.0090(0.0957) Grad: 22609.2285  LR: 0.000012  
Epoch: [1][900/2681] Elapsed 8m 21s (remain 16m 30s)



EVAL: [0/894] Elapsed 0m 0s (remain 8m 4s) Loss: 0.0225(0.0225) 
EVAL: [100/894] Elapsed 0m 13s (remain 1m 44s) Loss: 0.0011(0.0112) 
EVAL: [200/894] Elapsed 0m 26s (remain 1m 30s) Loss: 0.0368(0.0159) 
EVAL: [300/894] Elapsed 0m 38s (remain 1m 16s) Loss: 0.0193(0.0169) 
EVAL: [400/894] Elapsed 0m 51s (remain 1m 3s) Loss: 0.0014(0.0161) 
EVAL: [500/894] Elapsed 1m 4s (remain 0m 50s) Loss: 0.0196(0.0172) 
EVAL: [600/894] Elapsed 1m 17s (remain 0m 37s) Loss: 0.0072(0.0173) 
EVAL: [700/894] Elapsed 1m 30s (remain 0m 24s) Loss: 0.0000(0.0167) 
EVAL: [800/894] Elapsed 1m 43s (remain 0m 11s) Loss: 0.0088(0.0158) 
EVAL: [893/894] Elapsed 1m 55s (remain 0m 0s) Loss: 0.0272(0.0146) 
Epoch 1 - avg_train_loss: 0.0401  avg_val_loss: 0.0146  time: 1611s
Epoch 1 - Score: 0.8563
Epoch 1 - Save Best Score: 0.8563 Model




Epoch: [2][0/2681] Elapsed 0m 1s (remain 45m 45s) Loss: 0.0004(0.0004) Grad: 1550.2314  LR: 0.000018  
Epoch: [2][100/2681] Elapsed 0m 56s (remain 24m 2s) Loss: 0.0000(0.0098) Grad: 14.5993  LR: 0.000018  
Epoch: [2][200/2681] Elapsed 1m 52s (remain 23m 2s) Loss: 0.0002(0.0105) Grad: 707.6400  LR: 0.000017  
Epoch: [2][300/2681] Elapsed 2m 47s (remain 22m 4s) Loss: 0.0051(0.0112) Grad: 9073.1914  LR: 0.000017  
Epoch: [2][400/2681] Elapsed 3m 42s (remain 21m 7s) Loss: 0.0261(0.0112) Grad: 67889.3672  LR: 0.000017  
Epoch: [2][500/2681] Elapsed 4m 38s (remain 20m 10s) Loss: 0.0002(0.0112) Grad: 841.0383  LR: 0.000017  
Epoch: [2][600/2681] Elapsed 5m 33s (remain 19m 15s) Loss: 0.0038(0.0115) Grad: 3589.9844  LR: 0.000017  
Epoch: [2][700/2681] Elapsed 6m 29s (remain 18m 19s) Loss: 0.0015(0.0116) Grad: 12262.4463  LR: 0.000017  
Epoch: [2][800/2681] Elapsed 7m 24s (remain 17m 23s) Loss: 0.0045(0.0115) Grad: 3677.6150  LR: 0.000016  
Epoch: [2][900/2681] Elapsed 8m 20s (remain 16m 27s) Lo



EVAL: [0/894] Elapsed 0m 0s (remain 8m 7s) Loss: 0.0232(0.0232) 
EVAL: [100/894] Elapsed 0m 13s (remain 1m 45s) Loss: 0.0035(0.0096) 
EVAL: [200/894] Elapsed 0m 26s (remain 1m 30s) Loss: 0.0375(0.0147) 
EVAL: [300/894] Elapsed 0m 39s (remain 1m 16s) Loss: 0.0288(0.0155) 
EVAL: [400/894] Elapsed 0m 51s (remain 1m 3s) Loss: 0.0016(0.0149) 
EVAL: [500/894] Elapsed 1m 4s (remain 0m 50s) Loss: 0.0130(0.0158) 
EVAL: [600/894] Elapsed 1m 17s (remain 0m 37s) Loss: 0.0192(0.0161) 
EVAL: [700/894] Elapsed 1m 30s (remain 0m 24s) Loss: 0.0001(0.0155) 
EVAL: [800/894] Elapsed 1m 43s (remain 0m 11s) Loss: 0.0130(0.0147) 
EVAL: [893/894] Elapsed 1m 55s (remain 0m 0s) Loss: 0.0257(0.0135) 
Epoch 2 - avg_train_loss: 0.0118  avg_val_loss: 0.0135  time: 1607s
Epoch 2 - Score: 0.8630
Epoch 2 - Save Best Score: 0.8630 Model




Epoch: [3][0/2681] Elapsed 0m 1s (remain 45m 25s) Loss: 0.0188(0.0188) Grad: 17611.8418  LR: 0.000013  
Epoch: [3][100/2681] Elapsed 0m 56s (remain 24m 6s) Loss: 0.0085(0.0105) Grad: 25125.0195  LR: 0.000013  
Epoch: [3][200/2681] Elapsed 1m 51s (remain 23m 1s) Loss: 0.0000(0.0088) Grad: 597.4589  LR: 0.000013  
Epoch: [3][300/2681] Elapsed 2m 47s (remain 22m 2s) Loss: 0.0375(0.0091) Grad: 49659.6680  LR: 0.000013  
Epoch: [3][400/2681] Elapsed 3m 42s (remain 21m 6s) Loss: 0.0008(0.0090) Grad: 3150.3855  LR: 0.000013  
Epoch: [3][500/2681] Elapsed 4m 38s (remain 20m 10s) Loss: 0.0008(0.0088) Grad: 3423.0784  LR: 0.000013  
Epoch: [3][600/2681] Elapsed 5m 33s (remain 19m 14s) Loss: 0.0029(0.0086) Grad: 7466.3555  LR: 0.000012  
Epoch: [3][700/2681] Elapsed 6m 28s (remain 18m 18s) Loss: 0.0209(0.0084) Grad: 30629.4824  LR: 0.000012  
Epoch: [3][800/2681] Elapsed 7m 24s (remain 17m 22s) Loss: 0.0102(0.0086) Grad: 31126.9004  LR: 0.000012  
Epoch: [3][900/2681] Elapsed 8m 19s (remain 16m 2



EVAL: [0/894] Elapsed 0m 0s (remain 8m 16s) Loss: 0.0251(0.0251) 
EVAL: [100/894] Elapsed 0m 13s (remain 1m 45s) Loss: 0.0153(0.0123) 
EVAL: [200/894] Elapsed 0m 26s (remain 1m 30s) Loss: 0.0473(0.0188) 
EVAL: [300/894] Elapsed 0m 39s (remain 1m 16s) Loss: 0.0383(0.0202) 
EVAL: [400/894] Elapsed 0m 51s (remain 1m 3s) Loss: 0.0002(0.0191) 
EVAL: [500/894] Elapsed 1m 4s (remain 0m 50s) Loss: 0.0264(0.0197) 
EVAL: [600/894] Elapsed 1m 17s (remain 0m 37s) Loss: 0.0290(0.0199) 
EVAL: [700/894] Elapsed 1m 30s (remain 0m 24s) Loss: 0.0000(0.0188) 
EVAL: [800/894] Elapsed 1m 43s (remain 0m 11s) Loss: 0.0534(0.0179) 
EVAL: [893/894] Elapsed 1m 55s (remain 0m 0s) Loss: 0.0135(0.0165) 
Epoch 3 - avg_train_loss: 0.0082  avg_val_loss: 0.0165  time: 1605s
Epoch 3 - Score: 0.8786
Epoch 3 - Save Best Score: 0.8786 Model




Epoch: [4][0/2681] Elapsed 0m 1s (remain 44m 57s) Loss: 0.0101(0.0101) Grad: 25467.9590  LR: 0.000009  
Epoch: [4][100/2681] Elapsed 0m 56s (remain 24m 1s) Loss: 0.0003(0.0070) Grad: 4099.1211  LR: 0.000009  
Epoch: [4][200/2681] Elapsed 1m 51s (remain 22m 58s) Loss: 0.0128(0.0069) Grad: 15409.0771  LR: 0.000009  
Epoch: [4][300/2681] Elapsed 2m 46s (remain 22m 0s) Loss: 0.0074(0.0061) Grad: 32531.3047  LR: 0.000008  
Epoch: [4][400/2681] Elapsed 3m 42s (remain 21m 3s) Loss: 0.0001(0.0058) Grad: 261.2540  LR: 0.000008  
Epoch: [4][500/2681] Elapsed 4m 37s (remain 20m 7s) Loss: 0.0526(0.0059) Grad: 64010.9102  LR: 0.000008  
Epoch: [4][600/2681] Elapsed 5m 32s (remain 19m 11s) Loss: 0.0196(0.0062) Grad: 10959.7969  LR: 0.000008  
Epoch: [4][700/2681] Elapsed 6m 27s (remain 18m 15s) Loss: 0.0036(0.0062) Grad: 29576.1055  LR: 0.000008  
Epoch: [4][800/2681] Elapsed 7m 23s (remain 17m 19s) Loss: 0.0000(0.0060) Grad: 29.5572  LR: 0.000008  
Epoch: [4][900/2681] Elapsed 8m 18s (remain 16m 24



Epoch: [4][2680/2681] Elapsed 24m 41s (remain 0m 0s) Loss: 0.0026(0.0061) Grad: 83775.5938  LR: 0.000004  




EVAL: [0/894] Elapsed 0m 0s (remain 8m 11s) Loss: 0.0328(0.0328) 
EVAL: [100/894] Elapsed 0m 13s (remain 1m 45s) Loss: 0.0130(0.0142) 
EVAL: [200/894] Elapsed 0m 26s (remain 1m 30s) Loss: 0.0586(0.0219) 
EVAL: [300/894] Elapsed 0m 39s (remain 1m 16s) Loss: 0.0452(0.0238) 
EVAL: [400/894] Elapsed 0m 51s (remain 1m 3s) Loss: 0.0000(0.0223) 
EVAL: [500/894] Elapsed 1m 4s (remain 0m 50s) Loss: 0.0300(0.0237) 
EVAL: [600/894] Elapsed 1m 17s (remain 0m 37s) Loss: 0.0452(0.0241) 
EVAL: [700/894] Elapsed 1m 30s (remain 0m 24s) Loss: 0.0000(0.0232) 
EVAL: [800/894] Elapsed 1m 43s (remain 0m 11s) Loss: 0.0360(0.0221) 
EVAL: [893/894] Elapsed 1m 55s (remain 0m 0s) Loss: 0.0108(0.0204) 
Epoch 4 - avg_train_loss: 0.0061  avg_val_loss: 0.0204  time: 1603s
Epoch 4 - Score: 0.8776




Epoch: [5][0/2681] Elapsed 0m 1s (remain 45m 41s) Loss: 0.0000(0.0000) Grad: 451.9927  LR: 0.000004  
Epoch: [5][100/2681] Elapsed 0m 56s (remain 23m 55s) Loss: 0.0003(0.0032) Grad: 1609.8801  LR: 0.000004  
Epoch: [5][200/2681] Elapsed 1m 51s (remain 22m 53s) Loss: 0.0215(0.0041) Grad: 248607.5000  LR: 0.000004  
Epoch: [5][300/2681] Elapsed 2m 46s (remain 21m 55s) Loss: 0.0000(0.0045) Grad: 23.2836  LR: 0.000004  
Epoch: [5][400/2681] Elapsed 3m 41s (remain 20m 59s) Loss: 0.0123(0.0044) Grad: 61439.9297  LR: 0.000004  
Epoch: [5][500/2681] Elapsed 4m 36s (remain 20m 4s) Loss: 0.0075(0.0044) Grad: 62357.4844  LR: 0.000004  
Epoch: [5][600/2681] Elapsed 5m 31s (remain 19m 8s) Loss: 0.0000(0.0043) Grad: 115.9431  LR: 0.000003  
Epoch: [5][700/2681] Elapsed 6m 27s (remain 18m 13s) Loss: 0.0089(0.0042) Grad: 234015.5469  LR: 0.000003  
Epoch: [5][800/2681] Elapsed 7m 22s (remain 17m 17s) Loss: 0.0630(0.0042) Grad: 23240.2773  LR: 0.000003  
Epoch: [5][900/2681] Elapsed 8m 17s (remain 16m 



EVAL: [0/894] Elapsed 0m 0s (remain 8m 10s) Loss: 0.0339(0.0339) 
EVAL: [100/894] Elapsed 0m 13s (remain 1m 45s) Loss: 0.0048(0.0136) 
EVAL: [200/894] Elapsed 0m 26s (remain 1m 30s) Loss: 0.0489(0.0210) 
EVAL: [300/894] Elapsed 0m 39s (remain 1m 16s) Loss: 0.0448(0.0229) 
EVAL: [400/894] Elapsed 0m 51s (remain 1m 3s) Loss: 0.0000(0.0216) 
EVAL: [500/894] Elapsed 1m 4s (remain 0m 50s) Loss: 0.0210(0.0229) 
EVAL: [600/894] Elapsed 1m 17s (remain 0m 37s) Loss: 0.0362(0.0232) 
EVAL: [700/894] Elapsed 1m 30s (remain 0m 24s) Loss: 0.0000(0.0222) 
EVAL: [800/894] Elapsed 1m 43s (remain 0m 11s) Loss: 0.0427(0.0211) 
EVAL: [893/894] Elapsed 1m 55s (remain 0m 0s) Loss: 0.0095(0.0195) 
Epoch 5 - avg_train_loss: 0.0042  avg_val_loss: 0.0195  time: 1599s
Epoch 5 - Score: 0.8761


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForMaskedLM: ['lm_predictions.lm_head.dense.weight', 'deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.bias', '

Load weight from ../output/nbme-score-clinical-patient-notes/nbme-exp045/microsoft-deberta-xlarge-mlm-epoch-v4.bin




Epoch: [1][0/2681] Elapsed 0m 0s (remain 44m 26s) Loss: 0.7507(0.7507) Grad: inf  LR: 0.000000  
Epoch: [1][100/2681] Elapsed 0m 56s (remain 24m 3s) Loss: 0.1411(0.5086) Grad: 28175.8906  LR: 0.000002  
Epoch: [1][200/2681] Elapsed 1m 51s (remain 23m 0s) Loss: 0.0538(0.2940) Grad: 3730.4683  LR: 0.000003  
Epoch: [1][300/2681] Elapsed 2m 47s (remain 22m 4s) Loss: 0.0229(0.2122) Grad: 10768.3203  LR: 0.000004  
Epoch: [1][400/2681] Elapsed 3m 43s (remain 21m 8s) Loss: 0.0063(0.1674) Grad: 4831.8120  LR: 0.000006  
Epoch: [1][500/2681] Elapsed 4m 38s (remain 20m 12s) Loss: 0.0108(0.1391) Grad: 4893.3076  LR: 0.000007  
Epoch: [1][600/2681] Elapsed 5m 34s (remain 19m 16s) Loss: 0.0173(0.1199) Grad: 6388.0977  LR: 0.000009  
Epoch: [1][700/2681] Elapsed 6m 29s (remain 18m 20s) Loss: 0.0005(0.1058) Grad: 253.4816  LR: 0.000010  
Epoch: [1][800/2681] Elapsed 7m 25s (remain 17m 25s) Loss: 0.0045(0.0953) Grad: 1686.4609  LR: 0.000012  
Epoch: [1][900/2681] Elapsed 8m 20s (remain 16m 29s) Loss:



EVAL: [0/894] Elapsed 0m 0s (remain 8m 38s) Loss: 0.0027(0.0027) 
EVAL: [100/894] Elapsed 0m 13s (remain 1m 45s) Loss: 0.0061(0.0129) 
EVAL: [200/894] Elapsed 0m 26s (remain 1m 30s) Loss: 0.0006(0.0123) 
EVAL: [300/894] Elapsed 0m 39s (remain 1m 16s) Loss: 0.0030(0.0130) 
EVAL: [400/894] Elapsed 0m 51s (remain 1m 3s) Loss: 0.0016(0.0117) 
EVAL: [500/894] Elapsed 1m 4s (remain 0m 50s) Loss: 0.0047(0.0123) 
EVAL: [600/894] Elapsed 1m 17s (remain 0m 37s) Loss: 0.0091(0.0129) 
EVAL: [700/894] Elapsed 1m 30s (remain 0m 24s) Loss: 0.0386(0.0137) 
EVAL: [800/894] Elapsed 1m 43s (remain 0m 11s) Loss: 0.0025(0.0130) 
EVAL: [893/894] Elapsed 1m 55s (remain 0m 0s) Loss: 0.0007(0.0124) 
Epoch 1 - avg_train_loss: 0.0398  avg_val_loss: 0.0124  time: 1609s
Epoch 1 - Score: 0.8603
Epoch 1 - Save Best Score: 0.8603 Model




Epoch: [2][0/2681] Elapsed 0m 1s (remain 46m 44s) Loss: 0.0006(0.0006) Grad: 2377.9453  LR: 0.000018  
Epoch: [2][100/2681] Elapsed 0m 56s (remain 24m 5s) Loss: 0.0050(0.0089) Grad: 11827.9912  LR: 0.000018  
Epoch: [2][200/2681] Elapsed 1m 52s (remain 23m 2s) Loss: 0.0002(0.0093) Grad: 553.0225  LR: 0.000017  
Epoch: [2][300/2681] Elapsed 2m 47s (remain 22m 4s) Loss: 0.0007(0.0108) Grad: 2944.2153  LR: 0.000017  
Epoch: [2][400/2681] Elapsed 3m 42s (remain 21m 7s) Loss: 0.0295(0.0108) Grad: 64155.1523  LR: 0.000017  
Epoch: [2][500/2681] Elapsed 4m 38s (remain 20m 10s) Loss: 0.0249(0.0107) Grad: 49751.2734  LR: 0.000017  
Epoch: [2][600/2681] Elapsed 5m 33s (remain 19m 15s) Loss: 0.0005(0.0115) Grad: 1894.0925  LR: 0.000017  
Epoch: [2][700/2681] Elapsed 6m 29s (remain 18m 19s) Loss: 0.0236(0.0113) Grad: 40425.8633  LR: 0.000017  
Epoch: [2][800/2681] Elapsed 7m 24s (remain 17m 23s) Loss: 0.0113(0.0115) Grad: 70418.1406  LR: 0.000016  
Epoch: [2][900/2681] Elapsed 8m 20s (remain 16m 2



EVAL: [0/894] Elapsed 0m 0s (remain 8m 36s) Loss: 0.0014(0.0014) 
EVAL: [100/894] Elapsed 0m 13s (remain 1m 45s) Loss: 0.0056(0.0151) 
EVAL: [200/894] Elapsed 0m 26s (remain 1m 30s) Loss: 0.0028(0.0129) 
EVAL: [300/894] Elapsed 0m 39s (remain 1m 16s) Loss: 0.0024(0.0142) 
EVAL: [400/894] Elapsed 0m 51s (remain 1m 3s) Loss: 0.0002(0.0127) 
EVAL: [500/894] Elapsed 1m 4s (remain 0m 50s) Loss: 0.0027(0.0139) 
EVAL: [600/894] Elapsed 1m 17s (remain 0m 37s) Loss: 0.0000(0.0151) 
EVAL: [700/894] Elapsed 1m 30s (remain 0m 24s) Loss: 0.0585(0.0156) 
EVAL: [800/894] Elapsed 1m 43s (remain 0m 11s) Loss: 0.0049(0.0148) 
EVAL: [893/894] Elapsed 1m 55s (remain 0m 0s) Loss: 0.0000(0.0140) 
Epoch 2 - avg_train_loss: 0.0110  avg_val_loss: 0.0140  time: 1607s
Epoch 2 - Score: 0.8806
Epoch 2 - Save Best Score: 0.8806 Model




Epoch: [3][0/2681] Elapsed 0m 1s (remain 46m 47s) Loss: 0.0008(0.0008) Grad: 2382.3699  LR: 0.000013  
Epoch: [3][100/2681] Elapsed 0m 56s (remain 24m 0s) Loss: 0.0041(0.0068) Grad: 9214.9492  LR: 0.000013  
Epoch: [3][200/2681] Elapsed 1m 51s (remain 22m 59s) Loss: 0.0007(0.0071) Grad: 6656.0083  LR: 0.000013  
Epoch: [3][300/2681] Elapsed 2m 47s (remain 22m 1s) Loss: 0.0184(0.0078) Grad: 865916.5625  LR: 0.000013  
Epoch: [3][400/2681] Elapsed 3m 42s (remain 21m 5s) Loss: 0.0018(0.0077) Grad: 21942.4316  LR: 0.000013  
Epoch: [3][500/2681] Elapsed 4m 37s (remain 20m 8s) Loss: 0.0000(0.0076) Grad: 107.0134  LR: 0.000013  
Epoch: [3][600/2681] Elapsed 5m 33s (remain 19m 13s) Loss: 0.0001(0.0075) Grad: 718.5249  LR: 0.000012  
Epoch: [3][700/2681] Elapsed 6m 28s (remain 18m 17s) Loss: 0.0002(0.0080) Grad: 769.9167  LR: 0.000012  
Epoch: [3][800/2681] Elapsed 7m 23s (remain 17m 21s) Loss: 0.0049(0.0082) Grad: 20779.7871  LR: 0.000012  
Epoch: [3][900/2681] Elapsed 8m 19s (remain 16m 25s)



EVAL: [0/894] Elapsed 0m 0s (remain 8m 36s) Loss: 0.0023(0.0023) 
EVAL: [100/894] Elapsed 0m 13s (remain 1m 45s) Loss: 0.0004(0.0115) 
EVAL: [200/894] Elapsed 0m 26s (remain 1m 30s) Loss: 0.0086(0.0113) 
EVAL: [300/894] Elapsed 0m 39s (remain 1m 17s) Loss: 0.0013(0.0124) 
EVAL: [400/894] Elapsed 0m 51s (remain 1m 3s) Loss: 0.0000(0.0109) 
EVAL: [500/894] Elapsed 1m 4s (remain 0m 50s) Loss: 0.0029(0.0118) 
EVAL: [600/894] Elapsed 1m 17s (remain 0m 37s) Loss: 0.0000(0.0125) 
EVAL: [700/894] Elapsed 1m 30s (remain 0m 24s) Loss: 0.0504(0.0128) 
EVAL: [800/894] Elapsed 1m 43s (remain 0m 11s) Loss: 0.0029(0.0123) 
EVAL: [893/894] Elapsed 1m 55s (remain 0m 0s) Loss: 0.0001(0.0117) 
Epoch 3 - avg_train_loss: 0.0084  avg_val_loss: 0.0117  time: 1605s
Epoch 3 - Score: 0.8820
Epoch 3 - Save Best Score: 0.8820 Model




Epoch: [4][0/2681] Elapsed 0m 1s (remain 46m 56s) Loss: 0.0294(0.0294) Grad: 112462.6953  LR: 0.000009  
Epoch: [4][100/2681] Elapsed 0m 56s (remain 23m 57s) Loss: 0.0012(0.0058) Grad: 23168.2715  LR: 0.000009  
Epoch: [4][200/2681] Elapsed 1m 51s (remain 22m 57s) Loss: 0.0000(0.0058) Grad: 76.3402  LR: 0.000009  
Epoch: [4][300/2681] Elapsed 2m 46s (remain 21m 58s) Loss: 0.0016(0.0058) Grad: 23473.5449  LR: 0.000008  
Epoch: [4][400/2681] Elapsed 3m 42s (remain 21m 2s) Loss: 0.0000(0.0059) Grad: 48.1605  LR: 0.000008  
Epoch: [4][500/2681] Elapsed 4m 37s (remain 20m 6s) Loss: 0.0000(0.0059) Grad: 28.3536  LR: 0.000008  
Epoch: [4][600/2681] Elapsed 5m 32s (remain 19m 10s) Loss: 0.0001(0.0056) Grad: 3474.8064  LR: 0.000008  
Epoch: [4][700/2681] Elapsed 6m 27s (remain 18m 15s) Loss: 0.0612(0.0058) Grad: 95647.9219  LR: 0.000008  
Epoch: [4][800/2681] Elapsed 7m 22s (remain 17m 19s) Loss: 0.0000(0.0059) Grad: 11.3833  LR: 0.000008  
Epoch: [4][900/2681] Elapsed 8m 18s (remain 16m 24s) L



Epoch: [4][2680/2681] Elapsed 24m 41s (remain 0m 0s) Loss: 0.0001(0.0062) Grad: 2911.8994  LR: 0.000004  




EVAL: [0/894] Elapsed 0m 0s (remain 8m 35s) Loss: 0.0008(0.0008) 
EVAL: [100/894] Elapsed 0m 13s (remain 1m 45s) Loss: 0.0001(0.0168) 
EVAL: [200/894] Elapsed 0m 26s (remain 1m 30s) Loss: 0.0059(0.0150) 
EVAL: [300/894] Elapsed 0m 39s (remain 1m 16s) Loss: 0.0002(0.0158) 
EVAL: [400/894] Elapsed 0m 51s (remain 1m 3s) Loss: 0.0000(0.0141) 
EVAL: [500/894] Elapsed 1m 4s (remain 0m 50s) Loss: 0.0026(0.0150) 
EVAL: [600/894] Elapsed 1m 17s (remain 0m 37s) Loss: 0.0000(0.0158) 
EVAL: [700/894] Elapsed 1m 30s (remain 0m 24s) Loss: 0.0772(0.0168) 
EVAL: [800/894] Elapsed 1m 43s (remain 0m 11s) Loss: 0.0039(0.0163) 
EVAL: [893/894] Elapsed 1m 55s (remain 0m 0s) Loss: 0.0000(0.0154) 
Epoch 4 - avg_train_loss: 0.0062  avg_val_loss: 0.0154  time: 1602s
Epoch 4 - Score: 0.8863
Epoch 4 - Save Best Score: 0.8863 Model




Epoch: [5][0/2681] Elapsed 0m 1s (remain 46m 57s) Loss: 0.0001(0.0001) Grad: 726.1271  LR: 0.000004  
Epoch: [5][100/2681] Elapsed 0m 56s (remain 23m 54s) Loss: 0.0086(0.0050) Grad: 9268.3154  LR: 0.000004  
Epoch: [5][200/2681] Elapsed 1m 51s (remain 22m 56s) Loss: 0.0037(0.0052) Grad: 17326.9023  LR: 0.000004  
Epoch: [5][300/2681] Elapsed 2m 46s (remain 21m 57s) Loss: 0.0073(0.0050) Grad: 34676.6172  LR: 0.000004  
Epoch: [5][400/2681] Elapsed 3m 41s (remain 21m 0s) Loss: 0.0000(0.0050) Grad: 143.7717  LR: 0.000004  
Epoch: [5][500/2681] Elapsed 4m 36s (remain 20m 5s) Loss: 0.0000(0.0048) Grad: 659.5748  LR: 0.000004  
Epoch: [5][600/2681] Elapsed 5m 32s (remain 19m 9s) Loss: 0.0101(0.0045) Grad: 51952.4102  LR: 0.000003  
Epoch: [5][700/2681] Elapsed 6m 27s (remain 18m 13s) Loss: 0.0000(0.0043) Grad: 217.1687  LR: 0.000003  
Epoch: [5][800/2681] Elapsed 7m 22s (remain 17m 18s) Loss: 0.0000(0.0042) Grad: 233.8797  LR: 0.000003  
Epoch: [5][900/2681] Elapsed 8m 17s (remain 16m 22s) L



Epoch: [5][2680/2681] Elapsed 24m 38s (remain 0m 0s) Loss: 0.0001(0.0044) Grad: 3057.0549  LR: 0.000000  




EVAL: [0/894] Elapsed 0m 0s (remain 8m 43s) Loss: 0.0006(0.0006) 
EVAL: [100/894] Elapsed 0m 13s (remain 1m 45s) Loss: 0.0000(0.0188) 
EVAL: [200/894] Elapsed 0m 26s (remain 1m 30s) Loss: 0.0091(0.0173) 
EVAL: [300/894] Elapsed 0m 39s (remain 1m 16s) Loss: 0.0001(0.0186) 
EVAL: [400/894] Elapsed 0m 51s (remain 1m 3s) Loss: 0.0000(0.0166) 
EVAL: [500/894] Elapsed 1m 4s (remain 0m 50s) Loss: 0.0003(0.0180) 
EVAL: [600/894] Elapsed 1m 17s (remain 0m 37s) Loss: 0.0000(0.0188) 
EVAL: [700/894] Elapsed 1m 30s (remain 0m 24s) Loss: 0.0943(0.0199) 
EVAL: [800/894] Elapsed 1m 43s (remain 0m 11s) Loss: 0.0004(0.0193) 
EVAL: [893/894] Elapsed 1m 55s (remain 0m 0s) Loss: 0.0000(0.0183) 
Epoch 5 - avg_train_loss: 0.0044  avg_val_loss: 0.0183  time: 1600s
Epoch 5 - Score: 0.8862


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForMaskedLM: ['lm_predictions.lm_head.dense.weight', 'deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.bias', '

Load weight from ../output/nbme-score-clinical-patient-notes/nbme-exp045/microsoft-deberta-xlarge-mlm-epoch-v4.bin




Epoch: [1][0/2681] Elapsed 0m 0s (remain 44m 13s) Loss: 0.7411(0.7411) Grad: inf  LR: 0.000000  
Epoch: [1][100/2681] Elapsed 0m 56s (remain 24m 2s) Loss: 0.1134(0.4952) Grad: 23757.6484  LR: 0.000002  
Epoch: [1][200/2681] Elapsed 1m 51s (remain 23m 1s) Loss: 0.0252(0.2888) Grad: 5953.7939  LR: 0.000003  
Epoch: [1][300/2681] Elapsed 2m 47s (remain 22m 5s) Loss: 0.0088(0.2080) Grad: 2198.9417  LR: 0.000004  
Epoch: [1][400/2681] Elapsed 3m 43s (remain 21m 8s) Loss: 0.0512(0.1645) Grad: 27917.3906  LR: 0.000006  
Epoch: [1][500/2681] Elapsed 4m 38s (remain 20m 11s) Loss: 0.0071(0.1361) Grad: 5406.4727  LR: 0.000007  
Epoch: [1][600/2681] Elapsed 5m 33s (remain 19m 15s) Loss: 0.1107(0.1168) Grad: 19078.9434  LR: 0.000009  
Epoch: [1][700/2681] Elapsed 6m 29s (remain 18m 20s) Loss: 0.0221(0.1036) Grad: 5209.6406  LR: 0.000010  
Epoch: [1][800/2681] Elapsed 7m 24s (remain 17m 24s) Loss: 0.0164(0.0932) Grad: 5312.4653  LR: 0.000012  
Epoch: [1][900/2681] Elapsed 8m 20s (remain 16m 28s) Los



Epoch: [1][2680/2681] Elapsed 24m 44s (remain 0m 0s) Loss: 0.0089(0.0399) Grad: 2051.2898  LR: 0.000018  




EVAL: [0/894] Elapsed 0m 0s (remain 8m 40s) Loss: 0.0121(0.0121) 
EVAL: [100/894] Elapsed 0m 13s (remain 1m 45s) Loss: 0.0006(0.0131) 
EVAL: [200/894] Elapsed 0m 26s (remain 1m 30s) Loss: 0.0557(0.0132) 
EVAL: [300/894] Elapsed 0m 38s (remain 1m 16s) Loss: 0.0217(0.0120) 
EVAL: [400/894] Elapsed 0m 51s (remain 1m 3s) Loss: 0.0119(0.0113) 
EVAL: [500/894] Elapsed 1m 4s (remain 0m 50s) Loss: 0.0140(0.0129) 
EVAL: [600/894] Elapsed 1m 17s (remain 0m 37s) Loss: 0.0144(0.0129) 
EVAL: [700/894] Elapsed 1m 30s (remain 0m 24s) Loss: 0.0135(0.0131) 
EVAL: [800/894] Elapsed 1m 42s (remain 0m 11s) Loss: 0.0076(0.0126) 
EVAL: [893/894] Elapsed 1m 54s (remain 0m 0s) Loss: 0.0006(0.0119) 
Epoch 1 - avg_train_loss: 0.0399  avg_val_loss: 0.0119  time: 1604s
Epoch 1 - Score: 0.8516
Epoch 1 - Save Best Score: 0.8516 Model




Epoch: [2][0/2681] Elapsed 0m 1s (remain 46m 27s) Loss: 0.0168(0.0168) Grad: 14606.9951  LR: 0.000018  
Epoch: [2][100/2681] Elapsed 0m 56s (remain 23m 56s) Loss: 0.0002(0.0111) Grad: 439.2554  LR: 0.000018  
Epoch: [2][200/2681] Elapsed 1m 51s (remain 22m 54s) Loss: 0.0026(0.0122) Grad: 15461.7295  LR: 0.000017  
Epoch: [2][300/2681] Elapsed 2m 46s (remain 22m 0s) Loss: 0.0293(0.0121) Grad: 38778.6641  LR: 0.000017  
Epoch: [2][400/2681] Elapsed 3m 42s (remain 21m 4s) Loss: 0.0001(0.0120) Grad: 237.7211  LR: 0.000017  
Epoch: [2][500/2681] Elapsed 4m 37s (remain 20m 7s) Loss: 0.0012(0.0119) Grad: 5995.9980  LR: 0.000017  
Epoch: [2][600/2681] Elapsed 5m 32s (remain 19m 10s) Loss: 0.0009(0.0119) Grad: 11387.6348  LR: 0.000017  
Epoch: [2][700/2681] Elapsed 6m 27s (remain 18m 15s) Loss: 0.0199(0.0125) Grad: 61467.6641  LR: 0.000017  
Epoch: [2][800/2681] Elapsed 7m 22s (remain 17m 19s) Loss: 0.0199(0.0123) Grad: 72969.8984  LR: 0.000016  
Epoch: [2][900/2681] Elapsed 8m 18s (remain 16m 



EVAL: [0/894] Elapsed 0m 0s (remain 8m 39s) Loss: 0.0039(0.0039) 
EVAL: [100/894] Elapsed 0m 13s (remain 1m 45s) Loss: 0.0004(0.0112) 
EVAL: [200/894] Elapsed 0m 26s (remain 1m 30s) Loss: 0.0536(0.0128) 
EVAL: [300/894] Elapsed 0m 38s (remain 1m 16s) Loss: 0.0108(0.0116) 
EVAL: [400/894] Elapsed 0m 51s (remain 1m 3s) Loss: 0.0105(0.0106) 
EVAL: [500/894] Elapsed 1m 4s (remain 0m 50s) Loss: 0.0176(0.0126) 
EVAL: [600/894] Elapsed 1m 17s (remain 0m 37s) Loss: 0.0177(0.0130) 
EVAL: [700/894] Elapsed 1m 30s (remain 0m 24s) Loss: 0.0153(0.0128) 
EVAL: [800/894] Elapsed 1m 42s (remain 0m 11s) Loss: 0.0042(0.0122) 
EVAL: [893/894] Elapsed 1m 54s (remain 0m 0s) Loss: 0.0002(0.0117) 
Epoch 2 - avg_train_loss: 0.0114  avg_val_loss: 0.0117  time: 1601s
Epoch 2 - Score: 0.8782
Epoch 2 - Save Best Score: 0.8782 Model




Epoch: [3][0/2681] Elapsed 0m 1s (remain 46m 13s) Loss: 0.0021(0.0021) Grad: 23223.8438  LR: 0.000013  
Epoch: [3][100/2681] Elapsed 0m 56s (remain 23m 59s) Loss: 0.0092(0.0091) Grad: 36351.5078  LR: 0.000013  
Epoch: [3][200/2681] Elapsed 1m 51s (remain 22m 56s) Loss: 0.0000(0.0096) Grad: 122.2657  LR: 0.000013  
Epoch: [3][300/2681] Elapsed 2m 46s (remain 22m 0s) Loss: 0.0000(0.0091) Grad: 15.3984  LR: 0.000013  
Epoch: [3][400/2681] Elapsed 3m 42s (remain 21m 3s) Loss: 0.0026(0.0090) Grad: 7705.3242  LR: 0.000013  
Epoch: [3][500/2681] Elapsed 4m 37s (remain 20m 7s) Loss: 0.0001(0.0088) Grad: 581.9245  LR: 0.000013  
Epoch: [3][600/2681] Elapsed 5m 32s (remain 19m 11s) Loss: 0.0015(0.0087) Grad: 14233.3369  LR: 0.000012  
Epoch: [3][700/2681] Elapsed 6m 28s (remain 18m 16s) Loss: 0.0000(0.0086) Grad: 131.0315  LR: 0.000012  
Epoch: [3][800/2681] Elapsed 7m 23s (remain 17m 20s) Loss: 0.0172(0.0085) Grad: 472017.7188  LR: 0.000012  
Epoch: [3][900/2681] Elapsed 8m 18s (remain 16m 25s)



Epoch: [3][2680/2681] Elapsed 24m 42s (remain 0m 0s) Loss: 0.0000(0.0087) Grad: 66.1255  LR: 0.000009  




EVAL: [0/894] Elapsed 0m 0s (remain 8m 46s) Loss: 0.0040(0.0040) 
EVAL: [100/894] Elapsed 0m 13s (remain 1m 45s) Loss: 0.0003(0.0127) 
EVAL: [200/894] Elapsed 0m 26s (remain 1m 30s) Loss: 0.0420(0.0143) 
EVAL: [300/894] Elapsed 0m 38s (remain 1m 16s) Loss: 0.0096(0.0136) 
EVAL: [400/894] Elapsed 0m 51s (remain 1m 3s) Loss: 0.0335(0.0127) 
EVAL: [500/894] Elapsed 1m 4s (remain 0m 50s) Loss: 0.0188(0.0151) 
EVAL: [600/894] Elapsed 1m 17s (remain 0m 37s) Loss: 0.0151(0.0156) 
EVAL: [700/894] Elapsed 1m 30s (remain 0m 24s) Loss: 0.0101(0.0154) 
EVAL: [800/894] Elapsed 1m 42s (remain 0m 11s) Loss: 0.0004(0.0148) 
EVAL: [893/894] Elapsed 1m 54s (remain 0m 0s) Loss: 0.0000(0.0141) 
Epoch 3 - avg_train_loss: 0.0087  avg_val_loss: 0.0141  time: 1603s
Epoch 3 - Score: 0.8840
Epoch 3 - Save Best Score: 0.8840 Model




Epoch: [4][0/2681] Elapsed 0m 1s (remain 46m 22s) Loss: 0.0004(0.0004) Grad: 74854.2578  LR: 0.000009  
Epoch: [4][100/2681] Elapsed 0m 56s (remain 23m 54s) Loss: 0.0001(0.0058) Grad: 175.7405  LR: 0.000009  
Epoch: [4][200/2681] Elapsed 1m 51s (remain 22m 53s) Loss: 0.0145(0.0053) Grad: 4866.7734  LR: 0.000009  
Epoch: [4][300/2681] Elapsed 2m 46s (remain 21m 57s) Loss: 0.0006(0.0054) Grad: 9997.1826  LR: 0.000008  
Epoch: [4][400/2681] Elapsed 3m 41s (remain 21m 0s) Loss: 0.0031(0.0059) Grad: 8991.8428  LR: 0.000008  
Epoch: [4][500/2681] Elapsed 4m 37s (remain 20m 5s) Loss: 0.0156(0.0061) Grad: 19278.6465  LR: 0.000008  
Epoch: [4][600/2681] Elapsed 5m 32s (remain 19m 9s) Loss: 0.0001(0.0062) Grad: 777.4606  LR: 0.000008  
Epoch: [4][700/2681] Elapsed 6m 27s (remain 18m 14s) Loss: 0.0096(0.0063) Grad: 32794.4375  LR: 0.000008  
Epoch: [4][800/2681] Elapsed 7m 22s (remain 17m 18s) Loss: 0.0000(0.0062) Grad: 30.7857  LR: 0.000008  
Epoch: [4][900/2681] Elapsed 8m 17s (remain 16m 23s) 



EVAL: [0/894] Elapsed 0m 0s (remain 8m 32s) Loss: 0.0034(0.0034) 
EVAL: [100/894] Elapsed 0m 13s (remain 1m 45s) Loss: 0.0001(0.0115) 
EVAL: [200/894] Elapsed 0m 26s (remain 1m 30s) Loss: 0.0442(0.0129) 
EVAL: [300/894] Elapsed 0m 38s (remain 1m 16s) Loss: 0.0212(0.0127) 
EVAL: [400/894] Elapsed 0m 51s (remain 1m 3s) Loss: 0.0240(0.0120) 
EVAL: [500/894] Elapsed 1m 4s (remain 0m 50s) Loss: 0.0168(0.0138) 
EVAL: [600/894] Elapsed 1m 17s (remain 0m 37s) Loss: 0.0154(0.0142) 
EVAL: [700/894] Elapsed 1m 30s (remain 0m 24s) Loss: 0.0105(0.0141) 
EVAL: [800/894] Elapsed 1m 43s (remain 0m 11s) Loss: 0.0008(0.0138) 
EVAL: [893/894] Elapsed 1m 54s (remain 0m 0s) Loss: 0.0000(0.0131) 
Epoch 4 - avg_train_loss: 0.0062  avg_val_loss: 0.0131  time: 1600s
Epoch 4 - Score: 0.8860
Epoch 4 - Save Best Score: 0.8860 Model




Epoch: [5][0/2681] Elapsed 0m 1s (remain 46m 20s) Loss: 0.0000(0.0000) Grad: 113.7476  LR: 0.000004  
Epoch: [5][100/2681] Elapsed 0m 56s (remain 23m 54s) Loss: 0.0000(0.0032) Grad: 10.7630  LR: 0.000004  
Epoch: [5][200/2681] Elapsed 1m 51s (remain 22m 52s) Loss: 0.0549(0.0039) Grad: 109082.3203  LR: 0.000004  
Epoch: [5][300/2681] Elapsed 2m 46s (remain 21m 55s) Loss: 0.0001(0.0043) Grad: 627.6280  LR: 0.000004  
Epoch: [5][400/2681] Elapsed 3m 41s (remain 21m 0s) Loss: 0.0000(0.0044) Grad: 20.6574  LR: 0.000004  
Epoch: [5][500/2681] Elapsed 4m 36s (remain 20m 3s) Loss: 0.0029(0.0041) Grad: 11976.1914  LR: 0.000004  
Epoch: [5][600/2681] Elapsed 5m 31s (remain 19m 7s) Loss: 0.0026(0.0041) Grad: 6724.9102  LR: 0.000003  
Epoch: [5][700/2681] Elapsed 6m 26s (remain 18m 12s) Loss: 0.0001(0.0039) Grad: 278.9788  LR: 0.000003  
Epoch: [5][800/2681] Elapsed 7m 21s (remain 17m 16s) Loss: 0.0005(0.0040) Grad: 2624.7629  LR: 0.000003  
Epoch: [5][900/2681] Elapsed 8m 16s (remain 16m 21s) Los



Epoch: [5][2680/2681] Elapsed 24m 37s (remain 0m 0s) Loss: 0.0112(0.0044) Grad: 15123.7021  LR: 0.000000  




EVAL: [0/894] Elapsed 0m 0s (remain 8m 29s) Loss: 0.0033(0.0033) 
EVAL: [100/894] Elapsed 0m 13s (remain 1m 44s) Loss: 0.0002(0.0145) 
EVAL: [200/894] Elapsed 0m 26s (remain 1m 30s) Loss: 0.0537(0.0160) 
EVAL: [300/894] Elapsed 0m 38s (remain 1m 16s) Loss: 0.0171(0.0156) 
EVAL: [400/894] Elapsed 0m 51s (remain 1m 3s) Loss: 0.0262(0.0147) 
EVAL: [500/894] Elapsed 1m 4s (remain 0m 50s) Loss: 0.0197(0.0166) 
EVAL: [600/894] Elapsed 1m 17s (remain 0m 37s) Loss: 0.0204(0.0169) 
EVAL: [700/894] Elapsed 1m 30s (remain 0m 24s) Loss: 0.0181(0.0168) 
EVAL: [800/894] Elapsed 1m 42s (remain 0m 11s) Loss: 0.0003(0.0163) 
EVAL: [893/894] Elapsed 1m 54s (remain 0m 0s) Loss: 0.0000(0.0155) 
Epoch 5 - avg_train_loss: 0.0044  avg_val_loss: 0.0155  time: 1598s
Epoch 5 - Score: 0.8855
best_thres: 0.45  score: 0.88374


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Load weight from pretrained




  0%|          | 0/2 [00:00<?, ?it/s]

Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/queues.py", line 235, in _feed
Exception ignored in: <function _ConnectionBase.__del__ at 0x7f790992e430>
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 132, in __del__
    close()
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 177, in close
        self._close()
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 361, in _close
self._close()
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 361, in _close
        _close(self._handle)
_close(self._handle)
OSError: [Errno 9] Bad file descriptor
OSError: [Errno 9] Bad file descriptor

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.8/threading.py", line 932, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.8/threading.py", line 870, in run

Load weight from pretrained




  0%|          | 0/2 [00:00<?, ?it/s]

Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/queues.py", line 235, in _feed
    close()
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 177, in close
    self._close()
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 361, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.8/threading.py", line 932, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.8/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.8/multiprocessing/queues.py", line 266, in _feed
    queue_sem.release()
ValueError: semaphore or lock released too many times
Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.weight', '

Load weight from pretrained




  0%|          | 0/2 [00:00<?, ?it/s]

Exception in thread QueueFeederThread:
Exception ignored in: <function _ConnectionBase.__del__ at 0x7f790992e430>Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/queues.py", line 235, in _feed

Traceback (most recent call last):
    close()  File "/usr/lib/python3.8/multiprocessing/connection.py", line 132, in __del__

  File "/usr/lib/python3.8/multiprocessing/connection.py", line 177, in close
    self._close()
    self._close()
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 361, in _close
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 361, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor

During handling of the above exception, another exception occurred:

    Traceback (most recent call last):
  File "/usr/lib/python3.8/threading.py", line 932, in _bootstrap_inner
_close(self._handle)    self.run()
  File "/usr/lib/python3.8/threading.py", line 870, in run
    
self._target(*self._args, **self._

Load weight from pretrained




  0%|          | 0/2 [00:00<?, ?it/s]

Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/queues.py", line 235, in _feed
Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/queues.py", line 235, in _feed
    close()
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 177, in close
    close()
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 177, in close
        self._close()self._close()

  File "/usr/lib/python3.8/multiprocessing/connection.py", line 361, in _close
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 361, in _close
    _close(self._handle)    _close(self._handle)
OSError
OSError: [Errno 9] Bad file descriptor

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.8/threading.py", line 932, in _bootstrap_inner
:     [Errno 9] Bad file descriptor

During handling of t