## References

- https://www.kaggle.com/yasufuminakama/nbme-deberta-base-baseline-train

## Configurations

In [1]:
EXP_NAME = "nbme-exp002"
ENV = "colab"
DEBUG_MODE = False
SUBMISSION_MODE = False

In [2]:
class CFG:
    env=ENV
    exp_name=EXP_NAME
    debug=DEBUG_MODE
    submission=SUBMISSION_MODE
    apex=True
    input_dir=None
    output_dir=None
    library="pytorch"  # ["tf", "pytorch"]
    device="GPU"  # ["GPU", "TPU"]
    competition_name="nbme-score-clinical-patient-notes"
    id_col="id"
    target_col="location"
    pretrained_model_name="microsoft/deberta-base"
    tokenizer=None
    max_len=None
    output_dim=1
    dropout=0.2
    num_workers=4
    batch_size=8
    lr=2e-5
    betas=(0.9, 0.98)
    weight_decay=0.1
    num_warmup_steps_rate=0.1
    batch_scheduler=True
    epochs=5
    n_fold=5
    train_fold=[0, 1, 2, 3, 4]
    seed=71
    gradient_accumulation_steps=1
    max_grad_norm=1000
    print_freq=100
    train=True
    inference=True

In [3]:
if CFG.debug:
    CFG.epochs = 2
    CFG.train_fold = [0, 1]

if CFG.submission:
    CFG.train = False
    CFG.inference = True

## Directory Settings

In [4]:
import sys
from pathlib import Path


print(CFG.env)
if CFG.env == "colab":
    # colab環境
    from google.colab import drive
    drive.mount("/content/drive")
    CFG.input_dir = Path("./drive/MyDrive/00.kaggle/input") / CFG.competition_name
    CFG.output_dir = Path("./drive/MyDrive/00.kaggle/output") / CFG.competition_name / CFG.exp_name
    if not CFG.output_dir.exists():
        CFG.output_dir.mkdir()
    # install packages
    !pip install transformers

elif CFG.env == "local":
    # ローカルサーバ
    CFG.input_dir = Path("../input/") / CFG.competition_name
    CFG.output_dir = Path("../output/") / CFG.competition_name / CFG.exp_name
    if not CFG.output_dir.exists():
        CFG.output_dir.mkdir()

elif CFG.env == "kaggle":
    # kaggle環境
    CFG.input_dir = Path("../input/") / CFG.competition_name
    CFG.output_dir = Path("./")

colab
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import gc
import os
import ast
import time
import math
import random
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from scipy.optimize import minimize
from sklearn.metrics import roc_auc_score, mean_squared_error, f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as T
from torchvision.io import read_image
from torch.utils.data import DataLoader, Dataset

from transformers import BartModel,BertModel,BertTokenizer
from transformers import DebertaModel,DebertaTokenizer
from transformers import RobertaModel,RobertaTokenizer
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel,AutoConfig
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import ElectraModel, ElectraTokenizer, ElectraForSequenceClassification

import warnings
warnings.filterwarnings("ignore")

## Utilities

In [6]:
def micro_f1(preds, truths):
    """
    Micro f1 on binary arrays.

    Args:
        preds (list of lists of ints): Predictions.
        truths (list of lists of ints): Ground truths.

    Returns:
        float: f1 score.
    """
    # Micro : aggregating over all instances
    preds = np.concatenate(preds)
    truths = np.concatenate(truths)
    return f1_score(truths, preds)


def spans_to_binary(spans, length=None):
    """
    Converts spans to a binary array indicating whether each character is in the span.

    Args:
        spans (list of lists of two ints): Spans.

    Returns:
        np array [length]: Binarized spans.
    """
    length = np.max(spans) if length is None else length
    binary = np.zeros(length)
    for start, end in spans:
        binary[start:end] = 1
    return binary


def span_micro_f1(preds, truths):
    """
    Micro f1 on spans.

    Args:
        preds (list of lists of two ints): Prediction spans.
        truths (list of lists of two ints): Ground truth spans.

    Returns:
        float: f1 score.
    """
    bin_preds = []
    bin_truths = []
    for pred, truth in zip(preds, truths):
        if not len(pred) and not len(truth):
            continue
        length = max(np.max(pred) if len(pred) else 0, np.max(truth) if len(truth) else 0)
        bin_preds.append(spans_to_binary(pred, length))
        bin_truths.append(spans_to_binary(truth, length))
    return micro_f1(bin_preds, bin_truths)


def get_score(y_true, y_pred):
    score = span_micro_f1(y_true, y_pred)
    return score

In [7]:
def create_labels_for_scoring(df):
    # example: ['48 61', '111 128'] -> [[48, 61], [111, 128]]
    df["location_for_create_labels"] = [ast.literal_eval(f"[]")] * len(df)
    for i in range(len(df)):
        lst = df.loc[i, "location"]
        if lst:
            new_lst = ";".join(lst)
            df.loc[i, "location_for_create_labels"] = ast.literal_eval(f"[['{new_lst}']]")

    # create labels
    truths = []
    for location_list in df["location_for_create_labels"].values:
        truth = []
        if len(location_list) > 0:
            location = location_list[0]
            for loc in [s.split() for s in location.split(";")]:
                start, end = int(loc[0]), int(loc[1])
                truth.append([start, end])
        truths.append(truth)

    return truths


def get_char_probs(texts, token_probs, tokenizer):
    res = [np.zeros(len(t)) for t in texts]
    for i, (text, prediction) in enumerate(zip(texts, token_probs)):
        encoded = tokenizer(
            text=text,
            max_length=CFG.max_len,
            padding="max_length",
            return_offsets_mapping=True,
        )
        for (offset_mapping, pred) in zip(encoded["offset_mapping"], prediction):
            start, end = offset_mapping
            res[i][start:end] = pred
    return res


def get_predicted_location_str(char_probs, th=0.5):
    results = []
    for char_prob in char_probs:
        result = np.where(char_prob >= th)[0] + 1
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        result = [f"{min(r)} {max(r)}" for r in result]
        result = ";".join(result)
        results.append(result)
    return results


def get_predictions(results):
    predictions = []
    for result in results:
        prediction = []
        if result != "":
            for loc in [s.split() for s in result.split(";")]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
    return predictions


def scoring(df, th=0.5):
    labels = create_labels_for_scoring(df)

    token_probs = df[[str(i) for i in range(CFG.max_len)]].values
    char_probs = get_char_probs(df["pn_history"].values, token_probs, CFG.tokenizer)
    predicted_location_str = get_predicted_location_str(char_probs, th=th)
    preds = get_predictions(predicted_location_str)

    score = get_score(labels, preds)
    return score


def get_best_thres(oof_df):
    def f1_opt(x):
        return -1 * scoring(oof_df, th=x)

    best_thres = minimize(f1_opt, x0=np.array([0.5]), method="Nelder-Mead")["x"].item()
    return best_thres

In [8]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [9]:
seed_everything()

## Data Loading

In [10]:
train = pd.read_csv(CFG.input_dir / "train.csv")
features = pd.read_csv(CFG.input_dir / "features.csv")
patient_notes = pd.read_csv(CFG.input_dir / "patient_notes.csv")
test = pd.read_csv(CFG.input_dir / "test.csv")

train.shape, features.shape, patient_notes.shape, test.shape

((14300, 6), (143, 3), (42146, 3), (5, 4))

In [11]:
if CFG.debug:
    train = train.sample(n=1000, random_state=0).reset_index(drop=True)
    print(train.shape)

## Preprocessing

In [12]:
def preprocess_features(features):
    features.loc[features["feature_text"] == "Last-Pap-smear-I-year-ago", "feature_text"] = "Last-Pap-smear-1-year-ago"
    return features


features = preprocess_features(features)

In [13]:
train = train.merge(features, on=["feature_num", "case_num"], how="left")
train = train.merge(patient_notes, on=["pn_num", "case_num"], how="left")
test = test.merge(features, on=["feature_num", "case_num"], how="left")
test = test.merge(patient_notes, on=["pn_num", "case_num"], how="left")

train.shape, test.shape

((14300, 8), (5, 6))

In [14]:
def fix_anno(df, target_id, annotation, location):
    idx = df["id"] == target_id
    df.loc[idx, "annotation"] = annotation
    df.loc[idx, "location"] = location

In [15]:
fix_anno(train, "00669_000", "['father heart attack']", "['764 783']")
fix_anno(train, "01110_010", "['for the last 2-3 months', 'over the last 2 months']", "['77 100', '398 420']")
fix_anno(train, "01146_005", "['no heat intolerance', 'no cold intolerance']", "['285 292;301 312', '285 287;296 312']")
fix_anno(train, "02428_001", "['mother thyroid problem']", "['551 557;565 580']")
fix_anno(train, "02428_004", '[\'felt like he was going to "pass out"\']', "['131 135;181 212']")
fix_anno(train, "10047_105", "['stool , with no blood']", "['259 280']")
fix_anno(train, "10196_105", "['diarrhoe non blooody']", "['176 184;201 212']")
fix_anno(train, "10206_103", "['diarrhea for last 2-3 days']", "['249 257;271 288']")
fix_anno(train, "10228_100", "['no vaginal discharge']", "['822 824;907 924']")
fix_anno(train, "10268_111", "['started about 8-10 hours ago']", "['101 129']")
fix_anno(train, "10459_105", "['no blood in the stool']", "['531 539;549 561']")
fix_anno(train, "10620_102", "['last sexually active 9 months ago']", "['540 560;581 593']")
fix_anno(train, "10646_107", "['right lower quadrant pain']", "['32 57']")
fix_anno(train, "10968_105", "['diarrhoea no blood']", "['308 317;376 384']")
fix_anno(train, "20747_214", "['sweating']", "['549 557']")
fix_anno(
    train,
    "21686_200",
    "['previously as regular', 'previously eveyr 28-29 days', 'previously lasting 5 days', 'previously regular flow']",
    "['102 123', '102 112;125 141', '102 112;143 157', '102 112;159 171']",
)
fix_anno(train, "30437_309", "['for 2 months']", "['33 45']")
fix_anno(train, "32657_315", "['35 year old']", "['5 16']")
fix_anno(train, "32996_302", "['darker brown stools']", "['175 194']")
fix_anno(train, "33531_300", "['uncle with peptic ulcer']", "['700 723']")
fix_anno(train, "40974_406", "['difficulty falling asleep']", "['225 250']")
fix_anno(train, "41825_402", "['helps to take care of aging mother and in-laws']", "['197 218;236 260']")
fix_anno(
    train,
    "42625_400",
    "['No hair changes', 'No skin changes', 'No GI changes', 'No palpitations', 'No excessive sweating']",
    "['480 482;507 519', '480 482;499 503;512 519', '480 482;521 531', '480 482;533 545', '480 482;564 582']",
)
fix_anno(
    train,
    "43451_402",
    "['stressed due to taking care of her mother', 'stressed due to taking care of husbands parents']",
    "['290 320;327 337', '290 320;342 358']",
)
fix_anno(train, "44958_402", "['stressor taking care of many sick family members']", "['288 296;324 363']")
fix_anno(train, "50574_514", "['heart started racing and felt numbness for the 1st time in her finger tips']", "['108 182']")
fix_anno(train, "52512_500", "['first started 5 yrs']", "['102 121']")
fix_anno(train, "60235_608", "['No shortness of breath']", "['481 483;533 552']")
fix_anno(train, "60469_603", "['recent URI', 'nasal stuffines, rhinorrhea, for 3-4 days']", "['92 102', '123 164']")
fix_anno(
    train,
    "70255_702",
    "['irregularity with her cycles', 'heavier bleeding', 'changes her pad every couple hours']",
    "['89 117', '122 138', '368 402']",
)
fix_anno(train, "70412_701", "['gaining 10-15 lbs']", "['344 361']")
fix_anno(train, "72660_701", "['weight gain', 'gain of 10-16lbs']", "['600 611', '607 623']")
fix_anno(train, "81856_813", "['seeing her son knows are not real']", "['386 400;443 461']")
fix_anno(train, "81985_813", "['saw him once in the kitchen after he died']", "['160 201']")
fix_anno(train, "83199_810", "['tried Ambien but it didnt work']", "['325 337;349 366']")
fix_anno(train, "83757_803", "['heard what she described as a party later than evening these things did not actually happen']", "['405 459;488 524']")
fix_anno(train, "83757_813", "['experienced seeing her son at the kitchen table these things did not actually happen']", "['353 400;488 524']")
fix_anno(train, "92224_909", "['SCRACHY THROAT', 'RUNNY NOSE']", "['293 307', '321 331']")
fix_anno(train, "92385_900", "['without improvement when taking tylenol', 'without improvement when taking ibuprofen']", "['182 221', '182 213;225 234']")
fix_anno(train, "92385_902", "['yesterday', 'yesterday']", "['79 88', '409 418']")
fix_anno(train, "93988_904", "['headache global', 'headache throughout her head']", "['86 94;230 236', '86 94;237 256']")
fix_anno(train, "94656_904", "['headache generalized in her head']", "['56 64;156 179']")

In [16]:
train["annotation"] = train["annotation"].apply(ast.literal_eval)
train["location"] = train["location"].apply(ast.literal_eval)

In [17]:
train["annotation_length"] = train["annotation"].apply(len)
display(train['annotation_length'].value_counts().sort_index())

0    4399
1    8184
2    1293
3     287
4      99
5      27
6       9
7       1
8       1
Name: annotation_length, dtype: int64

## CV split

In [18]:
def get_groupkfold(df, group_name):
    groups = df[group_name].unique()

    kf = KFold(
        n_splits=CFG.n_fold,
        shuffle=True,
        random_state=CFG.seed,
    )
    folds_ids = []
    for i_fold, (_, val_group_idx) in enumerate(kf.split(groups)):
        val_group = groups[val_group_idx]
        is_val = df[group_name].isin(val_group)
        val_idx = df[is_val].index
        df.loc[val_idx, "fold"] = int(i_fold)

    df["fold"] = df["fold"].astype(int)
    return df

In [19]:
train = get_groupkfold(train, "pn_num")
display(train.groupby("fold").size())

fold
0    2902
1    2894
2    2813
3    2791
4    2900
dtype: int64

## Setup tokenizer

In [20]:
if CFG.submission:
    tokenizer = AutoTokenizer.from_pretrained(Path("../input/") / CFG.exp_name / "tokenizer/")
else:
    tokenizer = AutoTokenizer.from_pretrained(CFG.pretrained_model_name)
    tokenizer.save_pretrained(CFG.output_dir / "tokenizer/")

CFG.tokenizer = tokenizer

## Create dataset

In [21]:
pn_history_lengths = []
tk0 = tqdm(patient_notes["pn_history"].fillna("").values, total=len(patient_notes))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)["input_ids"])
    pn_history_lengths.append(length)

print("max length:", np.max(pn_history_lengths))

  0%|          | 0/42146 [00:00<?, ?it/s]

max length: 433


In [22]:
feature_text_lengths = []
tk0 = tqdm(features["feature_text"].fillna("").values, total=len(features))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)["input_ids"])
    feature_text_lengths.append(length)

print("max length:", np.max(feature_text_lengths))

  0%|          | 0/143 [00:00<?, ?it/s]

max length: 30


In [23]:
CFG.max_len = max(pn_history_lengths) + max(feature_text_lengths) + 3   # cls & sep & sep

print("max length:", CFG.max_len)

max length: 466


In [24]:
class TrainingDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.df = df
        self.tokenizer = self.cfg.tokenizer
        self.max_len = self.cfg.max_len
        self.feature_texts = self.df["feature_text"].values
        self.pn_historys = self.df["pn_history"].values
        self.annotation_lengths = self.df["annotation_length"].values
        self.locations = self.df["location"].values

    def __len__(self):
        return len(self.df)

    def _create_input(self, pn_history, feature_text):
        encoded = self.tokenizer(
            text=pn_history,
            text_pair=feature_text,
            max_length=self.max_len,
            padding="max_length",
            return_offsets_mapping=False,
        )
        for k, v in encoded.items():
            encoded[k] = torch.tensor(v, dtype=torch.long)
        return encoded

    def _create_label(self, pn_history, annotation_length, location_list):
        encoded = self.tokenizer(
            text=pn_history,
            max_length=self.max_len,
            padding="max_length",
            return_offsets_mapping=True,
        )
        offset_mapping = encoded["offset_mapping"]
        ignore_idxes = np.where(np.array(encoded.sequence_ids()) != 0)[0]
        label = np.zeros(len(offset_mapping))
        label[ignore_idxes] = -1

        if annotation_length > 0:
            for location in location_list:
                for loc in [s.split() for s in location.split(";")]:
                    start, end = int(loc[0]), int(loc[1])
                    start_idx = -1
                    end_idx = -1
                    for idx in range(len(offset_mapping)):
                        if (start_idx == -1) & (start < offset_mapping[idx][0]):
                            start_idx = idx - 1
                        if (end_idx == -1) & (end <= offset_mapping[idx][1]):
                            end_idx = idx + 1
                    if start_idx == -1:
                        start_idx = end_idx
                    if (start_idx != -1) & (end_idx != -1):
                        label[start_idx:end_idx] = 1

        return torch.tensor(label, dtype=torch.float)

    def __getitem__(self, idx):
        input_ = self._create_input(self.pn_historys[idx], self.feature_texts[idx])
        label = self._create_label(self.pn_historys[idx], self.annotation_lengths[idx], self.locations[idx])
        return input_, label

In [25]:
class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.df = df
        self.tokenizer = self.cfg.tokenizer
        self.max_len = self.cfg.max_len
        self.feature_texts = self.df["feature_text"].values
        self.pn_historys = self.df["pn_history"].values

    def __len__(self):
        return len(self.df)

    def _create_input(self, pn_history, feature_text):
        encoded = self.tokenizer(
            text=pn_history,
            text_pair=feature_text,
            max_length=self.max_len,
            padding="max_length",
            return_offsets_mapping=False,
        )
        for k, v in encoded.items():
            encoded[k] = torch.tensor(v, dtype=torch.long)
        return encoded

    def __getitem__(self, idx):
        input_ = self._create_input(self.pn_historys[idx], self.feature_texts[idx])
        return input_

## Model

In [26]:
class CustomModel(nn.Module):
    def __init__(self, cfg, model_config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg

        if model_config_path is None:
            self.model_config = AutoConfig.from_pretrained(
                self.cfg.pretrained_model_name,
                output_hidden_states=True,
            )
        else:
            self.model_config = torch.load(model_config_path)

        if pretrained:
            self.backbone = AutoModel.from_pretrained(
                self.cfg.pretrained_model_name,
                config=self.model_config,
            )
        else:
            self.backbone = AutoModel.from_config(self.model_config)

        self.fc = nn.Sequential(
            nn.Dropout(self.cfg.dropout),
            nn.Linear(self.model_config.hidden_size, self.cfg.output_dim),
        )

    def forward(self, inputs):
        h = self.backbone(**inputs)["last_hidden_state"]
        output = self.fc(h)
        return output

## Training

In [27]:
def train_fn(
    train_dataloader,
    model,
    criterion,
    optimizer,
    epoch,
    scheduler,
    device,
):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = time.time()
    for step, (inputs, labels) in enumerate(train_dataloader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)

        with torch.cuda.amp.autocast(enabled=CFG.apex):
            output = model(inputs)

        loss = criterion(output.view(-1, 1), labels.view(-1, 1))
        loss = torch.masked_select(loss, labels.view(-1, 1) != -1).mean()
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)

        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        if CFG.batch_scheduler:
            scheduler.step()

        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_dataloader)-1):
            print(
                "Epoch: [{0}][{1}/{2}] "
                "Elapsed {remain:s} "
                "Loss: {loss.val:.4f}({loss.avg:.4f}) "
                "Grad: {grad_norm:.4f}  "
                "LR: {lr:.6f}  "
                .format(
                    epoch+1,
                    step,
                    len(train_dataloader),
                    remain=timeSince(start, float(step+1) / len(train_dataloader)),
                    loss=losses,
                     grad_norm=grad_norm,
                     lr=scheduler.get_lr()[0],
                )
            )
    return losses.avg

In [28]:
def valid_fn(
    val_dataloader,
    model,
    criterion,
    device,
):
    model.eval()
    preds = []
    losses = AverageMeter()
    start = time.time()
    for step, (inputs, labels) in enumerate(val_dataloader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)

        with torch.no_grad():
            output = model(inputs)

        loss = criterion(output.view(-1, 1), labels.view(-1, 1))
        loss = torch.masked_select(loss, labels.view(-1, 1) != -1).mean()
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(output.sigmoid().squeeze().detach().cpu().numpy())

        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(val_dataloader)-1):
            print(
                "EVAL: [{0}/{1}] "
                "Elapsed {remain:s} "
                "Loss: {loss.val:.4f}({loss.avg:.4f}) "
                .format(
                    step, len(val_dataloader),
                    remain=timeSince(start, float(step+1) / len(val_dataloader)),
                    loss=losses,
                )
            )
    preds = np.concatenate(preds)
    return losses.avg, preds

In [29]:
def inference_fn(test_dataloader, model, device):
    model.eval()
    model.to(device)
    preds = []
    tk0 = tqdm(test_dataloader, total=len(test_dataloader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            output = model(inputs)
        preds.append(output.sigmoid().squeeze().detach().cpu().numpy())
    preds = np.concatenate(preds)
    return preds

In [30]:
def train_loop(df, i_fold, device):
    print(f"========== fold: {i_fold} training ==========")
    train_idx = df[df["fold"] != i_fold].index
    val_idx = df[df["fold"] == i_fold].index

    train_folds = df.loc[train_idx].reset_index(drop=True)
    val_folds = df.loc[val_idx].reset_index(drop=True)

    train_dataset = TrainingDataset(CFG, train_folds)
    val_dataset = TrainingDataset(CFG, val_folds)

    train_dataloader = DataLoader(
        train_dataset,
        batch_size=CFG.batch_size,
        shuffle=True,
        num_workers=CFG.num_workers,
        pin_memory=True,
        drop_last=True,
    )
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=CFG.batch_size,
        shuffle=False,
        num_workers=CFG.num_workers,
        pin_memory=True,
        drop_last=False,
    )

    model = CustomModel(CFG, model_config_path=None, pretrained=True)
    torch.save(model.model_config, CFG.output_dir / "model_config.pth")
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {"params": [p for n, p in param_optimizer if not any(
            nd in n for nd in no_decay)], "weight_decay": CFG.weight_decay},
        {"params": [p for n, p in param_optimizer if any(
            nd in n for nd in no_decay)], "weight_decay": 0.0}
    ]
    optimizer = AdamW(
        optimizer_grouped_parameters,
        lr=CFG.lr,
        betas=CFG.betas,
        weight_decay=CFG.weight_decay,
    )
    num_train_optimization_steps = int(len(train_dataloader) * CFG.epochs)
    num_warmup_steps = int(num_train_optimization_steps * CFG.num_warmup_steps_rate)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_train_optimization_steps,
    )

    criterion = nn.BCEWithLogitsLoss(reduction="none")
    best_score = -1 * np.inf

    for epoch in range(CFG.epochs):
        start_time = time.time()
        avg_loss = train_fn(
            train_dataloader,
            model,
            criterion,
            optimizer,
            epoch,
            scheduler,
            device,
        )
        avg_val_loss, val_preds = valid_fn(
            val_dataloader,
            model,
            criterion,
            device,
        )

        if isinstance(scheduler, optim.lr_scheduler.CosineAnnealingWarmRestarts):
            scheduler.step()

        # scoring
        val_folds[[str(i) for i in range(CFG.max_len)]] = val_preds
        score = scoring(val_folds, th=0.5)

        elapsed = time.time() - start_time

        print(f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s")
        print(f"Epoch {epoch+1} - Score: {score:.4f}")
        if score > best_score:
            best_score = score
            print(f"Epoch {epoch+1} - Save Best Score: {score:.4f} Model")
            torch.save({
                "model": model.state_dict(),
                "predictions": val_preds,
                },
                CFG.output_dir / f"fold{i_fold}_best.pth",
            )

    predictions = torch.load(
        CFG.output_dir / f"fold{i_fold}_best.pth",
        map_location=torch.device("cpu"),
    )["predictions"]
    val_folds[[str(i) for i in range(CFG.max_len)]] = predictions

    torch.cuda.empty_cache()
    gc.collect()

    return val_folds

## Main

In [31]:
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if CFG.train:
        oof_df = pd.DataFrame()
        for i_fold in range(CFG.n_fold):
            if i_fold in CFG.train_fold:
                _oof_df = train_loop(train, i_fold, device)
                oof_df = pd.concat([oof_df, _oof_df], axis=0, ignore_index=True)
        #oof_df.to_csv(CFG.output_dir / "oof_df.csv", index=False)
        oof_df.to_pickle(CFG.output_dir / "oof_df.pkl")

    if CFG.submission:
        oof_df = pd.read_pickle(Path("../input/") / CFG.exp_name / "oof_df.pkl")
    else:
        oof_df = pd.read_pickle(CFG.output_dir / "oof_df.pkl")

    score = scoring(oof_df, th=0.5)
    print(f"Best thres: 0.5, Score: {score:.4f}")
    best_thres = get_best_thres(oof_df)
    score = scoring(oof_df, th=best_thres)
    print(f"Best thres: {best_thres}, Score: {score:.4f}")

    if CFG.inference:
        test_dataset = TestDataset(CFG, test)
        test_dataloader = DataLoader(
            test_dataset,
            batch_size=CFG.batch_size,
            shuffle=False,
            num_workers=CFG.num_workers,
            pin_memory=True,
            drop_last=False,
        )
        predictions = []
        for i_fold in CFG.train_fold:
            if CFG.submission:
                model = CustomModel(CFG, model_config_path=Path("../input/") / CFG.exp_name / "model_config.pth", pretrained=False)
                path = Path("../input/") / CFG.exp_name / f"fold{i_fold}_best.pth"
            else:
                model = CustomModel(CFG, model_config_path=None, pretrained=True)
                path = CFG.output_dir / f"fold{i_fold}_best.pth"

            state = torch.load(path, map_location=torch.device("cpu"))
            model.load_state_dict(state["model"])
            test_token_probs = inference_fn(test_dataloader, model, device)
            test[[f"fold{i_fold}_{i}" for i in range(CFG.max_len)]] = test_token_probs
            test_char_probs = get_char_probs(test["pn_history"].values, test_token_probs, CFG.tokenizer)
            predictions.append(test_char_probs)

            del state, test_token_probs, model; gc.collect()
            torch.cuda.empty_cache()

        predictions = np.mean(predictions, axis=0)
        predicted_location_str = get_predicted_location_str(predictions, th=best_thres)
        test[CFG.target_col] = predicted_location_str
        test.to_csv(CFG.output_dir / "raw_submission.csv", index=False)
        test[[CFG.id_col, CFG.target_col]].to_csv(
            CFG.output_dir / "submission.csv", index=False
        )

In [32]:
if __name__ == "__main__":
    main()



Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/1424] Elapsed 0m 1s (remain 24m 15s) Loss: 0.7841(0.7841) Grad: inf  LR: 0.000000  
Epoch: [1][100/1424] Elapsed 0m 22s (remain 5m 0s) Loss: 0.1131(0.3776) Grad: 1992.6510  LR: 0.000003  
Epoch: [1][200/1424] Elapsed 0m 44s (remain 4m 33s) Loss: 0.0269(0.2211) Grad: 3103.3484  LR: 0.000006  
Epoch: [1][300/1424] Elapsed 1m 6s (remain 4m 9s) Loss: 0.0132(0.1612) Grad: 2843.2793  LR: 0.000008  
Epoch: [1][400/1424] Elapsed 1m 29s (remain 3m 48s) Loss: 0.0341(0.1293) Grad: 3110.3171  LR: 0.000011  
Epoch: [1][500/1424] Elapsed 1m 51s (remain 3m 25s) Loss: 0.0445(0.1092) Grad: 5013.7622  LR: 0.000014  
Epoch: [1][600/1424] Elapsed 2m 13s (remain 3m 2s) Loss: 0.0402(0.0950) Grad: 4871.8579  LR: 0.000017  
Epoch: [1][700/1424] Elapsed 2m 35s (remain 2m 40s) Loss: 0.0260(0.0849) Grad: 3865.8916  LR: 0.000020  
Epoch: [1][800/1424] Elapsed 2m 57s (remain 2m 18s) Loss: 0.0061(0.0772) Grad: 1086.5222  LR: 0.000020  
Epoch: [1][900/1424] Elapsed 3m 19s (remain 1m 55s) Loss: 0.0226(0.

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/1425] Elapsed 0m 0s (remain 16m 2s) Loss: 0.3971(0.3971) Grad: inf  LR: 0.000000  
Epoch: [1][100/1425] Elapsed 0m 23s (remain 5m 3s) Loss: 0.1334(0.2113) Grad: 4675.3442  LR: 0.000003  
Epoch: [1][200/1425] Elapsed 0m 45s (remain 4m 35s) Loss: 0.0897(0.1375) Grad: 10432.7158  LR: 0.000006  
Epoch: [1][300/1425] Elapsed 1m 7s (remain 4m 10s) Loss: 0.0215(0.1049) Grad: 5375.0044  LR: 0.000008  
Epoch: [1][400/1425] Elapsed 1m 29s (remain 3m 47s) Loss: 0.0320(0.0881) Grad: 2461.2358  LR: 0.000011  
Epoch: [1][500/1425] Elapsed 1m 51s (remain 3m 25s) Loss: 0.0117(0.0764) Grad: 3291.5432  LR: 0.000014  
Epoch: [1][600/1425] Elapsed 2m 13s (remain 3m 2s) Loss: 0.0228(0.0680) Grad: 5703.9058  LR: 0.000017  
Epoch: [1][700/1425] Elapsed 2m 35s (remain 2m 40s) Loss: 0.0076(0.0620) Grad: 2155.9717  LR: 0.000020  
Epoch: [1][800/1425] Elapsed 2m 57s (remain 2m 18s) Loss: 0.0224(0.0572) Grad: 4415.9048  LR: 0.000020  
Epoch: [1][900/1425] Elapsed 3m 19s (remain 1m 55s) Loss: 0.0400(0

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/1435] Elapsed 0m 0s (remain 15m 21s) Loss: 1.2420(1.2420) Grad: inf  LR: 0.000000  
Epoch: [1][100/1435] Elapsed 0m 23s (remain 5m 7s) Loss: 0.0329(0.6274) Grad: 572.4616  LR: 0.000003  
Epoch: [1][200/1435] Elapsed 0m 45s (remain 4m 38s) Loss: 0.0178(0.3453) Grad: 1442.6128  LR: 0.000006  
Epoch: [1][300/1435] Elapsed 1m 7s (remain 4m 13s) Loss: 0.0331(0.2436) Grad: 4928.6362  LR: 0.000008  
Epoch: [1][400/1435] Elapsed 1m 29s (remain 3m 50s) Loss: 0.0084(0.1908) Grad: 1225.8955  LR: 0.000011  
Epoch: [1][500/1435] Elapsed 1m 51s (remain 3m 27s) Loss: 0.0159(0.1586) Grad: 2368.3337  LR: 0.000014  
Epoch: [1][600/1435] Elapsed 2m 13s (remain 3m 5s) Loss: 0.0235(0.1367) Grad: 2449.5957  LR: 0.000017  
Epoch: [1][700/1435] Elapsed 2m 35s (remain 2m 42s) Loss: 0.0386(0.1209) Grad: 3049.7554  LR: 0.000020  
Epoch: [1][800/1435] Elapsed 2m 57s (remain 2m 20s) Loss: 0.0158(0.1085) Grad: 1226.0144  LR: 0.000020  
Epoch: [1][900/1435] Elapsed 3m 19s (remain 1m 58s) Loss: 0.0020(0.

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/1438] Elapsed 0m 0s (remain 14m 41s) Loss: 1.0292(1.0292) Grad: inf  LR: 0.000000  
Epoch: [1][100/1438] Elapsed 0m 23s (remain 5m 10s) Loss: 0.0620(0.5202) Grad: 774.8645  LR: 0.000003  
Epoch: [1][200/1438] Elapsed 0m 45s (remain 4m 39s) Loss: 0.0600(0.2938) Grad: 184538.2188  LR: 0.000006  
Epoch: [1][300/1438] Elapsed 1m 7s (remain 4m 15s) Loss: 0.0509(0.2104) Grad: 4400.4370  LR: 0.000008  
Epoch: [1][400/1438] Elapsed 1m 29s (remain 3m 51s) Loss: 0.0126(0.1660) Grad: 1064.3547  LR: 0.000011  
Epoch: [1][500/1438] Elapsed 1m 51s (remain 3m 28s) Loss: 0.0080(0.1387) Grad: 873.7530  LR: 0.000014  
Epoch: [1][600/1438] Elapsed 2m 13s (remain 3m 5s) Loss: 0.0131(0.1199) Grad: 1913.4095  LR: 0.000017  
Epoch: [1][700/1438] Elapsed 2m 35s (remain 2m 43s) Loss: 0.0103(0.1061) Grad: 1435.3951  LR: 0.000019  
Epoch: [1][800/1438] Elapsed 2m 57s (remain 2m 21s) Loss: 0.0115(0.0957) Grad: 1772.2068  LR: 0.000020  
Epoch: [1][900/1438] Elapsed 3m 19s (remain 1m 58s) Loss: 0.0248(

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/1425] Elapsed 0m 0s (remain 14m 24s) Loss: 1.0857(1.0857) Grad: inf  LR: 0.000000  
Epoch: [1][100/1425] Elapsed 0m 23s (remain 5m 4s) Loss: 0.0819(0.5536) Grad: 2765.7539  LR: 0.000003  
Epoch: [1][200/1425] Elapsed 0m 45s (remain 4m 35s) Loss: 0.0587(0.3063) Grad: 5891.1353  LR: 0.000006  
Epoch: [1][300/1425] Elapsed 1m 7s (remain 4m 11s) Loss: 0.0881(0.2188) Grad: 9443.4424  LR: 0.000008  
Epoch: [1][400/1425] Elapsed 1m 29s (remain 3m 48s) Loss: 0.0208(0.1731) Grad: 2249.7583  LR: 0.000011  
Epoch: [1][500/1425] Elapsed 1m 51s (remain 3m 25s) Loss: 0.0101(0.1441) Grad: 2044.3568  LR: 0.000014  
Epoch: [1][600/1425] Elapsed 2m 13s (remain 3m 3s) Loss: 0.0196(0.1244) Grad: 1927.8291  LR: 0.000017  
Epoch: [1][700/1425] Elapsed 2m 35s (remain 2m 40s) Loss: 0.0206(0.1099) Grad: 2645.6074  LR: 0.000020  
Epoch: [1][800/1425] Elapsed 2m 57s (remain 2m 18s) Loss: 0.0062(0.0989) Grad: 1454.6267  LR: 0.000020  
Epoch: [1][900/1425] Elapsed 3m 19s (remain 1m 56s) Loss: 0.0053(0

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1 [00:00<?, ?it/s]

Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/usr/lib/python3.7/multiprocessing/queues.py", line 232, in _feed
    close()
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 177, in close
    self._close()
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 361, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.7/multiprocessing/queues.py", line 263, in _feed
    queue_sem.release()
ValueError: semaphore or lock released too many times

Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/usr/lib/python3.7/multiprocessing/queues.py", line 232, in _feed
   

  0%|          | 0/1 [00:00<?, ?it/s]

Exception ignored in: <function _ConnectionBase.__del__ at 0x7fb978010b00>
Traceback (most recent call last):
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 132, in __del__
    self._close()
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 361, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor
Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/usr/lib/python3.7/multiprocessing/queues.py", line 232, in _feed
    close()
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 177, in close
    self._close()
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 361, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.7/threading.py", line 870, in run

  0%|          | 0/1 [00:00<?, ?it/s]

Exception ignored in: <function _ConnectionBase.__del__ at 0x7fb978010b00>
Traceback (most recent call last):
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 132, in __del__
    self._close()
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 361, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor
Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exac

  0%|          | 0/1 [00:00<?, ?it/s]

Exception ignored in: <function _ConnectionBase.__del__ at 0x7fb978010b00>
Traceback (most recent call last):
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 132, in __del__
    self._close()
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 361, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor
Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exac

  0%|          | 0/1 [00:00<?, ?it/s]

Exception ignored in: <function _ConnectionBase.__del__ at 0x7fb978010b00>
Traceback (most recent call last):
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 132, in __del__
    self._close()
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 361, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor
