## References

- https://www.kaggle.com/yasufuminakama/nbme-deberta-base-baseline-train

## Configurations

In [1]:
!nvidia-smi

Sun Apr 24 11:00:46 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    25W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
EXP_NAME = "nbme-exp094"
ENV = "colab"
DEBUG_MODE = False
SUBMISSION_MODE = False

In [3]:
%env TOKENIZERS_PARALLELISM=true

env: TOKENIZERS_PARALLELISM=true


In [4]:
class CFG:
    env=ENV
    exp_name=EXP_NAME
    debug=DEBUG_MODE
    submission=SUBMISSION_MODE
    apex=True
    input_dir=None
    output_dir=None
    library="pytorch"  # ["tf", "pytorch"]
    device="GPU"  # ["GPU", "TPU"]
    competition_name="nbme-score-clinical-patient-notes"
    id_col="id"
    target_col="location"
    pretrained_model_name="microsoft/deberta-v3-large"
    tokenizer=None
    max_len=None
    #pseudo_plain_path='../output/nbme-score-clinical-patient-notes/make_pseudo_dataset/pseudo_plain.pkl'
    pseudo_plain_path="./drive/MyDrive/00.kaggle/output/nbme-score-clinical-patient-notes/make_pseudo_dataset/pseudo_plain.pkl"
    n_pseudo_labels=100000
    output_dim=1
    dropout=0.2
    num_workers=4
    batch_size=3
    lr=2e-5
    betas=(0.9, 0.98)
    weight_decay=0.1
    alpha=1
    gamma=2
    smoothing=0.0001
    num_warmup_steps_rate=0.1
    batch_scheduler=True
    epochs=1
    n_fold=4
    train_fold=[0, 1, 2, 3]
    mask_aug_p=0.5
    mask_ratio=0.15
    seed=71
    gradient_accumulation_steps=2
    max_grad_norm=1000
    print_freq=100
    train=True
    inference=True

In [5]:
if CFG.debug:
    CFG.epochs = 2
    CFG.train_fold = [0, 1]

if CFG.submission:
    CFG.train = False
    CFG.inference = True

## Directory Settings

In [6]:
import sys
from pathlib import Path


print(CFG.env)
if CFG.env == "colab":
    # colab環境
    from google.colab import drive
    drive.mount("/content/drive")
    CFG.input_dir = Path("./drive/MyDrive/00.kaggle/input") / CFG.competition_name
    CFG.output_dir = Path("./drive/MyDrive/00.kaggle/output") / CFG.competition_name / CFG.exp_name
    if not CFG.output_dir.exists():
        CFG.output_dir.mkdir()
    # install packages
    !pip install transformers==4.16.2
    !pip install -q sentencepiece==0.1.96

elif CFG.env == "local":
    # ローカルサーバ
    CFG.input_dir = Path("../input/") / CFG.competition_name
    CFG.output_dir = Path("../output/") / CFG.competition_name / CFG.exp_name
    if not CFG.output_dir.exists():
        CFG.output_dir.mkdir()

elif CFG.env == "kaggle":
    # kaggle環境
    CFG.input_dir = Path("../input/") / CFG.competition_name
    CFG.output_dir = Path("./")

colab
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
# The following is necessary if you want to use the fast tokenizer for deberta v2 or v3
# This must be done before importing transformers
import shutil
from pathlib import Path

if CFG.env == "colab":
    input_dir = Path("./drive/MyDrive/00.kaggle/input/deberta-v2-3-fast-tokenizer")
    transformers_path = Path("/usr/local/lib/python3.7/dist-packages/transformers")
else:
    input_dir = Path("../input/deberta-v2-3-fast-tokenizer")
    transformers_path = Path("/opt/conda/lib/python3.7/site-packages/transformers")

convert_file = input_dir / "convert_slow_tokenizer.py"
conversion_path = transformers_path/convert_file.name

if conversion_path.exists():
    conversion_path.unlink()

shutil.copy(convert_file, transformers_path)
deberta_v2_path = transformers_path / "models" / "deberta_v2"

for filename in ['tokenization_deberta_v2.py', 'tokenization_deberta_v2_fast.py']:
    filepath = deberta_v2_path/filename
    if filepath.exists():
        filepath.unlink()

    shutil.copy(input_dir/filename, filepath)
    
    
from transformers.models.deberta_v2.tokenization_deberta_v2_fast import DebertaV2TokenizerFast

In [8]:
import gc
import os
import ast
import time
import math
import random
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from scipy.optimize import minimize
from sklearn.metrics import roc_auc_score, mean_squared_error, f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.transforms as T
from torchvision.io import read_image
from torch.utils.data import DataLoader, Dataset

from transformers import AutoModelForMaskedLM
from transformers import BartModel,BertModel,BertTokenizer
from transformers import DebertaModel,DebertaTokenizer
from transformers import RobertaModel,RobertaTokenizer
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel,AutoConfig
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import ElectraModel, ElectraTokenizer, ElectraForSequenceClassification

import warnings
warnings.filterwarnings("ignore")

## Utilities

In [9]:
def micro_f1(preds, truths):
    """
    Micro f1 on binary arrays.

    Args:
        preds (list of lists of ints): Predictions.
        truths (list of lists of ints): Ground truths.

    Returns:
        float: f1 score.
    """
    # Micro : aggregating over all instances
    preds = np.concatenate(preds)
    truths = np.concatenate(truths)
    return f1_score(truths, preds)


def spans_to_binary(spans, length=None):
    """
    Converts spans to a binary array indicating whether each character is in the span.

    Args:
        spans (list of lists of two ints): Spans.

    Returns:
        np array [length]: Binarized spans.
    """
    length = np.max(spans) if length is None else length
    binary = np.zeros(length)
    for start, end in spans:
        binary[start:end] = 1
    return binary


def span_micro_f1(preds, truths):
    """
    Micro f1 on spans.

    Args:
        preds (list of lists of two ints): Prediction spans.
        truths (list of lists of two ints): Ground truth spans.

    Returns:
        float: f1 score.
    """
    bin_preds = []
    bin_truths = []
    for pred, truth in zip(preds, truths):
        if not len(pred) and not len(truth):
            continue
        length = max(np.max(pred) if len(pred) else 0, np.max(truth) if len(truth) else 0)
        bin_preds.append(spans_to_binary(pred, length))
        bin_truths.append(spans_to_binary(truth, length))
    return micro_f1(bin_preds, bin_truths)


def get_score(y_true, y_pred):
    score = span_micro_f1(y_true, y_pred)
    return score

In [10]:
def create_labels_for_scoring(df):
    # example: ['48 61', '111 128'] -> [[48, 61], [111, 128]]
    df["location_for_create_labels"] = [ast.literal_eval(f"[]")] * len(df)
    for i in range(len(df)):
        lst = df.loc[i, "location"]
        if lst:
            new_lst = ";".join(lst)
            df.loc[i, "location_for_create_labels"] = ast.literal_eval(f"[['{new_lst}']]")

    # create labels
    truths = []
    for location_list in df["location_for_create_labels"].values:
        truth = []
        if len(location_list) > 0:
            location = location_list[0]
            for loc in [s.split() for s in location.split(";")]:
                start, end = int(loc[0]), int(loc[1])
                truth.append([start, end])
        truths.append(truth)

    return truths


def get_char_probs(texts, token_probs, tokenizer):
    res = [np.zeros(len(t)) for t in texts]
    for i, (text, prediction) in enumerate(zip(texts, token_probs)):
        encoded = tokenizer(
            text=text,
            max_length=CFG.max_len,
            padding="max_length",
            return_offsets_mapping=True,
        )
        for (offset_mapping, pred) in zip(encoded["offset_mapping"], prediction):
            start, end = offset_mapping
            res[i][start:end] = pred
    return res


def get_predicted_location_str(char_probs, th=0.5):
    results = []
    for char_prob in char_probs:
        # result = np.where(char_prob >= th)[0] + 1
        result = np.where(char_prob >= th)[0]
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        # result = [f"{min(r)} {max(r)}" for r in result]
        result = [f"{min(r)} {max(r) + 1}" for r in result]
        result = ";".join(result)
        results.append(result)
    return results


def get_predictions(results):
    predictions = []
    for result in results:
        prediction = []
        if result != "":
            for loc in [s.split() for s in result.split(";")]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
    return predictions


def scoring(df, th=0.5, use_token_prob=True):
    labels = create_labels_for_scoring(df)

    if use_token_prob:
        token_probs = df[[str(i) for i in range(CFG.max_len)]].values
        char_probs = get_char_probs(df["pn_history"].values, token_probs, CFG.tokenizer)
    else:
        char_probs = df[[str(i) for i in range(CFG.max_char_len)]].values
        char_probs = [char_probs[i] for i in range(len(char_probs))]

    predicted_location_str = get_predicted_location_str(char_probs, th=th)
    preds = get_predictions(predicted_location_str)

    score = get_score(labels, preds)
    return score


def get_best_thres(oof_df):
    def f1_opt(x):
        return -1 * scoring(oof_df, th=x)

    best_thres = minimize(f1_opt, x0=np.array([0.5]), method="Nelder-Mead")["x"].item()
    return best_thres

In [11]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [12]:
seed_everything()

In [13]:
def postprocess(texts, preds):
    fix_tokenize_dict = {
        'heart': ['h', 'eart'],
        'hair': ['h', 'air'],
        'adderal': ['a', 'dderal'],
        'mother': ['m', 'other'],
        'intermittent': ['i', 'ntermittent'],
        'temperature': ['t', 'emperature'],
        'episodes': ['e', 'pisodes'],
        'no': ['n', 'o'],
        'has': ['h', 'as'],
        'LMP': ['L', 'MP'],
        '10': ['1', '0'],
        'blood': ['b', 'lood'],
        'recurrent': ['r', 'ecurrent'],
        'denies': ['d', 'enies'],
        'sudden': ['s', 'udden'],
        'Sexually': ['S', 'exually'],
        'up': ['u', 'p'],
        'wakes': ['w', 'akes'],
        'sweats': ['s', 'weats'],
        'hot': ['h', 'ot'],
        'drenched': ['d', 'renched'],
        'gnawing': ['g', 'nawing'],
        'Uses': ['U', 'ses'],
        'Begin': ['B', 'egin'],
        'Nausea': ['N', 'ausea'],
        'Burning': ['B', 'urning'],
        'Started': ['S', 'tarted'],
        'neurvousness': ['n', 'eurvousness'],
        'constipation': ['c', 'onstipation'],
        'nervousness': ['n', 'ervousness'],
        'cold': ['c', 'old'],
        'loss': ['l', 'oss'],
        'CBC': ['C', 'BC'],
        'Hx': ['H', 'x'],
        'tingling': ['t', 'ingling'],
        'feels': ['f', 'eels'],
        'Lost': ['L', 'ost'],
        'she': ['s', 'he'],
        'racing': ['r', 'acing'],
        'throat': ['t', 'hroat'],
        'PATIENT': ['P', 'ATIENT'],
        'recreational': ['r', 'ecreational'],
        'clammy': ['c', 'lammy'],
        'numbness': ['n', 'umbness'],
        'like': ['l', 'ike'],
        'reports': ['r', 'eports'],
        'exercise': ['e', 'xercise'],
        'started': ['s', 'tarted'],
        'brough': ['b', 'rough'],
        'Associated': ['A', 'ssociated'],
        'exacerbated': ['e', 'xacerbated'],
        'sharp': ['s', 'harp'],
        'cannot': ['c', 'annot'],
        'heavy': ['h', 'eavy'],
        'fatigue': ['f', 'atigue'],
        'trouble': ['t', 'rouble'],
        'hearing': ['h', 'earing'],
        'reduced': ['r', 'educed'],
        'lack': ['l', 'ack'],
        'vomiting': ['v', 'omiting'],
        'generalized': ['g', 'eneralized'],
        'body': ['b', 'ody'],
        'all': ['a', 'll'],
        'scratchy': ['s', 'cratchy'],
        'mom': ['m', 'om'],
        'discomfort': ['d', 'iscomfort'],
        'CAD': ['C', 'AD'],
        'Thyroid': ['T', 'hyroid'],
        'BLADDER': ['B', 'LADDER'],
        'diarrhea': ['d', 'iarrhea'],
        'Started': ['S', 'tarted'],
        'Vaginal': ['V', 'aginal'],
        'sleeping': ['s', 'leeping'],
        'UNCLE': ['U', 'NCLE'],
        'USING': ['U', 'SING'],
        'BURNING': ['B', 'URNING'],
        'GETTING': ['G', 'ETTING'],
        'ETOH': ['E', 'TOH'],
        'ON': ['O', 'N'],
        'INITIALLY': ['I', 'NITIALLY'],
        'epigastric': ['e', 'pigastric'],
        'occurs': ['o', 'ccurs'],
        'began': ['b', 'egan'],
        'alleviated': ['a', 'lleviated'],
        'overwhelmed': ['o', 'verwhelmed'],
        'clamminess': ['c', 'lamminess'],
        'strongly': ['s', 'trongly'],
        'lump': ['l', 'ump'],
        'drugs': ['d', 'rugs'],
        'chest': ['c', 'hest'],
        'stuffy': ['s', 'tuffy'],
        'changes': ['c', 'hanges'],
        'trouble': ['t', 'rouble'],
        'takes': ['t', 'akes'],
        'tossing': ['t', 'ossing'],
        'Fam': ['F', 'am'],
        'sweating': ['s', 'weating'],
        'dyspareunia': ['d', 'yspareunia'],
        'irregular': ['i', 'rregular'],
        'time': ['t', 'ime'],
        'unpredictable': ['u', 'npredictable'],
        'darkened': ['d', 'arkened'],
        'anxiety': ['a', 'nxiety'],
        'nervous': ['n', 'ervous'],
        'TAKING': ['T', 'AKING'],
        'losing': ['l', 'osing'],
        'Difficulyt': ['D', 'ifficulyt'],
        'Appetite': ['A', 'ppetite'],
        'increased': ['i', 'ncreased'],
        'fingers': ['f', 'ingers'],
        'illicit': ['i', 'llicit'],
        'claminess': ['c', 'laminess'],
        'clamy': ['c', 'lamy'],
        'Recently': ['R', 'ecently'],
        'feeling': ['f', 'eeling'],
        'aggrav': ['a', 'ggrav'],
        'changing': ['c', 'hanging'],
        'unable': ['u', 'nable'],
        'SEEING': ['S', 'EEING'],
        'staying': ['s', 'taying'],
        'lightheadedness': ['l', 'ightheadedness'],
        'lighheadeness': ['l', 'ighheadeness'],
        'nail': ['n', 'ail'],
        'pounding': ['p', 'ounding'],
        'My': ['M', 'y'],
        'Father': ['F', 'ather'],
        'urinary': ['u', 'rinary'],
        'pain': ['p', 'ain'],
        'not': ['n', 'ot'],
        'lower': ['l', 'ower'],
        'menses': ['m', 'enses'],
        'at': ['a', 't'],
        'takes': ['t', 'akes'],
        'initally': ['i', 'nitally'],
        'melena': ['m', 'elena'],
        'BOWEL': ['B', 'OWEL'],
        'WEIGHT': ['W', 'EIGHT'],
        'difficulty': ['d', 'ifficulty'],
        'condo': ['c', 'ondo'],
        'experiences': ['e', 'xperiences'],
        'stuffy': ['s', 'tuffy'],
        'rhinorrhea': ['r', 'hinorrhea'],
        'felt': ['f', 'elt'],
        'feverish': ['f', 'everish'],
        'CYCLE': ['C', 'YCLE'],
        'tampon': ['t', 'ampon'],
        'Last': ['L', 'ast'],
        'Son': ['S', 'on'],
        'saw': ['s', 'aw'],
        'tightness': ['t', 'ightness'],
        'rash': ['r', 'ash'],
        'ibuprofen': ['i', 'buprofen'],
        'SCRATHY': ['S', 'CRATHY'],
        'PHOTOPHOBIA': ['P', 'HOTOPHOBIA'],
    }
    preds_pp = preds.copy()
    tk0 = tqdm(range(len(preds_pp)), total=len(preds_pp))
    for raw_idx in tk0:
        pred = preds[raw_idx]
        text = texts[raw_idx]
        if len(pred) != 0:
            # pp1: indexが1から始まる予測値は0から始まるように修正 ## 0.88579 -> 0.88702
            if pred[0][0] == 1:
                preds_pp[raw_idx][0][0] = 0
            for p_index, pp in enumerate(pred):
                start, end = pred[p_index]
                # pp2: startとendが同じ予測値はstartを前に１ずらす ## 0.88702 -> 0.88714
                if start == end:
                    preds_pp[raw_idx][p_index][0] = start - 1
                    start = start - 1
                # pp3: 始点が改行の場合始点を1つ後ろにずらす ## 0.88714 -> 0.88746
                if text[start] == '\n':
                    preds_pp[raw_idx][p_index][0] = start + 1
                    start = start + 1
                # pp4: 1-2などは-2で予測されることがあるので修正 ## 0.88746 -> 0.88747
                if text[start-1].isdigit() and text[start] == '-' and text[start+1].isdigit():
                    preds_pp[raw_idx][p_index][0] = start - 1
                    start = start - 1
                if text[start-1].isdigit() and text[start] == '/' and text[start+1].isdigit():
                    preds_pp[raw_idx][p_index][0] = start - 1
                    start = start - 1
                # pp5: 67などは7で予測されることがあるので修正 ## 0.88747 -> 0.88748
                if text[start-1].isdigit() and text[start].isdigit():
                    preds_pp[raw_idx][p_index][0] = start - 1
                    start = start - 1
                # pp6: 文頭が大文字で始まるものは大文字部分が除かれて予測されることがあるので修正 ## 0.88748 -> 0.88761
                if text[start-2] == '.' and text[start-1].isupper():
                    preds_pp[raw_idx][p_index][0] = start - 1
                    start = start - 1
                if text[start-2] == ',' and text[start-1].isupper():
                    preds_pp[raw_idx][p_index][0] = start - 1
                    start = start - 1
                if text[start-2] == ':' and text[start-1].isupper():
                    preds_pp[raw_idx][p_index][0] = start - 1
                    start = start - 1
                if text[start-2] == '-' and text[start-1].isupper():
                    preds_pp[raw_idx][p_index][0] = start - 1
                    start = start - 1
                # pp7: heart -> h + eart となっているようなものを修正する ## 0.88761 -> 0.88806
                for key, fix_tokenize in fix_tokenize_dict.items():
                    _s, s = fix_tokenize[0], fix_tokenize[1]
                    if text[start-1].lower() == _s.lower() and text[start:start+len(s)].lower() == s.lower():
                        preds_pp[raw_idx][p_index][0] = start - 1
                        start = start - 1
    return preds_pp

In [14]:
def get_results_from_preds_list(preds):
    results = []
    for pred in preds:
        s = []
        for p in pred:
            s.append(' '.join(list(map(str, p))))
        s = ';'.join(s)
        results.append(s)
    return results

In [15]:
def trunc_pred(texts, preds):
    preds_pp = preds.copy()
    tk0 = tqdm(range(len(preds_pp)), total=len(preds_pp))
    for raw_idx in tk0:
        text = texts[raw_idx]
        num_text = len(text)
        preds_pp[raw_idx, num_text:] = 0
    return preds_pp

In [16]:
def create_label(pn_history, location_list, max_char_len):
    label = np.zeros(max_char_len)
    label[len(pn_history):] = -1
    if len(location_list) > 0:
        for location in location_list:
            start, end = int(location[0]), int(location[1])
            label[start:end] = 1
    return label

def get_preds_from_results(results, texts, max_char_len):
    labels = []
    for idx, result in enumerate(results):
        label = create_label(texts[idx], result, max_char_len)
        labels.append(label)
    labels = np.stack(labels)
    print(labels.shape)
    return labels

## Data Loading

In [17]:
train = pd.read_csv(CFG.input_dir / "train.csv")
features = pd.read_csv(CFG.input_dir / "features.csv")
patient_notes = pd.read_csv(CFG.input_dir / "patient_notes.csv")
test = pd.read_csv(CFG.input_dir / "test.csv")

train.shape, features.shape, patient_notes.shape, test.shape

((14300, 6), (143, 3), (42146, 3), (5, 4))

In [18]:
if CFG.debug:
    train = train.sample(n=1000, random_state=0).reset_index(drop=True)
    print(train.shape)

## Preprocessing

In [19]:
def preprocess_features(features):
    features.loc[features["feature_text"] == "Last-Pap-smear-I-year-ago", "feature_text"] = "Last-Pap-smear-1-year-ago"
    return features


features = preprocess_features(features)

In [20]:
features['feature_text'] = features['feature_text'].str.lower()
patient_notes['pn_history'] = patient_notes['pn_history'].str.lower()

In [21]:
train = train.merge(features, on=["feature_num", "case_num"], how="left")
train = train.merge(patient_notes, on=["pn_num", "case_num"], how="left")
test = test.merge(features, on=["feature_num", "case_num"], how="left")
test = test.merge(patient_notes, on=["pn_num", "case_num"], how="left")

train.shape, test.shape

((14300, 8), (5, 6))

In [22]:
train["annotation"] = train["annotation"].apply(ast.literal_eval)
train["location"] = train["location"].apply(ast.literal_eval)

In [23]:
train["annotation_length"] = train["annotation"].apply(len)
display(train['annotation_length'].value_counts().sort_index())

0    4399
1    8181
2    1296
3     287
4      99
5      27
6       9
7       1
8       1
Name: annotation_length, dtype: int64

## CV split

In [24]:
Fold = GroupKFold(n_splits=CFG.n_fold)
groups = train['pn_num'].values
for n, (train_index, val_index) in enumerate(Fold.split(train, train['location'], groups)):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

fold
0    3575
1    3575
2    3575
3    3575
dtype: int64

## Setup tokenizer

In [25]:
if CFG.submission:
    tokenizer = DebertaV2TokenizerFast.from_pretrained(Path("../input/") / CFG.exp_name / "tokenizer/")
else:
    tokenizer = DebertaV2TokenizerFast.from_pretrained(CFG.pretrained_model_name)
    tokenizer.save_pretrained(CFG.output_dir / "tokenizer/")

CFG.tokenizer = tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Create dataset

In [26]:
pn_history_lengths = []
tk0 = tqdm(patient_notes["pn_history"].fillna("").values, total=len(patient_notes))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)["input_ids"])
    pn_history_lengths.append(length)

print("max length:", np.max(pn_history_lengths))

  0%|          | 0/42146 [00:00<?, ?it/s]

max length: 284


In [27]:
feature_text_lengths = []
tk0 = tqdm(features["feature_text"].fillna("").values, total=len(features))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)["input_ids"])
    feature_text_lengths.append(length)

print("max length:", np.max(feature_text_lengths))

  0%|          | 0/143 [00:00<?, ?it/s]

max length: 28


In [28]:
CFG.max_len = max(pn_history_lengths) + max(feature_text_lengths) + 3   # cls & sep & sep

print("max length:", CFG.max_len)

max length: 315


In [29]:
pn_history_lengths = []
tk0 = tqdm(patient_notes["pn_history"].fillna("").values, total=len(patient_notes))
for text in tk0:
    length = len(text)
    pn_history_lengths.append(length)

CFG.max_char_len = max(pn_history_lengths)

print("max length:", CFG.max_char_len)

  0%|          | 0/42146 [00:00<?, ?it/s]

max length: 950


In [30]:
class TrainingDataset(Dataset):
    def __init__(self, cfg, df, pseudo_label=None):
        self.cfg = cfg
        self.df = df
        self.tokenizer = self.cfg.tokenizer
        self.max_len = self.cfg.max_len
        self.mask_aug_p = self.cfg.mask_aug_p
        self.mask_ratio = self.cfg.mask_ratio
        self.max_char_len = self.cfg.max_char_len
        self.feature_texts = self.df["feature_text"].values
        self.pn_historys = self.df["pn_history"].values
        self.annotation_lengths = self.df["annotation_length"].values
        self.locations = self.df["location"].values
        if "pseudo_idx" in df.columns:
            self.pseudo_idx = self.df["pseudo_idx"].values
            self.pseudo_label = pseudo_label

    def __len__(self):
        return len(self.df)

    def mask_augment(self, inputs):
        all_inds = np.arange(1, len(inputs["input_ids"]) - 1)
        n_mask = max(int(len(all_inds) * self.mask_ratio), 1)
        np.random.shuffle(all_inds)
        mask_inds = all_inds[:n_mask]
        sep_ind = np.where(np.array(inputs["input_ids"]) == 2)[0]
        mask_inds = np.array([i for i in mask_inds if i < sep_ind[0]])
        inputs_ids = np.array(inputs["input_ids"])
        try:
            inputs_ids[mask_inds] = tokenizer.mask_token_id
            inputs["input_ids"] = list(inputs_ids)
            return inputs
        except:
            return inputs

    def _create_input(self, pn_history, feature_text):
        encoded = self.tokenizer(
            text=pn_history,
            text_pair=feature_text,
            max_length=self.max_len,
            padding="max_length",
            return_offsets_mapping=False,
        )
        if float(torch.rand(1)) < self.mask_aug_p:
            encoded = self.mask_augment(encoded)

        for k, v in encoded.items():
            encoded[k] = torch.tensor(v, dtype=torch.long)
        return encoded

    def _create_mapping_from_token_to_char(self, pn_history):
        encoded = self.tokenizer(
            text=pn_history,
            max_length=self.max_len,
            padding="max_length",
            return_offsets_mapping=True,
        )
        mapping_from_token_to_char = np.zeros(self.max_char_len)
        offset_mapping = encoded["offset_mapping"]
        for i, offset in enumerate(offset_mapping):
            start_idx, end_idx = offset
            mapping_from_token_to_char[start_idx:end_idx] = i
        return torch.tensor(mapping_from_token_to_char, dtype=torch.long)

    def _create_label(self, pn_history, annotation_length, location_list):
        label = np.zeros(self.max_char_len)
        label[len(pn_history):] = -1
        if annotation_length > 0:
            for location in location_list:
                for loc in [s.split() for s in location.split(";")]:
                    start, end = int(loc[0]), int(loc[1])
                    label[start:end] = 1
        return torch.tensor(label, dtype=torch.float)

    def __getitem__(self, idx):
        input_ = self._create_input(self.pn_historys[idx], self.feature_texts[idx])
        if not np.isnan(self.annotation_lengths[idx]):
            label = self._create_label(self.pn_historys[idx], self.annotation_lengths[idx], self.locations[idx])
        else:
            p_idx = int(self.pseudo_idx[idx])
            label = torch.tensor(self.pseudo_label[p_idx], dtype=torch.float)
        mapping_from_token_to_char = self._create_mapping_from_token_to_char(self.pn_historys[idx])
        return input_, label, mapping_from_token_to_char

In [31]:
class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.df = df
        self.tokenizer = self.cfg.tokenizer
        self.max_len = self.cfg.max_len
        self.max_char_len = self.cfg.max_char_len
        self.feature_texts = self.df["feature_text"].values
        self.pn_historys = self.df["pn_history"].values

    def __len__(self):
        return len(self.df)

    def _create_input(self, pn_history, feature_text):
        encoded = self.tokenizer(
            text=pn_history,
            text_pair=feature_text,
            max_length=self.max_len,
            padding="max_length",
            return_offsets_mapping=False,
        )
        for k, v in encoded.items():
            encoded[k] = torch.tensor(v, dtype=torch.long)
        return encoded

    def _create_mapping_from_token_to_char(self, pn_history):
        encoded = self.tokenizer(
            text=pn_history,
            max_length=self.max_len,
            padding="max_length",
            return_offsets_mapping=True,
        )
        mapping_from_token_to_char = np.zeros(self.max_char_len)
        offset_mapping = encoded["offset_mapping"]
        for i, offset in enumerate(offset_mapping):
            start_idx, end_idx = offset
            mapping_from_token_to_char[start_idx:end_idx] = i
        return torch.tensor(mapping_from_token_to_char, dtype=torch.long)

    def __getitem__(self, idx):
        input_ = self._create_input(self.pn_historys[idx], self.feature_texts[idx])
        mapping_from_token_to_char = self._create_mapping_from_token_to_char(self.pn_historys[idx])
        return input_, mapping_from_token_to_char

## Model

In [32]:
from transformers.modeling_outputs import MaskedLMOutput

class MaskedModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(
                cfg.pretrained_model_name,
                output_hidden_states=False
                )
        else:
            self.config = torch.load(config_path)
        
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.pretrained_model_name, config=self.config)
            self.lm_head = AutoModelForMaskedLM.from_pretrained(cfg.pretrained_model_name, config=self.config).cls # [cls, lm_head]
        else:
            self.model = AutoModel(self.config)
            self.lm_head = AutoModelForMaskedLM(self.config).cls # [cls, lm_head]
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(
            self, 
            input_ids=None,
            attention_mask=None,
            token_type_ids=None,
            #position_ids=None,
            inputs_embeds=None,
            labels=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None):
        
        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            #position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,)
        
        sequence_output = outputs[0]
        prediction_scores = self.lm_head(sequence_output)

        masked_lm_loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

        return MaskedLMOutput(loss=masked_lm_loss,
                              logits=prediction_scores,
                              hidden_states=outputs.hidden_states,
                              attentions=outputs.attentions)

In [33]:
class CustomModel(nn.Module):
    def __init__(self, cfg, model_config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg

        if model_config_path is None:
            self.model_config = AutoConfig.from_pretrained(
                self.cfg.pretrained_model_name,
                output_hidden_states=True,
            )
        else:
            self.model_config = torch.load(model_config_path)

        if pretrained:
            self.backbone = AutoModel.from_pretrained(
                self.cfg.pretrained_model_name,
                config=self.model_config,
            )
            print(f"Load weight from pretrained")
        else:
            #self.backbone = AutoModel.from_config(self.model_config)
            # itpt = AutoModelForMaskedLM.from_config(self.model_config)
            #path = str(Path("./drive/MyDrive/00.kaggle/output") / CFG.competition_name /  "nbme-exp010/checkpoint-130170/pytorch_model.bin")
            # path = "../output/nbme-score-clinical-patient-notes/nbme-exp010/checkpoint-130170/pytorch_model.bin"
            # state_dict = torch.load(path)
            # itpt.load_state_dict(state_dict)
            path = str(Path("./drive/MyDrive/00.kaggle/output") / CFG.competition_name /  "nbme-exp073/microsoft-deberta-v3-large-mlm-epoch-12.bin")
            #path = str(Path("../output") / CFG.competition_name /  "nbme-exp073/microsoft-deberta-v3-large-mlm-epoch-12.bin")
            masked_model = MaskedModel(CFG, config_path=None, pretrained=True)
            state = torch.load(path, map_location=torch.device("cpu"))
            masked_model.load_state_dict(state)
            self.backbone = masked_model.model
            print(f"Load weight from {path}")
            del state, masked_model; gc.collect()

        self.lstm = nn.GRU(
            input_size=self.model_config.hidden_size,
            bidirectional=True,
            hidden_size=self.model_config.hidden_size // 2,
            num_layers=4,
            dropout=self.cfg.dropout,
            batch_first=True,
        )
        self.fc = nn.Sequential(
            nn.Dropout(self.cfg.dropout),
            nn.Linear(self.model_config.hidden_size, self.cfg.output_dim),
        )

    def forward(self, inputs, mappings_from_token_to_char):
        h = self.backbone(**inputs)["last_hidden_state"]  # [batch, seq_len, d_model]
        mappings_from_token_to_char = mappings_from_token_to_char.unsqueeze(2).expand(-1, -1, self.model_config.hidden_size)
        h = torch.gather(h, 1, mappings_from_token_to_char)    # [batch, seq_len, d_model]
        h, _ = self.lstm(h)
        output = self.fc(h)

        return output

## Training

In [34]:
class FocalLoss(nn.Module):
    def __init__(self, reduction='none', alpha=1, gamma=2):
        super().__init__()
        self.reduction = reduction
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        bce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        pt = torch.exp(-bce_loss)
        loss = self.alpha * (1. - pt)**self.gamma * bce_loss
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss


class SmoothFocalLoss(nn.Module):
    def __init__(self, reduction='none', alpha=1, gamma=2, smoothing=0.0):
        super().__init__()
        self.reduction = reduction
        self.focal_loss = FocalLoss(reduction='none', alpha=alpha, gamma=gamma)
        self.smoothing = smoothing

    @staticmethod
    def _smooth(targets:torch.Tensor, smoothing=0.0):
        assert 0 <= smoothing < 1
        with torch.no_grad():
            targets = targets * (1.0 - smoothing) + 0.5 * smoothing
        return targets

    def forward(self, inputs, targets):
        targets = SmoothFocalLoss._smooth(targets, self.smoothing)
        loss = self.focal_loss(inputs, targets)
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss

    
class CEFocalLoss(nn.Module):
    def __init__(self, reduction='none', alpha=1, gamma=2):
        super(CEFocalLoss, self).__init__()
        self.reduction = reduction
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        loss = self.alpha * (1. - pt)**self.gamma * ce_loss
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss

    
class SmoothCEFocalLoss(nn.Module):
    def __init__(self, reduction='none', alpha=1, gamma=2, smoothing=0.0):
        super(SmoothCEFocalLoss, self).__init__()
        self.reduction = reduction
        self.alpha = alpha
        self.gamma = gamma
        self.smoothing = smoothing

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none', label_smoothing=self.smoothing) # torch >= 1.10.0
        pt = torch.exp(-ce_loss)
        loss = self.alpha * (1. - pt)**self.gamma * ce_loss
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss

In [35]:
def train_fn(
    train_dataloader,
    model,
    criterion,
    optimizer,
    epoch,
    scheduler,
    device,
):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = time.time()
    for step, (inputs, labels, mappings_from_token_to_char) in enumerate(train_dataloader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device) 
        batch_size = labels.size(0)
        mappings_from_token_to_char = mappings_from_token_to_char.to(device)

        with torch.cuda.amp.autocast(enabled=CFG.apex):
            output = model(inputs, mappings_from_token_to_char)

        loss = criterion(output.view(-1, 1), labels.view(-1, 1))
        loss = torch.masked_select(loss, labels.view(-1, 1) != -1)
        loss = loss.mean()

        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)

        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        if CFG.batch_scheduler:
            scheduler.step()

        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_dataloader)-1):
            print(
                "Epoch: [{0}][{1}/{2}] "
                "Elapsed {remain:s} "
                "Loss: {loss.val:.4f}({loss.avg:.4f}) "
                "Grad: {grad_norm:.4f}  "
                "LR: {lr:.6f}  "
                .format(
                    epoch+1,
                    step,
                    len(train_dataloader),
                    remain=timeSince(start, float(step+1) / len(train_dataloader)),
                    loss=losses,
                     grad_norm=grad_norm,
                     lr=scheduler.get_lr()[0],
                )
            )
    del output, loss, inputs, labels, mappings_from_token_to_char, scaler, grad_norm; gc.collect()
    torch.cuda.empty_cache()
    return losses.avg

In [36]:
def valid_fn(
    val_dataloader,
    model,
    criterion,
    device,
):
    model.eval()
    preds = []
    losses = AverageMeter()
    start = time.time()
    for step, (inputs, labels, mappings_from_token_to_char) in enumerate(val_dataloader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device) 
        batch_size = labels.size(0)
        mappings_from_token_to_char = mappings_from_token_to_char.to(device)

        with torch.cuda.amp.autocast(enabled=CFG.apex):
            output = model(inputs, mappings_from_token_to_char)

        loss = criterion(output.view(-1, 1), labels.view(-1, 1))
        loss = torch.masked_select(loss, labels.view(-1, 1) != -1)
        loss = loss.mean()
    
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(output.sigmoid().squeeze(2).detach().cpu().numpy())

        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(val_dataloader)-1):
            print(
                "EVAL: [{0}/{1}] "
                "Elapsed {remain:s} "
                "Loss: {loss.val:.4f}({loss.avg:.4f}) "
                .format(
                    step, len(val_dataloader),
                    remain=timeSince(start, float(step+1) / len(val_dataloader)),
                    loss=losses,
                )
            )
    preds = np.concatenate(preds)
    return losses.avg, preds

In [37]:
def inference_fn(test_dataloader, model, device):
    model.eval()
    model.to(device)
    preds = []
    tk0 = tqdm(test_dataloader, total=len(test_dataloader))
    for (inputs, mappings_from_token_to_char) in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        mappings_from_token_to_char = mappings_from_token_to_char.to(device)

        with torch.no_grad():
            output = model(inputs, mappings_from_token_to_char)
        preds.append(output.sigmoid().squeeze(2).detach().cpu().numpy())
    preds = np.concatenate(preds)
    return preds

In [38]:
def train_loop(df, i_fold, device):
    print(f"========== fold: {i_fold} training ==========")
    train_idx = df[df["fold"] != i_fold].index
    val_idx = df[df["fold"] == i_fold].index

    train_folds = df.loc[train_idx].reset_index(drop=True)
    val_folds = df.loc[val_idx].reset_index(drop=True)

    if CFG.pseudo_plain_path is not None:
        pseudo_plain = pd.read_pickle(CFG.pseudo_plain_path)
        print(f"get pseudo plain from {CFG.pseudo_plain_path}")
        pseudo_label_list = []
        weights = [0.4433659049657008, 0.20859987143371844, 0.3480342236005807]
        for exp_name in ["nbme-exp060", "nbme-exp067", "nbme-exp083"]:
            pseudo_label_path = f'./drive/MyDrive/00.kaggle/output/nbme-score-clinical-patient-notes/{exp_name}/pseudo_labels_{i_fold}.npy'
            #pseudo_label_path = f'../output/nbme-score-clinical-patient-notes/{exp_name}/pseudo_labels_{i_fold}.npy'
            pseudo_label = np.load(pseudo_label_path)
            print(f"get pseudo labels from {pseudo_label_path}")
            pseudo_label_list.append(pseudo_label)

        pseudo_label = weights[0] * pseudo_label_list[0] + weights[1] * pseudo_label_list[1] + weights[2] * pseudo_label_list[2]
        pseudo_label = trunc_pred(pseudo_plain["pn_history"].values, pseudo_label)
        predicted_location_str = get_predicted_location_str(pseudo_label, th=0.5)
        preds = get_predictions(predicted_location_str)
        results_postprocess = postprocess(pseudo_plain["pn_history"].values, preds)
        #results_postprocess = get_results_from_preds_list(results_postprocess)
        pseudo_label = get_preds_from_results(results_postprocess, pseudo_plain["pn_history"].values, pseudo_label.shape[1])
        print(pseudo_plain.shape, pseudo_label.shape)

        pseudo_plain['feature_text'] = pseudo_plain['feature_text'].str.lower()
        pseudo_plain['pn_history'] = pseudo_plain['pn_history'].str.lower()

        pseudo_plain["pseudo_idx"] = np.arange(len(pseudo_plain))
        pseudo_plain = pseudo_plain.sample(n=CFG.n_pseudo_labels)
        print(pseudo_plain.shape)
        train_folds = pd.concat([train_folds, pseudo_plain], axis=0, ignore_index=True)
        print(train_folds.shape)

    train_dataset = TrainingDataset(CFG, train_folds, pseudo_label)
    val_dataset = TrainingDataset(CFG, val_folds)

    train_dataloader = DataLoader(
        train_dataset,
        batch_size=CFG.batch_size,
        shuffle=True,
        num_workers=CFG.num_workers,
        pin_memory=True,
        drop_last=True,
    )
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=CFG.batch_size,
        shuffle=False,
        num_workers=CFG.num_workers,
        pin_memory=True,
        drop_last=False,
    )

    # model = CustomModel(CFG, model_config_path=None, pretrained=True)
    model = CustomModel(CFG, model_config_path=None, pretrained=False)   # itptを使うため
    torch.save(model.model_config, CFG.output_dir / "model_config.pth")
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {"params": [p for n, p in param_optimizer if not any(
            nd in n for nd in no_decay)], "weight_decay": CFG.weight_decay},
        {"params": [p for n, p in param_optimizer if any(
            nd in n for nd in no_decay)], "weight_decay": 0.0}
    ]
    optimizer = AdamW(
        optimizer_grouped_parameters,
        lr=CFG.lr,
        betas=CFG.betas,
        weight_decay=CFG.weight_decay,
    )
    num_train_optimization_steps = int(len(train_dataloader) * CFG.epochs)
    num_warmup_steps = int(num_train_optimization_steps * CFG.num_warmup_steps_rate)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_train_optimization_steps,
    )

    criterion = SmoothFocalLoss(reduction='none', alpha=CFG.alpha, gamma=CFG.gamma, smoothing=CFG.smoothing)
    #criterion = nn.BCEWithLogitsLoss(reduction="none")
    best_score = -1 * np.inf

    for epoch in range(CFG.epochs):
        start_time = time.time()
        avg_loss = train_fn(
            train_dataloader,
            model,
            criterion,
            optimizer,
            epoch,
            scheduler,
            device,
        )
        avg_val_loss, val_preds = valid_fn(
            val_dataloader,
            model,
            criterion,
            device,
        )

        if isinstance(scheduler, optim.lr_scheduler.CosineAnnealingWarmRestarts):
            scheduler.step()

        # scoring
        val_folds[[str(i) for i in range(CFG.max_char_len)]] = val_preds
        score = scoring(val_folds, th=0.5, use_token_prob=False)

        elapsed = time.time() - start_time

        print(f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s")
        print(f"Epoch {epoch+1} - Score: {score:.4f}")
        if score > best_score:
            best_score = score
            print(f"Epoch {epoch+1} - Save Best Score: {score:.4f} Model")
            torch.save({
                "model": model.state_dict(),
                "predictions": val_preds,
                },
                CFG.output_dir / f"fold{i_fold}_best.pth",
            )

    predictions = torch.load(
        CFG.output_dir / f"fold{i_fold}_best.pth",
        map_location=torch.device("cpu"),
    )["predictions"]
    val_folds[[str(i) for i in range(CFG.max_char_len)]] = predictions

    torch.cuda.empty_cache()
    gc.collect()

    return val_folds

## Main

In [39]:
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if CFG.train:
        oof_df = pd.DataFrame()
        for i_fold in range(CFG.n_fold):
            if i_fold in CFG.train_fold:
                _oof_df = train_loop(train, i_fold, device)
                oof_df = pd.concat([oof_df, _oof_df], axis=0, ignore_index=True)
        oof_df.to_pickle(CFG.output_dir / "oof_df.pkl")

    if CFG.submission:
        oof_df = pd.read_pickle(Path("../input/") / CFG.exp_name / "oof_df.pkl")
    else:
        oof_df = pd.read_pickle(CFG.output_dir / "oof_df.pkl")

    best_thres = 0.5
    best_score = 0.
    for th in np.arange(0.45, 0.55, 0.01):
        th = np.round(th, 2)
        score = scoring(oof_df, th=th, use_token_prob=False)
        if best_score < score:
            best_thres = th
            best_score = score
    print(f"best_thres: {best_thres}  score: {best_score:.5f}")

    if CFG.inference:
        test_dataset = TestDataset(CFG, test)
        test_dataloader = DataLoader(
            test_dataset,
            batch_size=CFG.batch_size,
            shuffle=False,
            num_workers=CFG.num_workers,
            pin_memory=True,
            drop_last=False,
        )
        predictions = []
        for i_fold in CFG.train_fold:
            if CFG.submission:
                model = CustomModel(CFG, model_config_path=Path("../input/") / CFG.exp_name / "model_config.pth", pretrained=False)
                path = Path("../input/") / CFG.exp_name / f"fold{i_fold}_best.pth"
            else:
                model = CustomModel(CFG, model_config_path=None, pretrained=True)
                path = CFG.output_dir / f"fold{i_fold}_best.pth"

            state = torch.load(path, map_location=torch.device("cpu"))
            model.load_state_dict(state["model"])
            print(f"load weights from {path}")
            test_char_probs = inference_fn(test_dataloader, model, device)
            predictions.append(test_char_probs)

            del state, test_char_probs, model; gc.collect()
            torch.cuda.empty_cache()

        predictions = np.mean(predictions, axis=0)
        predicted_location_str = get_predicted_location_str(predictions, th=best_thres)
        test[CFG.target_col] = predicted_location_str
        test.to_csv(CFG.output_dir / "raw_submission.csv", index=False)
        test[[CFG.id_col, CFG.target_col]].to_csv(
            CFG.output_dir / "submission.csv", index=False
        )

In [None]:
if __name__ == "__main__":
    main()

get pseudo plain from ./drive/MyDrive/00.kaggle/output/nbme-score-clinical-patient-notes/make_pseudo_dataset/pseudo_plain.pkl
get pseudo labels from ./drive/MyDrive/00.kaggle/output/nbme-score-clinical-patient-notes/nbme-exp060/pseudo_labels_0.npy
get pseudo labels from ./drive/MyDrive/00.kaggle/output/nbme-score-clinical-patient-notes/nbme-exp067/pseudo_labels_0.npy
get pseudo labels from ./drive/MyDrive/00.kaggle/output/nbme-score-clinical-patient-notes/nbme-exp083/pseudo_labels_0.npy


  0%|          | 0/612602 [00:00<?, ?it/s]

  0%|          | 0/612602 [00:00<?, ?it/s]

(612602, 950)
(612602, 6) (612602, 950)
(100000, 7)
(110725, 11)


Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the 

Load weight from drive/MyDrive/00.kaggle/output/nbme-score-clinical-patient-notes/nbme-exp073/microsoft-deberta-v3-large-mlm-epoch-12.bin
Epoch: [1][0/36908] Elapsed 0m 1s (remain 740m 15s) Loss: 0.0860(0.0860) Grad: 81389.8906  LR: 0.000000  
Epoch: [1][100/36908] Elapsed 1m 4s (remain 392m 50s) Loss: 0.0684(0.0810) Grad: 65753.2969  LR: 0.000001  
Epoch: [1][200/36908] Elapsed 2m 8s (remain 391m 38s) Loss: 0.0338(0.0662) Grad: 32102.8203  LR: 0.000001  
Epoch: [1][300/36908] Elapsed 3m 12s (remain 391m 3s) Loss: 0.0164(0.0513) Grad: 4301.4058  LR: 0.000002  
Epoch: [1][400/36908] Elapsed 4m 17s (remain 389m 58s) Loss: 0.0130(0.0419) Grad: 3926.3936  LR: 0.000002  
Epoch: [1][500/36908] Elapsed 5m 21s (remain 388m 53s) Loss: 0.0299(0.0361) Grad: 14480.1758  LR: 0.000003  
Epoch: [1][600/36908] Elapsed 6m 25s (remain 388m 19s) Loss: 0.0100(0.0322) Grad: 3048.4497  LR: 0.000003  
Epoch: [1][700/36908] Elapsed 7m 30s (remain 387m 47s) Loss: 0.0079(0.0294) Grad: 3465.6860  LR: 0.000004  


  0%|          | 0/612602 [00:00<?, ?it/s]

  0%|          | 0/612602 [00:00<?, ?it/s]

(612602, 950)
(612602, 6) (612602, 950)
(100000, 7)
(110725, 11)


Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the 

Load weight from drive/MyDrive/00.kaggle/output/nbme-score-clinical-patient-notes/nbme-exp073/microsoft-deberta-v3-large-mlm-epoch-12.bin
Epoch: [1][0/36908] Elapsed 0m 1s (remain 856m 12s) Loss: 0.0702(0.0702) Grad: 68483.4453  LR: 0.000000  
Epoch: [1][100/36908] Elapsed 1m 6s (remain 402m 40s) Loss: 0.0520(0.0634) Grad: 53005.6445  LR: 0.000001  
Epoch: [1][200/36908] Elapsed 2m 11s (remain 398m 44s) Loss: 0.0279(0.0518) Grad: 24213.1719  LR: 0.000001  
Epoch: [1][300/36908] Elapsed 3m 15s (remain 395m 53s) Loss: 0.0249(0.0407) Grad: 8569.5146  LR: 0.000002  
Epoch: [1][400/36908] Elapsed 4m 19s (remain 394m 13s) Loss: 0.0114(0.0339) Grad: 3000.4729  LR: 0.000002  
Epoch: [1][500/36908] Elapsed 5m 24s (remain 393m 6s) Loss: 0.0199(0.0298) Grad: 7129.7993  LR: 0.000003  
Epoch: [1][600/36908] Elapsed 6m 29s (remain 391m 46s) Loss: 0.0123(0.0267) Grad: 3749.8811  LR: 0.000003  
Epoch: [1][700/36908] Elapsed 7m 33s (remain 390m 10s) Loss: 0.0222(0.0247) Grad: 10543.9502  LR: 0.000004  

  0%|          | 0/612602 [00:00<?, ?it/s]

  0%|          | 0/612602 [00:00<?, ?it/s]

(612602, 950)
(612602, 6) (612602, 950)
(100000, 7)
(110725, 11)


Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the 

Load weight from drive/MyDrive/00.kaggle/output/nbme-score-clinical-patient-notes/nbme-exp073/microsoft-deberta-v3-large-mlm-epoch-12.bin
Epoch: [1][0/36908] Elapsed 0m 1s (remain 992m 1s) Loss: 0.0851(0.0851) Grad: 74909.3984  LR: 0.000000  
Epoch: [1][100/36908] Elapsed 1m 5s (remain 395m 34s) Loss: 0.0663(0.0788) Grad: 64577.2852  LR: 0.000001  
Epoch: [1][200/36908] Elapsed 2m 8s (remain 391m 12s) Loss: 0.0301(0.0649) Grad: 34872.2578  LR: 0.000001  
Epoch: [1][300/36908] Elapsed 3m 11s (remain 388m 40s) Loss: 0.0158(0.0503) Grad: 4097.6338  LR: 0.000002  
Epoch: [1][400/36908] Elapsed 4m 14s (remain 386m 53s) Loss: 0.0202(0.0408) Grad: 8213.8027  LR: 0.000002  
Epoch: [1][500/36908] Elapsed 5m 17s (remain 384m 11s) Loss: 0.0099(0.0350) Grad: 2644.7234  LR: 0.000003  
Epoch: [1][600/36908] Elapsed 6m 19s (remain 382m 8s) Loss: 0.0123(0.0312) Grad: 3901.3474  LR: 0.000003  
Epoch: [1][700/36908] Elapsed 7m 21s (remain 380m 11s) Loss: 0.0114(0.0284) Grad: 3704.8105  LR: 0.000004  
Ep

  0%|          | 0/612602 [00:00<?, ?it/s]

  0%|          | 0/612602 [00:00<?, ?it/s]

(612602, 950)
(612602, 6) (612602, 950)
(100000, 7)
(110725, 11)


Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the 

Load weight from drive/MyDrive/00.kaggle/output/nbme-score-clinical-patient-notes/nbme-exp073/microsoft-deberta-v3-large-mlm-epoch-12.bin
Epoch: [1][0/36908] Elapsed 0m 1s (remain 1017m 4s) Loss: 0.0751(0.0751) Grad: 68713.4688  LR: 0.000000  
Epoch: [1][100/36908] Elapsed 1m 5s (remain 398m 47s) Loss: 0.0608(0.0720) Grad: 58263.2617  LR: 0.000001  
Epoch: [1][200/36908] Elapsed 2m 9s (remain 393m 49s) Loss: 0.0293(0.0593) Grad: 29868.2812  LR: 0.000001  
Epoch: [1][300/36908] Elapsed 3m 13s (remain 391m 28s) Loss: 0.0050(0.0460) Grad: 7708.4980  LR: 0.000002  
Epoch: [1][400/36908] Elapsed 4m 16s (remain 389m 23s) Loss: 0.0053(0.0381) Grad: 3401.1326  LR: 0.000002  
Epoch: [1][500/36908] Elapsed 5m 19s (remain 387m 21s) Loss: 0.0200(0.0331) Grad: 5394.5464  LR: 0.000003  
Epoch: [1][600/36908] Elapsed 6m 23s (remain 386m 19s) Loss: 0.0114(0.0297) Grad: 3771.9961  LR: 0.000003  
Epoch: [1][700/36908] Elapsed 7m 27s (remain 384m 54s) Loss: 0.0102(0.0273) Grad: 4215.2383  LR: 0.000004  
