## References

- https://www.kaggle.com/yasufuminakama/nbme-deberta-base-baseline-train
- https://www.kaggle.com/abebe9849/nbmeexp019?select=itpt.py
- https://www.kaggle.com/rhtsingh/commonlit-readability-prize-roberta-torch-itpt
- https://www.kaggle.com/maunish/clrp-pytorch-roberta-pretrain

## Configurations

In [1]:
EXP_NAME = "nbme-exp033"
ENV = "local"
DEBUG_MODE = False
SUBMISSION_MODE = False

In [2]:
!nvidia-smi

Sat Mar 12 00:11:45 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.73.01    Driver Version: 460.73.01    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  TITAN RTX           Off  | 00000000:81:00.0 Off |                  N/A |
| 40%   29C    P8     8W / 280W |     15MiB / 24219MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
class CFG:
    env=ENV
    exp_name=EXP_NAME
    debug=DEBUG_MODE
    submission=SUBMISSION_MODE
    apex=True
    input_dir=None
    output_dir=None
    library="pytorch"  # ["tf", "pytorch"]
    device="GPU"  # ["GPU", "TPU"]
    competition_name="nbme-score-clinical-patient-notes"
    id_col="id"
    target_col="location"
    pretrained_model_name="roberta-large"
    tokenizer=None
    max_len=None
    output_dim=1
    dropout=0.2
    num_workers=4
    batch_size=8
    lr=2e-5
    betas=(0.9, 0.98)
    weight_decay=0.1
    num_warmup_steps_rate=0.1
    batch_scheduler=True
    epochs=5
    n_fold=5
    train_fold=[0, 1, 2, 3, 4]
    seed=71
    gradient_accumulation_steps=1
    max_grad_norm=1000
    print_freq=100
    train=True
    inference=True

In [4]:
class CFG_For_MLM:
    epochs = 15 # adjust
    learning_rate = 5e-05
    train_batch_size = 8
    gradient_accum_steps = 4
    eval_batch_size = 16
    eval_steps = 8678
    block_size = 466 # tokenizerのmax_length
    mlm_prob = 0.15
    fp16 = True

In [5]:
if CFG.debug:
    CFG.epochs = 2
    CFG.train_fold = [0, 1]

if CFG.submission:
    CFG.train = False
    CFG.inference = True

## Directory Settings

In [6]:
import sys
from pathlib import Path


print(CFG.env)
if CFG.env == "colab":
    # colab環境
    from google.colab import drive
    drive.mount("/content/drive")
    CFG.input_dir = Path("./drive/MyDrive/00.kaggle/input") / CFG.competition_name
    CFG.output_dir = Path("./drive/MyDrive/00.kaggle/output") / CFG.competition_name / CFG.exp_name
    if not CFG.output_dir.exists():
        CFG.output_dir.mkdir()
    # install packages
    !pip install -q sentencepiece==0.1.96
    !pip install -q transformers==4.16.2

elif CFG.env == "local":
    # ローカルサーバ
    CFG.input_dir = Path("../input/") / CFG.competition_name
    CFG.output_dir = Path("../output/") / CFG.competition_name / CFG.exp_name
    if not CFG.output_dir.exists():
        CFG.output_dir.mkdir()

elif CFG.env == "kaggle":
    # kaggle環境
    CFG.input_dir = Path("../input/") / CFG.competition_name
    CFG.output_dir = Path("./")

local


In [7]:
import gc
import os
import ast
import time
import math
import random
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from scipy.optimize import minimize
from sklearn.metrics import roc_auc_score, mean_squared_error, f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as T
from torchvision.io import read_image
from torch.utils.data import DataLoader, Dataset

from transformers import BartModel,BertModel,BertTokenizer
from transformers import DebertaModel,DebertaTokenizer
from transformers import RobertaModel,RobertaTokenizer
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel,AutoConfig
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import ElectraModel, ElectraTokenizer, ElectraForSequenceClassification

import warnings
warnings.filterwarnings("ignore")

In [8]:
from transformers import (AutoModel, 
                          AutoModelForMaskedLM,
                          AutoTokenizer,
                          AutoConfig,
                          AdamW,
                          LineByLineTextDataset,
                          DataCollatorForLanguageModeling,
                          Trainer,
                          TrainingArguments)

## Utilities

In [9]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [10]:
seed_everything()

## Data Loading

In [11]:
train = pd.read_csv(CFG.input_dir / "train.csv")
features = pd.read_csv(CFG.input_dir / "features.csv")
patient_notes = pd.read_csv(CFG.input_dir / "patient_notes.csv")
test = pd.read_csv(CFG.input_dir / "test.csv")

train.shape, features.shape, patient_notes.shape, test.shape

((14300, 6), (143, 3), (42146, 3), (5, 4))

In [12]:
if CFG.debug:
    train = train.sample(n=1000, random_state=0).reset_index(drop=True)
    print(train.shape)

## Preprocessing

In [13]:
pretrain_texts = patient_notes["pn_history"].unique()
print(len(pretrain_texts))


with open(CFG.output_dir / "text.txt", "w") as f:
    texts  = "\n".join(pretrain_texts.tolist())
    f.write(texts)

42146


## Setup tokenizer

In [14]:
if CFG.submission:
    tokenizer = AutoTokenizer.from_pretrained(Path("../input/") / CFG.exp_name / "tokenizer/", trim_offsets=False)
else:
    tokenizer = AutoTokenizer.from_pretrained(CFG.pretrained_model_name, trim_offsets=False)
    tokenizer.save_pretrained(CFG.output_dir / "tokenizer/")

CFG.tokenizer = tokenizer

In [15]:
tmp = 'dad with recent heart attack'
encode = tokenizer(tmp, return_offsets_mapping=True)
for (start,end) in encode['offset_mapping']:
    print(f"'{tmp[start:end]}', {start}, {end}")

print("ans")
print("""
'', 0, 0
'dad', 0, 3
' with', 3, 8
' recent', 8, 15
' heart', 15, 21
' attack', 21, 28
'', 0, 0
""")

'', 0, 0
'dad', 0, 3
' with', 3, 8
' recent', 8, 15
' heart', 15, 21
' attack', 21, 28
'', 0, 0
ans

'', 0, 0
'dad', 0, 3
' with', 3, 8
' recent', 8, 15
' heart', 15, 21
' attack', 21, 28
'', 0, 0



## Setup Dataset

In [16]:
train_dataset = LineByLineTextDataset(
    tokenizer=CFG.tokenizer,
    file_path=CFG.output_dir / "text.txt",
    block_size=CFG_For_MLM.block_size)

valid_dataset = LineByLineTextDataset(
    tokenizer=CFG.tokenizer,
    file_path=CFG.output_dir / "text.txt",
    block_size=CFG_For_MLM.block_size)

In [17]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=CFG.tokenizer, 
    mlm=True, 
    mlm_probability=CFG_For_MLM.mlm_prob)

## Model

In [18]:
model = AutoModelForMaskedLM.from_pretrained(CFG.pretrained_model_name)

## Training

In [19]:
training_args = TrainingArguments(
    output_dir=CFG.output_dir,
    overwrite_output_dir=True,
    num_train_epochs=CFG_For_MLM.epochs,
    per_device_train_batch_size=CFG_For_MLM.train_batch_size,
    per_device_eval_batch_size=CFG_For_MLM.eval_batch_size,
    learning_rate=CFG_For_MLM.learning_rate,
    gradient_accumulation_steps=CFG_For_MLM.gradient_accum_steps,
    fp16=CFG_For_MLM.fp16,
    eval_steps=CFG_For_MLM.eval_steps,
    save_steps=CFG_For_MLM.eval_steps,
    evaluation_strategy="steps",
    save_total_limit=2,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    load_best_model_at_end=True,
    prediction_loss_only=True,
    report_to="none",
)
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)

Using amp half precision backend


In [20]:
trainer.train()
trainer.save_model(CFG.output_dir)

***** Running training *****
  Num examples = 277742
  Num Epochs = 15
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 130185


Step,Training Loss,Validation Loss
8678,1.075,0.991411
17356,0.9835,0.91657
26034,0.9395,0.872384
34712,0.8848,0.822928
43390,0.8523,0.790142
52068,0.8312,0.769994
60746,0.806,0.743874
69424,0.7737,0.725133
78102,0.7347,0.699365
86780,0.7255,0.687088


***** Running Evaluation *****
  Num examples = 277742
  Batch size = 16
Saving model checkpoint to ../output/nbme-score-clinical-patient-notes/nbme-exp033/checkpoint-8678
Configuration saved in ../output/nbme-score-clinical-patient-notes/nbme-exp033/checkpoint-8678/config.json
Model weights saved in ../output/nbme-score-clinical-patient-notes/nbme-exp033/checkpoint-8678/pytorch_model.bin
Deleting older checkpoint [../output/nbme-score-clinical-patient-notes/nbme-exp033/checkpoint-121492] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 277742
  Batch size = 16
Saving model checkpoint to ../output/nbme-score-clinical-patient-notes/nbme-exp033/checkpoint-17356
Configuration saved in ../output/nbme-score-clinical-patient-notes/nbme-exp033/checkpoint-17356/config.json
Model weights saved in ../output/nbme-score-clinical-patient-notes/nbme-exp033/checkpoint-17356/pytorch_model.bin
Deleting older checkpoint [../output/nbme-score-clinical-patient-notes/nbme-exp033