ref: https://www.kaggle.com/competitions/nbme-score-clinical-patient-notes/discussion/323095

In [1]:
import os
import sys
import torch

class Config:
    AUTHOR = "kuruton"

    NAME = "USP-" + "MLM-deberta-v3-large-kf"

    MODEL_PATH = "microsoft/deberta-v3-large"
    DATASET_PATH = [
        "yasufuminakama/cpc-data"
    ]

    COMPETITION = "us-patent-phrase-to-phrase-matching"
    COLAB_PATH = "/content/drive/Shareddrives/USPatent" 
    DRIVE_PATH = os.path.join(COLAB_PATH, AUTHOR)

    api_path = "/content/drive/MyDrive/kaggle.json"

    seed = 42
    num_fold = 4
    trn_fold = [0, 1, 2, 3]
    batch_size = 32
    n_epochs = 10
    max_len = 256

    weight_decay = 2e-5
    beta = (0.9, 0.98)
    lr = 2e-5
    num_warmup_steps_rate = 0.01
    clip_grad_norm = None
    gradient_accumulation_steps = 1
    num_eval = 1
    WEB_HOOK_URL = "https://hooks.slack.com/services/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"

    debug = False

    upload_from_colab = False

In [2]:
# ========================================
# Library
# ========================================
import os
import gc
import re
import sys
import json
import time
import shutil
import joblib
import random
import requests
import warnings
warnings.filterwarnings('ignore')
from ast import literal_eval

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import scipy 
import itertools
from pathlib import Path
from glob import glob
from tqdm.auto import tqdm
from sklearn.model_selection import (
    StratifiedKFold, 
    KFold, 
    GroupKFold
)
from sklearn.metrics import (
    accuracy_score, 
    f1_score,
    roc_auc_score,
)

from google.colab import drive
if not os.path.isdir('/content/drive'):
    drive.mount('/content/drive') 

Mounted at /content/drive


In [3]:
# torch downgrade
! pip install -q torch==1.10.0

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
from torch.cuda.amp import autocast, GradScaler

[K     |██████████████████████████████▎ | 834.1 MB 1.2 MB/s eta 0:00:41tcmalloc: large alloc 1147494400 bytes == 0x649f8000 @  0x7f3e432c3615 0x592b76 0x4df71e 0x59afff 0x515655 0x549576 0x593fce 0x548ae9 0x51566f 0x549576 0x593fce 0x548ae9 0x5127f1 0x598e3b 0x511f68 0x598e3b 0x511f68 0x598e3b 0x511f68 0x4bc98a 0x532e76 0x594b72 0x515600 0x549576 0x593fce 0x548ae9 0x5127f1 0x549576 0x593fce 0x5118f8 0x593dd7
[K     |████████████████████████████████| 881.9 MB 19 kB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.12.0+cu113 requires torch==1.11.0, but you have torch 1.10.0 which is incompatible.
torchtext 0.12.0 requires torch==1.11.0, but you have torch 1.10.0 which is incompatible.
torchaudio 0.11.0+cu113 requires torch==1.11.0, but you have torch 1.10.0 which is incompatible.[0m
[?25h

In [4]:
def setup(cfg):
    cfg.COLAB = 'google.colab' in sys.modules
    cfg.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    if cfg.COLAB:
        print('This environment is Google Colab')

        # mount
        from google.colab import drive
        if not os.path.isdir('/content/drive'):
            drive.mount('/content/drive') 

        # pip install
        ! pip install -q torch==1.10.0
        ! pip install -q transformers
        ! pip install -q sentencepiece

        # use kaggle api (need kaggle token)
        f = open(cfg.api_path, 'r')
        json_data = json.load(f) 
        os.environ['KAGGLE_USERNAME'] = json_data['username']
        os.environ['KAGGLE_KEY'] = json_data['key']

        # set dirs
        cfg.DRIVE = cfg.DRIVE_PATH
        cfg.EXP = (cfg.NAME if cfg.NAME is not None 
            else requests.get('http://172.28.0.2:9000/api/sessions').json()[0]['name'][:-6]
        )
        cfg.INPUT = os.path.join(cfg.DRIVE, 'Input')
        cfg.OUTPUT = os.path.join(cfg.DRIVE, 'Output')
        cfg.SUBMISSION = os.path.join(cfg.DRIVE, 'Submission')
        cfg.DATASET = os.path.join(cfg.DRIVE, 'Dataset')

        cfg.OUTPUT_EXP = os.path.join(cfg.OUTPUT, cfg.EXP) 
        cfg.EXP_MODEL = os.path.join(cfg.OUTPUT_EXP, 'model')
        cfg.EXP_FIG = os.path.join(cfg.OUTPUT_EXP, 'fig')
        cfg.EXP_PREDS = os.path.join(cfg.OUTPUT_EXP, 'preds')

        # make dirs
        for d in [cfg.INPUT, cfg.SUBMISSION, cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS]:
            os.makedirs(d, exist_ok=True)
        
        if not os.path.isfile(os.path.join(cfg.INPUT, 'train.csv')):
            # load dataset
            ! pip install --upgrade --force-reinstall --no-deps kaggle
            ! kaggle competitions download -c $cfg.COMPETITION -p $cfg.INPUT
            filepath = os.path.join(cfg.INPUT,cfg.COMPETITION+'.zip')
            ! unzip -d $cfg.INPUT $filepath
            
        
        for path in cfg.DATASET_PATH:
            datasetpath = os.path.join(cfg.DATASET,  path.split('/')[1])
            if not os.path.exists(datasetpath):
                os.makedirs(datasetpath, exist_ok=True)
                ! kaggle datasets download $path -p $datasetpath
                filepath = os.path.join(datasetpath, path.split("/")[1]+'.zip')
                ! unzip -d $datasetpath $filepath

    else:
        print('This environment is Kaggle Kernel')

        # set dirs
        cfg.INPUT = f'../input/{cfg.COMPETITION}'
        cfg.EXP = cfg.NAME
        cfg.OUTPUT_EXP = cfg.NAME
        cfg.SUBMISSION = './'
        cfg.DATASET = '../input/'
        
        cfg.EXP_MODEL = os.path.join(cfg.EXP, 'model')
        cfg.EXP_FIG = os.path.join(cfg.EXP, 'fig')
        cfg.EXP_PREDS = os.path.join(cfg.EXP, 'preds')

        # make dirs
        for d in [cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS]:
            os.makedirs(d, exist_ok=True)
    return cfg

In [6]:
# The following is necessary if you want to use the fast tokenizer for deberta v2 or v3
# This must be done before importing transformers

cfg = setup(Config)

import shutil
from pathlib import Path

transformers_path = Path("/opt/conda/lib/python3.7/site-packages/transformers")

convert_file = cfg.OUTPUT_EXP + "/convert_slow_tokenizer.py"
conversion_path = transformers_path/convert_file.name

if conversion_path.exists():
    conversion_path.unlink()

shutil.copy(convert_file, transformers_path)
deberta_v2_path = transformers_path / "models" / "deberta_v2"

for filename in ['tokenization_deberta_v2.py', 'tokenization_deberta_v2_fast.py']:
    filepath = deberta_v2_path/filename
    
    if filepath.exists():
        filepath.unlink()

    shutil.copy(os.path.join(cfg.OUTPUT_EXP, filename), filepath)

This environment is Google Colab


TypeError: ignored

In [None]:
%%writefile mlm.py

import argparse
import os
import json
from pathlib import Path

import pandas as pd
from tqdm.auto import tqdm
import torch
from datasets import load_dataset
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoConfig
from transformers import DataCollatorForLanguageModeling, AutoModelForMaskedLM, Trainer
from transformers import TrainingArguments
from transformers.utils import logging
from IPython import embed  # noqa

logging.set_verbosity_info()
logger = logging.get_logger(__name__)
logger.info("INFO")
logger.warning("WARN")
KAGGLE_ENV = True if 'KAGGLE_URL_BASE' in set(os.environ.keys()) else False


print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# INPUT_DIR = Path('../input/')
# if KAGGLE_ENV:
#     OUTPUT_DIR = Path('')
#     os.environ["WANDB_DISABLED"] = "true"
# else:
#     OUTPUT_DIR = INPUT_DIR


def get_patient_notes_not_used_train():

    pppm_abstract = pd.read_csv(cfg.DATASET + "/pppm_abstract/pppm_abstract.csv")
    pppm_abstract.dropna(inplace = True)
    pppm_abstract = pppm_abstract.sample(frac=1, random_state=0).reset_index(drop=True)
    print(pppm_abstract.shape)

    train_pppm_abstract = \
        pppm_abstract.loc[:int(len(pppm_abstract) * 0.7), :].reset_index(drop=True)
    valid_pppm_abstract = \
        pppm_abstract.loc[int(len(pppm_abstract) * 0.7):, :].reset_index(drop=True)

    print(train_pppm_abstract.shape)
    print(valid_pppm_abstract.shape)
    return train_pppm_abstract, valid_pppm_abstract


def tokenize_function(examples):
    return tokenizer(examples["text"])


def get_tokenizer(args):
    if 'v3' in str(args.model_path):
        from transformers.models.deberta_v2.tokenization_deberta_v2_fast import DebertaV2TokenizerFast
        print('DebertaV2TokenizerFast')
        tokenizer = DebertaV2TokenizerFast.from_pretrained(INPUT_DIR / args.model_path, trim_offsets=False)
    else:
        if args.model_name:
            print('model_name', args.model_name)
            tokenizer = AutoTokenizer.from_pretrained(args.model_name, trim_offsets=False)
        else:
            print('model_path', args.model_path)
            tokenizer = AutoTokenizer.from_pretrained(INPUT_DIR / args.model_path, trim_offsets=False)
    return tokenizer


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_name", type=str, default="microsoft/deberta-v3-large", required=False)
    parser.add_argument("--model_path", type=str, default="", required=False)
    parser.add_argument("--seed", type=int, default=0, required=False)
    parser.add_argument('--debug', action='store_true', required=False)
    parser.add_argument('--exp_num', type=str, required=True)
    parser.add_argument("--param_freeze", action='store_true', required=False)
    parser.add_argument("--num_train_epochs", type=int, default=5, required=False)
    parser.add_argument("--batch_size", type=int, default=8, required=False)
    parser.add_argument("--lr", type=float, default=2e-5, required=False)
    parser.add_argument("--gradient_accumulation_steps", type=int, default=1, required=False)
    return parser.parse_args()


if __name__ == "__main__":

    args = parse_args()
    args.debug = True
    train, valid = get_patient_notes_not_used_train()

    if args.debug:
        train = train.iloc[:10, :]
        valid = valid.iloc[:10, :]
        args.batch_seize = 1

    def get_text(df):
        text_list = []
        for text in tqdm(df['abstract']):
            if len(text) < 30:
                pass
            else:
                text_list.append(text)
        return text_list

    train_text_list = get_text(train)
    valid_text_list = get_text(valid)

    mlm_train_json_path = cfg.OUTPUT_EXP / 'train_mlm.json'
    mlm_valid_json_path = cfg.OUTPUT_EXP / 'valid_mlm.json'

    for json_path, list_ in zip([mlm_train_json_path, mlm_valid_json_path],
                                [train_text_list, valid_text_list]):
        with open(str(json_path), 'w') as f:
            for sentence in list_:
                row_json = {'text': sentence}
                json.dump(row_json, f)
                f.write('\n')

    datasets = load_dataset(
        'json',
        data_files={'train': str(mlm_train_json_path),
                    'valid': str(mlm_valid_json_path)},
        )

    if mlm_train_json_path.is_file():
        mlm_train_json_path.unlink()
    if mlm_valid_json_path.is_file():
        mlm_valid_json_path.unlink()
    print(datasets["train"][:2])

    tokenizer = get_tokenizer(args)

    tokenized_datasets = datasets.map(
        tokenize_function,
        batched=True,
        num_proc=1,
        remove_columns=["text"],
        batch_size=args.batch_size)
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

    if args.model_name:
        print('model_name:', args.model_name)
        model_name = args.model_name
    else:
        print('model_path:', args.model_path)
        model_name = INPUT_DIR / args.model_path
    config = AutoConfig.from_pretrained(model_name, output_hidden_states=True)

    if 'v3' in str(model_name):
        model = transformers.DebertaV2ForMaskedLM.from_pretrained(INPUT_DIR / model_name, config=config)
    else:
        model = AutoModelForMaskedLM.from_pretrained(model_name, config=config)

    if args.param_freeze:
        # if freeze, Write freeze settings here

        # deberta-v3-large
        # model.deberta.embeddings.requires_grad_(False)
        # model.deberta.encoder.layer[:12].requires_grad_(False)

        # deberta-large
        model.deberta.embeddings.requires_grad_(False)
        model.deberta.encoder.layer[:24].requires_grad_(False)

        for name, p in model.named_parameters():
            print(name, p.requires_grad)

    if args.debug:
        save_steps = 100
        args.num_train_epochs = 1
    else:
        save_steps = 100000000

    training_args = TrainingArguments(
        output_dir="output-mlm",
        evaluation_strategy="epoch",
        learning_rate=args.lr,
        weight_decay=0.01,
        save_strategy='no',
        per_device_train_batch_size=args.batch_size,
        num_train_epochs=args.num_train_epochs,
        # report_to="wandb",
        run_name=f'output-mlm-{args.exp_num}',
        # logging_dir='./logs',
        lr_scheduler_type='cosine',
        warmup_ratio=0.2,
        fp16=True,
        logging_steps=500,
        gradient_accumulation_steps=args.gradient_accumulation_steps
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets['valid'],
        data_collator=data_collator,
        # optimizers=(optimizer, scheduler)
    )

    trainer.train()

    if args.model_name == 'microsoft/deberta-xlarge':
        model_name = 'deberta-xlarge'
    elif args.model_name == 'microsoft/deberta-large':
        model_name = 'deberta-large'
    elif args.model_name == 'microsoft/deberta-base':
        model_name = 'deberta-base'
    elif args.model_path == "../input/deberta-v3-large/deberta-v3-large/":
        model_name = 'deberta-v3-large'
    elif args.model_name == 'microsoft/deberta-v2-xlarge':
        model_name = 'deberta-v2-xlarge'
    trainer.model.save_pretrained(cfg.OUTPUT_EXP + f'/{args.exp_num}_mlm_{model_name}')



In [None]:
!python mlm.py --debug --exp_num 0

In [None]:
ls 

[0m[01;34m0_mlm_deberta-v3-large[0m/  __notebook_source__.ipynb  mlm.py  [01;34moutput-mlm[0m/
