In [1]:
! nvidia-smi

Wed May 18 01:02:52 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    24W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import os

class Config:
    AUTHOR = "kuruton"

    NAME = "USP-" + "MLM001-deberta-v3-large"
    MODEL_PATH = "microsoft/deberta-v3-large"
    DATASET_PATH = [
        'fankaixie/pppm-abstract',
        'fankaixie/cpc-description'
    ]

    COMPETITION = "us-patent-phrase-to-phrase-matching"
    COLAB_PATH = "/content/drive/Shareddrives/USPatent" 
    DRIVE_PATH = os.path.join(COLAB_PATH, AUTHOR)

    api_path = "/content/drive/MyDrive/kaggle.json"

    seed = 42
    train_size = 0.8
    batch_size = 2
    num_train_epochs = 5
    max_len = 512

    weight_decay = 2e-5
    lr = 2e-5
    mlm_probability = 0.15
    warmup_ratio = 0.01
    gradient_accumulation_steps = 1
    lr_scheduler_type = 'cosine'

    upload_from_colab = True

In [3]:
# ========================================
# Library
# ========================================
import os
import gc
import re
import sys
import json
import time
import shutil
import joblib
import random
import requests
import warnings
warnings.filterwarnings('ignore')
from ast import literal_eval

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path
from glob import glob
from tqdm.auto import tqdm
from sklearn.model_selection import (
    train_test_split,
    KFold,
    StratifiedKFold,
    GroupKFold,
    StratifiedGroupKFold,
)
from sklearn.metrics import (
    accuracy_score, 
    f1_score,
    roc_auc_score,
)

import torch

from google.colab import drive
if not os.path.isdir('/content/drive'):
    drive.mount('/content/drive') 

In [4]:
def setup(cfg):
    cfg.COLAB = 'google.colab' in sys.modules
    cfg.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    if cfg.COLAB:
        print('This environment is Google Colab')

        # mount
        from google.colab import drive
        if not os.path.isdir('/content/drive'):
            drive.mount('/content/drive') 

        # pip install
        ! pip install -q transformers
        ! pip install -q sentencepiece
        ! pip install -q datasets

        # use kaggle api (need kaggle token)
        f = open(cfg.api_path, 'r')
        json_data = json.load(f) 
        os.environ['KAGGLE_USERNAME'] = json_data['username']
        os.environ['KAGGLE_KEY'] = json_data['key']

        # set dirs
        cfg.DRIVE = cfg.DRIVE_PATH
        cfg.EXP = (cfg.NAME if cfg.NAME is not None 
            else requests.get('http://172.28.0.2:9000/api/sessions').json()[0]['name'][:-6]
        )
        cfg.INPUT = os.path.join(cfg.DRIVE, 'Input')
        cfg.OUTPUT = os.path.join(cfg.DRIVE, 'Output')
        cfg.SUBMISSION = os.path.join(cfg.DRIVE, 'Submission')
        cfg.DATASET = os.path.join(cfg.DRIVE, 'Dataset')

        cfg.OUTPUT_EXP = os.path.join(cfg.OUTPUT, cfg.EXP) 
        cfg.EXP_MODEL = os.path.join(cfg.OUTPUT_EXP, 'model')
        cfg.EXP_FIG = os.path.join(cfg.OUTPUT_EXP, 'fig')
        cfg.EXP_PREDS = os.path.join(cfg.OUTPUT_EXP, 'preds')

        # make dirs
        for d in [cfg.INPUT, cfg.SUBMISSION, cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS]:
            os.makedirs(d, exist_ok=True)
        
        if not os.path.isfile(os.path.join(cfg.INPUT, 'train.csv')):
            # load dataset
            ! pip install --upgrade --force-reinstall --no-deps kaggle
            ! kaggle competitions download -c $cfg.COMPETITION -p $cfg.INPUT
            filepath = os.path.join(cfg.INPUT,cfg.COMPETITION+'.zip')
            ! unzip -d $cfg.INPUT $filepath
            
        
        for path in cfg.DATASET_PATH:
            datasetpath = os.path.join(cfg.DATASET,  path.split('/')[1])
            if not os.path.exists(datasetpath):
                os.makedirs(datasetpath, exist_ok=True)
                ! kaggle datasets download $path -p $datasetpath
                filepath = os.path.join(datasetpath, path.split("/")[1]+'.zip')
                ! unzip -d $datasetpath $filepath

    else:
        print('This environment is Kaggle Kernel')

        # set dirs
        cfg.INPUT = f'../input/{cfg.COMPETITION}'
        cfg.EXP = cfg.NAME
        cfg.OUTPUT_EXP = cfg.NAME
        cfg.SUBMISSION = './'
        cfg.DATASET = '../input/'
        
        cfg.EXP_MODEL = os.path.join(cfg.EXP, 'model')
        cfg.EXP_FIG = os.path.join(cfg.EXP, 'fig')
        cfg.EXP_PREDS = os.path.join(cfg.EXP, 'preds')

        # make dirs
        for d in [cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS]:
            os.makedirs(d, exist_ok=True)
    return cfg


def dataset_create_new(dataset_name, upload_dir):
    dataset_metadata = {}
    dataset_metadata['id'] = f'{os.environ["KAGGLE_USERNAME"]}/{dataset_name}'
    dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
    dataset_metadata['title'] = dataset_name
    with open(os.path.join(upload_dir, 'dataset-metadata.json'), 'w') as f:
        json.dump(dataset_metadata, f, indent=4)
    api = KaggleApi()
    api.authenticate()
    api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode='tar')

In [5]:
def get_text(cfg):
    df = pd.read_csv(os.path.join(cfg.DATASET, 'cpc-description/CPC_description.csv'))['claim']
    df = df[df.notnull()].reset_index(drop=True)
    train, test = train_test_split(
        df,
        train_size=cfg.train_size,
        shuffle=True,
        random_state=cfg.seed,
    )
    train, test = train.tolist(), test.tolist()
    return train, test

def get_dataset(cfg):
    train, test = get_text(cfg)
    train_path = os.path.join('./', 'train_mlm.json')
    valid_path = os.path.join('./', 'valid_mlm.json')

    for path, text_list in zip(
        [train_path, valid_path],
        [train, test]):
        with open(str(path), 'w') as f:
            for sentence in text_list:
                row_json = {'text': sentence}
                json.dump(row_json, f)
                f.write('\n')

    datasets = load_dataset(
        'json',
        data_files={
            'train': train_path,
            'valid': valid_path
        },
    )
    return datasets

def main(cfg):
    if 'KAGGLE_URL_BASE' in set(os.environ.keys()):
        os.environ["WANDB_DISABLED"] = "true"

    datasets = get_dataset(cfg)
    tokenizer = AutoTokenizer.from_pretrained(cfg.MODEL_PATH)
    tokenized_datasets = datasets.map(
        lambda x: tokenizer(x['text'],  max_length=cfg.max_len),
        batched=True,
        num_proc=1,
        remove_columns=["text"],
        batch_size=cfg.batch_size
    )

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm_probability=cfg.mlm_probability,
    )

    config = AutoConfig.from_pretrained(cfg.MODEL_PATH, output_hidden_states=True)
    model = AutoModelForMaskedLM.from_pretrained(cfg.MODEL_PATH, config=config)

    training_args = TrainingArguments(
        output_dir=os.path.join(cfg.EXP_MODEL, "mlm"),
        evaluation_strategy="epoch",
        learning_rate=cfg.lr,
        weight_decay=cfg.weight_decay,
        save_strategy='no',
        per_device_train_batch_size=cfg.batch_size,
        num_train_epochs=cfg.num_train_epochs,
        lr_scheduler_type=cfg.lr_scheduler_type,
        warmup_ratio=cfg.warmup_ratio,
        fp16=True,
        logging_steps=500,
        gradient_accumulation_steps=cfg.gradient_accumulation_steps
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets['valid'],
        data_collator=data_collator,
        # optimizers=(optimizer, scheduler)
    )
    trainer.train()
    trainer.model.save_pretrained(os.path.join(cfg.EXP_MODEL), 'model')

In [None]:
# =====================
# Main
# =====================
cfg = setup(Config)

import transformers
from transformers import (
    AutoConfig,
    AutoModel,
    AutoTokenizer,
)
from transformers import (
    DataCollatorForLanguageModeling,
    AutoModelForMaskedLM,
    Trainer,
)
from transformers import TrainingArguments
from transformers.utils import logging
from datasets import load_dataset

main(cfg)

This environment is Google Colab


Using custom data configuration default-c3eec0dadfe1d276


Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-c3eec0dadfe1d276/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-c3eec0dadfe1d276/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/580 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/20000 [00:00<?, ?ba/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


  0%|          | 0/5000 [00:00<?, ?ba/s]

Downloading:   0%|          | 0.00/833M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2ForMaskedLM: ['deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.weight', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2ForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification mode

Epoch,Training Loss,Validation Loss
1,1.4846,1.363767
2,1.1977,1.163145
3,1.1282,1.04926
4,1.0397,0.985993
5,0.967,0.967917


***** Running Evaluation *****
  Num examples = 10000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


Configuration saved in /content/drive/Shareddrives/USPatent/kuruton/Output/USP-MLM001-deberta-v3-large/model/config.json
Model weights saved in /content/drive/Shareddrives/USPatent/kuruton/Output/USP-MLM001-deberta-v3-large/model/pytorch_model.bin
