In [None]:
!pip install datasets transformers daaja fugashi ipadic numpy seqeval unidic-lite sentencepiece

In [18]:
from pathlib import Path
from typing import List, Tuple
import numpy as np
import datasets
from datasets import load_metric
from datasets import Dataset
from collections import defaultdict
import transformers
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments 

In [19]:
MODEL_NAME = 'nlp-waseda/roberta-base-japanese'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

loading file https://huggingface.co/nlp-waseda/roberta-base-japanese/resolve/main/spiece.model from cache at /root/.cache/huggingface/transformers/b425b145da6b899502074f9762ba498875debe6257cd51a90dafe3bd15ce3222.8da2281ef7354bc5cf79dc8bc8c1fce50b321191b1c8974ac95aa87b027dc163
loading file https://huggingface.co/nlp-waseda/roberta-base-japanese/resolve/main/tokenizer.json from cache at None
loading file https://huggingface.co/nlp-waseda/roberta-base-japanese/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/nlp-waseda/roberta-base-japanese/resolve/main/special_tokens_map.json from cache at /root/.cache/huggingface/transformers/2bd909f7d6065f4e8e3111ca7563e3067dfeb4566b36489f867105bbef84e0b4.3db0799720217f7da35e92d033f167ac40c8d2c02fa035130b7bb070f6355074
loading file https://huggingface.co/nlp-waseda/roberta-base-japanese/resolve/main/tokenizer_config.json from cache at /root/.cache/huggingface/transformers/7958f82c95545acba1e551d38fc8e68cb345b9b5e5a6

In [20]:
def load_conll(input_path: Path):
    tokens_list, labels_list = [], []
    tokens, labels = [], []
    for line in input_path.read_text().split("\n"):
        cols = line.split("\t")
        if len(cols) != 2:
            if len(tokens) > 0:
                tokens_list.append(tokens)
                labels_list.append(labels)
                tokens, labels = [], []
            continue
        tokens.append(cols[0])
        labels.append(cols[1])
    if len(tokens) > 0:
        tokens_list.append(tokens)
        labels_list.append(labels)

    result_d = defaultdict(list)
    for i, (tokens, labels) in enumerate(zip(tokens_list, labels_list)):
        result_d["id"].append(i)
        result_d["tokens"].append(tokens)
        result_d["labels"].append(labels)

    return Dataset.from_dict(result_d, features=datasets.Features({
        "id": datasets.Value("string"),
        "tokens": datasets.Sequence(datasets.Value("string")),
        "labels": datasets.Sequence(
            datasets.features.ClassLabel(
                names=[
                    'O',
                    'B-ARTIFACT',
                    'I-ARTIFACT',
                    'B-DATE',
                    'I-DATE',
                    'B-EVENT',
                    'I-EVENT',
                    'B-LOCATION',
                    'I-LOCATION',
                    'B-MONEY',
                    'I-MONEY',
                    'B-NUMBER',
                    'I-NUMBER',
                    'B-ORGANIZATION',
                    'I-ORGANIZATION',
                    'B-OTHER',
                    'I-OTHER',
                    'B-PERCENT',
                    'I-PERCENT',
                    'B-PERSON',
                    'I-PERSON',
                    'B-TIME',
                    'I-TIME',
                    ]
                )
            ),
    }))

In [21]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], max_length=128, truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [22]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [23]:
metric = load_metric("seqeval")

In [24]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [25]:
train_dataset = load_conll(Path("./subsets/aug_500_p_0.7_num_10.tsv"))
valid_dataset = load_conll(Path("./subsets/valid.tsv"))
test_dataset = load_conll(Path("./subsets/test.tsv"))

tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_valid_dataset = valid_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

label_list = train_dataset.features["labels"].feature.names

model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, num_labels=len(label_list))
args = TrainingArguments(
    "./outputs",
    evaluation_strategy="epoch",
    num_train_epochs=75,
    save_total_limit=2,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    metric_for_best_model='f1',
    save_strategy="epoch",
    load_best_model_at_end=True
)
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train_dataset,    
    eval_dataset=tokenized_valid_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [transformers.EarlyStoppingCallback(early_stopping_patience=5)]

)

trainer.train()

def round(val, digit=0):
    p = 10 ** digit
    return (val * p * 2 + 1) // 2 / p

print(round(trainer.evaluate(tokenized_test_dataset)["eval_f1"], 3) * 100)

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

loading configuration file https://huggingface.co/nlp-waseda/roberta-base-japanese/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/dfb95760f9326cf799d9c01329f932bc2fb0ffa3dfd8bda477d8d5065de46539.91a9185d7ddf708a2bea0706761a53483964a06643fee0de70801c69800fe9a0
Model config RobertaConfig {
  "_name_or_path": "nlp-waseda/roberta-base-japanese",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 2,
  "classifier_dropout": null,
  "eos_token_id": 3,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_1

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.123495,0.731104,0.77551,0.752653,0.963095
2,No log,0.116365,0.781221,0.808552,0.794651,0.966738
3,0.180200,0.127492,0.773196,0.838192,0.804383,0.966447
4,0.180200,0.134611,0.791648,0.838192,0.814255,0.968888
5,0.180200,0.15533,0.77793,0.825559,0.801037,0.966738
6,0.030400,0.161484,0.765632,0.827017,0.795141,0.966265
7,0.030400,0.167679,0.774514,0.832847,0.802622,0.96652
8,0.030400,0.166972,0.765306,0.838192,0.800093,0.968013
9,0.013200,0.186322,0.77038,0.826531,0.797468,0.966884


The following columns in the evaluation set  don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: tokens, id.
***** Running Evaluation *****
  Num examples = 250
  Batch size = 64
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./outputs/checkpoint-172
Configuration saved in ./outputs/checkpoint-172/config.json
Model weights saved in ./outputs/checkpoint-172/pytorch_model.bin
tokenizer config file saved in ./outputs/checkpoint-172/tokenizer_config.json
Special tokens file saved in ./outputs/checkpoint-172/special_tokens_map.json
Deleting older checkpoint [outputs/checkpoint-344] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: tokens, id.
***** Running Evaluation *****
  Num examples = 250
  Batch size = 64
  _warn_prf(average, modifier, msg_start, len(result))
Saving model check

80.10000000000001


In [26]:
trainer.evaluate(tokenized_test_dataset)["eval_f1"]

The following columns in the evaluation set  don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: tokens, id.
***** Running Evaluation *****
  Num examples = 250
  Batch size = 64


0.8008817046289493