#設定

In [None]:
%%capture
!pip install transformers ipadic fugashi

In [None]:
import tensorflow as tf
import numpy as np
import random
import itertools
import json
import unicodedata
import datetime
import os
import matplotlib.pyplot as plt

from tqdm import tqdm
from transformers import BertJapaneseTokenizer, TFBertForTokenClassification

In [None]:
MODEL_NAME = "cl-tohoku/bert-base-japanese-whole-word-masking"
MAX_LENGTH = 128

#IO法トークナイザクラス

# 新しいセクション

In [None]:
class NER_tokenizer(BertJapaneseTokenizer):

    def encode_plus_tagged(self, text, entities, max_length):
        # 固有表現の前後でtextを分割し、それぞれのラベルをつけておく
        entities = sorted(entities, key=lambda x: x["span"][0])
        splitted = []
        position = 0
        for entity in entities:
            start = entity["span"][0]
            end = entity["span"][1]
            label = entity["type_id"]

            # 固有表現でないものには0のラベルをつける
            splitted.append({"text":text[position:start], "label":0})
            # 固有表現には固有表現のタイプに対応するIDをラベルにつける
            splitted.append({"text":text[start:end], "label":label})
            position = end
        
        splitted.append({"text":text[position:], "label":0})
        # 長さが0の文字列は除く
        splitted = [s for s in splitted if s["text"]]

        # 分割されたそれぞれの文字列をトークン化し、ラベルをつける
        tokens = []
        labels = []
        for text_splitted in splitted:
            text = text_splitted["text"]
            label = text_splitted["label"]
            tokens_splitted = self.tokenize(text)
            labels_splitted = [label] * len(tokens_splitted)
            tokens.extend(tokens_splitted)
            labels.extend(labels_splitted)

        # 符号化を行いBERTに入力できる形式にする
        input_ids = self.convert_tokens_to_ids(tokens)
        # input_idsをencodingに変換
        encoding = self.prepare_for_model(
            input_ids,
            max_length=max_length,
            padding="max_length",
            truncation=True,
            return_tensors="tf"
        )
        # 特殊トークン[CLS], [SEP]のラベルを0にする
        labels = [0] + labels[:max_length-2] + [0]
        #特殊トークン[PAD]のラベルを0にする
        labels = labels + [0] * (max_length - len(labels))
        encoding["labels"] = labels

        return encoding

    def encode_plus_untagged(self, text, max_length=None):
        # 文章のトークン化を行い、それぞれのトークンと文章中の文字列を対応づける
        tokens = []
        tokens_original = []
        words = self.word_tokenizer.tokenize(text)
        for word in words:
            # 単語をサブワードに分割
            tokens_word = self.subword_tokenizer.tokenize(word)
            tokens.extend(tokens_word)
            if tokens_word[0] == "[UNK]":
                tokens_original.append(word)
            else:
                tokens_original.extend([
                    token.replace("##", "") for token in tokens_word
                ])

        # 各トークンの文書中での位置を調べる
        position = 0
        spans = []
        for token in tokens_original:
            len_token = len(token)
            while 1:
                if token != text[position:position+len_token]:
                    position += 1
                else:
                    spans.append([position, position+len_token])
                    position += len_token
                    break

        # 符号化を行いBERTに入力できる形式にする
        input_ids = self.convert_tokens_to_ids(tokens)
        encoding = self.prepare_for_model(
            input_ids,
            max_length=max_length,
            padding="max_length" if max_length else False,
            truncation=True if max_length else False,
            return_tensors="tf"
        )
        sequence_length = len(encoding["input_ids"])
        # 特殊トークン[CLS]に対するダミーのspanを追加
        spans = [[-1, -1]] + spans[:sequence_length-2]
        # 特殊トークン[SEP], [PAD]に対するダミーのspanを追加
        spans = spans + [[-1, -1]] * (sequence_length - len(spans))

        return encoding, spans

    def convert_bert_output_to_entities(self, text, labels, spans):
        # labels, spansから特殊トークンに来往する部分を取り除く
        labels = [label for label, span in zip(labels, spans) if span[0] != -1]
        spans = [span for span in spans if span[0] != -1]

        # 同じラベルが連続するトークンをまとめて、固有表現を抽出する
        entities = []
        for label, group in itertools.groupby(enumerate(labels), key=lambda x: x[1]):
            group = list(group)
            start = spans[group[0][0]][0]
            end = spans[group[-1][0]][1]

            if label != 0:
                # ラベルが0以外ならば、新たな固有表現として追加
                entity = {
                    "name":text[start:end],
                    "span":[start, end],
                    "type_id":label
                }
                entities.append(entity)

        return entities

In [None]:
tokenizer = NER_tokenizer.from_pretrained(MODEL_NAME)

In [None]:
text = "昨日のみらい事務所との打ち合わせは順調だった。"
entities = [
            {"name":"みらい事務所", "span":[3, 9], "type_id":1}
]
encoding = tokenizer.encode_plus_tagged(text, entities, max_length=20)
print(encoding)

In [None]:
text = "騰訊の英語名はTencent Holdings Ltdである。"
encoding, spans = tokenizer.encode_plus_untagged(text)
print(encoding)
print(spans)

In [None]:
labels_predicted = [0, 1, 1, 0, 0, 0, 0, 1, 1, 1 ,1 ,1 ,1 ,1 ,1 ,1, 0, 0, 0, 0]
entities = tokenizer.convert_bert_output_to_entities(text, labels_predicted, spans)
print(entities)

#トークン分類BERT

In [None]:
bert = TFBertForTokenClassification.from_pretrained(MODEL_NAME, num_labels=4)

In [None]:
text = "AさんはB大学に入学した。"
encoding, spans = tokenizer.encode_plus_untagged(text)
print(encoding)

In [None]:
output = bert(
    tf.reshape(encoding["input_ids"], (1, 12)),
    tf.reshape(encoding["attention_mask"], (1, 12)),
    tf.reshape(encoding["token_type_ids"], (1, 12))
)
print(output)
scores = output.logits
labels_predicted = tf.argmax(scores[0], -1).numpy().tolist()

In [None]:
entities = tokenizer.convert_bert_output_to_entities(
    text, labels_predicted, spans
)
print(entities)

#データセット

In [None]:
!git clone --branch v2.0 https://github.com/stockmarkteam/ner-wikipedia-dataset.git

In [None]:
dataset = json.load(open("ner-wikipedia-dataset/ner.json", "r"))

In [None]:
print(dataset[10])

In [None]:
type_to_id = {entity["type"] for data in dataset for entity in data["entities"]}
type_to_id = {label:num+1 for num, label in enumerate(type_to_id)}
print(type_to_id)

In [None]:
for data in dataset:
    data["text"] = unicodedata.normalize("NFKC", data["text"])
    for entity in data["entities"]:
        entity["type_id"] = type_to_id[entity["type"]]
        del entity["type"]

In [None]:
print(dataset[10])

In [None]:
print(tokenizer.encode_plus_tagged(dataset[10]["text"], dataset[10]["entities"], MAX_LENGTH))

In [None]:
random.shuffle(dataset)
num_dataset = len(dataset)
num_train = int(num_dataset * 0.6)
num_val = int(num_dataset * 0.2)

dataset_train = dataset[:num_train]
dataset_val = dataset[num_train:num_train+num_val]
dataset_test = dataset[num_train+num_val:]

In [None]:
def to_train_feature(ds, tokenizer, max_length, num_labels):
    input_shape = (len(ds), max_length)
    output_shape = (len(ds), max_length)

    input_ids = np.zeros(input_shape, np.int32)
    attention_mask = np.zeros(input_shape, np.int32)
    token_type_ids = np.zeros(input_shape, np.int32)
    labels = np.zeros(output_shape, np.int32)

    for i, data in enumerate(ds):

        encoding = tokenizer.encode_plus_tagged(data["text"], data["entities"], max_length)
        input_ids[i] = encoding["input_ids"]
        attention_mask[i] = encoding["attention_mask"]
        token_type_ids[i] = encoding["token_type_ids"]
        labels[i] = encoding["labels"]
        

    return [input_ids, attention_mask, token_type_ids], labels

In [None]:
X_train, y_train = to_train_feature(dataset_train, tokenizer, MAX_LENGTH, len(type_to_id)+1)
X_val, y_val = to_train_feature(dataset_val, tokenizer, MAX_LENGTH, len(type_to_id)+1)
X_test, y_test = to_train_feature(dataset_test, tokenizer, MAX_LENGTH, len(type_to_id)+1)

#訓練開始

In [None]:
!rm -rf logs

In [None]:
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
log_dir = os.path.join('logs/', current_time)
ckpt_dir = os.path.join('ckpt/', current_time)

In [None]:
bert_tc = TFBertForTokenClassification.from_pretrained(
    MODEL_NAME, num_labels=len(type_to_id)+1
)

In [None]:
bert_tc.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
    metrics=["accuracy"]
)

In [None]:
EPOCHS = 20

callbacks = [
             tf.keras.callbacks.EarlyStopping(
                 monitor="val_loss", mode="min",
                 patience=5
             ),
             tf.keras.callbacks.TensorBoard(
                 log_dir=log_dir,
                 histogram_freq=1
             ),
             tf.keras.callbacks.ModelCheckpoint(
                 ckpt_dir,
                 save_best_only=True, save_weights_only=True
             )
]


history = bert_tc.fit(
    X_train, y_train, epochs=EPOCHS,
    batch_size=32,
    callbacks=callbacks,
    validation_data=(X_val, y_val), 
    validation_batch_size=32
)

#性能評価

In [None]:
test_eval = bert_tc.evaluate(X_test, y_test)

In [None]:
print(test_eval)

In [None]:
for i in [random.randint(0, len(X_test[0])) for _ in range(5)]:
    print(dataset_test[i]["text"])
    print(dataset_test[i]["entities"])
    output = bert_tc(
        [X_test[0][i].reshape((1,128)),
         X_test[1][i].reshape((1,128)),
         X_test[2][i].reshape((1,128))]
    )
    labels_predicted = tf.argmax(output.logits[0], axis=1)
    _, spans = tokenizer.encode_plus_untagged(dataset_test[i]["text"], MAX_LENGTH)

    print(tokenizer.convert_bert_output_to_entities(
        dataset_test[i]["text"], labels_predicted.numpy(), spans
    ))

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir logs/