#設定

In [None]:
!pip install -q transformers ipadic fugashi

In [None]:
import tensorflow as tf
import json
import random
import matplotlib.pyplot as plt
import datetime
import numpy as np

from tqdm import tqdm
from transformers import TFBertModel, BertJapaneseTokenizer

In [None]:
BERT_MODEL_NAME = "cl-tohoku/bert-base-japanese-whole-word-masking"
MAX_LENGTH = 128

In [None]:
tokenizer = BertJapaneseTokenizer.from_pretrained(BERT_MODEL_NAME)

#データのダウンロード

In [None]:
!wget https://s3-ap-northeast-1.amazonaws.com/dev.tech-sketch.jp/chakki/public/chABSA-dataset.zip .

In [None]:
!unzip -q chABSA-dataset.zip

In [None]:
polarity_to_label = {"negative":0, "neutral":1, "positive":2}
dataset = list()
filepaths = tf.io.gfile.glob("chABSA-dataset/*.json")
random.shuffle(filepaths)

for filepath in filepaths:
    data = json.load(open(filepath, "r"))
    # 各文の極性をリストにまとめる。
    for sentence in data["sentences"]:
        text = sentence["sentence"]
        labels = [0, 0, 0]
        # 極性を持つ語を含めば、その文章はその極性を持つこととする。
        for opinion in sentence["opinions"]:
            labels[polarity_to_label[opinion["polarity"]]] = 1
        example = {"text":text, "labels":labels}
        dataset.append(example)

In [None]:
num_examples = len(dataset)
encoded_data = {"input_ids":list(), "attention_mask":list(), "token_type_ids":list(), "labels":list()}
for example in dataset:
    encoded = tokenizer(
        example["text"], max_length=MAX_LENGTH,
        padding="max_length", truncation=True,
        return_tensors="tf"
    )
    for k, v in encoded.items():
        encoded_data[k].append(v[0])
    encoded_data["labels"].append(example["labels"])

In [None]:
encoded_dataset = tf.data.Dataset.from_tensor_slices(encoded_data)

In [None]:
train_dataset = encoded_dataset.take(int(num_examples*0.6))
test_dataset = encoded_dataset.skip(int(num_examples*0.6)).take(int(num_examples*0.2))
valid_dataset = encoded_dataset.skip(int(num_examples*0.8)).take(-1)

#マルチラベリングモデルの定義

In [None]:
def create_classifier(
    num_labels, 
    sequence_length, 
    bert_model_name=BERT_MODEL_NAME, 
    bert_trainable=False, 
    activation=None):

    input_ids = tf.keras.layers.Input(shape=(sequence_length,), dtype=tf.int32, name="input_ids")
    attention_mask = tf.keras.layers.Input(shape=(sequence_length,), dtype=tf.int32, name="attention_mask")
    token_type_ids = tf.keras.layers.Input(shape=(sequence_length,), dtype=tf.int32, name="token_type_ids")

    bert = TFBertModel.from_pretrained(bert_model_name, name="bert")
    bert.layers[0].trainable = bert_trainable
    multiply = tf.keras.layers.Multiply(name="multiply")
    dense = tf.keras.layers.Dense(num_labels, name="dense", activation=activation)

    x = bert.bert([input_ids, attention_mask, token_type_ids])

    # BERTの各トークンに対する出力を使う
    last_hidden_state = x.last_hidden_state

    # attention_maskが1の、[SPAN]トークンでない出力を平均する
    mask = tf.tile(
        tf.expand_dims(attention_mask, 2),
        [1, 1, bert.config.to_dict()["hidden_size"]]
    )
    mask = tf.cast(mask, tf.float32)
    x = multiply([last_hidden_state, mask])
    x = tf.math.reduce_sum(x, axis=1) / tf.math.reduce_sum(mask, axis=1)

    x = dense(x)

    return tf.keras.Model(inputs=[input_ids, attention_mask, token_type_ids], outputs=x)

#モデルの学習

In [None]:
EPOCHS = 3
BATCHSIZE = 32
STEPS_PER_EPOCH = len(train_dataset)//BATCHSIZE + 1

In [None]:
train_dataset_batched = train_dataset.shuffle(100).repeat().batch(BATCHSIZE)

##BERTの中身を学習させる

In [None]:
!rm -rf logs

In [None]:
multi_labeling = create_classifier(3, MAX_LENGTH, BERT_MODEL_NAME, True, "sigmoid")

In [None]:
d = next(iter(train_dataset.batch(10)))
multi_labeling([d["input_ids"], d["attention_mask"], d["token_type_ids"]])

In [None]:
loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=False)
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
acc_metrics_obj = tf.keras.metrics.BinaryAccuracy()
loss_metrics_obj = tf.keras.metrics.BinaryCrossentropy()

In [None]:
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
train_log_dir = 'logs/gradient_tape/' + current_time + '/train'
valid_log_dir = 'logs/gradient_tape/' + current_time + '/valid'
train_summary_writer = tf.summary.create_file_writer(train_log_dir)
valid_summary_writer = tf.summary.create_file_writer(valid_log_dir)

In [None]:
# 訓練ステップ関数
@tf.function
def train_step(inputs, loss_fn, optimizer, acc_metrics_obj, loss_metrics_obj):
    with tf.GradientTape() as tape:
        scores = multi_labeling(
            [inputs["input_ids"], inputs["attention_mask"], inputs["token_type_ids"]]
        )
        loss = loss_fn(inputs["labels"], scores)

    grads = tape.gradient(loss, multi_labeling.trainable_weights)
    optimizer.apply_gradients(list(zip(
        grads, multi_labeling.trainable_weights)
    ))

    acc_metrics_obj.update_state(inputs["labels"], scores)
    loss_metrics_obj.update_state(inputs["labels"], scores)

# 検証ステップ関数
@tf.function
def eval_step(inputs, acc_metrics_obj, loss_metrics_obj):
    scores = multi_labeling(
            [inputs["input_ids"], inputs["attention_mask"], inputs["token_type_ids"]]
        )

    acc_metrics_obj.update_state(inputs["labels"], scores)
    loss_metrics_obj.update_state(inputs["labels"], scores)

In [None]:
# メトリクス保存用
metrics = dict(
    acc=list(), loss=list(),
    val_acc=list(), val_loss=list()
)

train_iter = iter(train_dataset_batched)
for epoch in range(EPOCHS):    

    print(f"{epoch+1} EPOCH START")
    for step in tqdm(range(STEPS_PER_EPOCH)):
        inputs = next(train_iter)
        # 訓練の1ステップ
        train_step(inputs, loss_fn, optimizer, acc_metrics_obj, loss_metrics_obj)
        
    # ステップトータルの損失として保存する
    metrics["acc"].append(acc_metrics_obj.result().numpy())
    metrics["loss"].append(loss_metrics_obj.result().numpy())

    # 訓練結果をTensorboard用に保存する
    with train_summary_writer.as_default():
        tf.summary.scalar('accuracy', acc_metrics_obj.result(), step=epoch)
        tf.summary.scalar('loss', loss_metrics_obj.result(), step=epoch)

        for layer in multi_labeling.layers:
            for trainable_variable in layer.trainable_variables:
                tf.summary.histogram(
                    trainable_variable.name,
                    trainable_variable,
                    step=epoch)

    # 保存したメトリクスを消去する
    acc_metrics_obj.reset_state()
    loss_metrics_obj.reset_state()

    # 検証ループ
    for inputs in valid_dataset.batch(BATCHSIZE):
        eval_step(inputs, acc_metrics_obj, loss_metrics_obj)
    
    metrics["val_acc"].append(acc_metrics_obj.result().numpy())
    metrics["val_loss"].append(loss_metrics_obj.result().numpy())

    with valid_summary_writer.as_default():
        tf.summary.scalar('accuracy', acc_metrics_obj.result(), step=epoch)
        tf.summary.scalar('loss', loss_metrics_obj.result(), step=epoch)

    acc_metrics_obj.reset_state()
    loss_metrics_obj.reset_state()

    print(
        f"acc:{metrics['acc'][-1]:.4f}, loss:{metrics['loss'][-1]:.4f}, val_acc:{metrics['val_acc'][-1]:.4f}, val_loss:{metrics['val_loss'][-1]:.4f}\t"
    )

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(6, 4))
ax.plot(range(EPOCHS), metrics["acc"], label="acc")
ax.plot(range(EPOCHS), metrics["val_acc"], label="val_acc")

ay = ax.twinx()
ay.plot(range(EPOCHS), metrics["loss"], label="loss", linestyle="--")
ay.plot(range(EPOCHS), metrics["val_loss"], label="val_loss", linestyle="--")

ax.legend(loc="center right")
ay.legend(loc="lower right")
plt.show()

In [None]:
for inputs in test_dataset.batch(BATCHSIZE):
    pred = multi_labeling(
        [inputs["input_ids"], inputs["attention_mask"], inputs["token_type_ids"]]
    )

    acc_metrics_obj.update_state(inputs["labels"], pred)
    loss_metrics_obj.update_state(inputs["labels"], pred)

print(f"accuracy for test dataset : {acc_metrics_obj.result().numpy():.4f}\n")
print(f"loss for test dataset : {loss_metrics_obj.result().numpy():.4f}\n")

acc_metrics_obj.reset_state()
loss_metrics_obj.reset_state()

In [None]:
binary = tf.keras.losses.BinaryCrossentropy()
categ = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.BinaryAccuracy()
for inputs in test_dataset.shuffle(100).batch(5).take(1):
    for a,b,c,d in zip(inputs["input_ids"], inputs["attention_mask"], inputs["token_type_ids"],inputs["labels"]):
        print(tokenizer.decode(a, skip_special_tokens=True).replace(" ", ""))
        score = multi_labeling(
            [np.array(a).reshape((1,128)), np.array(b).reshape((1,128)), np.array(c).reshape((1,128))]
        )
        print("# predict")
        print(score.numpy()[0])
        print("# label")
        print(d.numpy())
        print()

In [None]:
multi_labeling.save_weights("ckpt/bert_not_trainable")

In [None]:
loaded_model = create_classifier(3, MAX_LENGTH)
loaded_model.load_weights("ckpt/bert_not_trainable")

for inputs in test_dataset.batch(BATCHSIZE):
    pred = loaded_model(
        [inputs["input_ids"], inputs["attention_mask"], inputs["token_type_ids"]]
    )

    acc_metrics_obj.update_state(inputs["labels"], pred)
    loss_metrics_obj.update_state(inputs["labels"], pred)

print(f"accuracy for test dataset : {acc_metrics_obj.result().numpy():.4f}\n")
print(f"loss for test dataset : {loss_metrics_obj.result().numpy():.4f}\n")

acc_metrics_obj.reset_state()
loss_metrics_obj.reset_state()

#TensorBoardのためのログを出力させる

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir logs