#設定

In [None]:
%%capture
!pip install transformers ipadic fugashi

In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

from transformers import BertJapaneseTokenizer, TFBertModel

In [None]:
MODEL_NAME = "cl-tohoku/bert-base-japanese-whole-word-masking"
MAX_LENGTH = 256

#データセット

In [None]:
!wget https://www.rondhuit.com/download/ldcc-20140209.tar.gz
!tar -zxf ldcc-20140209.tar.gz

#ベクトル化

In [None]:
filepaths = tf.io.gfile.glob("./text/*/*-*.txt")

In [None]:
category = list({filepath.split("/")[2] for filepath in filepaths})

In [None]:
tokenizer = BertJapaneseTokenizer.from_pretrained(MODEL_NAME)
model = TFBertModel.from_pretrained(MODEL_NAME)

[CLS]トークン、もしくは各トークンの出力を平均を文章ベクトルにする。

In [None]:
labels = list()
sentence_vectors_cls = np.zeros((len(filepaths), model.config.to_dict()["hidden_size"]))
sentence_vectors_avg = np.zeros((len(filepaths), model.config.to_dict()["hidden_size"]))

for i, filepath in enumerate(filepaths):
    labels.append(filepath.split("/")[2])

texts = list()
for i in range(len(filepaths)//500):
    start = i*500
    end = (i+1)*500
    print(f"{start}-{end}")
    
    for j in range(start, end):
        with open(filepaths[j], "r") as f:
            text = "".join(f.readlines()[3:])
        texts.append(text)

    encode = tokenizer(
        texts, max_length=MAX_LENGTH,
        padding="max_length", truncation=True,
        return_tensors="tf"
    )

    output = model(**encode)
    
    mask = tf.tile(
        tf.expand_dims(encode["attention_mask"], 2), 
        [1, 1, model.config.to_dict()["hidden_size"]]
    )
    mask = tf.cast(mask, tf.float32)

    avg = tf.math.reduce_sum(output.last_hidden_state*mask, 1) / tf.math.reduce_sum(mask, 1)

    sentence_vectors_cls[start:end] = output.pooler_output.numpy()
    sentence_vectors_avg[start:end] = avg.numpy()
    texts = list()

for j in range(end, len(filepaths)):
    with open(filepaths[j], "r") as f:
        text = "".join(f.readlines()[3:])
    texts.append(text)
print(f"{end}-{len(filepaths)}")

encode = tokenizer(
    texts, max_length=MAX_LENGTH,
    padding="max_length", truncation=True,
    return_tensors="tf"
)
output = model(**encode)

mask = tf.tile(
    tf.expand_dims(encode["attention_mask"], 2), 
    [1, 1, model.config.to_dict()["hidden_size"]]
)
mask = tf.cast(mask, tf.float32)

avg = tf.math.reduce_sum(output.last_hidden_state*mask, 1) / tf.math.reduce_sum(mask, 1)

sentence_vectors_cls[end:] = output.pooler_output.numpy()
sentence_vectors_avg[end:] = avg.numpy()

In [None]:
def plot_compressed_features(vectors, categories, paths):
    fig, axes = plt.subplots(ncols=3, nrows=3, figsize=(10, 8))
    axes = axes.ravel()

    for ax, ctg in zip(axes, categories):
        ax.scatter(vectors[:, 0], vectors[:, 1], c="gray", alpha=0.6, s=5)

        indices = list()
        for i in range(len(paths)):
            if ctg in paths[i]:
                indices.append(i)

        ax.scatter(vectors[indices, 0], vectors[indices, 1], c="red", s=5)
        ax.set_title(ctg)

    plt.tight_layout()
    plt.show()

#PCA(主成分分析)

In [None]:
cls_pca = PCA(n_components=2).fit_transform(sentence_vectors_cls)
avg_pca = PCA(n_components=2).fit_transform(sentence_vectors_avg)

In [None]:
plot_compressed_features(cls_pca, category, filepaths)

In [None]:
plot_compressed_features(avg_pca, category, filepaths)

#t-SNE

In [None]:
cls_tsne = TSNE(n_components=2).fit_transform(sentence_vectors_cls)
avg_tsne = TSNE(n_components=2).fit_transform(sentence_vectors_avg)

In [None]:
plot_compressed_features(cls_tsne, category, filepaths)

In [None]:
plot_compressed_features(avg_tsne, category, filepaths)

t-SNEの特徴量抽出の方が、PCAよりもカテゴリごとに分けられているように見える。

t-SNEでは、[CLS]トークンでもトークンの平均でも違いはなさそう。