In [1]:
import torch
import torch.nn as nn
from pytorch_pretrained_bert import BertTokenizer, BertForPreTraining, modeling
from pyknp import Juman
import os
import json
import numpy as np

### ●環境変数設定

In [2]:
os.environ["pytorch_model"] = "./Japanese_L-12_H-768_A-12_E-30_BPE/pytorch_model.bin"
os.environ["vocab_txt"] = "./Japanese_L-12_H-768_A-12_E-30_BPE/vocab.txt"
os.environ["bert_config"] = "./Japanese_L-12_H-768_A-12_E-30_BPE/bert_config.json"

In [3]:
with open(os.environ["bert_config"], "r") as f:
    jsonData = json.load(f)
    print(json.dumps(jsonData, sort_keys=True, indent=4))

{
    "attention_probs_dropout_prob": 0.1,
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.1,
    "hidden_size": 768,
    "initializer_range": 0.02,
    "intermediate_size": 3072,
    "max_position_embeddings": 512,
    "num_attention_heads": 12,
    "num_hidden_layers": 12,
    "type_vocab_size": 2,
    "vocab_size": 32006
}


### ●Bert設定

In [4]:
config = modeling.BertConfig(
    attention_probs_dropout_prob=jsonData["attention_probs_dropout_prob"],
    hidden_act=jsonData["hidden_act"],
    hidden_dropout_prob=jsonData["hidden_dropout_prob"],
    hidden_size=jsonData["hidden_size"],
    initializer_range=jsonData["initializer_range"],
    intermediate_size=jsonData["intermediate_size"],
    max_position_embeddings=jsonData["max_position_embeddings"],
    num_attention_heads=jsonData["num_attention_heads"],
    num_hidden_layers=jsonData["num_hidden_layers"],
    type_vocab_size=jsonData["type_vocab_size"],
    vocab_size_or_config_json_file=jsonData["vocab_size"],
)

In [5]:
# mask語予測と隣接文予測をするためのBERTモデル
model = BertForPreTraining(config=config)
model.load_state_dict(torch.load(os.environ['pytorch_model']))

# do_lower_case=False, do_basic_tokenize=False
tokenizer = BertTokenizer(os.environ["vocab_txt"], do_lower_case=False, do_basic_tokenize=False)

# model.eval()

### ●juman++でトークン化

In [6]:
class JumanTokenizer():
    def __init__(self):
        self.juman = Juman()

    def tokenize(self, text):
        result = self.juman.analysis(text)
        return [mrph.midasi for mrph in result.mrph_list()]

## ●マスク語予測
### masked_lm_labels: 
* masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [-1, 0, ..., vocab_size]. 
* All labels set to -1 are ignored (masked), the loss is only computed for the labels set in [0, ..., vocab_size]

In [7]:
def predict_mask(text, tag_bert_tokens, mask_id):
    true_tokens = []
    for i in mask_id:
        true_tokens.append(tag_bert_tokens[i]) 
        tag_bert_tokens[i] = "[MASK]"
    print("# tokens: \n{}\n".format(tag_bert_tokens))

    ids = tokenizer.convert_tokens_to_ids(tag_bert_tokens)

    # ベクトルを作成する
    tokens_tensor = torch.tensor(ids).reshape(1, -1)
    
    with torch.no_grad():
        output, _ = model(tokens_tensor)

    # 上位10トークンの確率
    print("# true: \n{}\n".format(true_tokens))
    print("# predicted: \n")
    for i in mask_id:
        logits = output[0][i].sort()[0]
        predicted_ids = output[0][i].sort()[1]

        predicted_mask = [tokenizer.ids_to_tokens[i.item()] for i in output[0][i].argsort()[-15:]][::-1]

        m = nn.Softmax(dim=-1)
        pp = ["{:.2f}%".format(i*100) for i in m(logits).sort()[0][-15:]][::-1]

        print(["{}: {}".format(i, j) for i, j in zip(predicted_mask, pp)])
        print()

In [8]:
text1 = """
わたしはコンピュータ関係の仕事をしていますが、あなたは？
"""
text2 = """
わたしは居酒屋で働いています。
"""

# 文章数
cnt = 1

In [9]:
juman_tokenizer = JumanTokenizer()
if cnt == 1:
    text2 = ""
    tokens = juman_tokenizer.tokenize(text1)
    # print("juman++: {}\n".format(tokens))

    bert_tokens = tokenizer.tokenize(" ".join(tokens))
    # print("BertTokenizer: {}\n".format(bert_tokens))

    tag_bert_tokens = ["[CLS]"] + bert_tokens[:126] + ["[SEP]"]
    print("tokens: \n{}\n".format(tag_bert_tokens))

    print([[i, j] for i, j in zip(range(len(tag_bert_tokens)), tag_bert_tokens)])

elif cnt == 2:
    tokens1 = juman_tokenizer.tokenize(text1)
    tokens2 = juman_tokenizer.tokenize(text2)
    # print("juman++: {} {}".format(tokens1, tokens2))

    bert_tokens1 = tokenizer.tokenize(" ".join(tokens1))
    bert_tokens2 = tokenizer.tokenize(" ".join(tokens2))
    # print("BertTokenizer: {} {}".format(bert_tokens1, bert_tokens2))

    tag_bert_tokens = (
        ["[CLS]"] + bert_tokens1[:126] + ["[SEP]"] + bert_tokens2[:126] + ["[SEP]"]
    )
    print("tokens: \n{}\n".format(tag_bert_tokens))

    print([[i, j] for i, j in zip(range(len(tag_bert_tokens)), tag_bert_tokens)])

Analysis is done ignoring "\n".


tokens: 
['[CLS]', 'わたし', 'は', 'コンピュータ', '関係', 'の', '仕事', 'を', 'して', 'い', 'ます', 'が', '、', 'あなた', 'は', '？', '[SEP]']

[[0, '[CLS]'], [1, 'わたし'], [2, 'は'], [3, 'コンピュータ'], [4, '関係'], [5, 'の'], [6, '仕事'], [7, 'を'], [8, 'して'], [9, 'い'], [10, 'ます'], [11, 'が'], [12, '、'], [13, 'あなた'], [14, 'は'], [15, '？'], [16, '[SEP]']]


In [10]:
mask_id = [13]
print("●juman++BPE")
predict_mask(text1 + text2, tag_bert_tokens, mask_id)

●juman++BPE
# tokens: 
['[CLS]', 'わたし', 'は', 'コンピュータ', '関係', 'の', '仕事', 'を', 'して', 'い', 'ます', 'が', '、', '[MASK]', 'は', '？', '[SEP]']

# true: 
['あなた']

# predicted: 

['それ: 19.58%', '私: 11.35%', 'わたし: 8.60%', 'これ: 4.91%', '彼女: 3.32%', 'あなた: 3.21%', '仕事: 2.87%', '名前: 1.50%', '理由: 1.16%', '答え: 1.15%', '彼: 1.06%', '僕: 0.99%', '詳細: 0.85%', '中身: 0.79%', '結果: 0.76%']



## ●隣接文予測
### next_sentence_label: 
* next sentence classification loss: torch.LongTensor of shape [batch_size] with indices selected in [0, 1]. 
* 0 => next sentence is the continuation, 
* 1 => next sentence is a random sentence.

In [11]:
text1 = """
梅雨の話をしましょう？
"""
text2 = """
ワールドカップでベスト４に入ります。
"""

In [12]:
juman_tokenizer = JumanTokenizer()
tokens1 = juman_tokenizer.tokenize(text1)
tokens2 = juman_tokenizer.tokenize(text2)
# print("juman++: {} {}".format(tokens1, tokens2))

bert_tokens1 = tokenizer.tokenize(" ".join(tokens1))
bert_tokens2 = tokenizer.tokenize(" ".join(tokens2))
# print("BertTokenizer: {} {}".format(bert_tokens1, bert_tokens2))

tag_bert_tokens = (
    ["[CLS]"] + bert_tokens1[:126] + ["[SEP]"] + bert_tokens2[:126] + ["[SEP]"]
)

print("text1: {}\ntext2: {}".format(text1, text2))
print("tokens: \n{}\n".format(tag_bert_tokens))
# print("len: {}\n".format(len(tag_bert_tokens)))

ids = tokenizer.convert_tokens_to_ids(tag_bert_tokens)
# print("idを振る: {}\n".format(ids))

# ベクトルを作成する
tokens_tensor = torch.tensor(ids).reshape(1, -1)
# print(tokens_tensor)

with torch.no_grad():
    output, label = model(tokens_tensor)
    
m = nn.Softmax(dim=-1)
pp = ["{:.2f}%".format(i*100) for i in m(label[0])]

print("next sentence: {}\nrandom sentence: {}\n".format(pp[0], pp[1]))
print("second-text is random next: {}\n".format(bool(np.argmax(pp))))

Analysis is done ignoring "\n".


text1: 
梅雨の話をしましょう？

text2: 
ワールドカップでベスト４に入ります。

tokens: 
['[CLS]', '梅雨', 'の', '話', 'を', 'し', 'ましょう', '？', '[SEP]', 'ワールドカップ', 'で', 'ベスト', '４', 'に', '入り', 'ます', '。', '[SEP]']

next sentence: 84.28%
random sentence: 15.72%

second-text is random next: False



Analysis is done ignoring "\n".
