In [1]:
import torch
import torch.nn as nn
from pytorch_pretrained_bert import BertTokenizer, BertForPreTraining, modeling
from pyknp import Juman
import os
import json
import numpy as np

### ●環境変数設定

In [2]:
os.environ["pytorch_model"] = "./mecab-PyTorch/pytorch_model.bin"
os.environ["vocab_txt"] = "./mecab-PyTorch/vocab.txt"
os.environ["bert_config"] = "./mecab-PyTorch/bert_config.json"

In [3]:
with open(os.environ["bert_config"], "r") as f:
    jsonData = json.load(f)
    print(json.dumps(jsonData, sort_keys=True, indent=4))

{
    "attention_probs_dropout_prob": 0.1,
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.1,
    "hidden_size": 768,
    "initializer_range": 0.02,
    "intermediate_size": 3072,
    "max_position_embeddings": 512,
    "num_attention_heads": 12,
    "num_hidden_layers": 12,
    "type_vocab_size": 2,
    "vocab_size": 32005
}


### ●Bert設定

In [4]:
config = modeling.BertConfig(
    attention_probs_dropout_prob=jsonData["attention_probs_dropout_prob"],
    hidden_act=jsonData["hidden_act"],
    hidden_dropout_prob=jsonData["hidden_dropout_prob"],
    hidden_size=jsonData["hidden_size"],
    initializer_range=jsonData["initializer_range"],
    intermediate_size=jsonData["intermediate_size"],
    max_position_embeddings=jsonData["max_position_embeddings"],
    num_attention_heads=jsonData["num_attention_heads"],
    num_hidden_layers=jsonData["num_hidden_layers"],
    type_vocab_size=jsonData["type_vocab_size"],
    vocab_size_or_config_json_file=jsonData["vocab_size"],
)

In [5]:
# mask語予測と隣接文予測をするためのBERTモデル
model = BertForPreTraining(config=config)
model.load_state_dict(torch.load(os.environ['pytorch_model']))

# do_lower_case=False, do_basic_tokenize=False
tokenizer = BertTokenizer(os.environ["vocab_txt"], do_lower_case=False, do_basic_tokenize=False)

# model.eval()

### ●MeCab + NEologでトークン化

In [6]:
import MeCab
class MecabTokenizer():
    def __init__(self):
        self.mecab = MeCab.Tagger("-Ochasen -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")
        self.mecab.parse("")

    def tokenize(self, text):
        node = self.mecab.parseToNode(text)
        result = []
        while node:
            result.append(node.surface)
            node = node.next
        return result

## ●マスク語予測
### masked_lm_labels: 
* masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [-1, 0, ..., vocab_size]. 
* All labels set to -1 are ignored (masked), the loss is only computed for the labels set in [0, ..., vocab_size]

In [7]:
def predict_mask(text, tag_bert_tokens, mask_id):
    true_tokens = []
    for i in mask_id:
        true_tokens.append(tag_bert_tokens[i]) 
        tag_bert_tokens[i] = "[MASK]"
    print("# tokens: \n{}\n".format(tag_bert_tokens))

    ids = tokenizer.convert_tokens_to_ids(tag_bert_tokens)

    # ベクトルを作成する
    tokens_tensor = torch.tensor(ids).reshape(1, -1)
    
    with torch.no_grad():
        output, _ = model(tokens_tensor)

    # 上位10トークンの確率
    print("# true: \n{}\n".format(true_tokens))
    print("# predicted: \n")
    for i in mask_id:
        logits = output[0][i].sort()[0]
        predicted_ids = output[0][i].sort()[1]

        predicted_mask = [tokenizer.ids_to_tokens[i.item()] for i in output[0][i].argsort()[-15:]][::-1]

        m = nn.Softmax(dim=-1)
        pp = ["{:.2f}%".format(i*100) for i in m(logits).sort()[0][-15:]][::-1]

        print(["{}: {}".format(i, j) for i, j in zip(predicted_mask, pp)])
        print()

In [8]:
text1 = """
包丁とは、調理に使う刃物のことであり、AI、ロボット、焼き鳥など多くの種類がある。
"""
text2 = """
あなたは？
"""

# 文書数
cnt = 1

In [9]:
Mecab_Tokenizer = MecabTokenizer()
if cnt == 1:
    text2 = ""
    tokens = Mecab_Tokenizer.tokenize(text1)
    # print("juman++: {}\n".format(tokens))

    bert_tokens = tokenizer.tokenize(" ".join(tokens))
    # print("BertTokenizer: {}\n".format(bert_tokens))

    tag_bert_tokens = ["[CLS]"] + bert_tokens[:126] + ["[SEP]"]
    print("tokens: \n{}\n".format(tag_bert_tokens))

    print([[i, j] for i, j in zip(range(len(tag_bert_tokens)), tag_bert_tokens)])

elif cnt == 2:
    tokens_a = Mecab_Tokenizer.tokenize(text1)
    tokens_b = Mecab_Tokenizer.tokenize(text2)
    tokens_a = tokenizer.tokenize(" ".join(tokens_a))
    tokens_b = tokenizer.tokenize(" ".join(tokens_b))
    tokens_a = ['[CLS]'] + tokens_a + ['[SEP]']
    tokens_b = tokens_b + ['[SEP]']
    tag_bert_tokens = tokens_a + tokens_b
    
    print("tokens: \n{}\n".format(tag_bert_tokens))

    print([[i, j] for i, j in zip(range(len(tag_bert_tokens)), tag_bert_tokens)])

tokens: 
['[CLS]', '包丁', 'と', 'は', '、', '調理', 'に', '使う', '刃物', 'の', 'こと', 'で', 'あり', '、', 'AI', '、', 'ロボット', '、', '焼き鳥', 'など', '多く', 'の', '種類', 'が', 'ある', '。', '[SEP]']

[[0, '[CLS]'], [1, '包丁'], [2, 'と'], [3, 'は'], [4, '、'], [5, '調理'], [6, 'に'], [7, '使う'], [8, '刃物'], [9, 'の'], [10, 'こと'], [11, 'で'], [12, 'あり'], [13, '、'], [14, 'AI'], [15, '、'], [16, 'ロボット'], [17, '、'], [18, '焼き鳥'], [19, 'など'], [20, '多く'], [21, 'の'], [22, '種類'], [23, 'が'], [24, 'ある'], [25, '。'], [26, '[SEP]']]


In [10]:
mask_id = [16]
print("●mecab")
predict_mask(text1 + text2, tag_bert_tokens, mask_id)

●mecab
# tokens: 
['[CLS]', '包丁', 'と', 'は', '、', '調理', 'に', '使う', '刃物', 'の', 'こと', 'で', 'あり', '、', 'AI', '、', '[MASK]', '、', '焼き鳥', 'など', '多く', 'の', '種類', 'が', 'ある', '。', '[SEP]']

# true: 
['ロボット']

# predicted: 

['[UNK]: 47.00%', '包丁: 12.39%', '焼き鳥: 2.20%', '鍋: 1.37%', '刃物: 1.09%', '箸: 0.94%', 'フライパン: 0.93%', 'ナイフ: 0.90%', '餃子: 0.88%', 'カッター: 0.85%', '豆腐: 0.80%', '寿司: 0.69%', 'きゅうり: 0.67%', '天ぷら: 0.63%', '揚げ物: 0.55%']



## ●隣接文予測
### next_sentence_label: 
* next sentence classification loss: torch.LongTensor of shape [batch_size] with indices selected in [0, 1]. 
* 0 => next sentence is the continuation, 
* 1 => next sentence is a random sentence.

In [11]:
text1 = """
包丁とは、調理に使う刃物のことである。
"""
text2 = """
出刃包丁、刺身包丁、菜切り包丁など、多くの種類がある。
"""

In [12]:
Mecab_Tokenizer = MecabTokenizer()
tokens1 = Mecab_Tokenizer.tokenize(text1)
tokens2 = Mecab_Tokenizer.tokenize(text2)
# print("juman++: {} {}".format(tokens1, tokens2))

bert_tokens1 = tokenizer.tokenize(" ".join(tokens1))
bert_tokens2 = tokenizer.tokenize(" ".join(tokens2))
# print("BertTokenizer: {} {}".format(bert_tokens1, bert_tokens2))

tag_bert_tokens = (
    ["[CLS]"] + bert_tokens1[:126] + ["[SEP]"] + bert_tokens2[:126] + ["[SEP]"]
)

print("text1: {}\ntext2: {}".format(text1, text2))
print("tokens: \n{}\n".format(tag_bert_tokens))
# print("len: {}\n".format(len(tag_bert_tokens)))

ids = tokenizer.convert_tokens_to_ids(tag_bert_tokens)
# print("idを振る: {}\n".format(ids))

# ベクトルを作成する
tokens_tensor = torch.tensor(ids).reshape(1, -1)
# print(tokens_tensor)

with torch.no_grad():
    output, label = model(tokens_tensor)
    
m = nn.Softmax(dim=-1)
pp = ["{:.2f}%".format(i*100) for i in m(label[0])]

print("next sentence: {}\nrandom sentence: {}\n".format(pp[0], pp[1]))
print("second-text is random next: {}\n".format(bool(np.argmax(pp))))

text1: 
包丁とは、調理に使う刃物のことである。

text2: 
出刃包丁、刺身包丁、菜切り包丁など、多くの種類がある。

tokens: 
['[CLS]', '包丁', 'と', 'は', '、', '調理', 'に', '使う', '刃物', 'の', 'こと', 'で', 'ある', '。', '[SEP]', '[UNK]', '、', '[UNK]', '、', '[UNK]', 'など', '、', '多く', 'の', '種類', 'が', 'ある', '。', '[SEP]']

next sentence: 20.38%
random sentence: 79.62%

second-text is random next: True

