In [1]:
import torch
import torch.nn as nn
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
from pyknp import Juman
import os
import json
import numpy as np

### ●Bert設定

In [2]:
# mask語予測と隣接文予測をするためのBERTモデル
model = BertModel.from_pretrained('bert-base-uncased')

# do_lower_case=False, do_basic_tokenize=False
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
# model.eval()

## ●マスク語予測
### masked_lm_labels: 
* masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [-1, 0, ..., vocab_size]. 
* All labels set to -1 are ignored (masked), the loss is only computed for the labels set in [0, ..., vocab_size]

In [3]:
def predict_mask(text, tag_bert_tokens, mask_id):
    true_tokens = []
    for i in mask_id:
        true_tokens.append(tag_bert_tokens[i]) 
        tag_bert_tokens[i] = "[MASK]"
    print("# tokens: \n{}\n".format(tag_bert_tokens))

    ids = tokenizer.convert_tokens_to_ids(tag_bert_tokens)

    # ベクトルを作成する
    tokens_tensor = torch.tensor(ids).reshape(1, -1)
    
    with torch.no_grad():
        outputs = model(tokens_tensor)
        predictions = outputs[0]
        print(predictions[mask_id])

    # 上位10トークンの確率
    print("# true: \n{}\n".format(true_tokens))
    print("# predicted: \n")
    for i in mask_id:
        scores, predicted_indexes = torch.topk(predictions[mask_id], k=5)
        predicted_tokens = tokenizer.convert_ids_to_tokens(predicted_indexes[0].tolist())
        print("# predicted_tokens: \n{}\n".format(predicted_tokens))

In [4]:
text1 = """
the man went to the store."""
text2 = """
あなたは？
"""

# 文書数
cnt = 1

In [5]:
if cnt == 1:
    text2 = ""
    bert_tokens = tokenizer.tokenize(text1)

    tag_bert_tokens = ["[CLS]"] + bert_tokens[:126] + ["[SEP]"]
    print("tokens: \n{}\n".format(tag_bert_tokens))

    print([[i, j] for i, j in zip(range(len(tag_bert_tokens)), tag_bert_tokens)])

elif cnt == 2:
    tokens_a = Mecab_Tokenizer.tokenize(text1)
    tokens_b = Mecab_Tokenizer.tokenize(text2)
    tokens_a = tokenizer.tokenize(" ".join(tokens_a))
    tokens_b = tokenizer.tokenize(" ".join(tokens_b))
    tokens_a = ['[CLS]'] + tokens_a + ['[SEP]']
    tokens_b = tokens_b + ['[SEP]']
    tag_bert_tokens = tokens_a + tokens_b
    
    print("tokens: \n{}\n".format(tag_bert_tokens))

    print([[i, j] for i, j in zip(range(len(tag_bert_tokens)), tag_bert_tokens)])

tokens: 
['[CLS]', 'the', 'man', 'went', 'to', 'the', 'store', '.', '[SEP]']

[[0, '[CLS]'], [1, 'the'], [2, 'man'], [3, 'went'], [4, 'to'], [5, 'the'], [6, 'store'], [7, '.'], [8, '[SEP]']]


In [6]:
mask_id = [6]
predict_mask(text1 + text2, tag_bert_tokens, mask_id)

# tokens: 
['[CLS]', 'the', 'man', 'went', 'to', 'the', '[MASK]', '.', '[SEP]']

tensor([[-3.8511, -3.7515, -3.8620,  ..., -3.4319, -3.8656, -3.8376]])
# true: 
['store']

# predicted: 

# predicted_tokens: 
['door', 'window', 'bathroom', 'kitchen', 'bed']



## ●隣接文予測
### next_sentence_label: 
* next sentence classification loss: torch.LongTensor of shape [batch_size] with indices selected in [0, 1]. 
* 0 => next sentence is the continuation, 
* 1 => next sentence is a random sentence.

In [11]:
text1 = """
包丁とは、調理に使う刃物のことである。
"""
text2 = """
出刃包丁、刺身包丁、菜切り包丁など、多くの種類がある。
"""

In [12]:
Mecab_Tokenizer = MecabTokenizer()
tokens1 = Mecab_Tokenizer.tokenize(text1)
tokens2 = Mecab_Tokenizer.tokenize(text2)
# print("juman++: {} {}".format(tokens1, tokens2))

bert_tokens1 = tokenizer.tokenize(" ".join(tokens1))
bert_tokens2 = tokenizer.tokenize(" ".join(tokens2))
# print("BertTokenizer: {} {}".format(bert_tokens1, bert_tokens2))

tag_bert_tokens = (
    ["[CLS]"] + bert_tokens1[:126] + ["[SEP]"] + bert_tokens2[:126] + ["[SEP]"]
)

print("text1: {}\ntext2: {}".format(text1, text2))
print("tokens: \n{}\n".format(tag_bert_tokens))
# print("len: {}\n".format(len(tag_bert_tokens)))

ids = tokenizer.convert_tokens_to_ids(tag_bert_tokens)
# print("idを振る: {}\n".format(ids))

# ベクトルを作成する
tokens_tensor = torch.tensor(ids).reshape(1, -1)
# print(tokens_tensor)

with torch.no_grad():
    output, label = model(tokens_tensor)
    
m = nn.Softmax(dim=-1)
pp = ["{:.2f}%".format(i*100) for i in m(label[0])]

print("next sentence: {}\nrandom sentence: {}\n".format(pp[0], pp[1]))
print("second-text is random next: {}\n".format(bool(np.argmax(pp))))

text1: 
包丁とは、調理に使う刃物のことである。

text2: 
出刃包丁、刺身包丁、菜切り包丁など、多くの種類がある。

tokens: 
['[CLS]', '包丁', 'と', 'は', '、', '調理', 'に', '使う', '刃物', 'の', 'こと', 'で', 'ある', '。', '[SEP]', '[UNK]', '、', '[UNK]', '、', '[UNK]', 'など', '、', '多く', 'の', '種類', 'が', 'ある', '。', '[SEP]']

next sentence: 20.38%
random sentence: 79.62%

second-text is random next: True

