In [1]:
import sys
import os

sys.path.append("modules")
from keras_bert import load_trained_model_from_checkpoint
from keras.utils import plot_model

import sentencepiece as spm
import codecs
import numpy as np
import copy
import logging
import random

Using TensorFlow backend.


## 環境変数設定

In [2]:
os.environ["config_path"] = "./keras_model/bert_config.json"
os.environ["checkpoint_path"] = "./keras_model/model.ckpt-1400000"
os.environ["dict_path"] = "./keras_model/wiki-ja.vocab"
os.environ["wiki_model"] = "./keras_model/wiki-ja.model"

In [3]:
config_path = os.environ["config_path"]
checkpoint_path = os.environ["checkpoint_path"]
dict_path = os.environ["dict_path"]
maxlen = 512
bert_dim = 768

## modelのロード

In [4]:
model = load_trained_model_from_checkpoint(config_path, checkpoint_path, training=True)
# model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


## SentencePieceProcessor

In [5]:
sp = spm.SentencePieceProcessor()
sp.Load(os.environ["wiki_model"])

True

In [6]:
token_dict = {}
with codecs.open(dict_path, "r", "utf8") as reader:
    for line in reader:
        token = line.split()[0]
        token_dict[token] = len(token_dict)
token_dict_rev = {v: k for k, v in token_dict.items()}

### マスク単語予測

In [7]:
def predict_masks(tokens, masks_id=None):
    # マスク位置をランダムにするかどうか
    if masks_id is None:
        mask_rate=0.15
        masks_id = (
            sorted(
                random.sample(range(2, len(tokens)), k=int(len(tokens) * mask_rate))
            ),
        )
    true_tokens = copy.copy(tokens)

    ## tokensをdictにする
    print("# tokens: \n{}\n".format(tokens))
    tokenized_dict = {}
    for i in range(len(tokens)):
        tokenized_dict[i] = tokens[i]

    label = []
    masks_id = masks_id

    for i in masks_id:
        label.append(tokens[i])
        tokens[i] = "[MASK]"
    print("# tokens:\n{}\n".format(tokens))
    print("# true:{}".format(label))
    
    indices = np.zeros((1, maxlen), dtype=np.float32)
    segments = np.zeros((1, maxlen), dtype=np.float32)
    masks = np.asarray([[0] * 512])
    for i in masks_id:
        masks[0][i] = 1

    ## ベクトル化する
    for t, token in enumerate(tokens):
        try:
            indices[0, t] = sp.piece_to_id(token)
        except:
            logging.warn(f"{token} is unknown.")
            indices[0, t] = sp.piece_to_id("<unk>")
    vector = model.predict([indices, segments, masks])[0]

    ### 32000tokenから最大のidを返す
    predicts = np.argmax(vector, axis=-1)
    predicts_sort = np.sort(vector, axis=-1)
    predicts_args = np.argsort(vector, axis=-1)

    result_predictd = {}
    ranknum = 10
    for i in range(len(masks_id)):
        str = "# true: {}\n".format(label[i])
        for j in range(1, ranknum + 1):
            # i番目に大きいindexを用いる
            predicts[0][masks_id[i]] = predicts_args[0][masks_id[i]][-j]
            # 予測単語と確率
            str += "{}: {} ({:.2f}) ".format(
                j,
                list(
                    map(
                        lambda x: token_dict_rev[x],
                        predicts[0][masks_id[i] : masks_id[i] + 1],
                    )
                )[0],
                (predicts_sort[0][masks_id[i]][-j]) * 100,
            )
        str += "\n"
        print(str)

In [8]:
test_sentence_1 = """
わたしはコンピュータ関係の仕事をしていますが、あなたは？
"""
test_sentence_2 = """
わたしは居酒屋で働いています。
"""

# 文章数
c_num = 1

In [9]:
if c_num == 1:
    print("text: \n{}\n".format(test_sentence_1))

    ## sentencepieceでtokenize
    tokens = []
    tokens.append("[CLS]")
    tokens.extend(sp.encode_as_pieces(test_sentence_1))
    tokens.append("[SEP]")
    print("tokens: \n{}\n".format([(i, j) for i, j in zip(range(len(tokens)), tokens)]))

elif c_num == 2:
    print("text: \n{}{}\n".format(test_sentence_1, test_sentence_2))

    ## sentencepieceでtokenize
    tokens = []
    tokens.append("[CLS]")
    tokens.extend(sp.encode_as_pieces(test_sentence_1))
    tokens.append("[SEP]")
    tokens.extend(sp.encode_as_pieces(test_sentence_2))
    tokens.append("[SEP]")
    print("tokens: \n{}\n".format(tokens))
    print("tokens: \n{}\n".format([(i, j) for i, j in zip(range(len(tokens)), tokens)]))

text: 

わたしはコンピュータ関係の仕事をしていますが、あなたは？


tokens: 
[(0, '[CLS]'), (1, '▁'), (2, 'わたし'), (3, 'は'), (4, 'コンピュータ'), (5, '関係の'), (6, '仕事を'), (7, 'し'), (8, 'ています'), (9, 'が'), (10, '、'), (11, 'あなた'), (12, 'は'), (13, '?'), (14, '[SEP]')]



In [10]:
print("●SentencePiece")
predict_masks(tokens, masks_id=[11])

●SentencePiece
# tokens: 
['[CLS]', '▁', 'わたし', 'は', 'コンピュータ', '関係の', '仕事を', 'し', 'ています', 'が', '、', 'あなた', 'は', '?', '[SEP]']

# tokens:
['[CLS]', '▁', 'わたし', 'は', 'コンピュータ', '関係の', '仕事を', 'し', 'ています', 'が', '、', '[MASK]', 'は', '?', '[SEP]']

# true:['あなた']
# true: あなた
1: 今 (9.46) 2: これから (8.42) 3: 次 (3.45) 4: どう (2.82) 5: 具体的に (2.19) 6: いつか (2.01) 7: 結局 (1.61) 8: そこに (1.60) 9: そのために (1.49) 10: 意味 (1.47) 



# ●隣接文予測: check whether the two sentences are continuous

In [11]:
def Next_Sentence_Prediction(test_sentence_1, test_sentence_2):
    print("### first-text:\n{}\n".format(test_sentence_1))
    print("### second-text:\n{}\n".format(test_sentence_2))

    ## truncate
    tokens1 = []
    tokens1.append("[CLS]")
    tokens1.extend(sp.encode_as_pieces(test_sentence_1))
    tokens1.append("[SEP]")

    tokens2 = []
    tokens2.append("[CLS]")
    tokens2.extend(sp.encode_as_pieces(test_sentence_2))
    tokens2.append("[SEP]")

    ## pack
    first_packed_tokens = tokens1
    second_packed_tokens = tokens2[1:]
    tokens = tokens1 + tokens2[1:]

    print("### tokens:\n{}\n".format(tokens))

    first_len = len(first_packed_tokens)
    second_len = len(second_packed_tokens)
    pad_len = 512 - first_len - second_len

    # sentenseの区別
    segments = np.zeros((1, maxlen), dtype=np.float32)
    segments[0][first_len : first_len + second_len] = [1] * (second_len)

    ## ids
    indices = np.zeros((1, maxlen), dtype=np.float32)
    for t, token in enumerate(tokens):
        try:
            indices[0, t] = sp.piece_to_id(token)
        except:
            print(f"{token} is unknown.")
            indices[0, t] = sp.piece_to_id("<unk>")

    ## mask位置
    masks = np.array([[0] * 512])

    ## predicts[1]
    print("### 結果:")
    predicts = model.predict([indices, segments, masks])[1][0]
    print("next sentence: {}\nrandom sentence: {}\n".format(predicts[0], predicts[1]))
    print("●second-text is random next: {}\n".format(bool(np.argmax(predicts))))

In [12]:
test_sentence_1 = """
梅雨の話をしましょう？
"""
test_sentence_2 = """
ワールドカップでベスト４に入ります。
"""

In [13]:
print(
    "-------------------------------------------------------------------------------------"
)
print("### memo\n\n")

Next_Sentence_Prediction(test_sentence_1, test_sentence_2)

-------------------------------------------------------------------------------------
### memo


### first-text:

梅雨の話をしましょう？


### second-text:

ワールドカップでベスト４に入ります。


### tokens:
['[CLS]', '▁', '梅', '雨', 'の', '話を', 'し', 'ましょう', '?', '[SEP]', '▁', 'ワールドカップ', 'で', 'ベスト', '4', 'に入り', 'ます', '。', '[SEP]']

### 結果:
next sentence: 0.011785382404923439
random sentence: 0.9882146120071411

●second-text is random next: True

