In [82]:
import codecs
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras_bert import load_trained_model_from_checkpoint, Tokenizer

In [65]:
SEQ_LEN = 64
BATCH_SIZE = 32
OUTPUT_LAYER_NUM = 4
LEARNING_RATE = 1e-5

PRETRAINED_PATH = 'bert-master/Japanese_L-12_H-768_A-12_E-30_BPE/'
CONFIG_PATH = PRETRAINED_PATH + 'bert_config.json'
CHECKPOINT_PATH = PRETRAINED_PATH + 'bert_model.ckpt'
VOCAB_PATH = PRETRAINED_PATH + 'vocab.txt'

In [7]:
model = load_trained_model_from_checkpoint(
  CONFIG_PATH,
  CHECKPOINT_PATH,
  training=False,
  trainable=False,
  output_layer_num=OUTPUT_LAYER_NUM,
  seq_len=SEQ_LEN
)

In [50]:
print(model.inputs, model.outputs)

[<tf.Tensor 'Input-Token_2:0' shape=(?, 64) dtype=float32>, <tf.Tensor 'Input-Segment_2:0' shape=(?, 64) dtype=float32>] [<tf.Tensor 'Encoder-Output_1/concat:0' shape=(?, 64, 3072) dtype=float32>]


In [125]:
#define model
class JPNFeatureExtractor(keras.Model):
  def __init__(self, bert_model):
    super(JPNFeatureExtractor, self).__init__()
    self.bert_model = bert_model

  def call(self, x):
    x = self.bert_model(x)
    return x

In [120]:
# prepare token->idx dictionary
def make_token_dict(vocab_path):
  token_dict = {}
  with codecs.open(vocab_path, 'r', 'utf8') as reader:
    for line in reader:
      if line != ' \n':
        token = line.strip()
      else:
        token = line.strip('\n')
      token_dict[token] = len(token_dict)
  return token_dict

In [121]:
token_dict = make_token_dict(VOCAB_PATH)
print(token_dict['[CLS]'])
print(token_dict['##ｏｓ'])
print(token_dict['好調な'])

2
2451
32005


In [122]:
texts = ['日本語テスト。', '明日の天気はどうですか？']

In [123]:
tokenizer = Tokenizer(token_dict, cased=True)
ids = []
segments = []
for text in texts:
  id, segment = tokenizer.encode(text, max_len=SEQ_LEN)
  ids.append(id)
  segments.append(segment)
print(ids, '\n', segments)

[[2, 29, 97, 156, 3003, 7, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [2, 1503, 29, 5, 866, 1482, 9, 5272, 12323, 856, 1566, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] 
 [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


In [126]:
feature_extractor = JPNFeatureExtractor(model)
feature_extractor.compile(optimizer=tf.compat.v1.train.GradientDescentOptimizer(learning_rate=LEARNING_RATE), loss='categorical_crossentropy', metrics=[])

In [None]:
result = feature_extractor.predict([ids, segments], verbose=True)

In [115]:
model.compile(optimizer=tf.compat.v1.train.GradientDescentOptimizer(learning_rate=LEARNING_RATE), loss='categorical_crossentropy', metrics=[])
result = model.predict([ids, segments], verbose=True)



In [116]:
result.shape

(2, 64, 3072)

In [None]:
feature_extractor.build(input_shape=[64, 64])
feature_extractor.summary()

In [139]:
inputs_ids = keras.Input(shape=(64, ))
inputs_segments = keras.Input(shape=(64, ))
extracted = model([inputs_ids, inputs_segments])([inputs_ids, inputs_segments])
test_model = keras.Model(inputs=[inputs_ids, inputs_segments], outputs=extracted)

AttributeError: 'Node' object has no attribute 'output_masks'