In [None]:
!pip install mecab-python3
!pip install unidic
!python -m unidic download
!pip install fugashi
!pip install ipadic

In [None]:
!pip install transformers

青空文庫から夏目漱石の
「それから」
「こころ」
「夢十夜」
をダウンロードしてくる

In [None]:
!wget https://www.aozora.gr.jp/cards/000148/files/773_ruby_5968.zip
!unzip -O sjjs /content/773_ruby_5968.zip
!wget https://www.aozora.gr.jp/cards/000148/files/56143_ruby_50824.zip
!unzip -O sjjs  /content/56143_ruby_50824.zip
!wget https://www.aozora.gr.jp/cards/000148/files/799_ruby_6024.zip
!unzip -O sjjs 799_ruby_6024.zip

In [None]:
!apt install nkf

In [None]:
!nkf -w --overwrite kokoro.txt sorekara.txt yume_juya.txt

In [None]:
!cat kokoro.txt sorekara.txt yume_juya.txt > train.txt

In [None]:
from transformers import TFBertModel
from transformers import BertJapaneseTokenizer


tokenizer = BertJapaneseTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')

bert = TFBertModel.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')

In [None]:
import MeCab
import numpy as np
import tensorflow as tf
import os

In [None]:
with open('train.txt', 'r', encoding='utf-8') as f:
  text = f.read().replace('\n', '')
mecab = MeCab.Tagger("-Owakati")
text = mecab.parse(text).split()
vocab = sorted(set(text))
char2idx = {u: i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)
text_as_int = np.array([char2idx[c] for c in text])

In [None]:
seq_length = 128

# 訓練用サンプルとターゲットを作る
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [None]:
for input_example, target_example in dataset.take(3):
    print(f'Input data: {repr("".join(idx2char[input_example.numpy()]))}')
    print(f'Target data: {repr("".join(idx2char[target_example.numpy()]))}')

ラベルのサイズ(バッチサイズ、　文の長さ)

出力のサイズ（バッチサイズ、　文の長さ、　ボキャブラリーサイズ）

In [None]:
BATCH_SIZE = 64


BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

input_ids = tf.keras.layers.Input(shape=(None, ), dtype='int32', name='input_ids')
inputs = [input_ids]

bert.trainable = False
x = bert(inputs)

out = x[0]

Y = tf.keras.layers.Dense(len(vocab))(out)

checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

model = tf.keras.Model(inputs=inputs, outputs=Y)
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model.compile(loss=loss,
              optimizer=tf.keras.optimizers.Adam(1e-7))

model.fit(dataset,epochs=5, callbacks=[checkpoint_callback])

In [None]:
def generate_text(model, start_string):
  # 評価ステップ（学習済みモデルを使ったテキスト生成）

  # 生成する文字数
  num_generate = 30

  # 開始文字列を数値に変換（ベクトル化）
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # 結果を保存する空文字列
  text_generated = []

  # 低い temperature　は、より予測しやすいテキストをもたらし
  # 高い temperature は、より意外なテキストをもたらす
  # 実験により最適な設定を見つけること
  temperature = 1

  # ここではバッチサイズ　== 1
  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      # バッチの次元を削除
      predictions = tf.squeeze(predictions, 0)

      # カテゴリー分布をつかってモデルから返された言葉を予測 
      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # 過去の隠れ状態とともに予測された言葉をモデルへのつぎの入力として渡す
      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(idx2char[predicted_id])

  return (''.join(start_string) + ''.join(text_generated))

In [None]:
text = '私は'
mecab = MeCab.Tagger("-Owakati")
text = mecab.parse(text).split()
generate_text(model, text)

In [None]:
import tensorflow as tf

In [None]:
def input_target(chunk):
    input_text = chunk
    target = tf.constant([1, 0, 0], dtype=tf.float32)
    return input_text, target

kokoro = tf.data.TextLineDataset('kokoro.txt')
kokoro = kokoro.map(input_target)

In [None]:
def input_target(chunk):
    input_text = chunk
    target = tf.constant([0, 1, 0], dtype=tf.float32)
    return input_text, target

sorekara = tf.data.TextLineDataset('sorekara.txt')
sorekara = sorekara.map(input_target)

In [None]:
def input_target(chunk):
    input_text = chunk
    target = tf.constant([0, 0, 1], dtype=tf.float32)
    return input_text, target

yume_juya = tf.data.TextLineDataset('yume_juya.txt')
yume_juya = yume_juya.map(input_target)

In [None]:
train_dataset = kokoro.concatenate(sorekara).concatenate(yume_juya)

def tokenize_map_fn(tokenizer):

    """map function for pretrained tokenizer"""
    def _tokenize(text_a, label):
        inputs = tokenizer.encode_plus(
            text_a.numpy().decode('utf-8'),
            add_special_tokens=True,
        )
        input_ids= inputs["input_ids"]
        return input_ids, label

    def _map_fn(text,label):
        out = tf.py_function(_tokenize, inp=[text, label], Tout=(tf.int32, tf.float32))
        return (out[0], out[1])

    return _map_fn


train_dataset = train_dataset.map(tokenize_map_fn(tokenizer))
train_dataset = train_dataset.map(lambda x, y : (x[:128], y))
train_dataset = train_dataset.padded_batch(64, padded_shapes=([128], [3]))

In [None]:
BUFFER_SIZE = 10000

dataset = train_dataset.shuffle(BUFFER_SIZE)

input_ids = tf.keras.layers.Input(shape=(None, ), dtype='int32', name='input_ids')
inputs = [input_ids]

bert.trainable = False
x = bert(inputs)

out = x[1]

fully_connected = tf.keras.layers.Dense(256, activation='relu')(out)
Y = tf.keras.layers.Dense(3, activation='softmax')(fully_connected)

checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

model = tf.keras.Model(inputs=inputs, outputs=Y)
def loss(labels, logits):
  return tf.keras.losses.categorical_crossentropy(labels, logits)

model.compile(loss=loss,
              optimizer=tf.keras.optimizers.Adam(1e-7))

model.fit(dataset,epochs=5, callbacks=[checkpoint_callback])

In [None]:

text = '楽しい勉強でした。'


encoded = tokenizer.encode_plus(
            text,
            text,
            add_special_tokens=True,
            max_length=128,
            pad_to_max_length=True,
            return_attention_mask=True
        )
inputs = tf.expand_dims(encoded["input_ids"],0)
res = model.predict_on_batch(inputs)
res