<a href="https://colab.research.google.com/github/s99436q/RNN/blob/master/Novel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prepare Data

In [0]:
#from google.colab import files
#uploaded = files.upload()
!ls

novel2.txt  sample_data


In [0]:
%tensorflow_version 2.x
%load_ext tensorboard
import tensorflow as tf
print(tf.__version__)


2.2.0-rc4


In [0]:
text=open("novel2.txt", encoding="utf-8").read()
print(f"天龍八部小說共有 {len(text)} 中文字")
print(f"包含了 {len(set(text))} 個獨一無二的字")
print()
print(text[100031:100099])

天龍八部小說共有 1266445 中文字
包含了 4401 個獨一無二的字

凶神惡煞’南海鱷神之上，必定是個狠惡可怖之極的人物，那知居然頗有姿色，不由得又向她瞧了幾眼。葉二孃向她嫣然一笑，木婉清全身一顫，只覺她這


# Tokenize Data

In [0]:
import numpy as np
# 初始化一個以字為單位的 Tokenizer
#tokenizer = tf.keras.preprocessing.text.Tokenizer(
#    num_words=None,char_level=True,filters=''
#)
    
# tokenizer 看一遍天龍八部全文，將每個新出現的字加入字典並將中文字轉成對應的數字索引
#tokenizer.fit_on_texts(text)
#text_as_int = tokenizer.texts_to_sequences([text])[0] #return text=[[...]]

words=set(text)
char2index = {u:i for i, u in enumerate(words)}
index2char = np.array(words)

text_as_int = np.array([char2index[c] for c in text])
index2char
text_as_int

array([ 544, 2955, 1713, ..., 3515,  888, 1392])

# Prepare training data


In [0]:
SEQ_LENGTH = 100
BATCH_SIZE = 128 
steps_per_epoch = len(text_as_int) // SEQ_LENGTH

# transfer text_as_int to tensor
characters = tf.data.Dataset.from_tensor_slices(text_as_int)
# cut the novel to small pieces of seqs
sequences = characters.batch(SEQ_LENGTH + 1,drop_remainder=True)

# each seq to input/output pairs
def build_seq_pairs(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

# 1.文本擷取出來的序列套用上面定義的函式，拆成兩個數字序列作為輸入／輸出序列
# 2.再將得到的所有數據隨機打亂順序
# 3.最後再一次拿出 BATCH_SIZE（128）筆數據,作為模型一次訓練步驟的所使用的資料
ds = sequences\
    .map(build_seq_pairs)\
    .shuffle(steps_per_epoch)\
    .batch(BATCH_SIZE, 
           drop_remainder=True)  


# Build the Model


In [0]:
# 超參數
EMBEDDING_DIM = 512
RNN_UNITS = 1024
num_words=len(set(text))
def build_model(EMBEDDING_DIM,RNN_UNITS,num_words,batch_size):
      # 使用 keras 建立一個非常簡單的 LSTM 模型
      model = tf.keras.Sequential()

      # 詞嵌入層: 將每個索引數字對應到一個高維空間的向量
      model.add(
          tf.keras.layers.Embedding(
              input_dim=num_words, 
              output_dim=EMBEDDING_DIM,
              batch_input_shape=[
                  batch_size, None]
      ))

      # LSTM 層:負責將序列數據依序讀入並做處理
      model.add(
          tf.keras.layers.LSTM(
          units=RNN_UNITS, 
          return_sequences=True, 
          stateful=True, 
          recurrent_initializer='glorot_uniform'
      ))

      # 全連接層:負責model每個中文字出現的可能性
      model.add(
          tf.keras.layers.Dense(
              num_words))

      model.summary()
      return model
model=build_model(EMBEDDING_DIM,RNN_UNITS,num_words,BATCH_SIZE)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (128, None, 512)          2253312   
_________________________________________________________________
lstm (LSTM)                  (128, None, 1024)         6295552   
_________________________________________________________________
dense (Dense)                (128, None, 4401)         4511025   
Total params: 13,059,889
Trainable params: 13,059,889
Non-trainable params: 0
_________________________________________________________________


# Set Model Parameter

In [0]:
# 超參數，決定模型一次要更新的步伐有多大
LEARNING_RATE = 0.001

# 定義模型預測結果跟正確解答之間的差異
# 因為全連接層沒使用 activation func, from_logits= True 
def loss(y_true, y_pred):
    return tf.keras.losses.sparse_categorical_crossentropy(
        y_true, y_pred, from_logits=True)

model.compile(
    optimizer=tf.keras.optimizers.Adam(
        learning_rate=LEARNING_RATE), 
    loss=loss
)


# Train Model

In [0]:
import datetime
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

import os
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

callbacks = [
    tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1),
    tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix,save_weights_only=True),
    tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3),
]

#!rm -rf ./logs/
!rm -rf ./training_checkpoints
 
EPOCHS=20
history = model.fit(
    ds,# 前面使用 tf.data 建構的資料集
    epochs=EPOCHS, 
    callbacks=[callbacks]
)

Epoch 1/20
Epoch 2/20
 5/97 [>.............................] - ETA: 22s - loss: 5.5114

# Predict

###Retore the lastes Checkpoints

In [0]:
tf.train.latest_checkpoint(checkpoint_dir)

###Build a new Model

In [0]:
p_model=build_model(EMBEDDING_DIM, RNN_UNITS, num_words,batch_size=1)
p_model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
p_model.build(tf.TensorShape([1,None]))

###Generate text

In [0]:
def generate_text(p_model,start_string):
  num_generate=150
  #convert string to nums (vectorizing)
  input_val=[char2index[c] for c in start_string]
  input_val=tf.expand_dims(input_val,0)
  print(len(input_val[0]),len(start_string))
  text_generated=[]
  temperature=1.0
  model.reset_states()
  for i in range(num_generate):
    prediction=p_model(input_val)
    prediction=tf.squeeze(prediction,0)
    prediction/=temperature
    
    predict_id=tf.random.categorical(prediction,num_samples=1)[-1,0].numpy()
    input_val=tf.expand_dims([predict_id],0) 
    text_generated+=[index2char[predict_id]]

  return start_string+''.join(text_generated)

In [0]:
content="段譽卻仍是抬起了頭望著她，見那少女雙腳蕩啊蕩的，似乎這麼坐樑上甚是好玩，問道：“是你救我的麼？"

print(generate_text(p_model,start_string=content))

# Measurements & Visualizations

In [0]:
!ls ./logs/fit

In [0]:
%tensorboard --logdir logs/fit/