<a href="https://colab.research.google.com/github/s99436q/RNN/blob/master/novels.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set Data

###Load data

In [0]:
from google.colab import drive
import os
%tensorflow_version 2.x
%load_ext tensorboard
import tensorflow as tf

drive.mount('/content/gdrive/')
os.chdir("/content/gdrive/My Drive/Colab Notebooks")
text=open("novel2.txt", encoding="utf-8").read()


print()
print('Current drive:',os.getcwd())
print('Tensoeflow version:',tf.__version__)
print(f"天龍八部小說共有 {len(text)} 中文字,{len(set(text))} 個獨一無二的字")
#print(text[10331:10555])



The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard
Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).

Current drive: /content/gdrive/My Drive/Colab Notebooks
Tensoeflow version: 2.2.0
天龍八部小說共有 1266445 中文字,4401 個獨一無二的字


### Prepare data


In [0]:


#intial tokenizor 
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=None,char_level=True,filters='')
#read the text
tokenizer.fit_on_texts(text)
#transfer text to int
text_as_int = tokenizer.texts_to_sequences([text])[0] #return text=[[...]]
#transfer text_as_int to tensor
text = tf.data.Dataset.from_tensor_slices(text_as_int)
#make mappinf between char and index
idx2char=tokenizer.index_word
char2idx=tokenizer.word_index
len_text=len(text_as_int)
vocab_size=len(set(text_as_int))

# each seq to input/output pairs
def build_pairs(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

def dataset(batch_size,seq_len,steps_per_epoch):
    #divide the novel text to small pieces of seqs
    seqs = text.batch(seq_len + 1,drop_remainder=True)
    #generate input/output pairs for each seq, shffle them, batch them
    
    ds=seqs.map(build_pairs).batch(batch_size,drop_remainder=True)
    
    return ds



In [0]:
idx2char


{1: '，',
 2: ' ',
 3: '。',
 4: '不',
 5: '\n',
 6: '一',
 7: '的',
 8: '“',
 9: '”',
 10: '是',
 11: '道',
 12: '了',
 13: '：',
 14: '人',
 15: '我',
 16: '你',
 17: '這',
 18: '他',
 19: '大',
 20: '來',
 21: '之',
 22: '說',
 23: '中',
 24: '？',
 25: '在',
 26: '得',
 27: '下',
 28: '上',
 29: '子',
 30: '有',
 31: '那',
 32: '到',
 33: '段',
 34: '也',
 35: '手',
 36: '去',
 37: '…',
 38: '便',
 39: '出',
 40: '心',
 41: '麼',
 42: '見',
 43: '個',
 44: '聲',
 45: '自',
 46: '身',
 47: '只',
 48: '然',
 49: '過',
 50: '無',
 51: '好',
 52: '時',
 53: '要',
 54: '譽',
 55: '又',
 56: '著',
 57: '頭',
 58: '她',
 59: '老',
 60: '！',
 61: '、',
 62: '峰',
 63: '想',
 64: '知',
 65: '聽',
 66: '向',
 67: '如',
 68: '可',
 69: '小',
 70: '將',
 71: '為',
 72: '已',
 73: '當',
 74: '阿',
 75: '師',
 76: '此',
 77: '卻',
 78: '起',
 79: '什',
 80: '和',
 81: '們',
 82: '功',
 83: '後',
 84: '兩',
 85: '而',
 86: '但',
 87: '神',
 88: '都',
 89: '生',
 90: '叫',
 91: '容',
 92: '王',
 93: '天',
 94: '笑',
 95: '正',
 96: '三',
 97: '蕭',
 98: '以',
 99: '十',
 100: '氣',
 101: '

# Set Model


### Build model

In [0]:
def build_model(embedding_dim,lstm_units,vocab_size,batch_size):
      #Make a LSTM model
      model = tf.keras.Sequential()
      #Word embedding: map a chinese char to high dimension vectors
      model.add(
        tf.keras.layers.Embedding(
          input_dim=vocab_size, 
          output_dim=embedding_dim,
          batch_input_shape=[batch_size, None]
        )
      )

      #LSTM: process the sequence of data
      model.add(
        tf.keras.layers.LSTM(
          units=lstm_units, 
          return_sequences=True, 
          stateful=True, 
          recurrent_initializer='glorot_uniform'
        )
      )

      #Dense layer: predict the probability of each chinese char
      model.add(
        tf.keras.layers.Dense(vocab_size)
      )

      #model.summary()
      return model


### Compile and fit model

In [0]:
#loss function and optimizer
def loss(y_true, y_pred):
    return tf.keras.losses.sparse_categorical_crossentropy(
          y_true, y_pred, from_logits=True
    ) #becase our model return logits, so set from_logits flag

def compile_model(model,learning_rate):
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), 
        loss=loss
    )
def fit_model(model,x,epochs,callbacks): model.fit(x=x,epochs=epochs,callbacks=[callbacks])

###Checkpoints

In [0]:
def set_modelID(lr,seq_len,batch_size,embedding_dim,lstm_units,epochs):  
    return f'{lr}seq{seq_len}batch{batch_size}_embed{embedding_dim}unit{lstm_units}_epoch{epochs}'

def path_to_checkpoint(lr,seq_len,batch_size,embedding_dim,lstm_units,epochs):
    model_id=set_modelID(lr,seq_len,batch_size,embedding_dim,lstm_units,epochs)
    checkpoint="./train/"+model_id+"/checkpoint"
    print (checkpoint)
    return checkpoint

def set_callbacks(lr,seq_len,batch_size,embedding_dim,lstm_units,epochs):
    log_dir ="./logs/"+set_modelID(lr,seq_len,batch_size,embedding_dim,lstm_units,epochs)
    ckpt=path_to_checkpoint(lr,seq_len,batch_size,embedding_dim,lstm_units,epochs)

    callbacks = [
        tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1),
        tf.keras.callbacks.ModelCheckpoint(filepath=ckpt,save_weights_only=True
                                           ,mode='min',monitor='loss'
                                           ,save_best_only=True,verbose=1),
      
        tf.keras.callbacks.EarlyStopping(monitor='loss', patience=8),
    ]

    return callbacks



# Train Model


In [0]:
def call_model(lr,seq_len,batch_size,embedding_dim,lstm_units,epochs):
    steps_per_epoch=len_text//seq_len

    model=build_model(embedding_dim,lstm_units,vocab_size,batch_size)
    compile_model(model,lr)

    callbacks=set_callbacks(lr, seq_len, batch_size,embedding_dim,lstm_units,epochs)
    fit_model(model,ds,epochs,callbacks)
    
   

# Predict

In [0]:
def call_predict(embedding_dim,lstm_units,checkpoint_path,temperature,num_generate,start_string):
    
    model=build_model(embedding_dim,lstm_units,vocab_size,batch_size=1)
    model.load_weights(checkpoint_path)
    model.build(tf.TensorShape([1,None]))
    generate_text(model,temperature,start_string,num_generate)
    

In [0]:
def generate_text(model,temperature,start_string,num_generate):
  #convert string to nums (vectorizing)
  input_val=[char2idx[c] for c in start_string] 
  input_val=tf.expand_dims(input_val,0)
  text_generated=[]
  
  model.reset_states()
  for i in range(num_generate):
    predict=model(input_val)
    predict=tf.squeeze(predict,0)
    predict/=temperature
    predict_id=tf.random.categorical(predict,num_samples=1)[-1,0].numpy()
    input_val=tf.expand_dims([predict_id],0) 
    text_generated+=[idx2char[predict_id]]
  print(start_string+''.join(text_generated))

# Run

In [0]:
content='虛竹又說起已將丁春秋交給了少林寺戒律院看管，每年端午和重陽兩節，少林寺僧給他服食靈鷲宮的藥丸，以解他生死符時發生時的苦楚\
，他生死懸於人手，料來不敢為非作歹。蕭峰拊掌大笑，說道：“二弟，你為武林中除去一個大害。這丁春秋在佛法陶治之下，將來能逐步化去他的戾氣，\
亦未可知。”虛竹愀然不樂，說道：“我想在少林寺出家，師祖、師父他們卻趕了我出來。這丁春秋傷天害理，作惡多端，卻能在少林寺清修，怎地我和他二人\
苦樂的業報如此不同？”蕭峰微微一笑，說道：“二弟，你羨慕丁老怪，丁老怪可更加千倍萬倍的羨慕你了。你身為靈鷲宮主人，統率三十六洞洞主、七十二島島主，'

#content=open("content.txt", encoding="utf-8").read()
len(content)

271

In [0]:
#Auto
#lr,seq_len,batch_size,embedding_dim,lstm_units,epochs,temperature,num_generate
paras=[0.001,100,128,512,1024,30]
alter=[0.002,10,256,256,512,60]
for i in range(0):
    parameters=paras[:]
    parameters[i]=alter[i]
    run(*parameters,0.6,160)
  

In [0]:
#Manually
def run(lr,seq_len,batch_size,embedding_dim,lstm_units,epochs,temperature,num_generate):
    
    ckpt=path_to_checkpoint(lr, seq_len, batch_size, embedding_dim, lstm_units, epochs)
   
    #call_model(lr,seq_len,batch_size,embedding_dim,lstm_units,epochs)
   
    call_predict(embedding_dim,lstm_units,ckpt,temperature,num_generate,content)



def keepRun(lr,seq_len,batch_size,embedding_dim,lstm_units,epochs,temperature,num_generate):
    ckpt=path_to_checkpoint(0.001, seq_len, batch_size, embedding_dim, lstm_units, 400)
    model=build_model(embedding_dim,lstm_units,vocab_size,batch_size)
    compile_model(model,lr)
    model.load_weights(ckpt)
    ds=dataset(batch_size,seq_len,steps_per_epoch=len_text//seq_len)
    callbacks=set_callbacks(lr, seq_len, batch_size,embedding_dim,lstm_units,epochs)
    fit_model(model,ds,epochs-400,callbacks)




#keepRun(0.001,1000,128,512,1024,500,0.6,160)
#print(len(content))
run(0.001,1000,128,512,1024,500,0.6,250)

./train/0.001seq1000batch128_embed512unit1024_epoch500/checkpoint
虛竹又說起已將丁春秋交給了少林寺戒律院看管，每年端午和重陽兩節，少林寺僧給他服食靈鷲宮的藥丸，以解他生死符時發生時的苦楚，他生死懸於人手，料來不敢為非作歹。蕭峰拊掌大笑，說道：“二弟，你為武林中除去一個大害。這丁春秋在佛法陶治之下，將來能逐步化去他的戾氣，亦未可知。”虛竹愀然不樂，說道：“我想在少林寺出家，師祖、師父他們卻趕了我出來。這丁春秋傷天害理，作惡多端，卻能在少林寺清修，怎地我和他二人苦樂的業報如此不同？”蕭峰微微一笑，說道：“二弟，你羨慕丁老怪，丁老怪可更加千倍萬倍的羨慕你了。你身為靈鷲宮主人，統率三十六洞洞主、七十二島島主，七十二島，成名金器，哪裡還有什麼打人？”

    那老僧哈哈大笑，說道：“啟稟主公，請移方禮，自是來領教。”緣根都道：“此人怎樣？”

    那老僧道：“是，是。只是那條腿是不是的。”說著左手一揮，在他膝頭之上，突然間一股涼風撲向他後心，正是李秋水。

    公冶乾道：“我……我在你身上，將這枚寶石指環給我宰了。”

    眾人都是一驚，但見一個老者，都是一隻大毒蛇)，一個踉蹌之，當下也有奔行。

    段譽叫道：“喂，你別玩！”

    那女郎聽了這句話，口中卻聽見了，還道是個什麼


# Visaualization and Measurement


In [0]:
%tensorboard --logdir logs/

In [0]:
#!pip install tensorflowjs
#!tensorflowjs_converter --input_format keras \
#                       ./my_model.h5 \
#                       ./tfjs_target_dir
#%%javascript
#import * as tf from '@tensorflow/tfjs';
#const model = await tf.loadLayersModel('./tfjs_target_dir/model.json');