<a href="https://colab.research.google.com/github/s99436q/RNN/blob/master/novels.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set Data

###Load data

In [19]:
%tensorflow_version 2.x
%load_ext tensorboard

import tensorflow as tf
print(tf.__version__)

from google.colab import files
#uploaded = files.upload()
text=open("novel2.txt", encoding="utf-8").read()
print(f"天龍八部小說共有 {len(text)} 中文字,{len(set(text))} 個獨一無二的字")
print(text[100031:100099])

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard
2.2.0-rc4
rmdir: failed to remove 'training_checkpoints/': Directory not empty
天龍八部小說共有 1266445 中文字,4401 個獨一無二的字
凶神惡煞’南海鱷神之上，必定是個狠惡可怖之極的人物，那知居然頗有姿色，不由得又向她瞧了幾眼。葉二孃向她嫣然一笑，木婉清全身一顫，只覺她這


### Prepare data


In [0]:
#intial tokenizor 
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=None,char_level=True,filters='')
#read the text
tokenizer.fit_on_texts(text)
#transfer text to int
text_as_int = tokenizer.texts_to_sequences([text])[0] #return text=[[...]]
#transfer text_as_int to tensor
text = tf.data.Dataset.from_tensor_slices(text_as_int)
#make mappinf between char and index

idx2char=tokenizer.index_word
char2idx=tokenizer.word_index

# each seq to input/output pairs
def build_pairs(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

def dataset(batch_size,seq_len,steps_per_epoch):
    #divide the novel text to small pieces of seqs
    seqs = text.batch(seq_len + 1,drop_remainder=True)
    #generate input/output pairs for each seq, shffle them, batch them
    ds=seqs.map(build_pairs).batch(batch_size,drop_remainder=True)
    rds=seqs.map(build_pairs).shuffle(steps_per_epoch).batch(batch_size,drop_remainder=True)
    return ds,rds

# Set Model


### Build model

In [0]:
def build_model(embedding_dim,lstm_units,vocab_size,batch_size):
      #Make a LSTM model
      model = tf.keras.Sequential()
      #Word embedding: map a chinese char to high dimension vectors
      model.add(
        tf.keras.layers.Embedding(
          input_dim=vocab_size, 
          output_dim=embedding_dim,
          batch_input_shape=[batch_size, None]
        )
      )

      #LSTM: process the sequence of data
      model.add(
        tf.keras.layers.LSTM(
          units=lstm_units, 
          return_sequences=True, 
          stateful=True, 
          recurrent_initializer='glorot_uniform'
        )
      )

      #Dense layer: predict the probability of each chinese char
      model.add(
        tf.keras.layers.Dense(vocab_size)
      )

      model.summary()
      return model


### Compile model

In [0]:
#loss function and optimizer
def loss(y_true, y_pred):
    return tf.keras.losses.sparse_categorical_crossentropy(
          y_true, y_pred, from_logits=True
    ) #becase our model return logits, so set from_logits flag

def compile_model(model,learning_rate):
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), 
        loss=loss
    )


###Set check points

In [0]:
def set_checkpoints(lr,seq_len,batch_size,embedding_dim,lstm_units):
  
    model_id=f'{lr}seq{seq_len}batch{batch_size}_embed{embedding_dim}unit{lstm_units}'
  
    #checkpoint_dir = './training_checkpoints/' + model_id 
    #checkpoint_prefix = checkpoint_dir + '/ckpt_{epoch}'
    checkpoint_filepath='./training_checkpoints/' + model_id 
    log_dir = "logs/fit/" + model_id
    
    callbacks = [
        tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1),
        tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,
              save_weights_only=True,save_best_only=True,
              monitor='vcc_loss',mode='min'),
        tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3),
    ]

    return checkpoint_filepath,callbacks

### Fit model


In [0]:
def fit_model(x,epochs,callbacks): model.fit(x=x,epochs=epochs,callbacks=[callbacks])

#Train Model

### Set parameters

In [0]:
lr=0.001
seq_len=100
batch_size=128
embedding_dim=512
lstm_units=1024
epochs=3
vocab_size=len(set(text_as_int))
steps_per_epoch=len(text_as_int)//seq_len
data_chuck=0 #0 is non-shuffled data and 1 is shffled data
temperature=0.6
num_generate=150

### Training 

In [5]:
model=build_model(embedding_dim,lstm_units,vocab_size,batch_size)
compile_model(model,lr)
data=dataset(batch_size,seq_len,steps_per_epoch)
checkpoint_filepath,callbacks=set_checkpoints(lr, seq_len, batch_size,embedding_dim,lstm_units)
fit_model(data[data_chuck],epochs,callbacks)

NameError: ignored

# Predict

###Retore the lastes Checkpoints

In [0]:
#tf.train.latest_checkpoint(checkpoint_dir)



###Generate text

In [0]:
def generate_text(model,temperature,start_string,num_generate):
  
  input_val=[char2idx[c] for c in start_string] #convert string to nums (vectorizing)
  input_val=tf.expand_dims(input_val,0)
  
  text_generated=[]
  
  model.reset_states()
  for i in range(num_generate):
    predict=model(input_val)
    predict=tf.squeeze(predict,0)
    predict/=temperature
    
    predict_id=tf.random.categorical(predict,num_samples=1)[-1,0].numpy()
    input_val=tf.expand_dims([predict_id],0) 
    text_generated+=[idx2char[predict_id]]

  return start_string+''.join(text_generated)

###Build a new Model

In [0]:
model=build_model(embedding_dim,lstm_units,vocab_size,batch_size=1)
#model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.load_weights(checkpoint_filepath)
model.build(tf.TensorShape([1,None]))

### Write Novel

In [0]:
start_string="段譽卻仍是抬起了頭望著她，見那少女雙腳蕩啊蕩的，似乎這麼坐樑上甚是好玩，問道：“是你救我的麼？"

print(generate_text(model,temperature,start_string,num_generate))

# Measurements & Visualizations

In [0]:
!ls ./logs/fit

In [0]:
%tensorboard --logdir logs/fit/