In [1]:
from tensorflow.keras.callbacks import LambdaCallback
from tensorflow.keras.models import Model, load_model, Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout, Input, Masking
from tensorflow.keras.layers import LSTM
from tensorflow.keras.utils import get_file
from tensorflow.keras.preprocessing.sequence import pad_sequences

from module.conf import PROJECT_DIR

import tensorflow as tf
import numpy as np
import random
import sys
import io

In [None]:
def build_data(text, Tx = 40, stride = 3):
    """
    Create a training set by scanning a window of size Tx over the text corpus, with stride 3.
    
    Arguments:
    text -- string, corpus of Shakespearian poem
    Tx -- sequence length, number of time-steps (or characters) in one training example
    stride -- how much the window shifts itself while scanning
    
    Returns:
    X -- list of training examples
    Y -- list of training labels
    """
    
    X = []
    Y = []

    ### START CODE HERE ### (≈ 3 lines)
    for i in range(0, len(text) - Tx, stride):
        X.append(text[i: i + Tx])
        Y.append(text[i + Tx])
    ### END CODE HERE ###
    
    print('number of training examples:', len(X))
    
    return X, Y

In [None]:
def vectorization(X, Y, n_x, char_indices, Tx = 40):
    """
    Convert X and Y (lists) into arrays to be given to a recurrent neural network.
    
    Arguments:
    X -- 
    Y -- 
    Tx -- integer, sequence length
    
    Returns:
    x -- array of shape (m, Tx, len(chars))
    y -- array of shape (m, len(chars))
    """
    
    m = len(X)
    x = np.zeros((m, Tx, n_x), dtype=np.bool_)
    y = np.zeros((m, n_x), dtype=np.bool_)
    for i, sentence in enumerate(X):
        for t, char in enumerate(sentence):
            x[i, t, char_indices[char]] = 1
        y[i, char_indices[Y[i]]] = 1
        
    return x, y 

In [None]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    out = np.random.choice(range(len(chars)), p = probas.ravel())
    return out
    #return np.argmax(probas)
    
def on_epoch_end(epoch, logs):
    # Function invoked at end of each epoch. Prints generated text.
    None
    #start_index = random.randint(0, len(text) - Tx - 1)
    
    #generated = ''
    #sentence = text[start_index: start_index + Tx]
    #sentence = '0'*Tx
    #usr_input = input("Write the beginning of your poem, the Shakespearian machine will complete it.")
    # zero pad the sentence to Tx characters.
    #sentence = ('{0:0>' + str(Tx) + '}').format(usr_input).lower()
    #generated += sentence
#
    #sys.stdout.write(usr_input)

    #for i in range(400):
"""
        #x_pred = np.zeros((1, Tx, len(chars)))

        for t, char in enumerate(sentence):
            if char != '0':
                x_pred[0, t, char_indices[char]] = 1.

        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds, temperature = 1.0)
        next_char = indices_char[next_index]

        generated += next_char
        sentence = sentence[1:] + next_char

        sys.stdout.write(next_char)
        sys.stdout.flush()
        
        if next_char == '\n':
            continue
        
    # Stop at the end of a line (4 lines)
    print()
 """

In [None]:
def generate_output():
    generated = ''
    #sentence = text[start_index: start_index + Tx]
    #sentence = '0'*Tx
    usr_input = input("Write the beginning of your poem, the Shakespeare machine will complete it. Your input is: ")
    # zero pad the sentence to Tx characters.
    sentence = ('{0:0>' + str(Tx) + '}').format(usr_input).lower()
    generated += usr_input 

    sys.stdout.write("\n\nHere is your poem: \n\n") 
    sys.stdout.write(usr_input)
    strategy  = tf.distribute.get_strategy()
    with strategy.scope():
        print(f"device:{tf.device('GPU')}")
        for i in range(400):
            x_pred = np.zeros((1, Tx, len(chars)))

            for t, char in enumerate(sentence):
                if char != '0':
                    x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, temperature = 1.0)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()

            if next_char == '\n':
                continue

In [None]:
print("Loading text data...")
text = io.open('./shakespeare.txt', encoding='utf-8').read().lower()
#print('corpus length:', len(text))

Tx = 40
chars = sorted(list(set(text)))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
#print('number of unique characters in the corpus:', len(chars))

print("Creating training set...")
X, Y = build_data(text, Tx, stride = 3)
print("Vectorizing training set...")
x, y = vectorization(X, Y, n_x = len(chars), char_indices = char_indices) 
print("Loading model...")
model = load_model('./models/model_shakespeare_kiank_350_epoch.h5')

In [None]:
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
strategy  = tf.distribute.get_strategy()
with strategy.scope():
    print(f"{tf.config.list_physical_devices('GPU') }")
    model.fit(x, y, batch_size=128, epochs=1, callbacks=[print_callback])

In [None]:
# Run this cell to try with different inputs without having to re-train the model 
generate_output()

In [None]:
model_sample: tf.keras.Model = load_model(PROJECT_DIR + '/models/model_shakespeare_kiank_350_epoch.h5')
model_sample.summary()
model_sample.save_spec()

In [None]:
strategy  = tf.distribute.get_strategy()
with strategy.scope():
    print(f"{tf.config.list_physical_devices('GPU') }")

In [67]:
org_file = PROJECT_DIR + "/data/lstm/truyen_kieu_data.txt"
pre_file = PROJECT_DIR + "/data/lstm/truyen_kieu_data_pre.txt"

def pre_data() -> None:
    with open(file=org_file, mode="rt") as i_f:
        with open(file=pre_file, mode="wt") as o_f:
            for line in i_f:
                if "" == line.strip(): continue
                a_word = line.split()
                pre_word = ""
                for word in a_word:
                    tmp_line = "".join(filter(str.isalpha, word.strip()))
                    pre_word = " ".join((pre_word, tmp_line))
                    pass
                pre_word = pre_word.strip().lower()
                o_f.write(f"{pre_word}\n") 
                pass
            pass
        pass
    return

def load_data() -> tuple:
    rs: map = {}
    ls = []
    tmp_line = ""
    with open(file=pre_file, mode="rt") as i_f:
        count = 0
        for line in i_f:
            line = line.strip()
            if "" == line: continue
            if count % 2 == 0 and tmp_line!="":
                ls.append(tmp_line.strip())
                tmp_line = ""
                pass
            tmp_line += (" | " if tmp_line!="" else "") + line
            a_word = line.split()
            for word in a_word:
                if word not in rs:
                    rs[word] = 0
                    pass
                rs[word]+=1
                pass
            pass
            count+=1
        pass
    return rs, ls

pre_data()
bag_of_words, coupled_lines = load_data()
bag_of_words["|"] = len(coupled_lines)
# coupled_lines
# values = [*bag_of_word.values()]
# sum(values)
len(bag_of_words)

2394

In [69]:
# model: tf.keras.Model = tf.keras.Sequential(name="LSTM-RNN")
# input = tf.keras.layers.Input(shape=(11))
# embedding_1 = tf.keras.layers.Embedding(input_dim=3_865, output_dim=100, input_length=7)
# bidrect_1 = tf.keras.layers.Bidirectional(\
#     layer=tf.keras.layers.LSTM(units=150, return_sequences=True, go_backwards=False),\
#     backward_layer=tf.keras.layers.LSTM(units=250, return_sequences=True, go_backwards=True))
# bidrect_2 = tf.keras.layers.Bidirectional(layer=tf.keras.layers.LSTM(units=64, return_sequences=True))
# dropout = tf.keras.layers.Dropout(rate=0.2)
# lstm = tf.keras.layers.LSTM(units=100)
# output_1 = tf.keras.layers.Dense(units=1932, activation="relu")
# output_2 = tf.keras.layers.Dense(units=3865, activation="softmax")

# model.add(input)
# model.add(embedding_1)
# model.add(bidrect_1)
# model.add(bidrect_2)
# model.add(dropout)
# model.add(lstm)
# model.add(output_1)
# model.add(output_2)

# model.summary()
model: tf.keras.Model = tf.keras.Sequential(name="LSTM-RNN")
input = tf.keras.layers.Input(shape=(15))
embedding_1 = tf.keras.layers.Embedding(input_dim=2394, output_dim=128, input_length=15)
bidrect_1 = tf.keras.layers.Bidirectional(\
    layer=tf.keras.layers.LSTM(units=160, return_sequences=True, go_backwards=False),\
    backward_layer=tf.keras.layers.LSTM(units=240, return_sequences=True, go_backwards=True))
bidrect_2 = tf.keras.layers.Bidirectional(layer=tf.keras.layers.LSTM(units=64, return_sequences=True))
dropout = tf.keras.layers.Dropout(rate=0.2)
lstm = tf.keras.layers.LSTM(units=128)
output_1 = tf.keras.layers.Dense(units=2394*4, activation="relu")
output_2 = tf.keras.layers.Dense(units=2394, activation="softmax")

model.add(input)
model.add(embedding_1)
model.add(bidrect_1)
model.add(bidrect_2)
model.add(dropout)
model.add(lstm)
model.add(output_1)
model.add(output_2)

model.summary()

Model: "LSTM-RNN"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 15, 128)           306432    
                                                                 
 bidirectional_2 (Bidirectio  (None, 15, 400)          539200    
 nal)                                                            
                                                                 
 bidirectional_3 (Bidirectio  (None, 15, 128)          238080    
 nal)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 15, 128)           0         
                                                                 
 lstm_7 (LSTM)               (None, 128)               131584    
                                                                 
 dense_2 (Dense)             (None, 9576)              123