<h1>LSTM Model for Encouragement Generator</h1>
CMPU 365
Jason Lee, Nhan Nguyen

We have consulted and adapted code from the following sources in the making of this model: 
- https://stackabuse.com/python-for-nlp-neural-machine-translation-with-seq2seq-in-keras/
- https://keras.io/examples/nlp/lstm_seq2seq/#run-inference-sampling
- https://towardsdatascience.com/word-level-english-to-marathi-neural-machine-translation-using-seq2seq-encoder-decoder-lstm-model-1a913f2dc4a7

In [3]:
import numpy as np
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.initializers import Constant
import csv
from CleanText import clean_text

In [5]:
decoder_method = "one-hot"
# Use "embed" for word embedding output

<h2>Text Preprocessing</h2>
clean_text function is adapted from https://towardsdatascience.com/how-to-implement-seq2seq-lstm-model-in-keras-shortcutnlp-6f355f3e5639

In [12]:
start_char = "<START>"
end_char = "<END>"

posts = []
comments_output = []
comments_input = []

with open('../splitted_data_3.csv', 'r', newline='') as csv_file:
    textReader = csv.reader(csv_file)
    for row in textReader:
        # Each row in the csv_file is of the form Comment, Post
        posts.append(" ".join(clean_text(row[1])))
        cleaned_text = clean_text(row[0])
        cleaned_text = [start_char] + cleaned_text + [end_char]
        comments_output.append(" ".join(cleaned_text[1:]))
        comments_input.append(" ".join(cleaned_text[:-1]))

<h3>Tokenize the sentences</h3>

In [13]:
# Generate all unique words in the dataset
all_words = set()
all_text = posts + comments_output + comments_input
for sentence in all_text:
    for word in sentence.split():
        all_words.add(word)

# Tokenize the sentences
vocab_size = len(all_words)
tokenizer = Tokenizer(num_words = vocab_size+1, filters='', lower=False)
tokenizer.fit_on_texts(all_text)
word_to_index = tokenizer.word_index
with open("word_to_index.csv", "w") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=["word", "index"])
    writer.writeheader()
    for k,v in word_to_index.items():
        writer.writerow({"word": k, "index":v})
index_to_word = {v:k for k,v in word_to_index.items()}
posts_sequence = tokenizer.texts_to_sequences(posts)
comments_output_sequence = tokenizer.texts_to_sequences(comments_output)
comments_input_sequence = tokenizer.texts_to_sequences(comments_input)


<h3>Pad and Truncate Sentences</h3>

In [14]:
max_post_len = max(len(seq) for seq in posts_sequence)
median_post_length = np.median([len(seq) for seq in posts_sequence])
post_len = int(round((max_post_len+median_post_length)/2))
padded_post_sequences = pad_sequences(posts_sequence, maxlen=post_len, truncating='post')
print(post_len)

avg_comment_len = np.average([len(seq) for seq in comments_output_sequence])
max_comment_len = max([len(seq) for seq in comments_output_sequence])
comment_len = int(round((max_comment_len+avg_comment_len)/2))
comments_input_sequence = [
    x if len(x) <= comment_len else x[:comment_len] for x in comments_input_sequence
]
padded_comment_input_sequences = pad_sequences(comments_input_sequence, maxlen=comment_len, padding='post')
comments_output_sequence = [
    x if len(x) <= comment_len else x[:comment_len-1]+[word_to_index[end_char]] for x in comments_output_sequence
]
padded_comment_output_sequences = pad_sequences(comments_output_sequence, maxlen=comment_len, padding='post')

251


<h2>Build Model</h2>

<h3>Embedding Layer</h3>

In [None]:
# Embedding Layer
embeddings_dictionary = {}
with open('./glove6B/glove.6B.200d.txt', 'r') as glove:
    for line in glove:
        records = line.split()
        word = records[0]
        vector_dimensions = np.asarray(records[1:], dtype='float32')
        embeddings_dictionary[word] = vector_dimensions
        
embeddings_tokenized = np.zeros((vocab_size+1, embedding_dim))
if decoder_method == "one-hot":
    embedding_dim = 200
    for word, i in word_to_index.items():
        embedding_vector = embeddings_dictionary.get(word)
        if embedding_vector is not None:
            embeddings_tokenized[i] = embedding_vector
    embeddings_file_name = "embeddings_tokenized_one-hot.txt"
else:
    embedding_dim = 200+3
    for word, i in word_to_index.items():
        if word == start_char:
            embeddings_tokenized[i] = np.append(np.zeros((1, embedding_dim-3)), [1,0,0])
        elif word == end_char:
            embeddings_tokenized[i] = np.append(np.zeros((1, embedding_dim-3)), [0,1,0])
        else:
            embedding_vector = embeddings_dictionary.get(word)
        if embedding_vector is not None:
            embeddings_tokenized[i] = np.append(embedding_vector, [0,0,0])
    embeddings_file_name = "embeddings_tokenized_embed.txt"

    embeddings_tokenized[0] = np.append(np.zeros((1, embedding_dim-3)), [0,0,1])


with open(embeddings_file, "wb") as embeddings_file:
    np.save(embeddings_file_name, embeddings_tokenized)
embedding_layer = Embedding(vocab_size+1, embedding_dim, embeddings_initializer=Constant(embeddings_tokenized), input_length=post_len, trainable=False)

<h3>Decoder Output</h3>

In [None]:
if decoder_method == "one-hot":
    decoder_targets = np.zeros((
        len(posts),
        comment_len,
        vocab_size+1
        ), 
        dtype='float32' 
    )
    # One-hot encoding of the output
    for i, sequences in enumerate(padded_comment_output_sequences):
        for j, seq in enumerate(sequences):
            decoder_targets[i, j, seq] = 1
else:
    decoder_targets = np.zeros((
        len(posts), 
        comment_len, 
        embedding_dim
        ), 
        dtype='float32'
    )
    for i, seqs in enumerate(padded_comment_output_sequences):
        for j, seq in enumerate(seqs):
            decoder_targets[i, j] = embeddings_tokenized[seq]

<h3>Construct Network</h3>

In [None]:
# Hyperparameters
if decoder_method == "one-hot":
    epochs = 500
    latent_dim = 512
    optimizer = "rmsprop"
    activation = "softmax"
    loss_function = "categorical_crossentropy"
else:
    epochs = 800
    latent_dim = 200
    optimizer = "rmsprop"
    loss_function = "cosine_similarity"

In [None]:
# Defining the encoder
encoder_inputs = Input(shape=(post_len,))
enc_emb = embedding_layer(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
# Discard text output
encoder_states = [state_h, state_c]

In [None]:
# Defining the decoder

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(comment_len,)) 
# Different embedding layer depending on decoder method
if decoder_method == "one-hot":
    dec_emb_layer = Embedding(vocab_size+1, latent_dim)
else:
    dec_emb_layer = Embedding(vocab_size+1, embedding_dim, embeddings_initializer=Constant(embeddings_tokenized), input_length=comment_len, trainable=False)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
# Discard output for inner states
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)

In [None]:
if decoder_method == "one-hot":
    # Probability distribution over all words in vocabulary
    decoder_dense = Dense(vocab_size+1, activation=activation)
else: 
    # Produce individual dimensions in embedding
    decoder_dense = Dense(embedding_dim)
decoder_outputs = decoder_dense(decoder_outputs)

<h3>Fit model to data</h3>

In [None]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(
    optimizer=optimizer,
    loss=loss_function,
    metrics=['accuracy']
)
r = model.fit(
    x=[padded_post_sequences, padded_comment_input_sequences],
    y=decoder_targets,
    batch_size=1,
    epochs=1,
    validation_split=0.1,
)
model_name = "LSTM_One_Hot" if decoder_method == "one-hot" else "LSTM_Embed"
model.save(model_name)

In [None]:
from keras.utils import plot_model
plot_model(model, to_file='../LSTM/model.png', show_shapes=True, show_layer_names=True)