# Text Geneartion with LSTM

![books](./books.png)


This project is to generate another book of Dan Brown's novel series using Long Short-Term Memory Recurrent Neural Network.

The network is trained using 6 books consisting of : 
* Angels and Demons
* Inferno
* The Lost Symbols
* The Davinci Code
* Deception Point
* Digital Fortress

In [126]:
import numpy as np
import pickle
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from keras.utils import np_utils
import io
from os import listdir, mkdir
from re import sub, findall

## Prepare data

- read all file and keep in `text` with lowercase and stripped
- clean the data (remove non-ascii characters and some non-word characters)
- encode each characters as integer
- use one hot encoding for output

In [103]:
# MACROS
TXT_FILE_PATH = "txt"
PICKLE_FILE_PATH = "pickle"
CHARACTER_REGEX = r"[\w|\ |!|?]*"
SEQUENCE_LENGTH = 100

In [131]:
file_names = list(filter(lambda x: ".txt" in x, listdir(TXT_FILE_PATH)))
text = ""

for fn in file_names[:1]:
    with open(f"{TXT_FILE_PATH}/{fn}", "r") as f:
        text += f.read().lower()

text = text.replace("\n", " ")
text = "".join(findall(CHARACTER_REGEX, text))
text = text.encode("ascii", errors='ignore').decode()
chars = sorted(list(set(text)))
char_int = dict((c,i) for i,c in enumerate(chars))
char_length = len(chars)
text_length = len(text)

print(f"Number of characters ; {char_length}")
print(f"Text lengths: {text_length}")

Number of characters ; 40
Text lengths: 790267


In [132]:
pre_x = []
pre_y = []

# sliding windows of the text
for i in range(text_length-SEQUENCE_LENGTH):
    x = text[i:i+SEQUENCE_LENGTH] # preceding characters
    y = text[i+SEQUENCE_LENGTH] # target character
    pre_x.append([char_int[c] for c in x])
    pre_y.append(char_int[y])
print(len(pre_x))

790167


In [133]:
# save data
if PICKLE_FILE_PATH not in listdir("."):
    mkdir(PICKLE_FILE_PATH)
with io.open(f"{PICKLE_FILE_PATH}/data_x.pickle", "wb") as f:
    pickle.dump(pre_x,f)
with io.open(f"{PICKLE_FILE_PATH}/data_y.pickle", "wb") as f:
    pickle.dump(pre_y,f)

In [134]:
n_patterns = len(pre_x)
# we want [sameples, time steps, features]
X_data = np.reshape(pre_x, (n_patterns, SEQUENCE_LENGTH, 1))
X_data = X_data/float(char_length)
Y_data = np_utils.to_categorical(pre_y)

## Build the LSTM model

In [135]:
model = Sequential()
model.add(LSTM(512, input_shape=(X_data.shape[1], X_data.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(256, activation='relu'))
model.add(Dense(Y_data.shape[1], activation='softmax'))

model.compile(loss="categorical_crossentropy", optimizer=Adam())

model.fit(X_data, Y_data, epochs=20, batch_size=128)

Epoch 1/20
117376/790167 [===>..........................] - ETA: 1:30:33 - loss: 2.7938

KeyboardInterrupt: 