# Data Preparation

In [5]:
# importing required libraries
import requests
import string
import tensorflow as tf
import re

Downloading the book, turning it into text format and displaying its first 20 lines to see what it consists of.

In [6]:
# Getting the text form the internet
response = requests.get("https://www.gutenberg.org/files/61262/61262-0.txt",)
# turing the file into text and splitting it at every new line
data = response.text.split('\n')
# printing the first 20 lines of the raw text
data[:20]

['ï»¿The Project Gutenberg EBook of Poirot Investigates, by Agatha Christie\r',
 '\r',
 'This eBook is for the use of anyone anywhere in the United States and most\r',
 'other parts of the world at no cost and with almost no restrictions\r',
 'whatsoever.  You may copy it, give it away or re-use it under the terms of\r',
 'the Project Gutenberg License included with this eBook or online at\r',
 "www.gutenberg.org.  If you are not located in the United States, you'll have\r",
 'to check the laws of the country where you are located before using this ebook.\r',
 '\r',
 'Title: Poirot Investigates\r',
 '\r',
 'Author: Agatha Christie\r',
 '\r',
 'Release Date: January 28, 2020 [EBook #61262]\r',
 '[Last updated: June 20, 2022]\r',
 '\r',
 'Language: English\r',
 '\r',
 'Character set encoding: UTF-8\r',
 '\r']

Creating a function to clean the ebook by removing all unnecessary characters and to also separate the text into individual word tokens

In [7]:
# Function for cleaning the ebook and separating the text into individual word tokens
def clean (document):

    #only keeping the lines that correspond to the text of the book
    document = document[112:7600]
    # list of lines converted into text
    document = " ".join(document)
    # turning the text into lowercase letters only 
    document = document.lower()
    # removing carriage return characters
    document = document.replace('\r', '') 
    # removing all other extra spaces
    document = re.sub('\s+', ' ', document)
    # removing special characters
    document = re.sub(r'[^\x00-\x7f]', r'', document) 
    # Removing the punctuation
    table = str.maketrans('', '', string.punctuation)
    document = document.translate(table)

    # setting each "word" to be a token by spliting the text based on whitespace
    tokens = document.split()

    return tokens

Getting my text data cleaned using the function I created above and previewing my tokenized words from my text

In [8]:
tokens = clean(data)
# printing the first 50 words of the text after cleaning
print(tokens[:50])

['i', 'was', 'standing', 'at', 'the', 'window', 'of', 'poirots', 'rooms', 'looking', 'out', 'idly', 'on', 'the', 'street', 'below', 'thats', 'queer', 'i', 'ejaculated', 'suddenly', 'beneath', 'my', 'breath', 'what', 'is', 'mon', 'ami', 'asked', 'poirot', 'placidly', 'from', 'the', 'depths', 'of', 'his', 'comfortable', 'chair', 'deduce', 'poirot', 'from', 'the', 'following', 'facts', 'here', 'is', 'a', 'young', 'lady', 'richly']


In [None]:
# Total number of words
all_words = len(tokens)
print("Total words: ", all_words)

Total words:  52355


In [9]:
# unique words
unq_words = len(set(tokens))
print("Unique words: ", unq_words)

Unique words:  6277


Creating a list of multiple 41-word sequences that will be used to train the model. The first 40 words in each sequence are for training and the last one is for validating the prediciton

In [10]:
# 41 words to be used in the model, 
# the first 40 words are for training and the last one for validating the prediciton
length = 40 + 1
sentence_set = []

for i in range(length, len(tokens)):
    # taking a sequence of words for each chunk of 41 words in the whole text
    sequence = tokens[i-length:i]
    # joining the individual tokens (words) into a line (string)
    line = ' '.join(sequence)
    # adding my sentence (line of 41 words) into my "lines" list
    sentence_set.append(line)

print(len(sentence_set))

52312


In [11]:
# previewing some of the lines
sentence_set[:3]

['i was standing at the window of poirots rooms looking out idly on the street below thats queer i ejaculated suddenly beneath my breath what is mon ami asked poirot placidly from the depths of his comfortable chair deduce poirot from',
 'was standing at the window of poirots rooms looking out idly on the street below thats queer i ejaculated suddenly beneath my breath what is mon ami asked poirot placidly from the depths of his comfortable chair deduce poirot from the',
 'standing at the window of poirots rooms looking out idly on the street below thats queer i ejaculated suddenly beneath my breath what is mon ami asked poirot placidly from the depths of his comfortable chair deduce poirot from the following']

# Building my Model

In [20]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
import numpy as np

Preparing x and y for the words

In [21]:
# initializing the tokenizer object
tokenizer = Tokenizer()
# using tokenizer to turn every unique word into a unique integer
tokenizer.fit_on_texts(sentence_set)
# Creating a list of integer values for all our sentences
sequences_int = tokenizer.texts_to_sequences(sentence_set)
sequences_int = np.array(sequences_int)
# selecting the first 40 columns a my predictor variabls 
X = sequences_int[:, :-1] ### (there are 30 integers representing words)
# selecting the last (41st) column as my response variable 
y = sequences_int[:, -1] ### (there are 30 integers representing words)

# Turning my response variable into one-hot encoding
y = to_categorical(y, num_classes= len(tokenizer.word_index)+1)

In [None]:
model = Sequential()
# amount of unique words as inputs and amount of required predicted words as output
model.add(Embedding(unq_words+1, 50, input_length = X.shape[1]))
model.add(LSTM(64, return_sequences = True))
model.add(LSTM(128))
model.add(Dense(128, activation='relu'))
# output layers with softmax for probability of each predicted word
model.add(Dense(unq_words+1, activation='softmax'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 40, 50)            313900    
                                                                 
 lstm_2 (LSTM)               (None, 40, 64)            29440     
                                                                 
 lstm_3 (LSTM)               (None, 128)               98816     
                                                                 
 dense_2 (Dense)             (None, 128)               16512     
                                                                 
 dense_3 (Dense)             (None, 6278)              809862    
                                                                 
Total params: 1,268,530
Trainable params: 1,268,530
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Compiling the model with adam optimizer and fitting it.
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
lstm_history = model.fit(X, y, batch_size = 128, epochs=500)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

In [None]:
!mkdir models
model.save("./models/lang-model.h5")

# Testing the model (text from the book)

Loading the pre-trained model for testing.

In [None]:
model = tf.keras.models.load_model("./models/lang-model.h5")

Creating a text generation function

In [23]:
# function takes in the model for prediction, the tokenizer (dictionary), the piece of text to be used 
# for prediction and the number of words we want to predict 
def text_gen(model, tokenizer, given_text, num_of_words):
  # text predicted
  final_text = []
  # For each word we want to generate
  for i in range(num_of_words):
    # encoding the text of the first 50 words into ints
    token = tokenizer.texts_to_sequences([given_text])[0]
    # If text is larger than 50 words we keep the last 50 words
    token = pad_sequences([token], maxlen = X.shape[1], truncating='pre')
    # Predicting the probabilities for each word
    y_pred = model.predict(token).argmax(axis=1)

    predicted = ''
    # Finding the index for the predicted word
    for word, index in tokenizer.word_index.items():
      # in the intiger of the predicted word matches an integer index of a word in the dictionary
      if index == y_pred:
        # then the predicted word is assigned the string found in the dictionary at that index
        predicted = word
        break
    # adding the predicted word down into the given text so that the next word can be predicted
    given_text += " "+ predicted
    # also adding the predicted word into a new list of strings that will formulate the final predicted sentence
    final_text.append(predicted)
  # returning the predicted sentence
  return " ".join(final_text)

Previewing a chuck of text that will be used to predict a small sentence using my model

In [24]:
sentence_set[276]

'different i replied rather feebly ah sacr cried poirot is it that you expect her to promenade herself in the streets of london in a cowboy hat or with bare feet and a bunch of curls as an irish colleen always'

Below we can see the predicted senteced for the above given text

In [25]:
text_gen(model, tokenizer, sentence_set[276], 20)



'with you it is the nonessentials remember the case of the dancer valerie saintclair i shrugged my shoulders slightly annoyed'

Now lets look at the actual text that followed the piece of text that was used to make the prediction.

In [26]:
sentence_set[315]

'colleen always with you it is the nonessentials remember the case of the dancer valerie saintclair i shrugged my shoulders slightly annoyed but console yourself mon ami said poirot calming down all cannot be as hercule poirot i know it well'

# Testing the model (custom text)

**You can change the text below to anything you like. You can also adjust the number of words you want the model to produce** 

In [31]:
# The piece of text to be given to the model for prediction
my_text = "The gentleman approach the server at the local tavern and waited"
# setting how many words to predict
response_len = 50

In [32]:
text_gen(model, tokenizer, my_text, response_len)



'not by with he got down poirot ran nimbly and in such another minutes all the timefor each flat of course in doubt class of disappearance are you late he is met my uncle about halfpast his violet capacity the geniuses his mouth mrs robinson touched having if after many'