In [15]:
import pandas as pd  
import numpy as np 
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [234]:

with open('data.txt', 'r', encoding='utf-8') as file:
    text = file.readlines()

In [235]:
text 

['\n',
 "Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyle\n",
 '\n',
 'This eBook is for the use of anyone anywhere at no cost and with\n',
 'almost no restrictions whatsoever.  You may copy it, give it away or\n',
 're-use it under the terms of the Project Gutenberg License included\n',
 'with this eBook or online at www.gutenberg.net\n',
 '\n',
 '\n',
 'Title: The Adventures of Sherlock Holmes\n',
 '\n',
 'Author: Arthur Conan Doyle\n',
 '\n',
 'Release Date: November 29, 2002 [EBook #1661]\n',
 'Last Updated: May 20, 2019\n',
 '\n',
 'Language: English\n',
 '\n',
 'Character set encoding: UTF-8\n',
 '\n',
 '*** START OF THIS PROJECT GUTENBERG EBOOK THE ADVENTURES OF SHERLOCK HOLMES ***\n',
 '\n',
 '\n',
 '\n',
 'Produced by an anonymous Project Gutenberg volunteer and Jose Menendez\n',
 '\n',
 '\n',
 '\n',
 'cover\n',
 '\n',
 '\n',
 '\n',
 'The Adventures of Sherlock Holmes\n',
 '\n',
 '\n',
 '\n',
 'by Arthur Conan Doyle\n',
 '\n',
 '\n',
 '\n',
 'Conten

In [14]:
def preprocesstext(text):
    text = text.lower() 
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    stop_words = set(stopwords.words('english')) 
    words = text.split()  
    words = [word for word in words if word not in stop_words]  
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]  
    cleaned_text = ' '.join(words)
    return cleaned_text

In [None]:
cleansentence=[preprocesstext(sentence) for sentence in text]

In [225]:
cleansentence

['',
 'project gutenberg adventure sherlock holmes arthur conan doyle',
 '',
 'ebook use anyone anywhere cost',
 'almost restriction whatsoever may copy give away',
 'reuse term project gutenberg license included',
 'ebook online wwwgutenbergnet',
 '',
 '',
 'title adventure sherlock holmes',
 '',
 'author arthur conan doyle',
 '',
 'release date november ebook',
 'last updated may',
 '',
 'language english',
 '',
 'character set encoding utf',
 '',
 'start project gutenberg ebook adventure sherlock holmes',
 '',
 '',
 '',
 'produced anonymous project gutenberg volunteer jose menendez',
 '',
 '',
 '',
 'cover',
 '',
 '',
 '',
 'adventure sherlock holmes',
 '',
 '',
 '',
 'arthur conan doyle',
 '',
 '',
 '',
 'content',
 '',
 '',
 'scandal bohemia',
 'ii redheaded league',
 'iii case identity',
 'iv boscombe valley mystery',
 'v five orange pip',
 'vi man twisted lip',
 'vii adventure blue carbuncle',
 'viii adventure speckled band',
 'ix adventure engineer thumb',
 'x adventure noble b

In [18]:
corpus = ' '.join(cleansentence)
corpus




In [32]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [21]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([corpus])

In [25]:
sequences = []
for sentence in cleansentence:
    token_list = tokenizer.texts_to_sequences([sentence])[0]
    for i in range(1, len(token_list)):
        sequence = token_list[:i+1]  
        sequences.append(sequence)
        
sequences

[[57, 241],
 [57, 241, 274],
 [57, 241, 274, 49],
 [57, 241, 274, 49, 3],
 [57, 241, 274, 49, 3, 359],
 [57, 241, 274, 49, 3, 359, 1929],
 [57, 241, 274, 49, 3, 359, 1929, 1930],
 [856, 170],
 [856, 170, 282],
 [856, 170, 282, 1931],
 [856, 170, 282, 1931, 1138],
 [452, 2930],
 [452, 2930, 2931],
 [452, 2930, 2931, 11],
 [452, 2930, 2931, 11, 428],
 [452, 2930, 2931, 11, 428, 93],
 [452, 2930, 2931, 11, 428, 93, 40],
 [2932, 342],
 [2932, 342, 57],
 [2932, 342, 57, 241],
 [2932, 342, 57, 241, 555],
 [2932, 342, 57, 241, 555, 1932],
 [856, 1933],
 [856, 1933, 4150],
 [2318, 274],
 [2318, 274, 49],
 [2318, 274, 49, 3],
 [4151, 359],
 [4151, 359, 1929],
 [4151, 359, 1929, 1930],
 [4152, 640],
 [4152, 640, 4153],
 [4152, 640, 4153, 856],
 [52, 2933],
 [52, 2933, 11],
 [2934, 927],
 [429, 147],
 [429, 147, 4154],
 [429, 147, 4154, 4155],
 [581, 57],
 [581, 57, 241],
 [581, 57, 241, 856],
 [581, 57, 241, 856, 274],
 [581, 57, 241, 856, 274, 49],
 [581, 57, 241, 856, 274, 49, 3],
 [1653, 2935

In [26]:
max_sequence_length = max([len(seq) for seq in sequences])  
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='pre')

In [28]:
X = padded_sequences[:, :-1] 
y = padded_sequences[:, -1]

In [29]:
X

array([[   0,    0,    0, ...,    0,    0,   57],
       [   0,    0,    0, ...,    0,   57,  241],
       [   0,    0,    0, ...,   57,  241,  274],
       ...,
       [   0,    0,    0, ..., 7695, 2924, 7696],
       [   0,    0,    0, ..., 2924, 7696,  246],
       [   0,    0,    0, ..., 7696,  246,  244]])

In [30]:
y

array([ 241,  274,   49, ...,  246,  244, 1428])

In [31]:
y = to_categorical(y, num_classes=len(tokenizer.word_index) + 1)

In [33]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from tensorflow.keras.utils import to_categorical

In [35]:
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=X.shape[1]))
model.add(LSTM(100, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(len(tokenizer.word_index) + 1, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [44]:
model.fit(X, y, epochs=30, batch_size=64)


Epoch 1/30
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 41ms/step - accuracy: 0.0187 - loss: 7.5933
Epoch 2/30
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 43ms/step - accuracy: 0.0211 - loss: 7.5213
Epoch 3/30
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 38ms/step - accuracy: 0.0225 - loss: 7.4371
Epoch 4/30
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 38ms/step - accuracy: 0.0243 - loss: 7.3017
Epoch 5/30
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 38ms/step - accuracy: 0.0262 - loss: 7.2003
Epoch 6/30
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 40ms/step - accuracy: 0.0272 - loss: 7.0811
Epoch 7/30
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 39ms/step - accuracy: 0.0308 - loss: 6.9316
Epoch 8/30
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 38ms/step - accuracy: 0.0322 - loss: 6.8166
Epoch 9/30
[1m607/607[

<keras.src.callbacks.history.History at 0x23b03971160>

In [70]:
def predict_next_word(model, tokenizer, text, max_sequence_length):
    text = preprocesstext(text)   
    token_list = tokenizer.texts_to_sequences([text])[0]   
    token_list = pad_sequences([token_list], maxlen=max_sequence_length, padding='pre')   
    predicted_probabilities = model.predict(token_list, verbose=0)   
    predicted_word_index = np.argmax(predicted_probabilities, axis=-1)   
    predicted_word = tokenizer.index_word[predicted_word_index[0]]   
    return predicted_word

In [72]:
input_text = "what clue could you have as to his"
predicted_word = predict_next_word(model, tokenizer, input_text, max_sequence_length)
print(f"Next word prediction: {predicted_word}")

Next word prediction: identity


In [67]:
model.save('next_word_prediction_model.h5')




In [75]:
def predict_top_next_words(model, tokenizer, text, max_sequence_length, top_n=5):
    text = preprocesstext(text) 
    token_list = tokenizer.texts_to_sequences([text])[0]  
    token_list = pad_sequences([token_list], maxlen=max_sequence_length, padding='pre')  
    predicted_probabilities = model.predict(token_list, verbose=0)
    predicted_word_indices = np.argsort(predicted_probabilities[0])[::-1][:top_n]
    predicted_words = [tokenizer.index_word[index] for index in predicted_word_indices]
    
    return predicted_words

In [76]:
input_text = "what clue could you have as to his"
top_n_predictions = predict_top_next_words(model, tokenizer, input_text, max_sequence_length, top_n=5)
print(f"Top 5 predicted words: {top_n_predictions}")


Top 5 predicted words: ['identity', 'control', 'forgiven', 'quartering', 'bordered']
