In [1]:
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os

In [2]:
file  = open('data.txt', 'r', encoding = "utf8")

In [3]:
lines=[]
for i in file:
    lines.append(i)

In [4]:
data=""
for i in lines:
    data = ' '.join(lines)

In [5]:
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '').replace('“','').replace('”','')

In [6]:
data = data.split()
data = ' '.join(data)

In [7]:
data[:1000]

"THE ADVENTURES OF SHERLOCK HOLMES Arthur Conan Doyle Table of contents A Scandal in Bohemia The Red-Headed League A Case of Identity The Boscombe Valley Mystery The Five Orange Pips The Man with the Twisted Lip The Adventure of the Blue Carbuncle The Adventure of the Speckled Band The Adventure of the Engineer's Thumb The Adventure of the Noble Bachelor The Adventure of the Beryl Coronet The Adventure of the Copper Beeches A SCANDAL IN BOHEMIA Table of contents Chapter 1 Chapter 2 Chapter 3 CHAPTER I To Sherlock Holmes she is always the woman. I have seldom heard him mention her under any other name. In his eyes she eclipses and predominates the whole of her sex. It was not that he felt any emotion akin to love for Irene Adler. All emotions, and that one particularly, were abhorrent to his cold, precise but admirably balanced mind. He was, I take it, the most perfect reasoning and observing machine that the world has seen, but as a lover he would have placed himself in a false positio

In [8]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

In [9]:
pickle.dump(tokenizer, open('tokenizer.pkl', 'wb'))

In [10]:
sequence_data = tokenizer.texts_to_sequences([data])[0]

In [11]:
sequence_data[:20]

[1,
 1561,
 5,
 129,
 34,
 647,
 4498,
 4499,
 226,
 5,
 1562,
 6,
 827,
 7,
 871,
 1,
 234,
 462,
 648,
 6]

In [12]:
len(sequence_data)

105879

In [13]:
vocab_size = len(tokenizer.word_index)+1

In [14]:
vocab_size

8200

In [15]:
sequence=[]
for i in range(3, len(sequence_data)):
    words = sequence_data[i-3:i+1]
    sequence.append(words)

In [16]:
len(sequence)

105876

In [17]:
sequence = np.array(sequence)

In [18]:
sequence

array([[   1, 1561,    5,  129],
       [1561,    5,  129,   34],
       [   5,  129,   34,  647],
       ...,
       [  28,    1, 8198, 8199],
       [   1, 8198, 8199, 3187],
       [8198, 8199, 3187, 3186]])

In [19]:
max_len = max([len(x) for x in sequence])

In [20]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_input_sequences = pad_sequences(sequence, maxlen = max_len, padding='pre')

In [21]:
padded_input_sequences

array([[   1, 1561,    5,  129],
       [1561,    5,  129,   34],
       [   5,  129,   34,  647],
       ...,
       [  28,    1, 8198, 8199],
       [   1, 8198, 8199, 3187],
       [8198, 8199, 3187, 3186]], dtype=int32)

In [22]:
X = padded_input_sequences[:,:-1]

In [23]:
y = padded_input_sequences[:,-1]

In [24]:
X=[]
y=[]

for i in sequence:
    X.append(i[0:3])
    y.append(i[3])

In [25]:
X=np.array(X)
y=np.array(y)

In [26]:
X.shape

(105876, 3)

In [27]:
y.shape

(105876,)

In [28]:
y = to_categorical(y, num_classes=vocab_size)

In [29]:
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

Building the LSTM Model

In [30]:
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length = 56))
model.add(LSTM(1000, return_sequences = True))
model.add(LSTM(1000))
model.add(Dense(1000, activation='relu'))
model.add(Dense(vocab_size, activation="softmax"))



In [31]:
model.summary()

In [32]:
model.save('nextword_lstm_model.h5')



In [33]:
model.compile(loss="categorical_crossentropy", optimizer = Adam(learning_rate = 0.001))

model.fit(X, y, epochs=2, batch_size=64)

Epoch 1/2
[1m1655/1655[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 24ms/step - loss: 6.6086
Epoch 2/2
[1m1655/1655[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 24ms/step - loss: 5.6731


<keras.src.callbacks.history.History at 0x7cf399613070>