DATA EXPLORATION

In [4]:
import numpy as np
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

# Load the dataset
with open(r"C:\Users\Mega Computer\Desktop\Data Science Internship\DL\Next word predictor\Sherlock Holmes Dataset.txt", 'r', encoding='utf-8') as file:
    text = file.read().lower()

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])

# Convert text to sequences of integers
sequences = tokenizer.texts_to_sequences([text])[0]

# Define sequence length
SEQUENCE_LENGTH = 5  # You can adjust this value

# Create input-output pairs
X = []
Y = []

for i in range(SEQUENCE_LENGTH, len(sequences)):
    X.append(sequences[i-SEQUENCE_LENGTH:i])
    Y.append(sequences[i])

X = np.array(X)
Y = to_categorical(Y, num_classes=len(tokenizer.word_index) + 1)

DATA PREPROCESSING, MODEL ARCHITECTURE & TRAINING

In [18]:

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

# Example text
text = "His manner was not effusive. It seldom was; but he was glad, I think, to see me. With hardly a word spoken, but with a kindly eye, he waved me to an armchair, threw across his case of cigars, and indicated a spirit case and a gasogene in the corner. Then he stood before the fire and looked me over in his singular introspective fashion."

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
sequences = tokenizer.texts_to_sequences([text])[0]

# Set sequence length
SEQUENCE_LENGTH = 5

# Create input-output pairs
X = []
Y = []

for i in range(SEQUENCE_LENGTH, len(sequences)):
    X.append(sequences[i-SEQUENCE_LENGTH:i])
    Y.append(sequences[i])

X = np.array(X)
Y = to_categorical(Y, num_classes=len(tokenizer.word_index) + 1)

# Debugging prints
print("Shape of X:", X.shape)
print("Shape of Y:", Y.shape)
print("Number of classes:", len(tokenizer.word_index) + 1)

# Define the model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=50, input_length=SEQUENCE_LENGTH))
model.add(LSTM(100, return_sequences=False))
model.add(Dense(len(tokenizer.word_index) + 1, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X, Y, epochs=10, batch_size=32, validation_split=0.1, verbose=1)

Shape of X: (60, 5)
Shape of Y: (60, 47)
Number of classes: 47
Epoch 1/10




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 333ms/step - accuracy: 0.0000e+00 - loss: 3.8502 - val_accuracy: 0.0000e+00 - val_loss: 3.8519
Epoch 2/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.0721 - loss: 3.8445 - val_accuracy: 0.0000e+00 - val_loss: 3.8539
Epoch 3/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step - accuracy: 0.1385 - loss: 3.8384 - val_accuracy: 0.0000e+00 - val_loss: 3.8561
Epoch 4/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.1424 - loss: 3.8346 - val_accuracy: 0.0000e+00 - val_loss: 3.8585
Epoch 5/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.2068 - loss: 3.8273 - val_accuracy: 0.0000e+00 - val_loss: 3.8610
Epoch 6/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - accuracy: 0.1860 - loss: 3.8229 - val_accuracy: 0.0000e+00 - val_loss: 3.8637
Epoch 7/10
[1m2/2[0m [32m

EVALUATION

In [None]:
def predict_next_word(model, tokenizer, text_seq, sequence_length):
    # Tokenize the input sequence
    input_seq = tokenizer.texts_to_sequences([text_seq])[0]
    input_seq = pad_sequences([input_seq], maxlen=sequence_length, truncating='pre')
    
    # Predict the next word
    predicted_word_index = np.argmax(model.predict(input_seq), axis=-1)
    
    # Convert the predicted index back to a word
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word

# Example usage: Test on specific sequence from your dataset
input_text = "With hardly a word "
predicted_word = predict_next_word(model, tokenizer, input_text, SEQUENCE_LENGTH)
print(f"Predicted next word: {predicted_word}")