In [1]:
import numpy as np
import pickle
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from transformers import pipeline
import os


In [2]:
# Load and preprocess dataset
DATASET_PATH = 'metamorphosis_clean.txt'
with open(DATASET_PATH, 'r', encoding='utf-8') as f:
    corpus = f.read().lower().split("\n")

In [3]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

In [4]:
# Create input sequences
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        input_sequences.append(token_list[:i+1])


In [5]:
# Pad sequences
max_sequence_length = max(len(seq) for seq in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')
X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = np.array(y)  # No need for one-hot encoding with sparse categorical loss

In [6]:
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, Dropout, Dense

model = Sequential([
    Embedding(input_dim=total_words, output_dim=128, input_length=max_sequence_length - 1),  # reduced from 256
    Bidirectional(LSTM(128, return_sequences=True)),  # reduced LSTM units
    Dropout(0.2),
    Bidirectional(LSTM(64)),  # reduced from 128
    Dropout(0.2),
    Dense(total_words, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])




In [7]:
# Train model
epochs = 70
model.fit(X, y, epochs=epochs, verbose=1)

Epoch 1/70
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 39ms/step - accuracy: 0.0478 - loss: 6.4009
Epoch 2/70
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 39ms/step - accuracy: 0.0579 - loss: 5.7630
Epoch 3/70
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 39ms/step - accuracy: 0.0752 - loss: 5.6053
Epoch 4/70
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 46ms/step - accuracy: 0.0937 - loss: 5.4235
Epoch 5/70
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 42ms/step - accuracy: 0.1148 - loss: 5.2248
Epoch 6/70
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 42ms/step - accuracy: 0.1258 - loss: 5.0896
Epoch 7/70
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 42ms/step - accuracy: 0.1388 - loss: 4.9914
Epoch 8/70
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 42ms/step - accuracy: 0.1452 - loss: 4.8469
Epoch 9/70
[1m632/632[

<keras.src.callbacks.history.History at 0x1b485bf76b0>

In [31]:
# Train model
epochs = 20
model.fit(X, y, epochs=epochs, verbose=1)

Epoch 1/20
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 37ms/step - accuracy: 0.6319 - loss: 1.4588
Epoch 2/20
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 35ms/step - accuracy: 0.6256 - loss: 1.4786
Epoch 3/20
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 35ms/step - accuracy: 0.6358 - loss: 1.4079
Epoch 4/20
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 34ms/step - accuracy: 0.6462 - loss: 1.3853
Epoch 5/20
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 35ms/step - accuracy: 0.6492 - loss: 1.3887
Epoch 6/20
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 46ms/step - accuracy: 0.6585 - loss: 1.3422
Epoch 7/20
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 36ms/step - accuracy: 0.6654 - loss: 1.3208
Epoch 8/20
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 55ms/step - accuracy: 0.6587 - loss: 1.3234
Epoch 9/20
[1m632/632[

<keras.src.callbacks.history.History at 0x1b49764c560>

In [33]:
# Train model
epochs = 10
model.fit(X, y, epochs=epochs, verbose=1)

Epoch 1/10
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 38ms/step - accuracy: 0.7245 - loss: 1.0656
Epoch 2/10
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 35ms/step - accuracy: 0.7190 - loss: 1.0563
Epoch 3/10
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 33ms/step - accuracy: 0.7349 - loss: 1.0202
Epoch 4/10
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 35ms/step - accuracy: 0.7328 - loss: 1.0101
Epoch 5/10
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 46ms/step - accuracy: 0.7406 - loss: 0.9756
Epoch 6/10
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 46ms/step - accuracy: 0.7417 - loss: 0.9908
Epoch 7/10
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 44ms/step - accuracy: 0.7433 - loss: 0.9544
Epoch 8/10
[1m632/632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 38ms/step - accuracy: 0.7459 - loss: 0.9457
Epoch 9/10
[1m632/632[

<keras.src.callbacks.history.History at 0x1b497b91d30>

In [34]:
from sklearn.model_selection import train_test_split

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Evaluate the model on the validation set
val_loss, val_accuracy = model.evaluate(X_val, y_val, verbose=1)
print(f"Validation Accuracy: {val_accuracy:.2%}")

[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.8895 - loss: 0.4898
Validation Accuracy: 89.33%


In [35]:
# Save model and tokenizer
model.save('bilstm_model.h5')
with open('tokenizer1.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

print("✅ Training completed! Model and tokenizer saved.")




✅ Training completed! Model and tokenizer saved.


In [36]:
# Load BiLSTM Model
bilstm_model = load_model('bilstm_model.h5')
print("✅ BiLSTM model loaded successfully!")




✅ BiLSTM model loaded successfully!


In [37]:
# Load tokenizer
with open('tokenizer1.pkl', 'rb') as f:
    tokenizer = pickle.load(f)
print("✅ Tokenizer loaded successfully!")

✅ Tokenizer loaded successfully!


In [38]:
# Load BERT fill-mask pipeline
fill_mask = pipeline("fill-mask", model="bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


In [39]:
# Function to store only the predicted sentence in a .txt file
# -----------------------------------------------------------
def store_user_data(predicted_text, filename='metamorphosis_clean.txt'):
    """
    Append only the predicted text to a text file.
    Each prediction is stored on a new line.
    """
    with open(filename, "a", encoding="utf-8") as f:
        f.write(predicted_text + "\n")

In [40]:
# Load dataset vocabulary
if os.path.exists(DATASET_PATH):
    with open(DATASET_PATH, 'r', encoding='utf-8') as f:
        dataset_words = set(f.read().split())
else:
    dataset_words = set()

In [41]:
# Inappropriate words filter
BAD_WORDS = {"damn", "hell", "shit", "fuck", "bitch", "bastard", "ass", "asshole", "dumbass", "jackass", 
             "motherfucker", "cock", "piss", "crap", "slut", "whore", "dick", "cunt", "nigger", 
             "retard", "faggot", "twat", "wanker", "moron", "idiot", "stupid"}

# Ensure valid words
def is_valid_word(word):
    return word.lower() not in BAD_WORDS

In [42]:
def predict_next_word_bilstm(text, top_k=5):
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_sequence_length-1, padding='pre')
    # Get the full probability distribution from the model
    predictions = bilstm_model.predict(padded_sequence)[0]
    
    # Sort indices by probability (highest first)
    sorted_indices = np.argsort(predictions)[::-1]
    
    # Loop over top_k predictions and return the first valid word
    for idx in sorted_indices[:top_k]:
        predicted_word = tokenizer.index_word.get(idx, None)
        if predicted_word and is_valid_word(predicted_word):
            return predicted_word
    # If none of the top_k are valid, return a filtered indicator
    return "[filtered]"


In [43]:
def predict_next_word_bert(text):
    masked_text = text + " [MASK]."
    predictions = fill_mask(masked_text)
    for pred in predictions:
        token_str = pred['token_str'].strip()
        # Remove leading subword indicators if present
        if token_str.startswith("##"):
            token_str = token_str[2:]
        if is_valid_word(token_str):
            return token_str
    return "[filtered]"


In [44]:
def predict_next_word(text):
    words = text.split()
    last_word = words[-1] if words else ""
    if last_word in dataset_words:
        return predict_next_word_bilstm(text)
    else:
        new_word = predict_next_word_bert(text)
        if new_word != "[filtered]":
            predicted_sentence = text + " " + new_word
            dataset_words.update(predicted_sentence.split())
            with open(DATASET_PATH, 'a', encoding='utf-8') as f:
                f.write("\n" + predicted_sentence.strip())  # Save entire sentence
        return new_word


In [45]:
# Predict multiple words
def Predict_Next_Words(text, num_words):
    predicted_sentence = text
    for _ in range(num_words):
        next_word = predict_next_word(predicted_sentence)
        predicted_sentence += " " + next_word.strip()
    return predicted_sentence

In [46]:
# Test the model
if __name__ == "__main__":
    input_text = "The book was"
    num_predictions = 5
    result = Predict_Next_Words(input_text, num_predictions)
    print(f"\n🔹 Input: {input_text}\n✅ Predicted Sentence: {result}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 534ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step

🔹 Input: The book was
✅ Predicted Sentence: The book was effects of what's been going


In [47]:
# Take user input
user_input = input("Enter a starting phrase: ")
num_words_to_predict = input("Enter the number of words to predict: ")

# Ensure the second input is converted to an integer
try:
	num_words_to_predict = int(num_words_to_predict)
	predicted_sentence = Predict_Next_Words(user_input, num_words_to_predict)
	print(f"\n🔹 Input: {user_input}\n✅ Predicted Sentence: {predicted_sentence}")
except ValueError:
	print("❌ Error: Please enter a valid number for the number of words to predict.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step

🔹 Input: It is raining 
✅ Predicted Sentence: It is raining  again with
