In [14]:
# Install necessary libraries
!pip install keras keras-preprocessing tensorflow spacy pydantic
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


# Import libraries

In [15]:
import spacy
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding
from keras.preprocessing.sequence import pad_sequences
from random import randint
from pickle import dump, load

In [16]:
# Read file
def read_file(filepath):
    with open(filepath, 'r') as f:
        return f.read()

# Tokenize and clean text
nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger', 'ner'])
nlp.max_length = 1198623

def separate_punc(text):
    return [token.text.lower() for token in nlp(text) if token.text.isalpha()]

text = read_file('moby_dick_four_chapters.txt')
tokens = separate_punc(text)

In [17]:
# Create sequences
seq_len = 25
sequences = [
    tokens[i-seq_len:i]
    for i in range(seq_len, len(tokens))
]

# Tokenize sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sequences)
encoded_sequences = np.array(tokenizer.texts_to_sequences(sequences))

# Prepare input and target
X, y = encoded_sequences[:, :-1], encoded_sequences[:, -1]
y = to_categorical(y, num_classes=len(tokenizer.word_counts) + 1)


In [18]:
# Build the model
def create_model(vocab_size, seq_len):
    model = Sequential([
        Embedding(vocab_size, 25, input_length=seq_len),
        LSTM(150, return_sequences=True),
        LSTM(150),
        Dense(150, activation='relu'),
        Dense(vocab_size, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model = create_model(len(tokenizer.word_counts) + 1, seq_len)
model.summary()

In [24]:
# Train the model
model.fit(X, y, batch_size=128, epochs= 10, verbose=1)

# Save the model and tokenizer
model.save('model1.h5')
dump(tokenizer, open('tokenizer.pkl', 'wb'))

Epoch 1/10
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 199ms/step - accuracy: 0.1033 - loss: 4.5954
Epoch 2/10
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 240ms/step - accuracy: 0.1060 - loss: 4.5453
Epoch 3/10
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 187ms/step - accuracy: 0.1041 - loss: 4.5137
Epoch 4/10
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 230ms/step - accuracy: 0.1025 - loss: 4.4955
Epoch 5/10
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 218ms/step - accuracy: 0.1075 - loss: 4.4655
Epoch 6/10
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 168ms/step - accuracy: 0.1079 - loss: 4.4364
Epoch 7/10
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 247ms/step - accuracy: 0.1049 - loss: 4.4192
Epoch 8/10
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 179ms/step - accuracy: 0.1075 - loss: 4.3967
Epoch 9/10
[1m87/87[0m [32m━━



In [23]:
# Train the model
model.fit(X, y, batch_size=16, epochs= 10, verbose=1)

# Save the model and tokenizer
model.save('model2.h5')
dump(tokenizer, open('tokenizer.pkl', 'wb'))

Epoch 1/10
[1m696/696[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 34ms/step - accuracy: 0.0684 - loss: 5.7085
Epoch 2/10
[1m696/696[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 43ms/step - accuracy: 0.0713 - loss: 5.5436
Epoch 3/10
[1m696/696[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 83ms/step - accuracy: 0.0786 - loss: 5.4406
Epoch 4/10
[1m696/696[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 63ms/step - accuracy: 0.0829 - loss: 5.3230
Epoch 5/10
[1m696/696[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 69ms/step - accuracy: 0.0795 - loss: 5.2265
Epoch 6/10
[1m696/696[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 69ms/step - accuracy: 0.0884 - loss: 5.1216
Epoch 7/10
[1m696/696[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 97ms/step - accuracy: 0.0921 - loss: 5.0208
Epoch 8/10
[1m696/696[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 73ms/step - accuracy: 0.0921 - loss: 4.8991
Epoch 9/10
[1m696/696[



In [27]:
from keras.models import load_model

# Load the models
model1 = load_model('model1.h5')
model2 = load_model('model2.h5')

# Example: Evaluate on test data (X_test, y_test)
model1_results = model1.evaluate(X, y, verbose=0)
model2_results = model2.evaluate(X, y, verbose=0)

print(f"Model 1 Results: Loss = {model1_results[0]}, Accuracy = {model1_results[1]}")
print(f"Model 2 Results: Loss = {model2_results[0]}, Accuracy = {model2_results[1]}")



Model 1 Results: Loss = 4.306352138519287, Accuracy = 0.11404453963041306
Model 2 Results: Loss = 4.615689277648926, Accuracy = 0.10326867550611496


In [20]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    """
    INPUTS:
    model : Trained model
    tokenizer : Tokenizer fit on text data
    seq_len : Length of training sequences
    seed_text : Initial text to start generating
    num_gen_words : Number of words to generate
    """
    output_text = []  # Final output words
    input_text = seed_text  # Initial seed text

    for _ in range(num_gen_words):
        # Encode the input text
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]

        # Pad the encoded sequence to match the sequence length
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')

        # Predict probabilities and find the index of the word with the highest probability
        pred_word_encoded = model.predict(pad_encoded, verbose=0)
        pred_word_ind = np.argmax(pred_word_encoded, axis=1)[0]

        # Map the predicted index to the corresponding word
        pred_word = tokenizer.index_word.get(pred_word_ind, '')

        # Update the input text with the predicted word and append to output
        input_text += ' ' + pred_word
        output_text.append(pred_word)

    # Return the generated words as a single string
    return ' '.join(output_text)



In [21]:
# Generate and explore text
random_seed = randint(0, len(sequences))
seed_text = ' '.join(sequences[random_seed])
print(f"Seed Text: {seed_text}\n")
generated_text = generate_text(model, tokenizer, seq_len, seed_text, num_gen_words=50)
print(f"Generated Text: {generated_text}")


Seed Text: ten inches thick in a hard asphaltic weary for me when i struck my foot against the flinty projections because from hard remorseless service the

Generated Text: bed of the little little little little little little little little little little little little little little little little little little little little little little little little little little little little little little little little little little little little little little little little little little little little little little little
