In [1]:
# Install necessary libraries
!pip install keras keras-preprocessing tensorflow spacy pydantic
!python -m spacy download en_core_web_sm

Collecting keras-preprocessing
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl.metadata (1.9 kB)
Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: keras-preprocessing
Successfully installed keras-preprocessing-1.1.2
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m87.4 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by 

In [None]:
from google.colab import files
uploaded = files.upload()

# Import libraries

In [2]:
import spacy
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding
from keras.preprocessing.sequence import pad_sequences
from random import randint
from pickle import dump, load

In [3]:
# Read file
def read_file(filepath):
    with open(filepath, 'r') as f:
        return f.read()

# Tokenize and clean text
nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger', 'ner'])
nlp.max_length = 1198623

def separate_punc(text):
    return [token.text.lower() for token in nlp(text) if token.text.isalpha()]

text = read_file('Harry_Potter_1_Chapter1.txt')
tokens = separate_punc(text)



In [4]:
# Create sequences
seq_len = 25
sequences = [
    tokens[i-seq_len:i]
    for i in range(seq_len, len(tokens))
]

# Tokenize sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sequences)
encoded_sequences = np.array(tokenizer.texts_to_sequences(sequences))

# Prepare input and target
X, y = encoded_sequences[:, :-1], encoded_sequences[:, -1]
y = to_categorical(y, num_classes=len(tokenizer.word_counts) + 1)


In [5]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

def create_model(vocab_size, seq_len, lstm_units):
    model = Sequential([
        Embedding(vocab_size, 50, input_length=seq_len),
        LSTM(lstm_units, return_sequences=True),
        LSTM(lstm_units),
        Dense(256, activation='relu'),
        Dense(vocab_size, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

vocab_size = len(tokenizer.word_index) + 1

# Small Model
small_model = create_model(vocab_size, seq_len, lstm_units=50)

# Big Model
big_model = create_model(vocab_size, seq_len, lstm_units=100)




In [6]:
# Small Model
small_model.fit(X, y, batch_size=128, epochs=20)

# Save the small model
small_model.save('harry_potter_small_model.h5')

Epoch 1/20
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 49ms/step - accuracy: 0.0274 - loss: 6.8791
Epoch 2/20
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 56ms/step - accuracy: 0.0415 - loss: 6.0097
Epoch 3/20
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 85ms/step - accuracy: 0.0415 - loss: 5.9436
Epoch 4/20
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 50ms/step - accuracy: 0.0440 - loss: 5.9617
Epoch 5/20
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 50ms/step - accuracy: 0.0482 - loss: 5.9070
Epoch 6/20
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 49ms/step - accuracy: 0.0350 - loss: 5.8867
Epoch 7/20
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 74ms/step - accuracy: 0.0390 - loss: 5.8902
Epoch 8/20
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 49ms/step - accuracy: 0.0436 - loss: 5.7748
Epoch 9/20
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━



In [7]:
# Big Model
big_model.fit(X, y, batch_size=128, epochs=40)

# Save the big model
big_model.save('harry_potter_big_model.h5')

Epoch 1/40
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 118ms/step - accuracy: 0.0395 - loss: 6.7864
Epoch 2/40
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 155ms/step - accuracy: 0.0370 - loss: 6.0267
Epoch 3/40
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 104ms/step - accuracy: 0.0439 - loss: 5.9236
Epoch 4/40
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 115ms/step - accuracy: 0.0393 - loss: 5.9630
Epoch 5/40
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 117ms/step - accuracy: 0.0483 - loss: 5.8778
Epoch 6/40
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 102ms/step - accuracy: 0.0474 - loss: 5.8156
Epoch 7/40
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 146ms/step - accuracy: 0.0414 - loss: 5.7363
Epoch 8/40
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 105ms/step - accuracy: 0.0422 - loss: 5.6604
Epoch 9/40
[1m36/36[0m [32m━━━━━━━━━━



In [9]:
from keras.models import load_model

# Load the models
model1 = load_model('/content/harry_potter_small_model.h5')
model2 = load_model('/content/harry_potter_big_model.h5')

# Evaluate on test data (X_test, y_test)
model1_results = model1.evaluate(X, y, verbose=0)
model2_results = model2.evaluate(X, y, verbose=0)

print(f"Model 1 Results: Loss = {model1_results[0]}, Accuracy = {model1_results[1]}")
print(f"Model 2 Results: Loss = {model2_results[0]}, Accuracy = {model2_results[1]}")



Model 1 Results: Loss = 4.9745259284973145, Accuracy = 0.06706244498491287
Model 2 Results: Loss = 3.785808563232422, Accuracy = 0.1312664896249771


In [10]:
from keras.models import load_model
from random import randint

def generate_text(model, tokenizer, seq_len, seed_text, num_words):
    input_text = seed_text
    output_text = []

    for _ in range(num_words):
        encoded = tokenizer.texts_to_sequences([input_text])[0]
        padded = pad_sequences([encoded], maxlen=seq_len, truncating='pre')
        pred_index = np.argmax(model.predict(padded), axis=1)[0]
        pred_word = tokenizer.index_word.get(pred_index, '')
        input_text += ' ' + pred_word
        output_text.append(pred_word)

    return ' '.join(output_text)


compare the smaler lstm_unit vs bigger lstm_unit

In [12]:
# Seed Text
seed_index = randint(0, len(sequences))
seed_text = ' '.join(sequences[seed_index])

# Generate text
small_model_text = generate_text(small_model, tokenizer, seq_len, seed_text, 500)
big_model_text = generate_text(big_model, tokenizer, seq_len, seed_text, 500)

# Save the generated chapters
with open('harry_potter_small_model.txt', 'w', encoding='utf-8') as f:
    f.write(small_model_text)

with open('harry_potter_big_model.txt', 'w', encoding='utf-8') as f:
    f.write(big_model_text)



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30

In [13]:
with open('harry_potter_small_model.txt', 'r', encoding='utf-8') as f:
    small_model_chapter = f.read()

with open('harry_potter_big_model.txt', 'r', encoding='utf-8') as f:
    big_model_chapter = f.read()


In [None]:
from keras.layers import Dropout

# adding dropout layer
def create_model_adding_drop_out_layer(vocab_size, seq_len, lstm_units=150):
    model = Sequential([
        Embedding(vocab_size, 100, input_length=seq_len),
        LSTM(lstm_units, return_sequences=True),
        Dropout(0.2),
        LSTM(lstm_units),
        Dropout(0.2),
        Dense(lstm_units, activation='relu'),
        Dense(vocab_size, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


In [None]:
model3 = create_model_adding_drop_out_layer(vocab_size, seq_len, lstm_units = 100)

In [None]:
# Model3
model3.fit(X, y, batch_size=128, epochs=40)

# Save the model3
model3.save('harry_potter_model3.h5')

increase lstm_units

In [None]:
model4 = create_model_adding_drop_out_layer(vocab_size, seq_len, lstm_units = 200)

In [None]:
# Model3
model4.fit(X, y, batch_size=128, epochs=40)

# Save the model3
model4.save('harry_potter_model3.h5')

In [None]:
# Generate text
model3 = generate_text(small_model, tokenizer, seq_len, seed_text, 500)
model4 = generate_text(big_model, tokenizer, seq_len, seed_text, 500)

# Save the generated chapters
with open('harry_potter_model3.txt', 'w', encoding='utf-8') as f:
    f.write(small_model_text)

with open('harry_potter_model4.txt', 'w', encoding='utf-8') as f:
    f.write(big_model_text)


Let's modify the model to include dropout layers and increase the number of epochs from 40 to 100:

In [None]:
# Model3
model4.fit(X, y, batch_size=128, epochs=1003)

# Save the model3
model4.save('harry_potter_model3.h5')