In [None]:
# https://www.kaggle.com/datasets/shubhammaindola/harry-potter-books

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Embedding, Dense
import numpy as np

# Load and preprocess text
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    return text

file_path = "hp_1.txt"  # Ensure you have this file in your Colab or local directory
text = load_data(file_path).lower()

# Tokenization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences

tokenizer = Tokenizer(oov_token='<OOV>') # Out-Of-Vocabulary token
                                        # If a word not seen during training appears later, it will be replaced with <OOV>
                                        # Helps handle unknown words instead of ignoring them
tokenizer.fit_on_texts([text]) # analyzes the input text and creates a word index (mapping of words to unique integers)
total_words = len(tokenizer.word_index) + 1 #  0 is usually reserved for padding

# Convert text to sequences
input_sequences = []
tokens = tokenizer.texts_to_sequences([text])[0] # converts the input text into a list of numbers based on the word index
seq_length = 50  # Each input sequence contains 50 words

# First seq_length tokens (input): Used for training the model.
# Last token (target): Used as the label the model tries to predict.
# so total of (50 + 1) in one input_sequence index

for i in range(seq_length, len(tokens)):
    input_sequences.append(tokens[i - seq_length:i + 1])

# Pad sequences and split inputs/targets
# after this X will have inputs and y will have label for those inputs

input_sequences = np.array(pad_sequences(input_sequences, maxlen=seq_length + 1, padding='pre'))
X, y = input_sequences[:, :-1], input_sequences[:, -1]

# One-hot encode the labels , note- there are other ways for
# encoding like pre-trained word2vec encoding and so on

y = tf.keras.utils.to_categorical(y, num_classes=total_words)

# Build the Simple RNN model
model = Sequential([
    Embedding(input_dim=total_words, output_dim=64, input_length=seq_length),  # Word embeddings
    SimpleRNN(256, return_sequences=False),  # RNN Layer
    Dense(256, activation='relu'),  # Fully Connected Layer
    Dense(total_words, activation='softmax')  # Output Layer
])

# 256 in RNN - The number of hidden units (size of the hidden state vector)
# return_sequences=False  - The RNN will only return the final hidden state after processing the entire sequence

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=30, batch_size=128)

# Function to generate text using RNN
def generate_text(seed_text, next_words=50):
    for _ in range(next_words):
        tokenized_input = tokenizer.texts_to_sequences([seed_text])[0]
        tokenized_input = pad_sequences([tokenized_input], maxlen=seq_length, padding='pre')

        predicted_probs = model.predict(tokenized_input, verbose=0)
        predicted_index = np.argmax(predicted_probs)
        predicted_word = tokenizer.index_word.get(predicted_index, "<OOV>")

        seed_text += " " + predicted_word
    return seed_text

# Generate text using the trained model
print(generate_text("harry looked at"))




Epoch 1/30
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 175ms/step - accuracy: 0.0413 - loss: 6.9476
Epoch 2/30
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 174ms/step - accuracy: 0.0633 - loss: 6.2668
Epoch 3/30
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m148s[0m 184ms/step - accuracy: 0.0958 - loss: 5.8224
Epoch 4/30
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 180ms/step - accuracy: 0.1183 - loss: 5.4685
Epoch 5/30
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 178ms/step - accuracy: 0.1323 - loss: 5.1892
Epoch 6/30
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 176ms/step - accuracy: 0.1442 - loss: 4.9743
Epoch 7/30
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 175ms/step - accuracy: 0.1536 - loss: 4.7675
Epoch 8/30
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 177ms/step - accuracy: 0.1620 - loss: 4.5828
Epoch 9/

The model learns local patterns, not long-term dependencies

- RNNs struggle with long-range dependencies because they do not retain information well over long sequences.
- This is why the text seems grammatically okay but lacks deeper context.

The model generates phrases based on probabilities

- It predicts the most likely next word given the past words.
- It does not understand meaning but follows statistical patterns.
- It captures writing style but lacks coherence

Words appear logically related but do not form a strong narrative.
The model does not truly "understand" the book, it just mimics word usage.

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import numpy as np

# Function to load dataset
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    return text

# Load Harry Potter book text
file_path = "hp_1.txt"
text = load_data(file_path).lower()

# Tokenize the text
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1

# Convert text into sequences
input_sequences = []
tokens = tokenizer.texts_to_sequences([text])[0]
seq_length = 50  # Each input sequence will have 50 words

for i in range(seq_length, len(tokens)):
    input_sequences.append(tokens[i - seq_length:i + 1])

# Pad sequences and split into inputs (X) and labels (y)
input_sequences = np.array(pad_sequences(input_sequences, maxlen=seq_length + 1, padding='pre'))
X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)  # One-hot encode labels

# LSTM Model
model = Sequential([
    Embedding(input_dim=total_words, output_dim=100, input_length=seq_length),
    LSTM(256, return_sequences=True),  # First LSTM layer
    LSTM(256),  # Second LSTM layer
    Dense(total_words, activation='softmax')
])

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=20, batch_size=128)

# Function to Generate Text
def generate_text(seed_text, next_words=50, temperature=1.0):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=seq_length, padding='pre')

        predicted_probs = model.predict(token_list, verbose=0)[0]
        predicted_probs = np.log(predicted_probs) / temperature  # Adjust randomness
        predicted_probs = np.exp(predicted_probs) / np.sum(np.exp(predicted_probs))
        predicted_index = np.random.choice(range(len(predicted_probs)), p=predicted_probs)

        output_word = tokenizer.index_word.get(predicted_index, "")
        seed_text += " " + output_word

    return seed_text

# Generate text
print(generate_text("harry looked at", next_words=50, temperature=0.7))




Epoch 1/20
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m609s[0m 955ms/step - accuracy: 0.0444 - loss: 7.0546
Epoch 2/20
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m620s[0m 952ms/step - accuracy: 0.0545 - loss: 6.3644
Epoch 3/20
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m604s[0m 954ms/step - accuracy: 0.0850 - loss: 6.0058
Epoch 4/20
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m622s[0m 954ms/step - accuracy: 0.1083 - loss: 5.6874
Epoch 5/20
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m621s[0m 953ms/step - accuracy: 0.1193 - loss: 5.4484
Epoch 6/20
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m627s[0m 960ms/step - accuracy: 0.1261 - loss: 5.2622
Epoch 7/20
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m618s[0m 955ms/step - accuracy: 0.1343 - loss: 5.1024
Epoch 8/20
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m625s[0m 959ms/step - accuracy: 0.1428 - loss: 4.9089
Epoch 9/

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense
import numpy as np

# Function to load dataset
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    return text

# Load Harry Potter book text
file_path = "hp_1.txt"
text = load_data(file_path).lower()

# Tokenize the text
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1

# Convert text into sequences
input_sequences = []
tokens = tokenizer.texts_to_sequences([text])[0]
seq_length = 50  # Each input sequence will have 50 words

for i in range(seq_length, len(tokens)):
    input_sequences.append(tokens[i - seq_length:i + 1])

# Pad sequences and split into inputs (X) and labels (y)
input_sequences = np.array(pad_sequences(input_sequences, maxlen=seq_length + 1, padding='pre'))
X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)  # One-hot encode labels

# GRU Model
model = Sequential([
    Embedding(input_dim=total_words, output_dim=100, input_length=seq_length),
    GRU(256, return_sequences=True),  # First GRU layer
    GRU(256),  # Second GRU layer
    Dense(total_words, activation='softmax')
])

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=20, batch_size=128)

# Function to Generate Text
def generate_text(seed_text, next_words=50, temperature=1.0):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=seq_length, padding='pre')

        predicted_probs = model.predict(token_list, verbose=0)[0]
        predicted_probs = np.log(predicted_probs) / temperature  # Adjust randomness
        predicted_probs = np.exp(predicted_probs) / np.sum(np.exp(predicted_probs))
        predicted_index = np.random.choice(range(len(predicted_probs)), p=predicted_probs)

        output_word = tokenizer.index_word.get(predicted_index, "")
        seed_text += " " + output_word

    return seed_text

# Generate text
print(generate_text("harry looked at", next_words=50, temperature=0.7))


Epoch 1/20
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m552s[0m 863ms/step - accuracy: 0.0410 - loss: 7.2053
Epoch 2/20
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m555s[0m 853ms/step - accuracy: 0.0453 - loss: 6.6918
Epoch 3/20
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m540s[0m 853ms/step - accuracy: 0.0889 - loss: 6.0476
Epoch 4/20
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m559s[0m 849ms/step - accuracy: 0.1282 - loss: 5.4514
Epoch 5/20
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m565s[0m 854ms/step - accuracy: 0.1482 - loss: 4.9923
Epoch 6/20
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m560s[0m 851ms/step - accuracy: 0.1716 - loss: 4.5540
Epoch 7/20
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m563s[0m 853ms/step - accuracy: 0.2031 - loss: 4.1501
Epoch 8/20
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m540s[0m 853ms/step - accuracy: 0.2495 - loss: 3.7553
Epoch 9/