In [8]:
!pip install tensorflow
!pip install kaggle
!pip install numpy



In [11]:
# Reinstall tensorflow and numpy to ensure compatibility
!pip install --upgrade --force-reinstall tensorflow numpy
!pip install kaggle

Collecting tensorflow
  Downloading tensorflow-2.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting numpy
  Downloading numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting absl-py>=1.0.0 (from tensorflow)
  Downloading absl_py-2.3.0-py3-none-any.whl.metadata (2.4 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (



In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.layers import Layer, Embedding, Dense, LayerNormalization, Dropout
import numpy as np

In [2]:
#load the harry potter book as the dataset ->  url - https://www.kaggle.com/datasets/shubhammaindola/harry-potter-books
def load_data(file_path):
  with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
  return text

file_path = "/content/01 Harry Potter and the Sorcerers Stone.txt"
text = load_data(file_path).lower()

In [3]:
# Tokenize the text
tokenizer = Tokenizer(oov_token='')
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1


# Convert text to sequences
input_sequences = []
tokens = tokenizer.texts_to_sequences([text])[0]
seq_length = 50

# First seq_length tokens (input): Used for training the model.
# Last token (target): Used as the label the model tries to predict.
# so total of (50 + 1) in one input_sequence index

for i in range(seq_length, len(tokens)):
    input_sequences.append(tokens[i - seq_length:i + 1])

#print(input_sequences[0])

# Pad sequences and split inputs/targets
# after this X will have inputs and y will have label for those inputs
input_sequences = np.array(pad_sequences(input_sequences, maxlen=seq_length + 1, padding='pre'))
X, y = input_sequences[:, :-1], input_sequences[:, -1]

# One-hot encode the labels , note- there are other ways for
# encoding like pre-trained word2vec encoding and so on
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

In [4]:
class MultiHeadAttention(Layer):

    def __init__(self, embed_dim, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.embed_dim = embed_dim
        self.projection_dim = embed_dim // num_heads

        self.query_dense = Dense(embed_dim) # Q Determines "what to focus on"
        self.key_dense = Dense(embed_dim) # K Acts as "labels" to be matched with queries
        self.value_dense = Dense(embed_dim) # V Holds the actual information

        self.combine_heads = Dense(embed_dim)

    def attention(self, query, key, value):
        scores = tf.matmul(query, key, transpose_b=True)
        scores /= tf.math.sqrt(tf.cast(self.projection_dim, tf.float32)) # converting integer to a float32 tensor

        attention_probs = tf.nn.softmax(scores, axis=-1)

        return tf.matmul(attention_probs, value), attention_probs

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        query, key, value = inputs
        batch_size = tf.shape(query)[0] # (batch_size, seq_len, embed_dim)

        query = self.split_heads(self.query_dense(query), batch_size)
        key = self.split_heads(self.key_dense(key), batch_size)
        value = self.split_heads(self.value_dense(value), batch_size)

        attention, _ = self.attention(query, key, value)
        attention = tf.transpose(attention, perm=[0, 2, 1, 3])

        concat_attention = tf.reshape(attention, (batch_size, -1, self.embed_dim))

        return self.combine_heads(concat_attention)

In [5]:
class TransformerBlock(Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(embed_dim, num_heads)
        self.ffn = tf.keras.Sequential([
            Dense(ff_dim, activation="relu"),
            Dense(embed_dim),
        ])

        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att([inputs, inputs, inputs])
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output) # Residual Connection
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output) # Residual Connection

In [6]:
class TokenAndPositionEmbedding(Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = Embedding(input_dim=maxlen, output_dim=embed_dim)
    def call(self, x):
        maxlen = tf.shape(x)[-1] # sets maxlen to the length of the input sequence
        positions = tf.range(start=0, limit=maxlen, delta=1) # Generate [0, 1, 2, ..., maxlen-1]
        positions = self.pos_emb(positions) # Each position index is mapped to a trainable embedding of shape (maxlen, embed_dim)
        x = self.token_emb(x) # Each token ID in x is mapped to an embedding of shape (batch_size, maxlen, embed_dim)
        return x + positions

In [7]:
# Model Parameters
embed_dim = 128  # Embedding size
num_heads = 4    # Number of attention heads
ff_dim = 512     # Feed-forward layer size
maxlen = seq_length # here it is 50 defined above

# below total words = 6662 (see above - basically all tokens in the text)

# Build the model
inputs = tf.keras.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, total_words, embed_dim)
x = embedding_layer(inputs)
print(x.shape)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x, training=True)
print(x.shape)
x = x[:, -1, :]
print(x.shape)
x = Dense(total_words, activation="softmax")(x)
print(x.shape)
model = tf.keras.Model(inputs=inputs, outputs=x)

# Compile the model
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

model.summary()


(None, 50, 128)
(None, 50, 128)
(None, 128)
(None, 6663)


In [8]:
history = model.fit(X, y, batch_size=32, epochs=10)

Epoch 1/10
[1m2531/2531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m280s[0m 108ms/step - accuracy: 0.0833 - loss: 6.5134
Epoch 2/10
[1m2531/2531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m323s[0m 108ms/step - accuracy: 0.1568 - loss: 5.1033
Epoch 3/10
[1m2531/2531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m275s[0m 109ms/step - accuracy: 0.2123 - loss: 4.2781
Epoch 4/10
[1m2531/2531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m271s[0m 107ms/step - accuracy: 0.2546 - loss: 3.6552
Epoch 5/10
[1m2531/2531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m324s[0m 108ms/step - accuracy: 0.3189 - loss: 3.0999
Epoch 6/10
[1m2531/2531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m320s[0m 108ms/step - accuracy: 0.3955 - loss: 2.6064
Epoch 7/10
[1m2531/2531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m322s[0m 108ms/step - accuracy: 0.4786 - loss: 2.1765
Epoch 8/10
[1m2531/2531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m272s[0m 108ms/step - accuracy: 0.5480 - loss:

In [9]:
def generate_text(seed_text, next_words, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        predicted_word = tokenizer.index_word[np.argmax(predicted)]
        seed_text += " " + predicted_word
    return seed_text

# Generate text
seed_text = "harry looked at"
generated_text = generate_text(seed_text, next_words=50, max_sequence_len=seq_length + 1)
print(len(generated_text))

276


In [10]:
print(generated_text)



harry looked at the troll blood in the wooden club hermione did to the ground and then started throwing the apart from the castle overhead most boring class was standing on their way across the hall for the last word harry stood and ron didn’t soothe ron at all in a good kick
