In [2]:
! pip install tensorflow



In [3]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Embedding, Dense
import numpy as np

In [4]:
# Load and preprocess text
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    return text

file_path = "/content/sample_data/Book1.txt"  # Ensure you have this file in your Colab or local directory
text = load_data(file_path).lower()

In [5]:
text



In [12]:
# Tokenization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences

tokenizer = Tokenizer(oov_token='<oov>') # Out-Of-Vocabulary token
                                        # If a word not seen during training appears later, it will be replaced with
                                        # Helps handle unknown words instead of ignoring them
tokenizer.fit_on_texts([text]) # analyzes the input text and creates a word index (mapping of words to unique integers)
total_words = len(tokenizer.word_index) + 1

In [13]:
total_words

6785

In [14]:
# Convert text to sequences
input_sequences = []
tokens = tokenizer.texts_to_sequences([text])[0] # converts the input text into a list of numbers based on the word index
seq_length = 50  # Each input sequence contains 50 words

In [15]:
for i in range(seq_length, len(tokens)):
    input_sequences.append(tokens[i - seq_length:i + 1])

In [19]:
len(input_sequences[50])

51

In [20]:
# Pad sequences and split inputs/targets
# after this X will have inputs and y will have label for those inputs

input_sequences = np.array(pad_sequences(input_sequences, maxlen=seq_length + 1, padding='pre'))
X, y = input_sequences[:, :-1], input_sequences[:, -1]

In [21]:
# One-hot encode the labels , note- there are other ways for
# encoding like pre-trained word2vec encoding and so on

y = tf.keras.utils.to_categorical(y, num_classes=total_words)

# Build the Simple RNN model
model = Sequential([
    Embedding(input_dim=total_words, output_dim=64, input_length=seq_length),  # Word embeddings
    SimpleRNN(256, return_sequences=False),  # RNN Layer
    Dense(256, activation='relu'),  # Fully Connected Layer
    Dense(total_words, activation='softmax')  # Output Layer
])



In [24]:
# 256 in RNN - The number of hidden units (size of the hidden state vector)
# return_sequences=False  - The RNN will only return the final hidden state after processing the entire sequence

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=10, batch_size=128)

Epoch 1/10


KeyboardInterrupt: 

In [23]:
# Function to generate text using RNN
def generate_text(seed_text, next_words=50):
    for _ in range(next_words):
        tokenized_input = tokenizer.texts_to_sequences([seed_text])[0]
        tokenized_input = pad_sequences([tokenized_input], maxlen=seq_length, padding='pre')

        predicted_probs = model.predict(tokenized_input, verbose=0)
        predicted_index = np.argmax(predicted_probs)
        predicted_word = tokenizer.index_word.get(predicted_index, "")

        seed_text += " " + predicted_word
    return seed_text

# Generate text using the trained model
print(generate_text("harry looked at"))

harry looked at the door and ” he the door and ” he the door and ” he the door and ” he the door and ” he the door and ” he the door and ” he the door and ” he the door and ” he the door and ” he
