In [1]:
import numpy as np
from tensorflow.keras.models import Sequential # building neural networks layer by layer sequentially.
from tensorflow.keras.layers import LSTM, Dense, Embedding
# dense - Connects every neuron in the previous layer to every neuron in the next layer.
# embedding - converts integer to vectors
from tensorflow.keras.preprocessing.text import Tokenizer
#The Tokenizer class in Keras is used to vectorize text into integer sequences 
#that can be used as input to neural network models
from tensorflow.keras.preprocessing.sequence import pad_sequences
#his is useful when working with text data where sentences can have different lengths.

In [10]:
text_data = [
  "The quick brown fox jumped over the lazy dog.",
  "She sells seashells by the seashore.",
  "Peter Piper picked a peck of pickled peppers.",
  "How much wood would a woodchuck chuck if a woodchuck could chuck wood?",
  "The itsy bitsy spider crawled up the water spout.",
  "Humpty Dumpty sat on a wall, Humpty Dumpty had a great fall.",
  "Jack and Jill went up the hill to fetch a pail of water.",
  "Mary had a little lamb whose fleece was white as snow.",
  "Baa baa black sheep, have you any wool?",
  "Twinkle, twinkle little star, how I wonder what you are.",
  "Roses are red, violets are blue, sugar is sweet, and so are you.",
  "I scream, you scream, we all scream for ice cream!", 
  "The early bird gets the worm.",
  "The grass is always greener on the other side.",
  "When it rains, it pours.",
  "A penny saved is a penny earned.",
  "The pen is mightier than the sword.",
  "Laughter is the best medicine.",
  "Waste not, want not.",
  "A picture is worth a thousand words.",
  "Absence makes the heart grow fonder.",
  "Out of sight, out of mind.",
  "The grass isn't always greener on the other side.",
  "Don't put all your eggs in one basket.",  
  "A leopard can't change its spots.",
  "Time flies when you're having fun.",
  "It isn't over until the fat lady sings.",
  "The best things in life are free.",
  "It takes one to know one.",
  "Beggars can't be choosers.",
  "Fortune favors the bold.",
  "The squeaky wheel gets the grease.",
  "You can catch more flies with honey than with vinegar.",
  "The journey of a thousand miles begins with a single step.",
  "All's well that ends well.",
  "It's always darkest before the dawn.",
  "A picture paints a thousand words.",
  "When the cat's away, the mice will play.",
  "The early bird gets the worm.",
  "Rome wasn't built in a day.", 
  "If at first you don't succeed, try, try again.",
  "Fool me once, shame on you; fool me twice, shame on me.",
  "Two wrongs don't make a right.",
  "Easy come, easy go.",
  "You can't teach an old dog new tricks.",
  "Don't bite the hand that feeds you.",
  "You can lead a horse to water but you can't make it drink.",
  "Don't count your chickens before they hatch.",
  "Necessity is the mother of invention.",
  "Better late than never.",
  "Look before you leap.",
  "Two peas in a pod.",
  "Appearances can be deceiving.",  
  "Curiosity killed the cat.",
  "It's not over till the fat lady sings.",
  "The more things change, the more they stay the same.",
  "Actions speak louder than words.",
  "It never rains but pours.",
  "Truth will out.",
  "No man is an island.",
  "The best laid plans of mice and men often go awry.",
  "A drowning man will clutch at a straw.",
  "An eye for an eye only ends up making the whole world blind.",
  "The road to hell is paved with good intentions.",
  "Where there's a will, there's a way.",
  "Still waters run deep.",
  "Variety is the spice of life.",
  "History repeats itself.",
  "The devil is in the details.",
  "When life gives you lemons, make lemonade."
]
# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_data)
total_words = len(tokenizer.word_index) + 1
print(total_words) # total unique number of words

286


In [11]:
# Create input sequences and labels
input_sequences = []
for line in text_data:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)
#this creates N-gram style input sequences from each text line by truncating(Shot the size)
#the tokenized sequence up to each token's index. This can be used to train sequence prediction models

In [12]:
# Pad sequences for consistent input length
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')
# makes all inuput sizes are equal by adding 0

In [13]:
# Create predictors and labels
X, y = input_sequences[:, :-1], input_sequences[:, -1]
# input_sequences[:, :-1], this takes all the time steps except the last one into the predictors tensor X.
# input_sequences[:, -1] selects just the last time step and puts it into the labels tensor y.
# X is a 3D tensor of shape (num_sequences, max_len - 1, num_features)
# y is a 2D tensor of shape (num_sequences, num_features)

In [16]:
# Build the model
model = Sequential()
model.add(Embedding(total_words, 50, input_length=max_sequence_len - 1))
# Adds an Embedding layer that maps integer word indices to 50-dimensional vectors
# total_words is the vocabulary size
# input_length is the max sequence length for the input sequences
model.add(LSTM(100))
# Adds a LSTM (Long Short Term Memory) layer with 100 memory units
# This processes the embedded sequence input
model.add(Dense(total_words, activation='softmax'))
# Adds a dense output layer with a unit for each word in the vocabulary
# Uses softmax to output a probability distribution over all possible words
# softmax([2.1, 0.8, 1.5]) = [0.462, 0.172, 0.366]  (HIGHER SCORE GETS HIGHER PROBABILITY)

In [18]:
# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#Configures the model for training
#Uses sparse categorical crossentropy loss for classification
#Uses adam optimizer for training
#Metrics to track are accuracy
#This model takes integer encoded sequences as input, passes them through an embedding + LSTM to extract features, 
#and predicts the next word with a softmax output layer.

#The model can then be trained to maximize prediction accuracy.

In [19]:
# Train the model
model.fit(X, y, epochs=100, verbose=1)
# epochs - Number of times to iterate over the entire input data. 
# One epoch = one forward pass + one backward pass (for gradient calculation) over all the data. 
# More epochs means more training. Common ranges are 10-100+ epochs.
# verbose - Controls log output frequency. 0 = silent, 1 = progress bar, 2 = one line per epoch.

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.src.callbacks.History at 0x24ac1c78e50>

In [20]:
# Function to generate the next word
def generate_next_word(seed_text):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    # Tokenize the seed text to get a sequence of integer tokens
    
    token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')
    # Pad the token sequence to max length expected by the model

    predicted_probs = model.predict(token_list, verbose=0)
    # Use the trained model to predict probabilities for next token

    predicted_index = np.argmax(predicted_probs)
    # Get the index of the token with maximum predicted probability

    output_word = ""
    
    for word, index in tokenizer.word_index.items(): #Loop through the vocabulary and find the word that matches the 
                                                     #predicted index
        if index == predicted_index:
            output_word = word
            break
    return output_word
# iteratively generate a sequence word by word.

In [24]:
# Take user input and generate next words
while True:
    user_input = input("Enter a sentence: ")
    next_word = generate_next_word(user_input)
    generated_text = user_input + " " + next_word
    print("Generated sentence:", generated_text)
    break


Enter a sentence: its better if you stay
Generated sentence: its better if you stay the


In [None]:
#Samples
# vector will look like
# dog -> [0.2, 0.6]  
# cat -> [0.3, 0.4]
# car -> [0.7, 0.2]
# bike -> [0.1, 0.8]

# tokenizer example
from tensorflow.keras.preprocessing.text import Tokenizer

text_data = ["The cat is cute", "The dog is hairy", "The bird can sing"]

tokenizer = Tokenizer() 

# Fit the tokenizer on the text data
tokenizer.fit_on_texts(text_data) 

# Convert text to integer sequences
sequences = tokenizer.texts_to_sequences(text_data)

print(sequences)
print(tokenizer.word_index)

# {'the': 1, 'cat': 2, 'is': 3, 'cute': 4, 'dog': 5, 'hairy': 6, 'bird': 7, 'can': 8, 'sing': 9}

# example for pad sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences

sequences = [[1,2,3], [1,2], [1]]

padded = pad_sequences(sequences,maxlen=2)
print(padded)