In [1]:
# Q2. Import libraries and define dataset
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import to_categorical
import numpy as np


In [2]:
text_data = """About the Program
What is the course fee for Data Science Mentorship Program (DSMP 2023)
The course follows a monthly subscription model where you have to make monthly payments of Rs 799/month.
... (rest of faqs text) ...
Discussion on Job hunting strategies"""

In [3]:

tokenizer = Tokenizer()
tokenizer.fit_on_texts([text_data])

In [4]:
input_sequences = []
for sentence in text_data.split('\n'):
    tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]
    for i in range(1, len(tokenized_sentence)):
        input_sequences.append(tokenized_sentence[:i+1])

max_len = max([len(x) for x in input_sequences])
padded_sequences = pad_sequences(input_sequences, maxlen=max_len, padding='pre')

X = padded_sequences[:, :-1]
y = to_categorical(padded_sequences[:, -1], num_classes=len(tokenizer.word_index)+1)

In [5]:
vocab_size = len(tokenizer.word_index) + 1

In [6]:
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_len-1))
model.add(LSTM(150, return_sequences=True))
model.add(LSTM(150))
model.add(Dense(vocab_size, activation='softmax'))



In [7]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.build(input_shape=(None, max_len-1))

model.summary()

In [8]:

model.fit(X, y, epochs=100, verbose=2)

Epoch 1/100
2/2 - 4s - 2s/step - accuracy: 0.0541 - loss: 3.6121
Epoch 2/100
2/2 - 0s - 52ms/step - accuracy: 0.0811 - loss: 3.5971
Epoch 3/100
2/2 - 0s - 49ms/step - accuracy: 0.0811 - loss: 3.5856
Epoch 4/100
2/2 - 0s - 50ms/step - accuracy: 0.0811 - loss: 3.5698
Epoch 5/100
2/2 - 0s - 50ms/step - accuracy: 0.0541 - loss: 3.5454
Epoch 6/100
2/2 - 0s - 51ms/step - accuracy: 0.0541 - loss: 3.5059
Epoch 7/100
2/2 - 0s - 57ms/step - accuracy: 0.0541 - loss: 3.4498
Epoch 8/100
2/2 - 0s - 53ms/step - accuracy: 0.0541 - loss: 3.4110
Epoch 9/100
2/2 - 0s - 50ms/step - accuracy: 0.0541 - loss: 3.4219
Epoch 10/100
2/2 - 0s - 53ms/step - accuracy: 0.0541 - loss: 3.3788
Epoch 11/100
2/2 - 0s - 57ms/step - accuracy: 0.1081 - loss: 3.3216
Epoch 12/100
2/2 - 0s - 53ms/step - accuracy: 0.1351 - loss: 3.2855
Epoch 13/100
2/2 - 0s - 51ms/step - accuracy: 0.1622 - loss: 3.2608
Epoch 14/100
2/2 - 0s - 53ms/step - accuracy: 0.1622 - loss: 3.2162
Epoch 15/100
2/2 - 0s - 54ms/step - accuracy: 0.1351 - loss

<keras.src.callbacks.history.History at 0x2a57399a550>

In [9]:
# Q3. Next-word prediction for a given input
import time


In [10]:
input_text = "what is the fee"

In [11]:
token_text = tokenizer.texts_to_sequences([input_text])[0]
padded_token_text = pad_sequences([token_text], maxlen=max_len-1, padding='pre')

predicted_index = np.argmax(model.predict(padded_token_text, verbose=0))

for word, index in tokenizer.word_index.items():
    if index == predicted_index:
        predicted_word = word
        break

In [12]:
print(f"Input text: '{input_text}'")
print(f"Next predicted word: '{predicted_word}'")

Input text: 'what is the fee'
Next predicted word: 'fee'


In [13]:
# Q3 modified. Generate 5 words sequentially
input_text = "what is the fee"

In [14]:
for i in range(5):
    token_text = tokenizer.texts_to_sequences([input_text])[0]
    padded_token_text = pad_sequences([token_text], maxlen=max_len-1, padding='pre')
    
    predicted_index = np.argmax(model.predict(padded_token_text, verbose=0))
    
    for word, index in tokenizer.word_index.items():
        if index == predicted_index:
            input_text += " " + word
            break

In [15]:
print("Generated text:", input_text)

Generated text: what is the fee fee for data science mentorship
