In [7]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense

In [8]:
with open ("/content/drive/MyDrive/Book dataset.txt","r",encoding="utf8") as myfile:
  mytext=myfile.read()

In [9]:
mytext

'March 07, 2024\nRemarks of President Joe Biden — State of the Union Address As Prepared for Delivery\nHome\nBriefing Room\nSpeeches and Remarks\nThe United States Capitol\n\n###\n\nGood evening. \n\nMr. Speaker. Madam Vice President. Members of Congress. My Fellow Americans. \n\nIn January 1941, President Franklin Roosevelt came to this chamber to speak to the nation. \n\nHe said, “I address you at a moment unprecedented in the history of the Union.” \n\nHitler was on the march. War was raging in Europe. \n\nPresident Roosevelt’s purpose was to wake up the Congress and alert the American people that this was no ordinary moment.   \n\nFreedom and democracy were under assault in the world. \n\nTonight I come to the same chamber to address the nation. \n\nNow it is we who face an unprecedented moment in the history of the Union. \n\nAnd yes, my purpose tonight is to both wake up this Congress, and alert the American people that this is no ordinary moment either. \n\nNot since President L

In [10]:
mytokenizer = Tokenizer()
mytokenizer.fit_on_texts([mytext])
total_words = len(mytokenizer.word_index)+1

In [11]:
mytokenizer.word_index

{'the': 1,
 'and': 2,
 'to': 3,
 'of': 4,
 'a': 5,
 'in': 6,
 'i': 7,
 'for': 8,
 'that': 9,
 'we': 10,
 'is': 11,
 'it': 12,
 'my': 13,
 'our': 14,
 'more': 15,
 'america': 16,
 'on': 17,
 'are': 18,
 'you': 19,
 'american': 20,
 'not': 21,
 'this': 22,
 'will': 23,
 'but': 24,
 'with': 25,
 'all': 26,
 'at': 27,
 'have': 28,
 'now': 29,
 'people': 30,
 'so': 31,
 'president': 32,
 'as': 33,
 'no': 34,
 'by': 35,
 'americans': 36,
 'down': 37,
 'future': 38,
 'was': 39,
 'they': 40,
 'can': 41,
 'want': 42,
 'from': 43,
 'home': 44,
 'up': 45,
 'it’s': 46,
 'than': 47,
 'their': 48,
 'has': 49,
 'here': 50,
 'be': 51,
 'i’ve': 52,
 'tonight': 53,
 'who': 54,
 'when': 55,
 'because': 56,
 'i’m': 57,
 'history': 58,
 'or': 59,
 'pay': 60,
 'year': 61,
 'do': 62,
 'make': 63,
 'tax': 64,
 'world': 65,
 'an': 66,
 'been': 67,
 'years': 68,
 'get': 69,
 'that’s': 70,
 '000': 71,
 'us': 72,
 'fair': 73,
 'families': 74,
 'act': 75,
 'care': 76,
 'predecessor': 77,
 'me': 78,
 'like': 79,
 '

In [16]:
my_input_sequences = []
for line in mytext.split('\n'):
  #print(line)
  token_list = mytokenizer.texts_to_sequences([line])[0]
  #print(token_list)
  for i in range(1, len(token_list)):
    my_n_gram_sequences = token_list[:i+1]
    my_input_sequences.append(my_n_gram_sequences)
    #print(my_n_gram_sequences)

In [15]:
#padding
max_sequence_len = max([len(seq) for seq in my_input_sequences])
input_sequences = np.array(pad_sequences(my_input_sequences, maxlen=max_sequence_len,padding='pre'))

In [17]:
input_sequences[1]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
       270, 744, 482], dtype=int32)

In [18]:
X = input_sequences[:,:-1]
y = input_sequences[:,-1]

In [19]:
X[1]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
       270, 744], dtype=int32)

In [20]:
y

array([ 744,  482,    4, ...,  150,   14, 1623], dtype=int32)

In [21]:
X

array([[  0,   0,   0, ...,   0,   0, 270],
       [  0,   0,   0, ...,   0, 270, 744],
       [  0,   0,   0, ...,   0,   0, 483],
       ...,
       [  0,   0,   0, ...,   0, 338, 382],
       [  0,   0,   0, ..., 338, 382, 150],
       [  0,   0,   0, ..., 382, 150,  14]], dtype=int32)

In [22]:
y

array([ 744,  482,    4, ...,  150,   14, 1623], dtype=int32)

In [23]:
y = np.array(tf.keras.utils.to_categorical(y, num_classes= total_words))

In [24]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [25]:
model = Sequential()
model.add(Embedding(total_words , 100, input_length = max_sequence_len-1 ))
model.add(LSTM(150))
model.add(Dense(total_words,activation='softmax'))
print(model.summary())



None


In [26]:
model.compile( optimizer='adam',loss = 'categorical_crossentropy', metrics= ['accuracy'])
model.fit(X,y,epochs = 100, verbose = 1)

Epoch 1/100
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 120ms/step - accuracy: 0.0428 - loss: 6.8652
Epoch 2/100
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 108ms/step - accuracy: 0.0405 - loss: 6.2692
Epoch 3/100
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 109ms/step - accuracy: 0.0552 - loss: 6.1377
Epoch 4/100
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 102ms/step - accuracy: 0.0704 - loss: 5.9425
Epoch 5/100
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 116ms/step - accuracy: 0.0731 - loss: 5.7669
Epoch 6/100
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 117ms/step - accuracy: 0.0804 - loss: 5.5984
Epoch 7/100
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 113ms/step - accuracy: 0.1013 - loss: 5.3265
Epoch 8/100
[1m190/190[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 109ms/step - accuracy: 0.1149 - loss: 5.1603
Epoch 9/

<keras.src.callbacks.history.History at 0x7ad0df0ac8c0>

In [30]:
input_text = input("Enter the text: ")
predict_next_words= int(input("Enter how may words needs to be predicted: "))

for _ in range(predict_next_words):
    token_list = mytokenizer.texts_to_sequences([input_text])[0] #current sewntense into numbers
    print(token_list)
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre') #padding
    predicted = np.argmax(model.predict(token_list), axis=-1) #picks the index of hoighest prob
    output_word = ""
    for word, index in mytokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    input_text += " " + output_word

print(input_text)

Enter the text: Hi
Enter how may words needs to be predicted: 8
[]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step
[13]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step
[13, 328]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step
[13, 328, 11]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
[13, 328, 11, 89]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
[13, 328, 11, 89, 642]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step
[13, 328, 11, 89, 642, 1063]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[13, 328, 11, 89, 642, 1063, 397]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
Hi my administration is also eliminating title insurance fees
