# Import all Packages

In [1]:
import tensorflow as tf

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.layers import Dropout

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import numpy as np 

# Load the text file

In [2]:
data=open('text_file.txt').read()

corpus=data.split('\n')

# Tokenizing the data

In [3]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(corpus)
word_index=tokenizer.word_index

total_words=len(word_index)+1


# Create input sequences using list of tokens

In [4]:
input_seq=[]
for line in corpus:
  token_list=tokenizer.texts_to_sequences([line])[0]
  for i in range(1,len(token_list)):
    set_sequences=token_list[:i+1]
    input_seq.append(set_sequences)


# Pad Sequences 

In [5]:
max_seq_len=max(len(x) for x in input_seq)
input_seq=np.array(pad_sequences(input_seq,maxlen=max_seq_len,padding='pre'))
input_seq

array([[   0,    0,    0, ...,    0,   51,   12],
       [   0,    0,    0, ...,   51,   12,   96],
       [   0,    0,    0, ...,   12,   96, 1217],
       ...,
       [   0,    0,    0, ...,    0,   47,  105],
       [   0,    0,    0, ...,   47,  105,  138],
       [   0,    0,    0, ...,  105,  138,  184]], dtype=int32)

# Create predictors and label

In [6]:
xs,labels=input_seq[:,:-1],input_seq[:,-1]

ys=tf.keras.utils.to_categorical(labels,num_classes=total_words)

# Build a Model

In [7]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_seq_len-1))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(total_words, activation='softmax'))
adam = Adam(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
history = model.fit(xs, ys, epochs=10, verbose=1)
model.summary()


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 15, 100)           269000    
_________________________________________________________________
bidirectional (Bidirectional (None, 300)               301200    
_________________________________________________________________
dense (Dense)                (None, 2690)              809690    
Total params: 1,379,890
Trainable params: 1,379,890
Non-trainable params: 0
_________________________________________________________________


# Predicting next word

In [9]:
seed_text = " Help me"
next_words = 100
  
for _ in range(next_words):
	token_list = tokenizer.texts_to_sequences([seed_text])[0]
	token_list = pad_sequences([token_list], maxlen=max_seq_len-1, padding='pre')
	predicted = model.predict_classes(token_list, verbose=0)
	output_word = ""
	for word, index in tokenizer.word_index.items():
		if index == predicted:
			output_word = word
			break
	seed_text += " " + output_word
print(seed_text)

 Help me bundle it was in her bay and god ill them as our bow fought the swell he love the better still kept the water round me bubbling i neer may round a cask lonely glen to a month of the bright may long are in my heart i we might well times as the leaves are green grow i love until the ship sailing all entangled like gone and the green the dear too and never down like the wild fair of the valley may many many who may many who is it lave gaiety all the rakes of mallow
