In [3]:
import random
import pickle

import numpy as np
from pypdf import PdfReader
from nltk.tokenize import RegexpTokenizer

from tensorflow.keras.models import Sequential,load_model 
from tensorflow.keras.layers import LSTM,Dense,Activation 
from tensorflow.keras.optimizers import RMSprop

In [7]:
def extract_pdf_text(path):
    
    reader = PdfReader(path)
    pages = reader.pages[7:117]
    
    return " ".join(pages[i].extract_text() for i in range(len(pages)))
    
text = extract_pdf_text("constitution.pdf")
file = open("constitution.text","w")
file.write(text)
file.close()
    

In [None]:
tokenizer = RegexpTokenizer(r"\w+")
tokens = tokenizer.tokenize(text.lower())
tokens

In [13]:
unique_tokens = np.unique(tokens)
unique_token_index = {token:idx for idx, token in enumerate(unique_tokens)}

In [14]:
n_words = 10
input_words = []
output_words = [] 
for i in range(len(tokens)-n_words):
    
    input_words.append(tokens[i:i+n_words])
    output_words.append(tokens[i+n_words])

In [19]:
X = np.zeros((len(input_words),n_words,len(unique_tokens)),dtype=bool)
Y = np.zeros((len(output_words),len(unique_tokens)),dtype=bool)


In [28]:
for i,words in enumerate(input_words):
    
    for j,word in enumerate(words):
        
        X[i,j,unique_token_index[word]] = 1
    Y[i,unique_token_index[word]] =1

In [31]:
model = Sequential()
model.add(LSTM(128,input_shape=(n_words,len(unique_tokens)),return_sequences=True))
model.add(LSTM(128))
model.add(Dense(len(unique_tokens)))
model.add(Activation("softmax"))

In [32]:
model.compile(loss="categorical_crossentropy",optimizer=RMSprop(learning_rate=0.01),metrics=["accuracy"])
model.fit(X,Y,batch_size=128,epochs=10,shuffle=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1e52b106340>

In [33]:
model.save("constitutionModel.h5")

In [34]:
model = load_model("constitutionModel.h5")

In [38]:
def predict_next_words(input_text,n):
    
    X = np.zeros((1,n_words,len(unique_tokens)))
    
    for i,word in enumerate(input_text.lower().split()):
        X[0,i,unique_token_index[word]] = 1 
    
    predictions = model.predict(X)[0]
    
    return np.argpartition(predictions,n)[n:]

In [39]:
possible_words = predict_next_words("in every constituency in which a poll",5)

In [None]:
next_words = [unique_tokens[idx] for idx in possible_words]


In [43]:
def generate_text(input_text,text_length,creativity=3):
    
    word_sequence = input_text.split()
    
    current = 0 
    
    for _ in range(text_length):
        
        sub_sequence = " ".join(tokenizer.tokenize(" ".join(word_sequence).lower())[current:current+n_words])
        try:
            
            choice = unique_tokens[random.choice(predict_next_words(sub_sequence,creativity))]
            
        except:
            
            choice = random.choice(unique_tokens)
            
        word_sequence.append(choice)
        current += 1
        
    return " ".join(word_sequence)

next_words=generate_text("in every constituency in which a poll",10,100)
next_words

'in every constituency in which a poll consulate adjournment right 2 available whenever functions vacancies secretaries 26'