In [2]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import  Embedding, Dense, LSTM, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import nltk
import re

In [3]:
data_path = '/content/drive/MyDrive/Colab Notebooks/Sentence Auto-Completion/data.txt'

with open(data_path, 'r') as f:
    data = f.read()

# Cleaning Data

In [4]:
def remove_special_char(text):
  text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

  text = re.sub(' +' , ' ' ,text)
  return text

In [5]:
def pre_process_data(data):
  sentences = data.split('\n')
  for i in range(len(sentences)):
    sentences[i] = remove_special_char(sentences[i])
  sentences = [s.strip() for s in sentences]
  sentences = [sentence for sentence in sentences if len(sentence) > 0]

  tokenized = []
  for i in sentences:
    i = i.lower()
    tokenized.append(i)
  return tokenized

In [6]:
tokenized_data  = pre_process_data(data[:500000])

In [7]:
tokenizer = Tokenizer(oov_token='<oov>')
tokenizer.fit_on_texts(tokenized_data)

In [8]:
total_words = len(tokenizer.word_index) + 1

In [9]:
total_words

6120

In [10]:
input_sequence = []

for line in tokenized_data:
  token_list = tokenizer.texts_to_sequences([line])[0]
  for i in range(1, len(token_list)):
    n_gram_sequence = token_list[:i+1]
    input_sequence.append(n_gram_sequence)

In [11]:
input_sequence[:100]

[[218, 1569],
 [218, 1569, 211],
 [218, 1569, 211, 5],
 [218, 1569, 211, 5, 12],
 [218, 1569, 211, 5, 12, 37],
 [218, 1569, 211, 5, 12, 37, 4],
 [218, 1569, 211, 5, 12, 37, 4, 15],
 [218, 1569, 211, 5, 12, 37, 4, 15, 55],
 [218, 1569, 211, 5, 12, 37, 4, 15, 55, 175],
 [2681, 10],
 [2681, 10, 2],
 [2681, 10, 2, 1249],
 [2681, 10, 2, 1249, 2682],
 [2681, 10, 2, 1249, 2682, 12],
 [2681, 10, 2, 1249, 2682, 12, 37],
 [2681, 10, 2, 1249, 2682, 12, 37, 711],
 [25, 437],
 [25, 437, 310],
 [25, 437, 310, 211],
 [25, 437, 310, 211, 712],
 [25, 437, 310, 211, 712, 19],
 [25, 437, 310, 211, 712, 19, 22],
 [25, 437, 310, 211, 712, 19, 22, 67],
 [25, 437, 310, 211, 712, 19, 22, 67, 2175],
 [25, 437, 310, 211, 712, 19, 22, 67, 2175, 10],
 [25, 437, 310, 211, 712, 19, 22, 67, 2175, 10, 2],
 [25, 437, 310, 211, 712, 19, 22, 67, 2175, 10, 2, 1009],
 [2683, 33],
 [2683, 33, 32],
 [2683, 33, 32, 2681],
 [2683, 33, 32, 2681, 10],
 [2683, 33, 32, 2681, 10, 2],
 [2683, 33, 32, 2681, 10, 2, 711],
 [2683, 33, 

In [12]:
max_sentence_len = max([len(i) for i in input_sequence])

In [13]:
max_sentence_len

16

In [14]:
import pickle

with open("/content/drive/MyDrive/Colab Notebooks/Sentence Auto-Completion/input_sequence_5000000.pkl", "wb") as f:
    pickle.dump(input_sequence, f)

In [15]:
input_sequence = np.array(pad_sequences(input_sequence , maxlen=max_sentence_len , padding='pre'))

In [16]:
np.savez_compressed('/content/drive/MyDrive/Colab Notebooks/Sentence Auto-Completion/cleaned_data_5000000.npz', input_sequence=input_sequence)

In [17]:
x = input_sequence[:,:-1]

In [18]:
y = input_sequence[:, -1]

In [19]:
ys = tf.keras.utils.to_categorical (y, num_classes = total_words)

In [21]:
from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split(x , y ,test_size=0.2 , random_state=42)

#Model Traning

In [23]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sentence_len-1))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(total_words , activation='softmax'))
adam = Adam(learning_rate=0.01)
model.compile(loss ='sparse_categorical_crossentropy',optimizer = adam , metrics  = ['accuracy'] )

history  = model.fit(x_train , y_train , epochs = 10)

Epoch 1/10
[1m2002/2002[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m232s[0m 114ms/step - accuracy: 0.0950 - loss: 6.2584
Epoch 2/10
[1m2002/2002[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m228s[0m 114ms/step - accuracy: 0.1519 - loss: 5.1463
Epoch 3/10
[1m2002/2002[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m229s[0m 114ms/step - accuracy: 0.1751 - loss: 4.6490
Epoch 4/10
[1m2002/2002[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m229s[0m 114ms/step - accuracy: 0.1998 - loss: 4.2622
Epoch 5/10
[1m2002/2002[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m228s[0m 114ms/step - accuracy: 0.2195 - loss: 3.9994
Epoch 6/10
[1m2002/2002[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m229s[0m 114ms/step - accuracy: 0.2308 - loss: 3.8376
Epoch 7/10
[1m2002/2002[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m229s[0m 114ms/step - accuracy: 0.2412 - loss: 3.6886
Epoch 8/10
[1m2002/2002[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m229s[0m 114ms/step - accuracy: 0.2524 - loss:

In [24]:
def predict_top_five_words(model, tokenizer, seed_text):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    top_five_indexes = np.argsort(predicted[0])[::-1][:5]
    top_five_words = []
    for index in top_five_indexes:
        for word, idx in tokenizer.word_index.items():
            if idx == index:
                top_five_words.append(word)
                break
    return top_five_words


In [27]:
from IPython.display import HTML

def predict_top_five_words(model, tokenizer, seed_text):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sentence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    top_five_indexes = np.argsort(predicted[0])[::-1][:5]
    top_five_words = []
    for index in top_five_indexes:
        for word, idx in tokenizer.word_index.items():
            if idx == index:
                top_five_words.append(word)
                break
    return top_five_words

def predict_and_display_top_five_words(seed_text, model, tokenizer):
    top_five_words = predict_top_five_words(model, tokenizer, seed_text)
    heading_app = f"<h1>Sentence AutoCompletion App With Five Outputs</h1>"
    output_text = f"<ul>{''.join([f'<li>{seed_text} {word}</li>' for word in top_five_words])}</ul>"
    javascript_code = f"""
    <script>
        var newWindow = window.open("", "_blank");
        newWindow.document.write('<html><head><title>Top Five Words</title></head><body>{heading_app} <br> <hr> {output_text}</body></html>');
    </script>
    """
    return HTML(javascript_code)




In [48]:
seed_text = "I have"
predict_and_display_top_five_words(seed_text, model, tokenizer)

In [50]:

# Save the model
model.save('/content/drive/MyDrive/Colab Notebooks/Sentence Auto-Completion/my_model.keras')

# Save the tokenizer
with open('/content/drive/MyDrive/Colab Notebooks/Sentence Auto-Completion/tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)