In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

In [2]:
path = "Dataset.txt"
data = open(path,"r",encoding='utf-8').read().lower()

In [3]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded_data= tokenizer.texts_to_sequences([data])[0]

encoded_data

[4789,
 145,
 4790,
 1,
 1020,
 4,
 128,
 34,
 45,
 611,
 2235,
 2236,
 30,
 1021,
 15,
 23,
 1,
 275,
 4,
 394,
 2237,
 21,
 51,
 1676,
 2,
 18,
 572,
 51,
 3398,
 3399,
 13,
 75,
 817,
 10,
 213,
 10,
 124,
 63,
 2238,
 275,
 10,
 262,
 1,
 480,
 4,
 1,
 145,
 130,
 655,
 2239,
 18,
 30,
 1021,
 63,
 2240,
 21,
 1093,
 130,
 3400,
 2666,
 1,
 1020,
 4,
 128,
 34,
 4791,
 611,
 2235,
 2236,
 4792,
 1022,
 4793,
 4794,
 4795,
 1021,
 2241,
 139,
 3401,
 75,
 3402,
 4796,
 3403,
 1094,
 573,
 243,
 4797,
 4798,
 1469,
 722,
 4,
 30,
 145,
 130,
 1021,
 1,
 1020,
 4,
 128,
 34,
 1928,
 45,
 50,
 3404,
 145,
 130,
 2667,
 2,
 3405,
 3406,
 1677,
 1,
 1020,
 4,
 128,
 34,
 45,
 611,
 2235,
 2236,
 1929,
 6,
 5,
 885,
 8,
 940,
 2668,
 1,
 248,
 481,
 691,
 2669,
 5,
 113,
 4,
 2242,
 3407,
 1,
 774,
 1470,
 522,
 3408,
 1,
 312,
 1023,
 941,
 3409,
 1,
 58,
 18,
 1,
 1095,
 942,
 3410,
 1,
 612,
 4,
 1,
 448,
 1471,
 3411,
 1,
 612,
 4,
 1,
 1930,
 886,
 3412,
 1,
 612,
 4,
 1,
 3413,
 692

In [4]:
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 8932


In [5]:
sequences = list()
for i in range(1, len(encoded_data)):
	sequence = encoded_data[i-1:i+1]
	sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 111252


In [6]:
sequences

[[4789, 145],
 [145, 4790],
 [4790, 1],
 [1, 1020],
 [1020, 4],
 [4, 128],
 [128, 34],
 [34, 45],
 [45, 611],
 [611, 2235],
 [2235, 2236],
 [2236, 30],
 [30, 1021],
 [1021, 15],
 [15, 23],
 [23, 1],
 [1, 275],
 [275, 4],
 [4, 394],
 [394, 2237],
 [2237, 21],
 [21, 51],
 [51, 1676],
 [1676, 2],
 [2, 18],
 [18, 572],
 [572, 51],
 [51, 3398],
 [3398, 3399],
 [3399, 13],
 [13, 75],
 [75, 817],
 [817, 10],
 [10, 213],
 [213, 10],
 [10, 124],
 [124, 63],
 [63, 2238],
 [2238, 275],
 [275, 10],
 [10, 262],
 [262, 1],
 [1, 480],
 [480, 4],
 [4, 1],
 [1, 145],
 [145, 130],
 [130, 655],
 [655, 2239],
 [2239, 18],
 [18, 30],
 [30, 1021],
 [1021, 63],
 [63, 2240],
 [2240, 21],
 [21, 1093],
 [1093, 130],
 [130, 3400],
 [3400, 2666],
 [2666, 1],
 [1, 1020],
 [1020, 4],
 [4, 128],
 [128, 34],
 [34, 4791],
 [4791, 611],
 [611, 2235],
 [2235, 2236],
 [2236, 4792],
 [4792, 1022],
 [1022, 4793],
 [4793, 4794],
 [4794, 4795],
 [4795, 1021],
 [1021, 2241],
 [2241, 139],
 [139, 3401],
 [3401, 75],
 [75, 3402

In [7]:
sequences = np.array(sequences)
X, y = sequences[:,0],sequences[:,1]

In [8]:
y = to_categorical(y, num_classes=vocab_size)

In [9]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=1))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1, 10)             89320     
                                                                 
 lstm (LSTM)                 (None, 50)                12200     
                                                                 
 dense (Dense)               (None, 8932)              455532    
                                                                 
Total params: 557,052
Trainable params: 557,052
Non-trainable params: 0
_________________________________________________________________
None


In [10]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [11]:
model.fit(X, y, epochs=10)

Epoch 1/10


In [None]:
def generate_seq(model, tokenizer, enter_text, n_pred):  
	in_text, result = enter_text, enter_text
	for _ in range(n_pred):
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		encoded = np.array(encoded)
		predict = model.predict(encoded)
		yhat=np.argmax(predict,axis=1)
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		in_text, result = out_word, result + ' ' + out_word
	return result

In [None]:
print(generate_seq(model, tokenizer, 'door', 1))


door and
