# Importing Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Activation, Dropout, Bidirectional
from keras.layers.core import RepeatVector
from keras.models import Sequential, load_model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam, RMSprop
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, TensorBoard
import pickle
import heapq
import string

# Loading Data

In [2]:
file = open("data.txt", "r", encoding = "utf8")
lines = []
for i in file:
    lines.append(i)
# print ("Line 1: ", lines[:2])

# Data Preprocessing

### Cleaning Data

In [3]:
# Removing unwanted characters
data = ""
for i in lines:
    data = ' '. join(lines)
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '')
data[:500]

" Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyle  This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever.  You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.net   Title: The Adventures of Sherlock Holmes  Author: Arthur Conan Doyle  Release Date: November 29, 2002 [EBook #1661] Last Updated: May 20, 2019  Language: English  Charact"

In [4]:
# Map punctuation to space
translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
data = data.translate(translator)
data[:500]

' Project Gutenberg s The Adventures of Sherlock Holmes  by Arthur Conan Doyle  This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever   You may copy it  give it away or re use it under the terms of the Project Gutenberg License included with this eBook or online at www gutenberg net   Title  The Adventures of Sherlock Holmes  Author  Arthur Conan Doyle  Release Date  November 29  2002  EBook  1661  Last Updated  May 20  2019  Language  English  Charact'

In [5]:
# Removing repeated words
text = []
for i in data.split():
    if i not in text:
        text.append(i)  
text[:10]
print("data")
data = ' '.join(text)
data[:500]

data


'Project Gutenberg s The Adventures of Sherlock Holmes by Arthur Conan Doyle This eBook is for the use anyone anywhere at no cost and with almost restrictions whatsoever You may copy it give away or re under terms License included this online www gutenberg net Title Author Release Date November 29 2002 EBook 1661 Last Updated May 20 2019 Language English Character set encoding UTF 8 START OF THIS PROJECT GUTENBERG EBOOK THE ADVENTURES SHERLOCK HOLMES Produced an anonymous volunteer Jose Menendez '

### Tokenizing Characters

In [6]:
# Finding unique characters in the corpus
chars = sorted(list(set(data)))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [7]:
chars_size = len(chars)
print ("Total unique characters are: ", chars_size)

Total unique characters are:  76


In [8]:
# Dividing text into chuncks of 39 characters
SEQUENCE_LENGTH = 60
step = 3
sentences = []
next_chars = []
for i in range(0, len(data) - SEQUENCE_LENGTH, step):
    sentences.append(data[i:i+SEQUENCE_LENGTH])
    next_chars.append(data[i+SEQUENCE_LENGTH])
print ('Total input sentences: ', len(sentences))

Total input sentences:  25018


In [9]:
# Generating features and labels
x = np.zeros((len(sentences), SEQUENCE_LENGTH, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for j, char in enumerate(sentence):
        x[i, j, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1
print("Features: \n", x[0])
print("\nLabels: \n", y[0])

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  x = np.zeros((len(sentences), SEQUENCE_LENGTH, len(chars)), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y = np.zeros((len(sentences), len(chars)), dtype=np.bool)


Features: 
 [[False False False ... False False False]
 [False False False ... False False False]
 [False False False ... False False False]
 ...
 [False False False ... False False False]
 [False False False ... False False False]
 [False False False ... False False False]]

Labels: 
 [False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False  True False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False]


# Bidirectional LSTM

## Building Model 

In [63]:
# bi_lstm_model = Sequential()
# bi_lstm_model.add(Embedding(total_chars, 100, input_length = len(total_chars)))
# bi_lstm_model.add(Bidirectional(LSTM(150)))
# bi_lstm_model.add(Dense(total_words, activation = 'softmax'))

model = Sequential();
model.add(LSTM(128, input_shape=(SEQUENCE_LENGTH, chars_size), kernel_regularizer='l1'))
model.add(Dense(chars_size, activation='softmax', kernel_regularizer='l1'))

In [64]:
# Printing model summary
model.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_8 (LSTM)               (None, 128)               104960    
                                                                 
 dense_8 (Dense)             (None, 76)                9804      
                                                                 
Total params: 114,764
Trainable params: 114,764
Non-trainable params: 0
_________________________________________________________________


In [65]:
# Training model
optimizer = Adam(lr= 0.01)
# reduce = ReduceLROnPlateau(monitor='loss', factor=0.1, patience=3, min_lr=0.001, verbose = 1)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
history = model.fit(x, y, validation_split=0.05, batch_size=64, epochs=50, shuffle=True).history

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50

KeyboardInterrupt: 

In [None]:
# Saving model
bi_lstm_model.save('bi_lstm_model.h5')
pickle.dump(bi_lstm_history, open('bi_lstm_history.p', 'wb'))

## Evaluating Model

In [None]:
# Accuracy
plt.plot(history['accuracy'])
plt.plot(history['val_accuracy'])
plt.title('Model Accuracy for Bidirectional LSTM')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc= 'upper left')

In [None]:
# Loss
plt.plot(history['loss'])
plt.plot(history['val_loss'])
plt.title('Model Loss for Bidirectional LSTM')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc= 'upper left')

## Making Prediction

In [20]:
while(True):
    text = input("Enter the words: ")
    if text == "Stop":
        break
    else:
        token_list = tokenizer.texts_to_sequences([text])[0]
        token_list = pad_sequences([token_list], maxlen = max_sequence_len-1, padding='pre')
        predicted = bi_lstm_model.predict(token_list, verbose = 0)
        classes = np.argmax(predicted, axis = 1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == classes:
                output_word = word
                break
        text += " " + output_word
        print(text)

Enter the words: Stop


# LSTM

In [32]:
# lstm_model = Sequential()
# lstm_model.add(Embedding(total_chars, 10, input_length = 1))
# lstm_model.add(LSTM(1000, return_sequences = True))
# lstm_model.add(LSTM(1000))
# lstm_model.add(Dense(1000, activation = "relu"))
# lstm_model.add(Dense(total_words, activation = "softmax"))
# model = Sequential();
# model.add(LSTM(128, return_sequences=True, input_shape=(SEQUENCE_LENGTH, len(total_chars))))
# model.add(LSTM(128, input_shape=(SEQUENCE_LENGTH, len(total_chars))))
# model.add(Dense(len(total_chars)))
# model.add(Activation('softmax'))

# model = Sequential()
# model.add(Embedding(10000, 10, input_length=1))
# model.add(LSTM(1000, input_shape=(SEQUENCE_LENGTH, len(chars)), return_sequences=True))
# model.add(LSTM(1000))
# model.add(Dense(1000, activation="relu"))
# model.add(Dense(10000, activation="softmax"))

model = Sequential();
# model.add(Embedding(chars_size, 10, input_length=1))
model.add(LSTM(128, input_shape=(SEQUENCE_LENGTH, len(chars)), return_sequences=True))
model.add(LSTM(128, input_shape=(SEQUENCE_LENGTH, len(chars))))
model.add(Dense(1000, activation="relu"))
model.add(Dense(chars_size, activation="softmax"))

In [33]:
# Printing model summary
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_9 (LSTM)               (None, 60, 128)           104960    
                                                                 
 lstm_10 (LSTM)              (None, 128)               131584    
                                                                 
 dense_9 (Dense)             (None, 1000)              129000    
                                                                 
 dense_10 (Dense)            (None, 76)                76076     
                                                                 
Total params: 441,620
Trainable params: 441,620
Non-trainable params: 0
_________________________________________________________________


In [34]:
reduce = ReduceLROnPlateau(monitor='loss', factor = 0.2, patience = 3, min_lr = 0.0001, verbose = 1)
logdir = 'logsnextword'
tensorboard_Visualization = TensorBoard(log_dir = logdir)

In [None]:
model.compile(loss = "categorical_crossentropy", optimizer = Adam(lr=0.001), metrics = ['accuracy'])
history = model.fit(x, y, validation_split = 0.05, epochs = 150, batch_size = 128, callbacks = [reduce, tensorboard_Visualization]).history

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150

In [None]:
# Saving model
lstm_model.save('lstm_model.h5')
pickle.dump(lstm_history, open('lstm_history.p', 'wb'))

In [None]:
OLD

In [6]:
# Creating and saving tokenizer for predict function
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
pickle.dump(tokenizer, open('tokenizer.pkl', 'wb'))

In [7]:
# Indexing unique words using tokenizer
total_words = len(tokenizer.word_index) + 1
print("Total number of words: ", total_words)

Total number of words:  8925


In [47]:
# Converting words to numerical values
token_list = tokenizer.texts_to_sequences([data])[0]

# Creating input sequences that will be used to train model
input_sequences = []
step = 5
for i in range(1, len(token_list), step):
    for j in range(i, i+step):
        n_gram_sequence = token_list[i-1:j+1]
        input_sequences.append(n_gram_sequence)
print(input_sequences[:5])
print("Total input sequences are: ", len(input_sequences))   

[[72, 2], [72, 2, 73], [72, 2, 73, 3], [72, 2, 73, 3, 4], [72, 2, 73, 3, 4, 5]]
Total input sequences are:  9600


In [48]:
# Making length of all input sequences same using padding
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen = max_sequence_len, padding = 'pre'))
input_sequences[0]

array([ 0,  0,  0,  0, 72,  2])

In [49]:
# Create features and labels for all input sequences
x_words, label_words = input_sequences[:,:-1],input_sequences[:,-1]
print("Features: \n", x_words[0])
print("\nLabels: \n", label_words[0])

Features: 
 [ 0  0  0  0 72]

Labels: 
 2


In [50]:
# Convert labels array to array having binary values
y_words = to_categorical(label_words, num_classes = total_words)
print("Labels: \n", y_words[0])

Labels: 
 [0. 0. 1. ... 0. 0. 0.]


In [35]:
# Printing label for input_sequences[0]
# y[0][2] = 1 as for input_sequences[0], labels = 2
y_words[0][2]

1.0

In [None]:
OLD2

In [125]:
# Finding unique characters in the corpus and indexing them
chars = sorted(list(set(data)))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
print ("Unique Characters: ", len(chars))

Unique Characters:  97


In [142]:
# Dividing data into chuncks of 39 characters
SEQUENCE_LENGTH = 39
step = 3
sentences = []
next_chars = []
for i in range(0, len(data) - SEQUENCE_LENGTH, step):
    sentences.append(data[i:i+SEQUENCE_LENGTH])
    next_chars.append(data[i+SEQUENCE_LENGTH])
print ('Training Samples: ',len(sentences))

Training Samples:  42038


In [143]:
sentences[:5]

["Project Gutenberg's The Adventures of Sherlock Holmes, by Ar",
 "ject Gutenberg's The Adventures of Sherlock Holmes, by Arthu",
 "t Gutenberg's The Adventures of Sherlock Holmes, by Arthur C",
 "utenberg's The Adventures of Sherlock Holmes, by Arthur Cona",
 "nberg's The Adventures of Sherlock Holmes, by Arthur Conan D"]

In [144]:
next_chars[:5]

['t', 'r', 'o', 'n', 'o']

In [145]:
len(data)

126173

In [101]:
labels[:10]

array([4789,    1, 1020,    4,  128,   34,   45,  611, 2235, 2236])

In [113]:
y[2][1020]

1.0

In [46]:
total_words = len(tokenizer.word_index) + 1
print("Total number of words: ", total_words)
input_sequences = []
for line in lines:
    token_list = tokenizer.texts_to_sequences([line])[0]
#     print("token", token_list)
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
#         print(n_gram_sequence)
        input_sequences.append(n_gram_sequence)
print("Total input sequences: ", len(input_sequences))

Total number of words:  8931
Total input sequences:  101619
