# <p style="text-align: center;"> Characters and Words Prediction </p>
<p style="text-align: center;">Using LSTM | Recurrent Neural Networks</p>

In [None]:
!pip install -U tensorflow-gpu

In [0]:
import numpy as np
import sys
import tensorflow as tf
from tensorflow import keras
from operator import itemgetter
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

### Reading text file

In [0]:
import gzip
import urllib
dataurl="http://www.gutenberg.org/cache/epub/11/pg11.txt"
urllib.request.urlretrieve(dataurl, "wonderland.txt.gz")
with gzip.open('wonderland.txt.gz') as f:
    data=f.read()

<div class="alert alert-block alert-warning">
    1. Characters Prediction
</div>

### Extract the Unique Characters

In [252]:
# 1.
Data = str(data.decode(encoding='utf-8'))
print('Length of text: {} characters'.format(len(Data)))
Uniq = sorted(set(list(Data)))
print('{} unique characters'.format(len(Uniq)))
chars = []
for key in Uniq:
    chars.append([key,Data.count(key)])

Length of text: 167516 characters
86 unique characters


### Sorting Unique Characters into indexed Dictionary

In [253]:
# 2.
char_to_int = {}
for i in range(len(chars)):
    char_to_int[chars[i][0]] = i
int_to_char = dict(enumerate(Uniq))
print(char_to_int)

{'\n': 0, '\r': 1, ' ': 2, '!': 3, '"': 4, '#': 5, '$': 6, '%': 7, "'": 8, '(': 9, ')': 10, '*': 11, ',': 12, '-': 13, '.': 14, '/': 15, '0': 16, '1': 17, '2': 18, '3': 19, '4': 20, '5': 21, '6': 22, '7': 23, '8': 24, '9': 25, ':': 26, ';': 27, '?': 28, '@': 29, 'A': 30, 'B': 31, 'C': 32, 'D': 33, 'E': 34, 'F': 35, 'G': 36, 'H': 37, 'I': 38, 'J': 39, 'K': 40, 'L': 41, 'M': 42, 'N': 43, 'O': 44, 'P': 45, 'Q': 46, 'R': 47, 'S': 48, 'T': 49, 'U': 50, 'V': 51, 'W': 52, 'X': 53, 'Y': 54, 'Z': 55, '[': 56, ']': 57, '_': 58, 'a': 59, 'b': 60, 'c': 61, 'd': 62, 'e': 63, 'f': 64, 'g': 65, 'h': 66, 'i': 67, 'j': 68, 'k': 69, 'l': 70, 'm': 71, 'n': 72, 'o': 73, 'p': 74, 'q': 75, 'r': 76, 's': 77, 't': 78, 'u': 79, 'v': 80, 'w': 81, 'x': 82, 'y': 83, 'z': 84, '\ufeff': 85}


### Input Sequences ( length = 100 - window size = 1 )

In [254]:
# 3.
Seq = []
for i in range(0,len(Data)-101):
    seq = Data[i:i+100]
    Seq.append([seq, Data[i+100]])
print(Seq[:3])

[["\ufeffProject Gutenberg's Alice's Adventures in Wonderland, by Lewis Carroll\r\n\r\nThis eBook is for the use", ' '], ["Project Gutenberg's Alice's Adventures in Wonderland, by Lewis Carroll\r\n\r\nThis eBook is for the use ", 'o'], ["roject Gutenberg's Alice's Adventures in Wonderland, by Lewis Carroll\r\n\r\nThis eBook is for the use o", 'f']]


### Represinting Sequences as Integers & Encoding Target Character

In [255]:
# 4.
dataX, dataY = [], []
def Compare(word):
    res = 0
    for letter in word:
        if letter in char_to_int:
            res += char_to_int[letter]
    return res

def Comparex(word):
    res = []
    for letter in word:
        if letter in char_to_int:
            res.append(char_to_int[letter])
    return res

for i in range(len(Seq)):
    letter = [0 for _ in range(len(char_to_int))]
    letter[Compare(Seq[i][1])] = 1
    dataX.append(Comparex(Seq[i][0]))
    dataY.append(letter)
dataX = np.array(dataX)
dataY = np.array(dataY)

batch_size = 180
dataX = tf.reshape(dataX, [167415, 100,1])
print(np.shape(dataX))
dataY = tf.reshape(dataY, [167415, 86])
print(np.shape(dataY))

(167415, 100, 1)
(167415, 86)


### Simple LSTM Model

In [256]:
# 5.

Chlen = len(Uniq)
model = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(100, input_shape=[None,1]),
    tf.keras.layers.Dense(Chlen, activation="softmax"), 
    ])
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=['accuracy'])
model.summary()

Model: "sequential_17"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_17 (LSTM)               (None, 100)               40800     
_________________________________________________________________
dense_17 (Dense)             (None, 86)                8686      
Total params: 49,486
Trainable params: 49,486
Non-trainable params: 0
_________________________________________________________________


In [257]:
# 6.
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint("Model.h5")
history = model.fit(dataX, dataY, batch_size=180, epochs=10, callbacks=[checkpoint_cb])

Train on 167415 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Converting back to text & Testing

In [259]:
# 7.
Str = str()
test = tf.dtypes.cast(tf.reshape(dataX[1], [1,100,1]), tf.float32)
lastChar = model.predict_classes(test)
print('{} : is the Predicted Character'.format([int_to_char[lastChar[0]]]))
lastInputs = [int_to_char[x[0]] for x in list(np.asarray(dataX[1]))]
for x in lastInputs:
  Str += x
Str += int_to_char[lastChar[0]]
print('Converted input integers: {}'.format([Str]))
TestSeq = np.asarray(dataX[1]).flatten().reshape(100,1)
for i in range(300):
  test = tf.reshape(TestSeq, [1,len(TestSeq),1])
  test = tf.dtypes.cast(test, tf.float32)
  x = model.predict_classes(test)
  TestSeq = np.append(TestSeq, x)
PredictedChars = [int_to_char[x] for x in list((TestSeq).flatten())]
PredStrs = str()
for x in PredictedChars:
  PredStrs += x
print('Predicted 300 Characters: {}'.format([PredStrs]))

['t'] : is the Predicted Character
Converted input integers: ["Project Gutenberg's Alice's Adventures in Wonderland, by Lewis Carroll\r\n\r\nThis eBook is for the use t"]
Predicted 300 Characters: ["Project Gutenberg's Alice's Adventures in Wonderland, by Lewis Carroll\r\n\r\nThis eBook is for the use to toe tooe to the sane to the sane to the sane to the sane to the sane to the sane to the sane to the sane to the sane to the sane to the sane to the sane to the sane to the sane to the sane to the sane to the sane to the sane to the sane to the sane to the sane to the sane to the sane to the sane "]


<div class="alert alert-block alert-warning">
    2. Words Prediction
</div>

### PreProcessing

In [263]:
# 1. Removing symbols and splitting text into words
import re
Text = Data.lower()
Text = re.sub(r'(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|',r'',Text)
Text = Text.split()
print('Length Words: {}'.format(len(Text)))
# 2. Tekonizing text
tokenizer = keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(Text)
# 3. Unique words
Uniqtxt = set(Text)
word_to_char = dict(enumerate(Uniqtxt))
print('Length Unique words: {}'.format(len(Uniqtxt)))
print('\n       Words: {}'.format(Text[:300]))
print('Unique words: {}'.format(Uniqtxt))

[encoded] = np.array(tokenizer.texts_to_sequences([Text])) - 1
print('\nConverted words: {}'.format(tokenizer.texts_to_sequences([Text[:10]])))
# 4. Making a Training and Testing sequence
Seqtxt = []
for i in range(0,len(Text)-101):
    seq = Text[i:i+100]
    Seqtxt.append([seq, Text[i+100]])
print('Training words[0]: {}'.format(Seqtxt[0][0]))
print('Target   words[0]: {}'.format([Seqtxt[0][1]]))
# 5. Encoding Sequences & Preparing sequence to fit into the Model
[encoded] = np.array(tokenizer.texts_to_sequences([Text])) - 1

tekonized = tokenizer.texts_to_sequences(Text)

dic = {}
for i in range(len(Text)):
  dic[Text[i]] = tekonized[i][0]
# 6.
textX, textY = [], []
for i in range(len(Seqtxt)):
  textX.append([dic[Seqtxt[i][0][x]] for x in range(len(Seqtxt[0][0]))])
  textY.append(dic[Seqtxt[i][1]])

textX = np.array(textX)
textY = np.array(textY)
print('\nTraining Shape: {}'.format(textX.shape))
print('Target   Shape: {}'.format(textY.shape))

textX = tf.reshape(textX, [26737, 100,1])
print('\nInput  Shape : {}'.format(np.shape(textX)))
textY = tf.reshape(textY, [26737, 1])
print('Labels Shape: {}'.format(np.shape(textY)))

Length Words: 26838
Length Unique words: 5141

       Words: ['project', 'gutenbergs', 'alices', 'adventures', 'in', 'wonderland', 'by', 'lewis', 'carrollthis', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'withalmost', 'no', 'restrictions', 'whatsoever', 'you', 'may', 'copy', 'it', 'give', 'it', 'away', 'orreuse', 'it', 'under', 'the', 'terms', 'of', 'the', 'project', 'gutenberg', 'license', 'includedwith', 'this', 'ebook', 'or', 'online', 'at', 'wwwgutenbergorgtitle', 'alices', 'adventures', 'in', 'wonderlandauthor', 'lewis', 'carrollposting', 'date', 'june', '25', '2008', 'ebook', '11release', 'date', 'march', '1994last', 'updated', 'december', '20', '2011language', 'english', 'start', 'of', 'this', 'project', 'gutenberg', 'ebook', 'alices', 'adventures', 'in', 'wonderland', 'alices', 'adventures', 'in', 'wonderlandlewis', 'carrollthe', 'millennium', 'fulcrum', 'edition', '30chapter', 'i', 'down', 'the', 'rabbitholealice', 'was', 'beginn

### Simple LTSM Model

In [264]:
# 7. 
Chlen = len(Uniqtxt)
xmodel = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(100, input_shape=[None,1]),
    tf.keras.layers.Dense(Chlen, activation="softmax"), 
    ])
xmodel.compile(loss="categorical_crossentropy", optimizer="adam", metrics=['accuracy'])
xmodel.summary()

Model: "sequential_18"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_18 (LSTM)               (None, 100)               40800     
_________________________________________________________________
dense_18 (Dense)             (None, 5141)              519241    
Total params: 560,041
Trainable params: 560,041
Non-trainable params: 0
_________________________________________________________________


In [265]:
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint("Modeltxt.h5")
history = xmodel.fit(textX, textY, batch_size=180, epochs=10, callbacks=[checkpoint_cb])

Train on 26737 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [266]:
test = tf.dtypes.cast(tf.reshape(textX[0], [1,100,1]), tf.float32)
lastChar = model.predict_classes(test)
lastChar

array([2])

### Converting back to text & Testing

In [270]:
Str = str()
test = tf.dtypes.cast(tf.reshape(textX[1], [1,100,1]), tf.float32)
lastChar = xmodel.predict_classes(test)
print('{} : is the Predicted Character'.format([word_to_char[lastChar[0]]]))
lastInputs = [word_to_char[x[0]] for x in list(np.asarray(textX[1]))]
for x in lastInputs:
  Str += ' ' + x
Str += word_to_char[lastChar[0]]
print('Converted input integers: {}'.format([Str]))
TestSeq = np.asarray(textX[1]).flatten().reshape(100,1)
for i in range(300):
  test = tf.reshape(TestSeq, [1,len(TestSeq),1])
  test = tf.dtypes.cast(test, tf.float32)
  x = xmodel.predict_classes(test)
  TestSeq = np.append(TestSeq, x)
PredictedChars = [word_to_char[x] for x in list((TestSeq).flatten())]
PredStrs = str()
for x in PredictedChars:
  PredStrs += ' ' + x
print('Predicted 300 Characters: {}'.format([PredStrs]))

['muchnessdid'] : is the Predicted Character
Converted input integers: [' arithmeticambitiondistraction spokeunimportant 4 neat glanced unimportant queenfirst watched law attached silent first withina roared hundred thinkyoure submittedto rules wheres theleaves mystery rules saw twinkling filled lived cutting worksunless creatures worksunless firsthold aminute worksunless witha first thebreadandbutterjust roared first pennyworth 1 sizes happy hatteralice law atteatime glassfrom submittedto ofconversation spokeunimportant 4 neat verycarefully queenfirst sortnext decidedly growingtooyes ebooks beganin law rather decidedly stretching sobs strictliability welltake pattern goingdown ou nose roared hatteralice pennyworth 1 law spokeunimportant 4 neat glanced spokeunimportant 4 neat walks ishould banquetwhat brave across topsof means couldthe first fur shark yawned inthis rightly busy executiononce roared postedwith unimportantmuchnessdid']
Predicted 300 Characters: [' arithmeticambitiondistr