In [3]:
# !pip install numpy
# !pip install tensorflow

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing import sequence
from keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Embedding
import requests

In [2]:
# instantiate model
model = keras.Sequential()
model.add(layers.Embedding(input_dim=1000, output_dim=64))

# the output of SimpleRNN will be a 2d tensor of shape (batch_size, 128)
model.add(layers.SimpleRNN(128))
# add an additional hidden layer
model.add(layers.Dense(10))

# view architecture
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 64)          64000     
                                                                 
 simple_rnn (SimpleRNN)      (None, 128)               24704     
                                                                 
 dense (Dense)               (None, 10)                1290      
                                                                 
Total params: 89,994
Trainable params: 89,994
Non-trainable params: 0
_________________________________________________________________


2021-12-09 07:40:54.172766: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# LSTM network example
model = keras.Sequential()
# add an embedding layer
model.add(layers.Embedding(input_dim=1000,output_dim=64))
# add a LSTM layer
model.add(layers.LSTM(128))
# add a dense layer
model.add(layers.Dense(10))

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 64)          64000     
                                                                 
 lstm (LSTM)                 (None, 128)               98816     
                                                                 
 dense_1 (Dense)             (None, 10)                1290      
                                                                 
Total params: 164,106
Trainable params: 164,106
Non-trainable params: 0
_________________________________________________________________


In [4]:
# load text
url = "https://raw.githubusercontent.com/bloominstituteoftechnology/data-science-practice-datasets/main/unit_4/sherlock.txt"
res = requests.get(url)
text = res.text
text = text.replace('\r\n',' ')

In [5]:
# encode data as chars
# find unique chars
chars = list(set(text))
# lookup tables
char_int = {c:i for i, c in enumerate(chars)}
int_char = {i:c for i, c in enumerate(chars)}

print('The number of unique characters in the text:', len(chars))

The number of unique characters in the text: 91


In [6]:
# create the sequence data
maxlen = 40
step = 5

# encode the characters using the lookup tables
encoded = [char_int[c] for c in text]

# init empty lists to hold the sequences
sequences = [] # each element is 40 chars long
next_char = [] # one element for each sequence

# loop through the entire text
for i in range(0, len(encoded) - maxlen, step):
    sequences.append(encoded[i:i+maxlen])
    next_char.append(encoded[i+maxlen])
    
print('sequences:', len(sequences))

sequences: 54974


In [7]:
# pad sequences so all are equal
seq = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=40)
# create x & y, create arrays of zeros (False)
x = np.zeros((len(sequences), maxlen, len(chars)), dtype=bool)
y = np.zeros((len(sequences), len(chars)), dtype=bool)
# turn on the location (set to True) when the character is present
for i, sequence in enumerate(sequences):
    for t, char in enumerate(sequence):
        x[i,t,char] = 1
    y[i,next_char[i]] = 1

In [8]:
model = Sequential()
model.add(Embedding(output_dim=64, input_dim=len(chars)))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(len(chars), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [12]:
# fit model
model.fit(seq, y, batch_size=32, epochs=5, verbose=2)

Epoch 1/5
1718/1718 - 21s - loss: 2.1951 - 21s/epoch - 12ms/step
Epoch 2/5
1718/1718 - 23s - loss: 2.0643 - 23s/epoch - 14ms/step
Epoch 3/5
1718/1718 - 24s - loss: 1.9871 - 24s/epoch - 14ms/step
Epoch 4/5
1718/1718 - 24s - loss: 1.9271 - 24s/epoch - 14ms/step
Epoch 5/5
1718/1718 - 23s - loss: 1.8757 - 23s/epoch - 14ms/step


<keras.callbacks.History at 0x7fcad036b820>

In [13]:
def generate_text(model, seed, length):
    encoded = [char_int[c] for c in seed]
    generated = ''
    generated += seed
    model.reset_states()
    start_index = 0
    
    for _ in range(length):
        sample = encoded[start_index:start_index+10]
        sample = np.array(sample)
        sample = np.expand_dims(sample, 0)
        
        pred = model.predict(sample)
        pred = tf.squeeze(pred, 0)
        next_char = np.argmax(pred)
        encoded.append(next_char)
        generated += int_char[next_char]
        
        start_index += 1
    return generated

In [14]:
# set seed text which model will use to generate predicted text
seed_text = "I have no data yet it is a capital mistake to theorise before one has data insensibly one begins to twist facts to suit theories"

generate_text(model, seed_text, 400)

'I have no data yet it is a capital mistake to theorise before one has data insensibly one begins to twist facts to suit theoriesmothrtoa hn tn t sonenetltant re th the r ne te ore tf  had hothrtn trt nee af  ae hng ah thes  tarh  ah ttptethe r nd er e  rttaoahothohu     e r  aeatheahe coooathrtf  thotmav tau e  etheeao  t  tm dhaoamneahe  tmhte tm eahoear e aeooohnttvsmeoan     er rommmmvtetmsdd e r  aoumnn e  atmheu enehhtttvtmvhernldtmhtmheIeetdde rr damheeerthehnre dd tvhsdnnn    eeernr tmmmmveteueeeeeor eteetvteoms    '

In [15]:
# train with more epochs
model.fit(seq, y, batch_size=32, epochs=10, verbose=2)

Epoch 1/10
1718/1718 - 24s - loss: 1.8326 - 24s/epoch - 14ms/step
Epoch 2/10
1718/1718 - 24s - loss: 1.7923 - 24s/epoch - 14ms/step
Epoch 3/10
1718/1718 - 24s - loss: 1.7558 - 24s/epoch - 14ms/step
Epoch 4/10
1718/1718 - 24s - loss: 1.7192 - 24s/epoch - 14ms/step
Epoch 5/10
1718/1718 - 25s - loss: 1.6856 - 25s/epoch - 14ms/step
Epoch 6/10
1718/1718 - 23s - loss: 1.6532 - 23s/epoch - 13ms/step
Epoch 7/10
1718/1718 - 24s - loss: 1.6204 - 24s/epoch - 14ms/step
Epoch 8/10
1718/1718 - 25s - loss: 1.5898 - 25s/epoch - 14ms/step
Epoch 9/10
1718/1718 - 24s - loss: 1.5595 - 24s/epoch - 14ms/step
Epoch 10/10
1718/1718 - 25s - loss: 1.5311 - 25s/epoch - 14ms/step


<keras.callbacks.History at 0x7fcae07b8a90>

In [17]:
generate_text(model, seed_text, 400)

'I have no data yet it is a capital mistake to theorise before one has data insensibly one begins to twist facts to suit theoriesborhiwoa tt ws tmdaspssrltant ne hh she smnedoee r  anl miv boytiws tr  nle afe ae hng ah this htattieth tuttethenddnd eueonirrirwCa the   e eea rg oowwaaaao caaan usdoeaImdymaneoei sna ahattyy t ost dtaoamn ahes saorhecdh ahr en  r     hvgdt nsrgggarpherlI dbvnrceotndn rrrrdhrmgrdIs ewst’   an  t natetlrrde   bho t aaant e t    ahi derte an aavgt emm mmaeaeaooee eldeeer o soue eomn e  oeieeaaaaee'