In [1]:
import numpy as np
from jinja2 import optimizer
from tensorflow import keras
from keras_preprocessing.text import Tokenizer

In [2]:
tokenizer = Tokenizer(char_level=True)
file_name = './paradise_lost.txt'
text_1 = open(file_name, 'r').read().lower()
print('Total number of characters in the book: ', len(text_1))

Total number of characters in the book:  456649


In [5]:
# Create mapping between unique characters to integers and reverse 
tokenizer.fit_on_texts(text_1)
char_index = tokenizer.word_index
print('Found %s unique characters. ' % len(char_index))
print('Char to integer dictionary: ', char_index)

index_char = dict(enumerate(char_index.keys()))
print('Integer to char dictionary: ', index_char)

Found 40 unique characters. 
Char to integer dictionary:  {' ': 1, 'e': 2, 't': 3, 'o': 4, 'a': 5, 'h': 6, 'n': 7, 'i': 8, 's': 9, 'r': 10, 'd': 11, 'l': 12, '\n': 13, 'u': 14, ',': 15, 'm': 16, 'f': 17, 'w': 18, 'g': 19, 'c': 20, 'p': 21, 'b': 22, 'y': 23, 'v': 24, '’': 25, '_': 26, 'k': 27, ';': 28, '.': 29, ':': 30, 'j': 31, 'x': 32, '?': 33, 'q': 34, '-': 35, 'z': 36, '&': 37, '(': 38, ')': 39, '!': 40}
Integer to char dictionary:  {0: ' ', 1: 'e', 2: 't', 3: 'o', 4: 'a', 5: 'h', 6: 'n', 7: 'i', 8: 's', 9: 'r', 10: 'd', 11: 'l', 12: '\n', 13: 'u', 14: ',', 15: 'm', 16: 'f', 17: 'w', 18: 'g', 19: 'c', 20: 'p', 21: 'b', 22: 'y', 23: 'v', 24: '’', 25: '_', 26: 'k', 27: ';', 28: '.', 29: ':', 30: 'j', 31: 'x', 32: '?', 33: 'q', 34: '-', 35: 'z', 36: '&', 37: '(', 38: ')', 39: '!'}


In [6]:
# Creating input Tensor and Output Vectors
char_len = len(text_1)
seq_lenth = 5
data_X = []
data_y = []
for i in range(0, char_len - seq_lenth, 1):
    input_seq = text_1[i:i + seq_lenth]
    output_seq = text_1[i + seq_lenth]
    data_X.append([char_index[char] for char in input_seq])
    data_y.append(char_index[output_seq])
n_patterns = len(data_X)
print('Total number of patterns', n_patterns)

# Print first 10 elements in data_X
print(data_X[:10])
print(data_y[:10])

Total number of patterns 456644
[[1, 21, 5, 10, 5], [21, 5, 10, 5, 11], [5, 10, 5, 11, 8], [10, 5, 11, 8, 9], [5, 11, 8, 9, 2], [11, 8, 9, 2, 1], [8, 9, 2, 1, 12], [9, 2, 1, 12, 4], [2, 1, 12, 4, 9], [1, 12, 4, 9, 3]]
[11, 8, 9, 2, 1, 12, 4, 9, 3, 13]


In [7]:
# Reshaping data - to create numpy arrays
X = np.reshape(data_X, (n_patterns, seq_lenth, 1))
# Normalizing data - by dividing each element of the array by the number of unique characters in the book
X = X / len(char_index)
# One-hot encoding for y vector
y = keras.utils.to_categorical(data_y)
## Printing first 3 elements
print(X[:3])
print(y[:3])

[[[0.025]
  [0.525]
  [0.125]
  [0.25 ]
  [0.125]]

 [[0.525]
  [0.125]
  [0.25 ]
  [0.125]
  [0.275]]

 [[0.125]
  [0.25 ]
  [0.125]
  [0.275]
  [0.2  ]]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [12]:
print('Shape of X :', X.shape)
print('Shape of first element: ', X.shape[1])
print('Shape of second element: ', X.shape[2])

Shape of X : (456644, 5, 1)
Shape of first element:  5
Shape of second element:  1


In [13]:
# Building Recurrent Neural Networks using LSTM (GRU can be used as a substitute for LSTM)
model = keras.Sequential([
    keras.layers.LSTM((256), return_sequences=False, input_shape=(X.shape[1], X.shape[2])),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(y.shape[1], activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy')
print(model.summary())

Metal device set to: Apple M1


2022-02-13 13:10:34.692307: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-02-13 13:10:34.693446: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 256)               264192    
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense (Dense)               (None, 41)                10537     
                                                                 
Total params: 274,729
Trainable params: 274,729
Non-trainable params: 0
_________________________________________________________________
None


In [14]:
# Save the model
filepath='weights-improvement-{epoch}-{loss:.4f}.hdf5'
checkpoint = keras.callbacks.ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

# Fit the model
model.fit(X, y, epochs=30, batch_size=128, callbacks=callbacks_list)

2022-02-13 13:19:55.457616: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 1/30


2022-02-13 13:19:55.903727: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-13 13:19:56.103746: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-13 13:19:57.591248: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 00001: loss improved from inf to 2.90667, saving model to weights-improvement-1-2.9067.hdf5
Epoch 2/30
Epoch 00002: loss improved from 2.90667 to 2.77500, saving model to weights-improvement-2-2.7750.hdf5
Epoch 3/30
Epoch 00003: loss improved from 2.77500 to 2.71400, saving model to weights-improvement-3-2.7140.hdf5
Epoch 4/30
Epoch 00004: loss improved from 2.71400 to 2.65903, saving model to weights-improvement-4-2.6590.hdf5
Epoch 5/30
Epoch 00005: loss improved from 2.65903 to 2.60842, saving model to weights-improvement-5-2.6084.hdf5
Epoch 6/30
Epoch 00006: loss improved from 2.60842 to 2.55391, saving model to weights-improvement-6-2.5539.hdf5
Epoch 7/30
Epoch 00007: loss improved from 2.55391 to 2.48816, saving model to weights-improvement-7-2.4882.hdf5
Epoch 8/30
Epoch 00008: loss improved from 2.48816 to 2.41076, saving model to weights-improvement-8-2.4108.hdf5
Epoch 9/30
Epoch 00009: loss improved from 2.41076 to 2.32491, saving model to weights-improvement-9-2.3249.hdf

<keras.callbacks.History at 0x16b7f64f0>

In [38]:
# Loading the weights file
model.load_weights('weights-improvement-30-1.7436.hdf5')
print('Total Number of Patterns :', len(data_X))

start = np.random.randint(50, 100)
print('Starting Random Number: ', start)
pattern = data_X[start]
print([''.join(index_char[value]) for value in pattern])

Total Number of Patterns : 456644
Starting Random Number:  53
['i', 'e', 'f', 's', '\n']


In [39]:
# Generate Characters
txt_fl = []

for i in range(1000):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = x / len(char_index)
    prediction = model.predict(x, verbose=0)
    index = np.argmax(prediction)
    result = index_char[index].rstrip('\n\r')
    seq_in = [index_char[value] for value in pattern]
    # Print result
    txt_fl.append(result)
    pattern.append(index)
    pattern = pattern[1: len(pattern)]

print(''.join(txt_fl))

trraieeiofrheevbsrnftiedatm,shhitktiotepthdttdedift;shiuoyegedriuectrtseeiofrheevbsrnftiedatm,shhitktiotepthdttdedift;shiuoyegedriuectrtseeiofrheevbsrnftiedatm,shhitktiotepthdttdedift;shiuoyegedriuectrtseeiofrheevbsrnftiedatm,shhitktiotepthdttdedift;shiuoyegedriuectrtseeiofrheevbsrnftiedatm,shhitktiotepthdttdedift;shiuoyegedriuectrtseeiofrheevbsrnftiedatm,shhitktiotepthdttdedift;shiuoyegedriuectrtseeiofrheevbsrnftiedatm,shhitktiotepthdttdedift;shiuoyegedriuectrtseeiofrheevbsrnftiedatm,shhitktiotepthdttdedift;shiuoyegedriuectrtseeiofrheevbsrnftiedatm,shhitktiotepthdttdedift;shiuoyegedriuectrtseeiofrheevbsrnftiedatm,shhitktiotepthdttdedift;shiuoyegedriuectrtseeiofrheevbsrnftiedatm,shhitktiotepthdttdedift;shiuoyegedriuectrtseeiofrheevbsrnftiedatm,shhitktiotepthdttdedift;shiuoyegedriuectrtseeiofrheevbsrnftiedatm,shhitktiotepthdttdedift;shiuoyegedriuectrtseeiofrheevbsrnftiedatm,shhitktiotepthdttdedift;shiuoyegedriuectrtse
