### Language Modelling using Stacked LSTM

In [1]:
import requests
art_of_war = requests.get('https://raw.githubusercontent.com/futuremojo/nlp-demystified/main/datasets/art_of_war.txt').text

art_of_war[:300]

'1. Sun Tzŭ said: The art of war is of vital importance to the State.\n\n2. It is a matter of life and death, a road either to safety or to\nruin. Hence it is a subject of inquiry which can on no account be\nneglected.\n\n3. The art of war, then, is governed by five constant factors, to be\ntaken into accou'

In [2]:
from keras_preprocessing.text import Tokenizer
tok=Tokenizer(char_level=True)
tok.fit_on_texts([art_of_war])

In [3]:
print(f'character vocab size={len(tok.word_index)}')
#this is much smaller than what would have happeded if we tokenized words
print(tok.word_index)

character vocab size=56
{' ': 1, 'e': 2, 't': 3, 'i': 4, 'n': 5, 'o': 6, 'a': 7, 's': 8, 'r': 9, 'h': 10, 'l': 11, 'd': 12, 'u': 13, '\n': 14, 'c': 15, 'f': 16, 'm': 17, 'y': 18, 'g': 19, 'w': 20, '.': 21, 'p': 22, 'b': 23, ',': 24, 'v': 25, 'k': 26, '1': 27, ';': 28, '2': 29, '3': 30, '4': 31, '_': 32, '(': 33, ')': 34, '5': 35, '’': 36, '-': 37, 'q': 38, '6': 39, 'x': 40, ':': 41, '7': 42, '0': 43, '8': 44, '9': 45, 'j': 46, 'z': 47, '—': 48, 'ŭ': 49, '?': 50, '!': 51, 'œ': 52, '“': 53, '”': 54, 'ü': 55, '‘': 56}


In [4]:
seq=tok.texts_to_sequences([art_of_war])[0]
print(f'text length={len(seq)}')
print(seq)

text length=61054
[27, 21, 1, 8, 13, 5, 1, 3, 47, 49, 1, 8, 7, 4, 12, 41, 1, 3, 10, 2, 1, 7, 9, 3, 1, 6, 16, 1, 20, 7, 9, 1, 4, 8, 1, 6, 16, 1, 25, 4, 3, 7, 11, 1, 4, 17, 22, 6, 9, 3, 7, 5, 15, 2, 1, 3, 6, 1, 3, 10, 2, 1, 8, 3, 7, 3, 2, 21, 14, 14, 29, 21, 1, 4, 3, 1, 4, 8, 1, 7, 1, 17, 7, 3, 3, 2, 9, 1, 6, 16, 1, 11, 4, 16, 2, 1, 7, 5, 12, 1, 12, 2, 7, 3, 10, 24, 1, 7, 1, 9, 6, 7, 12, 1, 2, 4, 3, 10, 2, 9, 1, 3, 6, 1, 8, 7, 16, 2, 3, 18, 1, 6, 9, 1, 3, 6, 14, 9, 13, 4, 5, 21, 1, 10, 2, 5, 15, 2, 1, 4, 3, 1, 4, 8, 1, 7, 1, 8, 13, 23, 46, 2, 15, 3, 1, 6, 16, 1, 4, 5, 38, 13, 4, 9, 18, 1, 20, 10, 4, 15, 10, 1, 15, 7, 5, 1, 6, 5, 1, 5, 6, 1, 7, 15, 15, 6, 13, 5, 3, 1, 23, 2, 14, 5, 2, 19, 11, 2, 15, 3, 2, 12, 21, 14, 14, 30, 21, 1, 3, 10, 2, 1, 7, 9, 3, 1, 6, 16, 1, 20, 7, 9, 24, 1, 3, 10, 2, 5, 24, 1, 4, 8, 1, 19, 6, 25, 2, 9, 5, 2, 12, 1, 23, 18, 1, 16, 4, 25, 2, 1, 15, 6, 5, 8, 3, 7, 5, 3, 1, 16, 7, 15, 3, 6, 9, 8, 24, 1, 3, 6, 1, 23, 2, 14, 3, 7, 26, 2, 5, 1, 4, 5, 3, 6, 1, 7, 15, 15,

In [5]:
# a sanity check
print(tok.sequences_to_texts([seq[:15]]))    

['1 .   s u n   t z ŭ   s a i d']


Our training data is currently one long sequence which we'll need to segment into training examples. To do this, we'll use the Tensorflow Data API which makes it easy to build preprocessing pipelines by chaining operations together.
To use this API, we need to first convert our vectorized corpus (which is a plain Python list) into a Dataset object, which we can do using the from_tensor_slices method. This takes our vectorized corpus and returns a sequence of tensors, one tensor for each integer. We'll then be able to perform operations on this Dataset object to prep our data.

In [6]:
import tensorflow as tf
slices=tf.data.Dataset.from_tensor_slices(seq)

In [7]:
type(slices)

tensorflow.python.data.ops.from_tensor_slices_op._TensorSliceDataset

Now we need to create our training data. Unlike other problems here we don't have labelled data. Also labelled data does not make sense here.<br>
Till now we have the entire text as list of integers representing tokens. 
The format is similar to
[1,2,3,4,5,6,7,8,9,10]<br>
We create moving windows of a size say 5 (input timestep), set the shift=1 (window size=6) and split the text into smaller arrays<br>
[1,2,3,4,5,6],  [2,3...7], [3,4,..8], [4,5...9] and [5,6...10]<br>
Now for each array the X_label and y_label will be element and its subsequent element like <br>
1 -> 2 <br>
2 -> 3<br>
3 -> 4 <br>
4 -> 5<br>
5 -> 6<br>
and the same is repeated for each arrays.

In [8]:
input_timesteps=100
window_size=input_timesteps+1
windows=slices.window(window_size, shift=1, drop_remainder=True)
#drop remainder ignores window sizes less than 101

In [9]:
print(windows)#dataset of datasets more like 2d list
for window in windows.take(2):
    print(window)

<_WindowDataset element_spec=DatasetSpec(TensorSpec(shape=(), dtype=tf.int32, name=None), TensorShape([]))>
<_VariantDataset element_spec=TensorSpec(shape=(), dtype=tf.int32, name=None)>
<_VariantDataset element_spec=TensorSpec(shape=(), dtype=tf.int32, name=None)>


But our model won't accept dataset of datasets. It'll accept only tensors, so we need to extract the tensors from each window. To do that, we'll use flat_map which will flatten the dataset of datasets into a single dataset of elements. But because we want to retain our segmented sequences, we'll also pass in a batch function to maintain the segments (otherwise, we'll just get back one large tensor representing our whole corpus).

In [10]:
dataset=windows.flat_map(lambda window: window.batch(window_size))
#now we have a dataset of tensors where each tensor is 101 dimensional

In [11]:
for tensor in dataset.take(2):
    print(tensor)

tf.Tensor(
[27 21  1  8 13  5  1  3 47 49  1  8  7  4 12 41  1  3 10  2  1  7  9  3
  1  6 16  1 20  7  9  1  4  8  1  6 16  1 25  4  3  7 11  1  4 17 22  6
  9  3  7  5 15  2  1  3  6  1  3 10  2  1  8  3  7  3  2 21 14 14 29 21
  1  4  3  1  4  8  1  7  1 17  7  3  3  2  9  1  6 16  1 11  4 16  2  1
  7  5 12  1 12], shape=(101,), dtype=int32)
tf.Tensor(
[21  1  8 13  5  1  3 47 49  1  8  7  4 12 41  1  3 10  2  1  7  9  3  1
  6 16  1 20  7  9  1  4  8  1  6 16  1 25  4  3  7 11  1  4 17 22  6  9
  3  7  5 15  2  1  3  6  1  3 10  2  1  8  3  7  3  2 21 14 14 29 21  1
  4  3  1  4  8  1  7  1 17  7  3  3  2  9  1  6 16  1 11  4 16  2  1  7
  5 12  1 12  2], shape=(101,), dtype=int32)


In [12]:
batch_size=32
batches=dataset.batch(batch_size)

In [13]:
for batch in batches.take(2):
    print(batch)

tf.Tensor(
[[27 21  1 ... 12  1 12]
 [21  1  8 ...  1 12  2]
 [ 1  8 13 ... 12  2  7]
 ...
 [ 7  9  1 ...  2  3 18]
 [ 9  1  4 ...  3 18  1]
 [ 1  4  8 ... 18  1  6]], shape=(32, 101), dtype=int32)
tf.Tensor(
[[ 4  8  1 ...  1  6  9]
 [ 8  1  6 ...  6  9  1]
 [ 1  6 16 ...  9  1  3]
 ...
 [ 1  8  3 ... 23 46  2]
 [ 8  3  7 ... 46  2 15]
 [ 3  7  3 ...  2 15  3]], shape=(32, 101), dtype=int32)


In [30]:
xy_batches = batches.map(lambda batch: (batch[:, :-1], batch[:, 1:]))
#this is the input and target as discussed earlier

for x,y in xy_batches.take(1):
    print('Input',x)
    print('Target',y)
    print()

Input tf.Tensor(
[[27 21  1 ...  5 12  1]
 [21  1  8 ... 12  1 12]
 [ 1  8 13 ...  1 12  2]
 ...
 [ 7  9  1 ... 16  2  3]
 [ 9  1  4 ...  2  3 18]
 [ 1  4  8 ...  3 18  1]], shape=(32, 100), dtype=int32)
Target tf.Tensor(
[[21  1  8 ... 12  1 12]
 [ 1  8 13 ...  1 12  2]
 [ 8 13  5 ... 12  2  7]
 ...
 [ 9  1  4 ...  2  3 18]
 [ 1  4  8 ...  3 18  1]
 [ 4  8  1 ... 18  1  6]], shape=(32, 100), dtype=int32)



Now we'll one hot encode the inputs but not the targets. We're not using embeddings for the input. We can, but since this is a character model with just a few dozen possible choices, we can get away with one-hot encoding. There's also no reason to think a particular letter should be closer to another in vector space as we would want in a word-level model.

In [31]:
num_tokens=len(tok.word_index)+1 

xy_batches=xy_batches.map(lambda inputs, labels: (tf.one_hot(inputs, num_tokens), labels))

In [32]:
for batch in xy_batches.take(1):
    print('x1:', batch[0][0])
    print('y1:', batch[1][0])

x1: tf.Tensor(
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]], shape=(100, 57), dtype=float32)
y1: tf.Tensor(
[21  1  8 13  5  1  3 47 49  1  8  7  4 12 41  1  3 10  2  1  7  9  3  1
  6 16  1 20  7  9  1  4  8  1  6 16  1 25  4  3  7 11  1  4 17 22  6  9
  3  7  5 15  2  1  3  6  1  3 10  2  1  8  3  7  3  2 21 14 14 29 21  1
  4  3  1  4  8  1  7  1 17  7  3  3  2  9  1  6 16  1 11  4 16  2  1  7
  5 12  1 12], shape=(100,), dtype=int32)


In [34]:
#since our model will be a bit complex we add an optimization step
dataset = dataset.prefetch(tf.data.AUTOTUNE)

In [35]:
from keras import Sequential
from keras.layers import LSTM, Dense
model=Sequential()
model.add(LSTM(128, return_sequences=True, input_shape=(None, num_tokens)))
model.add(LSTM(128, return_sequences=True, input_shape=(None, num_tokens)))
model.add(Dense(num_tokens, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [36]:
model.fit(xy_batches, epochs=50)

Epoch 1/50
[1m1905/1905[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 55ms/step - accuracy: 0.2407 - loss: 2.7803
Epoch 2/50
[1m1905/1905[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 50ms/step - accuracy: 0.4160 - loss: 1.9771
Epoch 3/50
[1m1905/1905[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 50ms/step - accuracy: 0.4790 - loss: 1.7489
Epoch 4/50
[1m1905/1905[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 66ms/step - accuracy: 0.5190 - loss: 1.5980
Epoch 5/50
[1m1905/1905[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m197s[0m 103ms/step - accuracy: 0.5529 - loss: 1.4792
Epoch 6/50
[1m1905/1905[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 53ms/step - accuracy: 0.5793 - loss: 1.3760
Epoch 7/50
[1m1905/1905[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 53ms/step - accuracy: 0.6075 - loss: 1.2874
Epoch 8/50
[1m1905/1905[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m230s[0m 121ms/step - accuracy: 0.6297 - loss: 1.2074


<keras.src.callbacks.history.History at 0x2a68e5e96d0>

In [41]:
model.save('art_of_war.keras')

In [42]:
from keras.models import load_model
model=load_model('art_of_war.keras')

  saveable.load_own_variables(weights_store.get(inner_path))


In [74]:
def generate_text(model, tokenizer, seed, num_chars, temperature=1.0):
    text=seed
    for _ in range(num_chars):
        text_input_seq=tokenizer.texts_to_sequences([text[-input_timesteps:]])[0]
        text_input_one_hot=tf.one_hot(text_input_seq, num_tokens)

        pred=model.predict(tf.expand_dims(text_input_one_hot, axis=0))[0, -1, :]#we only want the last character
        preds=tf.math.log(pred)/temperature

        next_char=tf.random.categorical(tf.expand_dims(preds, axis=0), num_samples=1)
        next_char=tokenizer.sequences_to_texts([next_char.numpy()][0])[0]

        text+=next_char
    
    return text

In [79]:
print(generate_text(model, tok, "It's time to release the Kraken when", num_chars=300, temperature=1.0))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24

In [80]:
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tok, handle, protocol=pickle.HIGHEST_PROTOCOL)
    #saving tokenizer

In [82]:
with open('model.pickle', 'wb') as handle: 
    pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)
    #saving model