## Text generation with Char RNN

In [None]:
import tensorflow as tf

# Loading text data

In [2]:
shakespear_url = "https://homl.info/shakespeare"
filepath = tf.keras.utils.get_file('shakespear.txt', shakespear_url)

In [3]:
filepath

'/home/vi0/.keras/datasets/shakespear.txt'

In [4]:
with open(filepath) as f:
    shakespear_txt = f.read()

In [5]:
shakespear_txt[:80]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.'

In [None]:
text_vec_layer = tf.keras.layers.TextVectorization(split='character',
                                                  standardize='lower')

# Tokenizig and encoding at character level

In [7]:
text_vec_layer.adapt([shakespear_txt])

In [8]:
text_vec_layer.get_vocabulary()

['',
 '[UNK]',
 ' ',
 'e',
 't',
 'o',
 'a',
 'i',
 'h',
 's',
 'r',
 'n',
 '\n',
 'l',
 'd',
 'u',
 'm',
 'y',
 'w',
 ',',
 'c',
 'f',
 'g',
 'b',
 'p',
 ':',
 'k',
 'v',
 '.',
 "'",
 ';',
 '?',
 '!',
 '-',
 'j',
 'q',
 'x',
 'z',
 '3',
 '&',
 '$']

In [9]:
encoded = text_vec_layer([shakespear_txt])[0]

In [10]:
encoded

<tf.Tensor: shape=(1115394,), dtype=int64, numpy=array([21,  7, 10, ..., 22, 28, 12])>

In [11]:
# Removing code 0 and 1 reserved for padding and unknown characters 
# (codes start at 2 before that removal so now 0 and 1 will be some chars)
encoded -= 2

n_tokens = text_vec_layer.vocabulary_size() - 2
dataset_size = len(encoded)

In [12]:
n_tokens

39

# Dataset windowing with a single char shift

In [26]:
ds = tf.data.Dataset.from_tensor_slices(encoded)

In [27]:
ds = ds.window(5, shift=1)

In [None]:
[[ [vx, vy], [feats] ], [], []]

In [None]:
[
0 [0,..., 12],
[0,..., 12]
...
31 ...    
]

In [28]:
for sample in ds.take(3):
    print('\n')
    for x in sample:
        print(x)



tf.Tensor(19, shape=(), dtype=int64)
tf.Tensor(5, shape=(), dtype=int64)
tf.Tensor(8, shape=(), dtype=int64)
tf.Tensor(7, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)


tf.Tensor(5, shape=(), dtype=int64)
tf.Tensor(8, shape=(), dtype=int64)
tf.Tensor(7, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(0, shape=(), dtype=int64)


tf.Tensor(8, shape=(), dtype=int64)
tf.Tensor(7, shape=(), dtype=int64)
tf.Tensor(2, shape=(), dtype=int64)
tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(18, shape=(), dtype=int64)


In [29]:
ds = ds.flat_map(lambda window_ds: window_ds.batch(5 + 1))
ds = ds.batch(2)

In [30]:
for sample in ds.take(3):
    print('\n')
    for x in sample:
        print(x)



tf.Tensor([19  5  8  7  2], shape=(5,), dtype=int64)
tf.Tensor([5 8 7 2 0], shape=(5,), dtype=int64)


tf.Tensor([ 8  7  2  0 18], shape=(5,), dtype=int64)
tf.Tensor([ 7  2  0 18  5], shape=(5,), dtype=int64)


tf.Tensor([ 2  0 18  5  2], shape=(5,), dtype=int64)
tf.Tensor([ 0 18  5  2  5], shape=(5,), dtype=int64)


In [18]:
def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=32):
    ds = tf.data.Dataset.from_tensor_slices(sequence)
    ds = ds.window(length + 1, shift=1, drop_remainder=True)
    ds = ds.flat_map(lambda window_ds: window_ds.batch(length+1))
    if shuffle:
        ds = ds.shuffle(buffer_size=100_000, seed=seed)
    ds = ds.batch(batch_size)
    return ds.map(lambda window: (window[:, :-1], window[:, 1:])[[vx, vy]] ).prefetch(1)

# Preparing train, val, test datasets

In [22]:
length = 100
tf.random.set_seed(42)

In [23]:
train_set = to_dataset(encoded[:1_000_000], length=length, shuffle=True, seed=42)
valid_set = to_dataset(encoded[1_000_000:1_060_000], length=length)
test_set = to_dataset(encoded[:1_060_000], length=length)

# This model has as many outputs as tokens, so at char level split it is relatively small. For word tokens it would probably be untractabe. 

In [24]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16),
    tf.keras.layers.GRU(128, return_sequences=True),
    tf.keras.layers.Dense(n_tokens, activation='softmax')
])


In [25]:
model.compile(loss='sparse_categorical_crossentropy',
             optimizer='nadam',
             metrics=['accuracy'])

In [26]:
model_chkpt = tf.keras.callbacks.ModelCheckpoint('shakespear_model',
                                                 monitor='val_accuracy',
                                                 save_best_only=True)

In [27]:
history = model.fit(train_set,
                    validation_data=valid_set,
                    epochs=10,
                    callbacks=model_chkpt)

Epoch 1/10


2023-09-24 13:05:02.270637: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8600
2023-09-24 13:05:02.287390: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:606] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2023-09-24 13:05:02.349004: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7ff608ea8390 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-09-24 13:05:02.349032: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3060 Laptop GPU, Compute Capability 8.6
2023-09-24 13:05:02.361616: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:255] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-09-24 13:05:02.510112: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifeti

  31246/Unknown - 133s 4ms/step - loss: 1.3938 - accuracy: 0.5737

2023-09-24 13:07:08.980296: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous recv item cancelled. Key hash: 16739376533431514147
2023-09-24 13:07:08.980324: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous recv item cancelled. Key hash: 14144194180811173346
2023-09-24 13:07:08.980334: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous recv item cancelled. Key hash: 5286058283677343576
2023-09-24 13:07:13.303978: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous recv item cancelled. Key hash: 16739376533431514147
2023-09-24 13:07:13.304003: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous recv item cancelled. Key hash: 14144194180811173346
2023-09-24 13:07:13.304007: I tensorflow/core/framework/local_rendezvous.cc:409] Local rendezvous send item cancelled. Key hash: 8376685432050931232
2023-09-24 13:07:13.304016: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous recv

INFO:tensorflow:Assets written to: shakespear_model/assets


INFO:tensorflow:Assets written to: shakespear_model/assets


Epoch 2/10


INFO:tensorflow:Assets written to: shakespear_model/assets


Epoch 3/10
Epoch 4/10


INFO:tensorflow:Assets written to: shakespear_model/assets


Epoch 5/10


INFO:tensorflow:Assets written to: shakespear_model/assets


Epoch 6/10


INFO:tensorflow:Assets written to: shakespear_model/assets


Epoch 7/10


INFO:tensorflow:Assets written to: shakespear_model/assets


Epoch 8/10
Epoch 9/10


INFO:tensorflow:Assets written to: shakespear_model/assets


Epoch 10/10


In [28]:
# Previously trained model got dataset based on encoded input
# Here we allow for pure text ingestion during inference with encoding/tokenization done
# within the model

final_model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Lambda(lambda x: x-2),
    model
])

# Direct predictions from the model

In [30]:
y_proba = final_model.predict(["To be or not to b"])



In [34]:
len("To be or not to b")

17

In [32]:
# A batch of one sentence of length 17 returned
# with a probability distribution over
# 39 possible chars/tokens
y_proba.shape

(1, 17, 39)

In [35]:
# Retured model output is the whole sequence 
# shifted to the right with a single new characted appended
# We take it for inspection
y_proba[0, -1]

array([1.86649723e-11, 8.35158348e-01, 1.68423383e-11, 2.72461679e-02,
       1.22674080e-02, 1.55899301e-02, 4.27650519e-08, 4.20408917e-08,
       4.73230183e-02, 1.17821635e-07, 2.32666122e-12, 2.66743433e-02,
       3.45763862e-10, 2.94391215e-02, 1.32026353e-05, 6.28691632e-03,
       1.05333179e-07, 7.68806546e-11, 2.79281893e-12, 3.24001732e-08,
       7.74614510e-15, 1.09816233e-06, 7.41741735e-09, 4.17692964e-12,
       2.63939721e-11, 2.98858112e-08, 5.17616297e-11, 1.57950417e-08,
       1.20890901e-11, 1.99622055e-11, 7.75813441e-12, 1.21248827e-11,
       4.28895390e-08, 2.21203394e-12, 1.66966146e-11, 4.37479081e-11,
       3.57669505e-18, 1.65201681e-19, 1.76836897e-21], dtype=float32)

In [36]:
y_proba = y_proba[0, -1]

In [37]:
y_pred = tf.argmax(y_proba)

In [40]:
# Correctly predicted character
text_vec_layer.get_vocabulary()[y_pred + 2]

'e'

# Exploring policy around fixed RNN prediction

Using NN output as probablity distribution and sampling from it.

Could also employ nucleus sampling where some top number of predictions is used each time for sampling whose collective probability exceeds some threshold.

In [12]:
log_probas = tf.math.log([[0.5, 0.4, 0.1]])
tf.random.set_seed(42)
tf.random.categorical(log_probas, num_samples=8)

<tf.Tensor: shape=(1, 8), dtype=int64, numpy=array([[0, 0, 1, 1, 1, 0, 0, 0]])>

In [52]:
def next_char(text, model, temperature=1):
    y_proba = model.predict([text])[0, -1:]
    rescaled_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(rescaled_logits, num_samples=1)[0,0]
    return text_vec_layer.get_vocabulary()[char_id + 2]

In [53]:
def extend_text(text, n_chars, model, temperature=1):
    for _ in range(n_chars):
        text += next_char(text, model, temperature)
    return text

* This network with dataset window size 100 can learn up to 100 characters long sequences. 
* Longer sequences require larger networks or stateful network, or more advanced architecture like transformers

In [None]:
tf.random.set_seed(42)
extend_text("Thee shall not fall for whom the crown weights", 100, final_model, temperature=1)

# Stateful RNN: training new batch where the previous one left off to capture longer range correlations for longer sentences generation.

In [12]:
length = 100

In [13]:
def to_dataset_for_stateful_rnn(sequence, length):
    # Injested dataset is just a one long stream of 1-char encodings
    ds = tf.data.Dataset.from_tensor_slices(sequence)
    
    # Creates nested dataset of window datasets of size (length + 1) with 1-char elements per window
    ds = ds.window(length + 1, shift=length, drop_remainder=True)
    
    # Lambda glues together each window 1-char elements and then batch(1) batches the final dataset
    # Here batch size is 1 specifically for the statuful network training
    # For more general batching one has to prepare dataset specifically by e.g.
    # splitting it into n parts and then place each part on its place withing the final ds batch
    ds = ds.flat_map(lambda window: window.batch(length + 1)).batch(1)
    
    # The returned sample will have two elements, two sentences shifted by 1
    # for training input and target
    return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)

## Batching ds from the solutions code:
https://github.com/ageron/handson-ml3/blob/main/16_nlp_with_rnns_and_attention.ipynb

In [None]:
# extra code – shows one way to prepare a batched dataset for a stateful RNN

def to_non_overlapping_windows(sequence, length):
    ds = tf.data.Dataset.from_tensor_slices(sequence)
    ds = ds.window(length + 1, shift=length, drop_remainder=True)
    return ds.flat_map(lambda window: window.batch(length + 1))

def to_batched_dataset_for_stateful_rnn(sequence, length, batch_size=32):
    parts = np.array_split(sequence, batch_size)
    datasets = tuple(to_non_overlapping_windows(part, length) for part in parts)
    ds = tf.data.Dataset.zip(datasets).map(lambda *windows: tf.stack(windows))
    return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)

#list(to_batched_dataset_for_stateful_rnn(tf.range(20), length=3, batch_size=2))


In [14]:
train_set = to_dataset_for_stateful_rnn(encoded[:1_000_000], length=length)
valid_set = to_dataset_for_stateful_rnn(encoded[1_000_000:1_060_000], length=length)
test_set = to_dataset_for_stateful_rnn(encoded[:1_060_000], length=length)

In [28]:
for item in train_set.take(1):
    print(item)

(<tf.Tensor: shape=(1, 100), dtype=int64, numpy=
array([[19,  5,  8,  7,  2,  0, 18,  5,  2,  5, 35,  1,  9, 23, 10, 21,
         1, 19,  3,  8,  1,  0, 16,  1,  0, 22,  8,  3, 18,  1,  1, 12,
         0,  4,  9, 15,  0, 19, 13,  8,  2,  6,  1,  8, 17,  0,  6,  1,
         4,  8,  0, 14,  1,  0,  7, 22,  1,  4, 24, 26, 10, 10,  4, 11,
        11, 23, 10,  7, 22,  1,  4, 24, 17,  0,  7, 22,  1,  4, 24, 26,
        10, 10, 19,  5,  8,  7,  2,  0, 18,  5,  2,  5, 35,  1,  9, 23,
        10, 15,  3, 13]])>, <tf.Tensor: shape=(1, 100), dtype=int64, numpy=
array([[ 5,  8,  7,  2,  0, 18,  5,  2,  5, 35,  1,  9, 23, 10, 21,  1,
        19,  3,  8,  1,  0, 16,  1,  0, 22,  8,  3, 18,  1,  1, 12,  0,
         4,  9, 15,  0, 19, 13,  8,  2,  6,  1,  8, 17,  0,  6,  1,  4,
         8,  0, 14,  1,  0,  7, 22,  1,  4, 24, 26, 10, 10,  4, 11, 11,
        23, 10,  7, 22,  1,  4, 24, 17,  0,  7, 22,  1,  4, 24, 26, 10,
        10, 19,  5,  8,  7,  2,  0, 18,  5,  2,  5, 35,  1,  9, 23, 10,
        15,

# Stateful RNN

In [42]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16, batch_input_shape=[1, None]), # was 1, None
    tf.keras.layers.GRU(128, return_sequences=True, stateful=True),
    tf.keras.layers.Dense(n_tokens, activation="softmax")
])

In [43]:
class ResetStatesCallback(tf.keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs):
        self.model.reset_states()
        
model_chkpt = tf.keras.callbacks.ModelCheckpoint('shakespear_model',
                                                 monitor='val_accuracy',
                                                 save_best_only=True)        

In [44]:
model.compile(loss='sparse_categorical_crossentropy',
             optimizer='nadam',
             metrics=['accuracy'],
             )

In [45]:
history = model.fit(train_set,
                   validation_data=valid_set,
                   epochs=1,
                   callbacks=[ResetStatesCallback(), model_chkpt])

   9991/Unknown - 47s 5ms/step - loss: 1.8610 - accuracy: 0.4519

2023-09-27 13:00:49.892412: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous recv item cancelled. Key hash: 12468002509017132553
2023-09-27 13:00:49.892444: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous recv item cancelled. Key hash: 13980077015368050233
2023-09-27 13:00:49.892461: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous recv item cancelled. Key hash: 447728031393915026
2023-09-27 13:00:51.457197: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous recv item cancelled. Key hash: 13980077015368050233
2023-09-27 13:00:51.457224: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous recv item cancelled. Key hash: 12468002509017132553
2023-09-27 13:00:51.457230: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous recv item cancelled. Key hash: 447728031393915026


INFO:tensorflow:Assets written to: shakespear_model/assets


INFO:tensorflow:Assets written to: shakespear_model/assets




In [46]:
model.reset_states()

In [47]:
for sample in valid_set.take(2):
    print(model.predict(sample[0]))

[[[3.89042281e-04 1.95684806e-02 3.70772332e-02 ... 2.49786041e-04
   3.86669650e-04 4.52092820e-04]
  [4.05543186e-02 2.30744928e-01 1.28481418e-01 ... 2.79863912e-06
   6.00172007e-06 3.82376857e-06]
  [4.47216371e-06 5.04652737e-03 3.95580716e-02 ... 1.09606644e-05
   8.55144390e-06 3.68872770e-06]
  ...
  [1.60388066e-04 3.45875919e-01 7.19036092e-04 ... 4.40044126e-08
   2.42503777e-08 4.61003928e-08]
  [7.14462459e-01 5.58487810e-02 2.06820760e-03 ... 1.26294051e-08
   2.93860825e-09 1.01698355e-07]
  [4.96623979e-05 2.45402101e-02 3.96034606e-02 ... 6.80763321e-07
   3.02264681e-07 3.66973268e-07]]]
[[[1.4775415e-07 4.4651251e-02 1.6833520e-03 ... 2.9837672e-06
   4.7187473e-06 2.7973711e-06]
  [2.6613630e-05 8.9043610e-02 5.1904931e-03 ... 2.3154482e-06
   1.6898730e-06 1.1341128e-06]
  [4.0087188e-03 1.2978478e-02 3.8150107e-03 ... 6.7156662e-07
   4.2688313e-07 2.1731264e-06]
  ...
  [1.0243160e-05 3.8950261e-01 3.2701364e-04 ... 2.7503686e-09
   4.9824593e-09 7.8358592e-10]


## Lifting the need for same sized batches as during training by creating stateless model and transferring trained weights

In [48]:
model_prod = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16),
    tf.keras.layers.GRU(128, return_sequences=True),
    tf.keras.layers.Dense(n_tokens, activation="softmax")
])

In [49]:
model_prod.set_weights(model.get_weights())

In [50]:
# Previously trained model got dataset based on encoded input
# Here we allow for pure text ingestion during inference with encoding/tokenization done
# within the model
final_model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Lambda(lambda x: x-2),
    model_prod
])

In [54]:
extend_text

<function __main__.extend_text(text, n_chars, model, temperature=1)>

In [60]:
extend_text("king", 10, final_model)



'king to so.\nwh'

# Char-RNN learns higher level abstract notions like the sentiment of the text - OpenAI discovered a sentiment neuron in a char-rnn even without explicit sentiment labels. This was an early hint at unsupervised pretraining potential.