In [3]:
import numpy as np
import random as  rnd

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers import Input

# Data preprocessing

## Load data

In [4]:
lines = []
with open('./data/shakespeare_data.txt') as file:
  for line in file:
    line = line.strip()
    if len(line) > 0:
      lines.append(line)

print(lines[:5])

["A LOVER'S COMPLAINT", 'FROM off a hill whose concave womb reworded', 'A plaintful story from a sistering vale,', 'My spirits to attend this double voice accorded,', 'And down I laid to list the sad-tuned tale;']


## Create vocabulary

In [5]:
text = '\n'.join(lines)
vocab = sorted(set(text))
vocab.insert(0, "[UNK]") # Unknown token for out-of-vocab words
vocab.insert(1, "") # empty char for padding

print(f'{len(vocab)} words')
print(vocab)

82 words
['[UNK]', '', '\t', '\n', ' ', '!', '$', '&', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '|']


## Encode sentence

In [6]:
# Very simple implementation
encode_ = lambda string: [ vocab.index(char) for char in string ]
decode_ = lambda nums: ' '.join([ vocab[index] for index in nums ])

Two issues with this implementation:
- It does not take into account UTF-8 characters
- It throws an error when encoutering unknown word instead of returning index for UNK token

To properly handle these, use:
- `tf.strings.unicode_split`: this will encode UTF-8 before splitting
- `tf.keras.layers.StringLookup`: this takes care of UNK token.

In [7]:
def line_to_tensor(line, vocab):
  tokens = tf.strings.unicode_split(line, input_encoding='UTF-8')
  ids = tf.keras.layers.StringLookup(vocabulary=vocab)(tokens)

  return ids

def text_from_ids(ids, vocab):
  tokens = tf.keras.layers.StringLookup(vocabulary=vocab, invert=True)(ids)

  return tf.strings.reduce_join(tokens)

In [8]:
ids = line_to_tensor('hello world', vocab)
print(f'IDs: {ids}')

text = text_from_ids(ids, vocab)
print(f'Text: {text}')

IDs: [62 59 66 66 69  4 77 69 72 66 58]
Text: b'hello world'


## Create dataset

In [9]:
train_lines = lines[: -1000]
eval_lines = lines[-1000:]

### Dataset creation procedure
1. Convert text to IDs
2. Group IDs into batches of SEQUENCE_LENGTH
3. Map each sequence of IDs to text input and target
4. Batch again into batches of BATCH_SIZE

In [10]:
# Convert text to IDs
all_ids = line_to_tensor('\n'.join(['hello world', 'generative AI']), vocab)
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
print(ids_dataset)

<TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int64, name=None)>


In [11]:
# Group IDs into batches of sequence_length
seq_length = 5
data_generator = ids_dataset.batch(seq_length + 1, drop_remainder=True)

In [12]:
for seq in data_generator.take(5):
  print(seq)

tf.Tensor([62 59 66 66 69  4], shape=(6,), dtype=int64)
tf.Tensor([77 69 72 66 58  3], shape=(6,), dtype=int64)
tf.Tensor([61 59 68 59 72 55], shape=(6,), dtype=int64)
tf.Tensor([74 63 76 59  4 27], shape=(6,), dtype=int64)


In [13]:
def split_input_target(sequence):
  input_text = sequence[:-1]
  target_text = sequence[1:]

  return input_text, target_text

split_input_target(list('tensorflow'))

(['t', 'e', 'n', 's', 'o', 'r', 'f', 'l', 'o'],
 ['e', 'n', 's', 'o', 'r', 'f', 'l', 'o', 'w'])

In [14]:
# Map each sequence to input and target
data_xy = data_generator.map(lambda z: split_input_target(z))

# Batch again
batches = data_xy.batch(2)
for batch in batches.take(2):
  print(batch)

(<tf.Tensor: shape=(2, 5), dtype=int64, numpy=
array([[62, 59, 66, 66, 69],
       [77, 69, 72, 66, 58]], dtype=int64)>, <tf.Tensor: shape=(2, 5), dtype=int64, numpy=
array([[59, 66, 66, 69,  4],
       [69, 72, 66, 58,  3]], dtype=int64)>)
(<tf.Tensor: shape=(2, 5), dtype=int64, numpy=
array([[61, 59, 68, 59, 72],
       [74, 63, 76, 59,  4]], dtype=int64)>, <tf.Tensor: shape=(2, 5), dtype=int64, numpy=
array([[59, 68, 59, 72, 55],
       [63, 76, 59,  4, 27]], dtype=int64)>)


In [15]:
def create_batch_dataset(lines, vocab, seq_length=20, batch_size=64):
  single_line_data = '\n'.join(lines)

  all_ids = line_to_tensor(single_line_data, vocab)
  ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

  # Split IDs into lines of seq_length + 1
  data_generator = ids_dataset.batch(seq_length+1, drop_remainder=True)

  # Map lines of IDs into batches of (x, y) tuples
  dataset_xy = data_generator.map(lambda z: split_input_target(z))

  # Split lines of (x, y) tuples into batches of batch_size
  dataset = (
      dataset_xy
        .shuffle(10000)
        .batch(batch_size, drop_remainder=True)
        # .prefetch(tf.data.experimental.AUTOTUNE)
  )

  return dataset

In [16]:
dataset = create_batch_dataset(train_lines, vocab, seq_length=20, batch_size=64)

In [17]:
# there are a total of batch_size pairs in each dataset batch
# THIS IS NOT AN ACTUAL LOOP
for input, target in dataset.take(1):
  print(f'Batch size: {len(input)}')

  print(text_from_ids(input[0], vocab))
  print(text_from_ids(target[0], vocab))

Batch size: 64
tf.Tensor(b'e, we burn daylight,', shape=(), dtype=string)
tf.Tensor(b', we burn daylight, ', shape=(), dtype=string)
