In [27]:
import os
import random
import string

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.python.keras.layers import TextVectorization

%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [28]:
M_VOCAB_SZ = 20000
M_MAX_LEN = 80
M_ATT_HEADS = 2
M_DIM_EMB = 256
M_DIM_FFN = 256
T_BATCH_SIZE = 256
T_EPOCHS = 1

data_files = ["data_test_wikitext/wiki.train.tokens"]

In [29]:
def causal_attention_mask(batch_size, n_dest, n_src, dtype):
    i = tf.range(n_dest)[:, None]
    j = tf.range(n_src)
    m = i >= j - n_src + n_dest
    mask = tf.cast(m, dtype)
    mask = tf.reshape(mask, [1, n_dest, n_src])
    mult = tf.concat([tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 0)
    return tf.tile(mask, mult)


class Transformer(layers.Layer):
    def __init__(self, embedding_dim, num_att_heads, state_dims, dropout_rate=0.1):
        super(Transformer, self).__init__()
        self.embedding_dim = embedding_dim
        self.num_att_heads = num_att_heads
        self.state_dims = state_dims
        self.dropout_rate = dropout_rate
        self.attention = layers.MultiHeadAttention(num_att_heads, embedding_dim)
        self.feed_forward = keras.Sequential([
            layers.Dense(state_dims, activation="relu"),
            layers.Dense(embedding_dim)
        ])
        self.norm1, self.norm2 = layers.LayerNormalization(epsilon=1e-6), layers.LayerNormalization(epsilon=1e-6)
        self.drop1, self.drop2 = layers.Dropout(dropout_rate), layers.Dropout(dropout_rate)

    def call(self, inputs):
        inp_shape = tf.shape(inputs)
        batch_sz, seq_len = inp_shape[0], inp_shape[1]
        causal_mask = causal_attention_mask(batch_sz, seq_len, seq_len, tf.bool)
        attention_out = self.attention(inputs, inputs, attention_mask=causal_mask)
        attention_out = self.drop1(attention_out)
        out1 = self.norm1(inputs + attention_out)
        feed_forward_out = self.feed_forward(out1)
        feed_forward_out = self.drop2(feed_forward_out)
        return self.norm2(out1 + feed_forward_out)

    def get_config(self):
        config = super().get_config()
        config.update({
            "embedding_dim": self.embedding_dim,
            "num_att_heads": self.num_att_heads,
            "state_dims": self.state_dims,
            "dropout_rate": self.dropout_rate,
        })
        return config

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, max_len, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.max_len = max_len
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=max_len, output_dim=embed_dim)

    def call(self, x):
        max_len = tf.shape(x)[-1]
        pos = tf.range(start=0, limit=max_len, delta=1)
        pos = self.pos_emb(pos)
        x = self.token_emb(x)
        return x + pos

    def get_config(self):
        config = super().get_config()
        config.update({
            "max_len": self.max_len,
            "vocab_size": self.vocab_size,
            "embed_dim": self.embed_dim,
        })
        return config

In [30]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, embedding_dim, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
        self.emb_dim = tf.cast(embedding_dim, tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        step = tf.cast(step, tf.float32)
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)
        return tf.math.rsqrt(self.emb_dim) * tf.math.minimum(arg1, arg2)

    def get_config(self):
        config = super().get_config()
        config.update({
            "emb_dim": self.emb_dim,
            "warmup_steps": self.warmup_steps,
        })
        return config

In [31]:
def create_model():
    # TODO: Remove this if we're using tokenizer
    embedding = TokenAndPositionEmbedding(M_MAX_LEN, vocab_size, M_DIM_EMB)
    transformer = Transformer(M_DIM_EMB, M_ATT_HEADS, M_DIM_FFN)

    l_input = layers.Input(shape=(M_MAX_LEN,), dtype=tf.int32)
    l_emb = embedding(l_input)
    l_trans = transformer(l_emb)
    l_output = layers.Dense(vocab_size)(l_trans)

    m = keras.Model(inputs=l_input, outputs=[l_output, l_trans])
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    learning_rate = CustomSchedule(M_DIM_EMB)
    optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

    m.compile(optimizer, loss=[loss_fn, None])
    return m

In [32]:
def create_dataset(file_pth, batch_sz, buf_sz=1000, shuffle=True):
    # Shuffle the data and create batches
    if shuffle:
        random.shuffle(file_pth)
    ds = tf.data.TextLineDataset(file_pth)
    ds = ds.shuffle(buffer_size=buf_sz)
    ds = ds.batch(batch_sz)
    return ds


def create_tokenizer(dataset, max_vocab_size):
    def preprocess_txt(input_string):
        # Preprocessing for word-level model
        s1 = tf.strings.lower(input_string)
        return tf.strings.regex_replace(s1, f"([{string.punctuation}])", r" \1")

    # Vectorization of the data
    vectorize = TextVectorization(
        standardize=preprocess_txt,
        max_tokens=vocab_size - 1,
        output_mode="int",
        output_sequence_length=M_MAX_LEN + 1,
    )
    vectorize.adapt(dataset)
    vocab = vectorize.get_vocabulary()
    return vectorize, vocab


# Read in the data and create the dataset
dataset = create_dataset(data_files, T_BATCH_SIZE)
# Create the tokenizer
tokenizer, vocab = create_tokenizer(dataset, M_VOCAB_SZ)


def create_sequences(txt):
    txt = tf.expand_dims(txt, -1)
    txt_tok = tokenizer(txt)
    return txt_tok[:, :-1], txt_tok[:, 1:]


dataset = dataset.map(create_sequences).prefetch(tf.data.AUTOTUNE)

In [33]:
for d in dataset.take(1):
    print(d)

(<tf.Tensor: shape=(256, 80), dtype=int64, numpy=
array([[   13,   225,  3108, ...,     0,     0,     0],
       [11490,  3970,    30, ...,     2,   829,   813],
       [    0,     0,     0, ...,     0,     0,     0],
       ...,
       [    2,  1318,  8359, ...,     0,     0,     0],
       [   63,  7488,   140, ...,  7488,     7,     6],
       [    5,    38,   123, ...,    32,  1204,     7]])>, <tf.Tensor: shape=(256, 80), dtype=int64, numpy=
array([[ 225, 3108,   10, ...,    0,    0,    0],
       [3970,   30,    1, ...,  829,  813, 9256],
       [   0,    0,    0, ...,    0,    0,    0],
       ...,
       [1318, 8359, 8688, ...,    0,    0,    0],
       [7488,  140,  142, ...,    7,    6, 7707],
       [  38,  123,    3, ..., 1204,    7,    6]])>)


In [34]:
class TextGenerator(keras.callbacks.Callback):
    def __init__(self, max_tokens, start_tokens, index_to_word, top_k=10, print_every=1):
        self.max_tokens = max_tokens
        self.start_tokens = start_tokens
        self.index_to_word = index_to_word
        self.print_every = print_every
        self.k = top_k

    def sample_from(self, logits):
        logits, indices = tf.math.top_k(logits, k=self.k, sorted=True)
        indices = np.asarray(indices).astype("int32")
        preds = keras.activations.softmax(tf.expand_dims(logits, 0))[0]
        preds = np.asarray(preds).astype("float32")
        return np.random.choice(indices, p=preds)

    def detokenize(self, number):
        return self.index_to_word[number]

    def on_epoch_end(self, epoch, logs=None):
        start_tokens = list(self.start_tokens)
        if (epoch + 1) % self.print_every != 0:
            return
        num_tokens_generated = 0
        tokens_generated = []
        while num_tokens_generated <= self.max_tokens:
            pad_len = M_MAX_LEN - len(start_tokens)
            sample_index = len(start_tokens) - 1
            if pad_len < 0:
                x = start_tokens[:M_MAX_LEN]
                sample_index = M_MAX_LEN - 1
            elif pad_len > 0:
                x = start_tokens + [0] * pad_len
            else:
                x = start_tokens
            x = np.array([x])
            y, _ = self.model.predict(x)
            sample_token = self.sample_from(y[0][sample_index])
            tokens_generated.append(sample_token)
            start_tokens.append(sample_token)
            num_tokens_generated = len(tokens_generated)
        txt = " ".join([self.detokenize(_) for _ in self.start_tokens + tokens_generated])
        print(f"Generated:\n{txt}\n")

In [35]:
def create_generation_callback(start_prompt, vocabulary, gen_len=100):
    # Tokenize starting prompt
    word_to_index = {word: index for index, word in enumerate(vocabulary)}
    prompt_tokens = [word_to_index.get(_, 1) for _ in start_prompt.lower().split()]
    return TextGenerator(gen_len, prompt_tokens, vocabulary)

def create_callbacks(base_dir, model, defaults: list = None):
    import tensorflow as tf
    print(f'base_dir: {base_dir}')
    dir_models = os.path.join(base_dir, model.name)
    path_csv = os.path.join(dir_models, 'history.csv')
    print("History CSV:", path_csv)
    path_ckp = os.path.join(dir_models, 'checkpoints.h5')
    print("Checkpoint:", path_ckp)
    path_tb = os.path.join(dir_models, "logs")
    tb_file_writer = tf.summary.create_file_writer(path_tb)
    callbacks = [] if defaults is None else defaults
    callbacks.append(tf.keras.callbacks.TensorBoard(log_dir=path_tb, histogram_freq=1))
    callbacks.append(tf.keras.callbacks.CSVLogger(path_csv, separator=",", append=True))
    # callbacks.append(tf.keras.callbacks.ModelCheckpoint(path_ckp,
    #                                                     monitor='loss',
    #                                                     save_best_only=True,
    #                                                     mode='auto',
    #                                                     verbose=0))
    os.makedirs(dir_models, exist_ok=True)
    return callbacks, tb_file_writer

In [36]:
# Clear any logs from previous runs
!rm -rf ./logs/

In [None]:
os.makedirs("./logs", exist_ok=True)
%tensorboard --logdir logs/

In [38]:
model = create_model()
callbacks, tb_file_writer = create_callbacks("logs", model)
gen_callback = create_generation_callback("a day in the life", vocab)
callbacks.append(gen_callback)

base_dir: logs
History CSV: logs/model_2/history.csv
Checkpoint: logs/model_2/checkpoints.h5


In [39]:
# wandb.tensorboard.patch(root_logdir="logs")
# wandb.init(project='transformer')
model.fit(dataset, verbose=2, epochs=T_EPOCHS, callbacks=callbacks)

Epoch 1/30
Generated:
a day in the life since been longer he day  this  and island june  to was but and records as area between  there red be  their this   there from included but he described but be most with he  he also . area    in the day   but was  while  he there to   be  .    he he  . on day with  be day area central be day  be was with   be be was   area    area with to

144/144 - 25s - loss: 9.2713 - dense_8_loss: 9.2713 - 25s/epoch - 171ms/step
Epoch 2/30
Generated:
a day in the life  to [UNK]  and " [UNK] the [UNK]  , .  in .    . ,    <unk <unk  . to in    . and   in to  [UNK]   [UNK]  . to  in ,   and <unk   the , to . . . and  [UNK] <unk was and    the  <unk in @ in     .  @ <unk  and the ,   to and in   , <unk . <unk . 

144/144 - 25s - loss: 6.6075 - dense_8_loss: 6.6075 - 25s/epoch - 174ms/step
Epoch 3/30
Generated:
a day in the life         =                          =                                                     .        .   the  

144/144 - 25s - loss: 3.7419

<keras.callbacks.History at 0x7f5e03397cd0>

In [42]:
model.save("model_warmup.h5")

NotImplementedError: Learning rate schedule must override get_config