In [50]:
#@formatter:off
%load_ext autoreload
%autoreload 2
#@formatter:on

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [51]:
import random
import string

import tensorflow as tf
from src.Generation import GenerationCallback, Generator
from src.Model import TokenAndPositionEmbedding
from src.Model import Transformer
from src.Model import WarmupScheduler
from src import Utils
from src.Configs import ModelConfig
from src.Configs import TrainingConfig
import pickle


In [52]:
# def create_model(model_config: ModelConfig):
#     # TODO: Remove this if we're using tokenizer
#     embedding = TokenAndPositionEmbedding(model_config.M_MAX_LEN, model_config.M_VOCAB_SZ, model_config.M_DIM_EMB)
#     transformer = Transformer(model_config.M_DIM_EMB, model_config.M_ATT_HEADS, model_config.M_DIM_FFN)
#
#     l_input = tf.keras.layers.Input(shape=(model_config.M_MAX_LEN,), dtype=tf.int32)
#     l_emb = embedding(l_input)
#     l_trans = transformer(l_emb)
#     l_output = tf.keras.layers.Dense(model_config.M_VOCAB_SZ)(l_trans)
#
#     m = tf.keras.Model(inputs=l_input, outputs=[l_output, l_trans])
#     loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
#     learning_rate = WarmupScheduler(model_config.M_DIM_EMB, model_config.M_WARMUP_STEPS)
#     optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
#
#     m.compile("adam", loss=[loss_fn, None])
#     return m

In [53]:
class SimpleGPT(tf.keras.layers.Layer):
    def __init__(self, model_config):
        super(SimpleGPT, self).__init__()
        self.layer_embedding = TokenAndPositionEmbedding(model_config.M_MAX_LEN, model_config.M_VOCAB_SZ,
                                                         model_config.M_DIM_EMB)
        self.transformer_block = Transformer(model_config.M_DIM_EMB, model_config.M_ATT_HEADS, model_config.M_DIM_FFN)

        self.layer_output = tf.keras.layers.Dense(model_config.M_VOCAB_SZ)

    def call(self, inputs):
        emb = self.layer_embedding(inputs)
        attention_mask = self.transformer_block(emb)
        logits = self.layer_output(attention_mask)
        return logits, attention_mask

def create_model(model_config: ModelConfig):
    inputs = tf.keras.layers.Input(shape=(model_config.M_MAX_LEN,), dtype=tf.int32)
    simple_gpt = SimpleGPT(model_config)
    logits, attention_mask = simple_gpt(inputs)
    m = tf.keras.Model(inputs=inputs, outputs=[logits, attention_mask])

    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    learning_rate = WarmupScheduler(model_config.M_DIM_EMB, model_config.M_WARMUP_STEPS)
    optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

    m.compile("adam", loss=[loss_fn, None])
    return m

In [54]:
def create_dataset(file_pth, batch_sz, buf_sz=1000, shuffle=True):
    # Shuffle the data and create batches
    if shuffle:
        random.shuffle(file_pth)
    ds = tf.data.TextLineDataset(file_pth)
    ds = ds.shuffle(buffer_size=buf_sz)
    ds = ds.batch(batch_sz)
    return ds


def create_tokenizer(dataset, max_vocab_size, max_seq_len):
    def preprocess_txt(input_string):
        # Preprocessing for word-level model
        s1 = tf.strings.lower(input_string)
        return tf.strings.regex_replace(s1, f"([{string.punctuation}])", r" \1")

    # Vectorization of the data
    vectorize = tf.keras.layers.TextVectorization(
        standardize=preprocess_txt,
        max_tokens=max_vocab_size - 1,
        output_mode="int",
        output_sequence_length=max_seq_len + 1,
    )
    vectorize.adapt(dataset)
    vocab = vectorize.get_vocabulary()
    return vectorize, vocab


config_model, config_training = ModelConfig(), TrainingConfig()
# Read in the data and create the dataset
dataset = create_dataset(config_training.T_DATASET, config_training.T_BATCH_SIZE)
# Create the tokenizer
tokenizer, vocab = create_tokenizer(dataset, config_model.M_VOCAB_SZ, config_model.M_MAX_LEN)

In [55]:
for d in dataset.take(1):
    print("=" * 80)
    print(d[0])
    print("=" * 80)
    print(d[1])

tf.Tensor(b'Either you with me or bitch you on the shit list', shape=(), dtype=string)
tf.Tensor(b"Rising quick, no Fleischman's", shape=(), dtype=string)


In [56]:
def create_sequences(txt):
    txt = tf.expand_dims(txt, -1)
    txt_tok = tokenizer(txt)
    return txt_tok[:, :-1], txt_tok[:, 1:]


dataset = dataset.map(create_sequences).prefetch(tf.data.AUTOTUNE)

In [57]:
for d in dataset.take(1):
    print("=" * 80)
    print(d[0])
    print("=" * 80)
    print(d[1])

tf.Tensor(
[[   5 3318  114 ...    0    0    0]
 [  52  158 1139 ...    0    0    0]
 [ 143   10   21 ...    0    0    0]
 ...
 [1548   91    4 ...    0    0    0]
 [  32    3   17 ...    0    0    0]
 [  34   12  134 ...    0    0    0]], shape=(256, 100), dtype=int64)
tf.Tensor(
[[3318  114    2 ...    0    0    0]
 [ 158 1139  608 ...    0    0    0]
 [  10   21    4 ...    0    0    0]
 ...
 [  91    4    1 ...    0    0    0]
 [   3   17 4202 ...    0    0    0]
 [  12  134  190 ...    0    0    0]], shape=(256, 100), dtype=int64)


In [58]:
model = create_model(config_model)
callbacks, tb_file_writer = Utils.create_callbacks("logs", model)
gen_callback = GenerationCallback("i will always be", 100, config_model.M_MAX_LEN, vocab, tb_file_writer=tb_file_writer)
callbacks.append(gen_callback)

# Model Dir: logs/model_2
 - History Path: logs/model_2/history.csv
 - Checkpoint Path: logs/model_2/checkpoints.h5
 - TB Path: logs/model_2


In [59]:
model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 100)]             0         
                                                                 
 simple_gpt_3 (SimpleGPT)    ((None, 100, 20000),      5416992   
                              (None, 100, 128))                  
                                                                 
Total params: 5,416,992
Trainable params: 5,416,992
Non-trainable params: 0
_________________________________________________________________


In [None]:
# wandb.tensorboard.patch(root_logdir="logs")
# wandb.init(project='transformer')
model.fit(dataset, verbose=1, epochs=config_training.T_EPOCHS, callbacks=callbacks)

Layer SimpleGPT has arguments ['self', 'model_config']
in `__init__` and therefore must override `get_config()`.

Example:

class CustomLayer(keras.layers.Layer):
    def __init__(self, arg1, arg2):
        super().__init__()
        self.arg1 = arg1
        self.arg2 = arg2

    def get_config(self):
        config = super().get_config()
        config.update({
            "arg1": self.arg1,
            "arg2": self.arg2,
        })
        return config
Epoch 1/20
     24/Unknown - 4s 132ms/step - loss: 6.8168 - simple_gpt_3_loss: 6.8168

In [None]:

model.save("model_save")
with open("model_save/vocab.pkl", "wb") as f:
    pickle.dump(vocab, f)
with open("model_save/config_model.pkl", "wb") as f:
    pickle.dump(config_model, f)
with open("model_save/config_training.pkl", "wb") as f:
    pickle.dump(config_training, f)

In [None]:
# model.save("model_warmup.h5")
generator = Generator(model, config_model.M_MAX_LEN, vocab)
generated_txt = generator.generate("i will always be", 50)
print(generated_txt)