In [1]:
MAX_SEQ_LEN = 20

In [2]:
from datasets import load_dataset

ds = load_dataset("jeremygf/domains-app-alpha")
ds = ds['train']

In [3]:
ds = ds.map(lambda x: {
    'input_ids': x['ids'][:-1],
    'target_ids': x['ids'][1:]
})

ds_dict = ds.train_test_split(test_size=0.1)

In [4]:
import tensorflow as tf

class LayerNormalization(tf.keras.layers.Layer):
    def __init__(self, epsilon=1e-6, **kwargs):
        super().__init__(**kwargs)
        self.epsilon = epsilon

    def build(self, input_shape):
        self.gamma = self.add_weight(name='gamma', shape=input_shape[-1:],
                                     initializer='ones', trainable=True)
        self.beta = self.add_weight(name='beta', shape=input_shape[-1:],
                                    initializer='zeros', trainable=True)
        super().build(input_shape)

    def call(self, x):
        mean = tf.reduce_mean(x, axis=-1, keepdims=True)
        variance = tf.reduce_mean(tf.square(x - mean), axis=-1, keepdims=True)
        x = (x - mean) * tf.math.rsqrt(variance + self.epsilon)
        x = self.gamma * x + self.beta
        return x

2024-02-22 11:08:16.417227: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-22 11:08:17.008695: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-22 11:08:17.008747: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-22 11:08:17.038513: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-22 11:08:17.112336: I tensorflow/core/platform/cpu_feature_guar

In [26]:
import numpy as np

def positional_encoding(length, depth):
    depth = depth/2

    positions = np.arange(length)[:, np.newaxis]
    depths = np.arange(depth)[np.newaxis, :]/depth

    angle_rates = 1 / (10000**depths)
    angle_rads = positions * angle_rates

    pos_encoding = np.concatenate(
        [np.sin(angle_rads), np.cos(angle_rads)],
        axis=-1) 

    return tf.cast(pos_encoding, dtype=tf.float32)

In [6]:
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, mask_zero=True) 
        self.pos_encoding = positional_encoding(length=MAX_SEQ_LEN, depth=embedding_dim)

    def call(self, x):
        length = tf.shape(x)[1]
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.embedding_dim, tf.float32))
        x = x + self.pos_encoding[tf.newaxis, :length, :]
        return x

In [7]:
class CausalSelfAttention(tf.keras.layers.Layer):
    def __init__(self, num_heads, embedding_dim):
        super().__init__()

        self.mha = tf.keras.layers.MultiHeadAttention(
            num_heads=num_heads, 
            key_dim=embedding_dim
        )

        self.layer_norm = LayerNormalization()
    
    def call(self, x):
        attn_output = self.mha(
            query=x, 
            value=x, 
            use_causal_mask=True
        )
        x = x + attn_output
        x = self.layer_norm(x)
        return x

In [8]:
class FeedForward(tf.keras.layers.Layer):
    def __init__(self, embedding_dim, feedforward_dim, dropout_rate=0.1):
        super().__init__()
        
        self.d1 = tf.keras.layers.Dense(feedforward_dim, activation='gelu')
        self.d2 = tf.keras.layers.Dense(embedding_dim)
        self.dropout = tf.keras.layers.Dropout(dropout_rate)

        self.layer_norm = LayerNormalization()

    def call(self, x):
        x = self.d2(self.d1(x))
        x = x + self.dropout(x)
        x = self.layer_norm(x) 
        return x

In [9]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, embedding_dim, num_heads, feedforward_dim, dropout_rate=0.1):
        super(DecoderLayer, self).__init__()

        self.causal_self_attention = CausalSelfAttention(num_heads=num_heads, embedding_dim=embedding_dim)

        self.ffn = FeedForward(embedding_dim=embedding_dim, feedforward_dim=feedforward_dim, dropout_rate=dropout_rate)

    def call(self, x):
        x = self.causal_self_attention(x=x)
        x = self.ffn(x)
        return x    

In [22]:
class TransformerDecoder(tf.keras.Model):
    def __init__(self, vocab_size: int, num_layers: int, embedding_dim: int, 
        num_heads: int, intermediate_dim: int) -> None:

        super().__init__()

        self.embeddings = PositionalEmbedding(vocab_size, embedding_dim)

        self.decoders = [
            DecoderLayer(embedding_dim, num_heads, intermediate_dim)
            for _ in range(num_layers)
        ]

        self.dense = tf.keras.layers.Dense(vocab_size, activation='relu')
        
    def call(self, inputs):
        x = self.embeddings(inputs)  

        for layer in self.decoders:
            x = layer(x)
        return self.dense(x)       

In [23]:
def masked_loss(label, pred):
    mask = label != 0
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
    loss = loss_object(label, pred)

    mask = tf.cast(mask, dtype=loss.dtype)
    loss *= mask

    loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
    return loss

def masked_accuracy(label, pred):
    pred = tf.argmax(pred, axis=2)
    label = tf.cast(label, pred.dtype)
    match = label == pred

    mask = label != 0

    match = match & mask

    match = tf.cast(match, dtype=tf.float32)
    mask = tf.cast(mask, dtype=tf.float32)
    return tf.reduce_sum(match)/tf.reduce_sum(mask)

In [15]:
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast(tokenizer_file='tokenizer.json')

In [18]:
BATCH = 256

from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer, 
    return_tensors="tf"
)

tf_train = ds_dict['train'].to_tf_dataset(
    columns="input_ids",
    label_cols="target_ids",
    collate_fn=data_collator,
    batch_size=BATCH,
    shuffle=True
)

tf_test = ds_dict['test'].to_tf_dataset(
    columns="input_ids",
    label_cols="target_ids",
    collate_fn=data_collator,
    batch_size=BATCH,
    shuffle=True
)


In [27]:
EPOCHS = 10
EMBED_DIM = 256

NUM_HEADS = 4
FEED_FORWARD_DIM = 512
NUM_LAYERS = 2

model = TransformerDecoder(
    vocab_size=tokenizer.vocab_size,
    num_layers=NUM_LAYERS,
    embedding_dim=EMBED_DIM,
    num_heads=NUM_HEADS,
    intermediate_dim=FEED_FORWARD_DIM
)

model.compile(loss=masked_loss, metrics=[masked_accuracy], optimizer='adam')

In [28]:
model.fit(
    tf_train, 
    validation_data=tf_test, 
    epochs=EPOCHS
)

Epoch 1/10


2024-02-22 11:20:31.401019: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2024-02-22 11:20:32.802727: I external/local_xla/xla/service/service.cc:168] XLA service 0x7f58091bead0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-02-22 11:20:32.802777: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 4050 Laptop GPU, Compute Capability 8.9
2024-02-22 11:20:32.846521: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-02-22 11:20:32.921814: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8904
I0000 00:00:1708618833.016721    5229 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10

KeyboardInterrupt: 

In [33]:
def step(prompt, cache, index):
    logits = model(prompt)[:, index-1, :]
    hidden_states = None
    return logits, hidden_states, cache

In [32]:
prompt = ''
prompt_length = len(tokenizer.encode(prompt))-1
prompt = tokenizer.encode(prompt, padding='max_length', max_length=MAX_SEQ_LEN, return_tensors='tf')

In [62]:
import keras_nlp

sampler = keras_nlp.samplers.TopPSampler(0.5)
output_tokens = sampler(
    next=step,
    prompt=prompt,
    index=prompt_length
)[0]

print(tokenizer.decode(output_tokens).split('[END]')[0])

[STA] car e er w iz ard 
