In [1]:
!pip install -r requirements.txt



In [18]:
import glob, gzip, os
import numpy as np
np.set_printoptions(linewidth=200)

In [19]:
DATA_DIR = os.path.join('data')
TEMP_FILE = os.path.join(DATA_DIR, 'temp.txt')

VOCAB_SIZE = 5000

class SpecialTokens:
    PAD = "[PAD]"
    START = "[STA]"
    END = "[END]"
    UNK = "[UNK]"
    TOKENS = [PAD, START, END, UNK]
    TOKEN_TO_ID = {token: i for i, token in enumerate(TOKENS)}
    TUPLES = [(token, i) for i, token in enumerate(TOKENS)]

In [20]:
from tokenizers.trainers import BpeTrainer
from tokenizers import Tokenizer
from tokenizers.models import BPE

trainer = BpeTrainer(
    special_tokens=SpecialTokens.TOKENS,
    vocab_size=VOCAB_SIZE,
    show_progress=True
)

tokenizer = Tokenizer(BPE(
    unk_token=SpecialTokens.UNK,
    end_of_word_suffix=SpecialTokens.END
))

In [21]:
from tokenizers.pre_tokenizers import Whitespace
tokenizer.pre_tokenizer = Whitespace()

In [22]:
from tokenizers.processors import TemplateProcessing
tokenizer.post_processor = TemplateProcessing(
    single="[STA] $A [END]",
    special_tokens=SpecialTokens.TUPLES,
)

In [23]:
text_files = glob.glob(os.path.join(DATA_DIR, "*.txt.gz"))
os.remove(TEMP_FILE)

for file in text_files:
    with gzip.open(file, "rt") as f:
        with open(TEMP_FILE, "a") as f2:
            f2.write(f.read().replace("\n", " ").replace("\r", ""))
        

In [24]:
tokenizer.train(['data/temp.txt'], trainer)






In [25]:
tokenizer.save("tokenizer.json")

In [27]:
from datasets import load_dataset

dataset = load_dataset("jeremygf/domains-app-alpha")
dataset = dataset['train']
dataset

Dataset({
    features: ['text'],
    num_rows: 534152
})

In [28]:
dataset = dataset.map(lambda x: {
    'ids': tokenizer.encode(x['text']).ids
})

Map:   0%|          | 0/534152 [00:00<?, ? examples/s]

In [29]:
dataset = dataset.map(lambda x: {
    'input_ids': x['ids'][:-1],
    'target_ids': x['ids'][1:]
})

Map:   0%|          | 0/534152 [00:00<?, ? examples/s]

In [30]:
dataset

Dataset({
    features: ['text', 'ids', 'input_ids', 'target_ids'],
    num_rows: 534152
})

In [31]:
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast(tokenizer_file='tokenizer.json')
tokenizer.pad_token = SpecialTokens.PAD

In [32]:
BATCH = 1024
EPOCHS = 100

MAX_SEQ_LEN = 31
EMBED_DIM = 96
NUM_HEADS = 24
FEED_FORWARD_DIM = 96
NUM_LAYERS = 3

In [34]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")
tf_dataset = dataset.to_tf_dataset(
    columns=["input_ids"],
    label_cols=["target_ids"],
    batch_size=BATCH,
    collate_fn=data_collator,
    shuffle=True
)

In [35]:
import tensorflow as tf


In [36]:
from tensorflow.keras.layers import Embedding, Dense

class CharTransformer(tf.keras.Model):
    def __init__(self, vocab_size: int, sequence_length: int, num_layers: int, embedding_dim: int, 
        num_heads: int, intermediate_dim: int) -> None:

        super().__init__()

        self.embeddings = TokenAndPositionEmbedding(
            vocabulary_size=vocab_size,
            sequence_length=sequence_length,
            embedding_dim=embedding_dim,
            mask_zero=True,
        )

        self.decoders = [
            TransformerDecoder(
                num_heads=num_heads, 
                intermediate_dim=intermediate_dim,
                dropout=0.1,
                activation='gelu'
            )
            for _ in range(num_layers)
        ]

        self.dense = tf.keras.layers.Dense(vocab_size, activation='relu')
        
    def call(self, inputs):
        x = self.embeddings(inputs)
        for layer in self.decoders:
            x = layer(x)
        return self.dense(x)       

In [42]:
model.fit(text_ds, epochs=EPOCHS, verbose=1)

Epoch 1/100
Epoch 2/100


2024-02-06 09:39:03.793875: I tensorflow/core/framework/local_rendezvous.cc:421] Local rendezvous recv item cancelled. Key hash: 11649731160769717573
2024-02-06 09:39:03.794068: I tensorflow/core/framework/local_rendezvous.cc:421] Local rendezvous recv item cancelled. Key hash: 15495456782289674620
2024-02-06 09:39:03.794119: I tensorflow/core/framework/local_rendezvous.cc:421] Local rendezvous recv item cancelled. Key hash: 10014077403232570392
2024-02-06 09:39:03.794128: I tensorflow/core/framework/local_rendezvous.cc:425] Local rendezvous send item cancelled. Key hash: 5551772001302607106


Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
 48/522 [=>............................] - ETA: 1:27 - loss: 2.9651 - perplexity: 73.2684

KeyboardInterrupt: 

In [113]:
model.save('char_transformer_32_24h.keras')

In [37]:
prompt_packer = StartEndPacker(
    MAX_SEQ_LEN,
    start_value=SpecialTokens.START,
    pad_value=SpecialTokens.PADDING,
    return_padding_mask=False
)

In [38]:
def pack_prompt(prompt: str) -> tf.Tensor:
    chars = tf.strings.unicode_split(prompt, input_encoding='UTF-8')
    padded = prompt_packer(chars)
    tokens = ids_from_chars(padded)
    return tokens

In [39]:
def nextt(prompt, cache, index):
    logits = model(prompt)[:, index-1, :]
    # Ignore hidden states for now; only needed for contrastive search.
    hidden_states = None
    return logits, hidden_states, cache

In [40]:
prompt = ''
prompt_length = len(prompt)
prompt_tokens = pack_prompt([prompt])

sampler = keras_nlp.samplers.TopPSampler(0.5)
output_tokens = sampler(
    next=nextt,
    prompt=prompt_tokens,
    index=prompt_length+1,  # Start sampling immediately after the [BOS] token.
)
txt = chars_from_ids(output_tokens)
txt=tf.strings.reduce_join(txt, axis=-1).numpy()
print(f"Greedy search generated text: \n{txt}\n")

Greedy search generated text: 
[b'[STA][END][PAD]ii[END][PAD][END][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]t[PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]']

