In [50]:
from decoder import TransformerDecoder
import numpy as np
import tensorflow as tf
from AdamW import AdamW

In [51]:
#simple token cheme from https://www.tensorflow.org/text/tutorials/word2vec
def gen_vocab(corpus):
    vocab = {}
    vocab["<start>"] = 0
    vocab["<end>"] = 1
    i = 2
    tokens = corpus.lower().split()
    for token in tokens:
        if token  not in vocab:
            vocab[token] = i
            i+=1
    return vocab



In [52]:
corpus = "to be, or not to be, that is the question"


vocab = gen_vocab(corpus)

vocab
inverse_vocab = {index: token for token, index in vocab.items()}
inverse_vocab
vocab

{'<start>': 0,
 '<end>': 1,
 'to': 2,
 'be,': 3,
 'or': 4,
 'not': 5,
 'that': 6,
 'is': 7,
 'the': 8,
 'question': 9}

In [53]:

sentence = "<start> " + corpus + " <end>"
tokenize = lambda sentence : [vocab[word] for word in sentence.lower().split()]
detokenize = lambda tokenized_text : ' '.join([inverse_vocab[x] for x in tokenized_text])

tokenized_text = tokenize(sentence)
input_text = np.array(tokenized_text[:-1])[np.newaxis,:]
target_text = np.array(tokenized_text[1:])[np.newaxis,:]
target_text
input_text

array([[0, 2, 3, 4, 5, 2, 3, 6, 7, 8, 9]])

In [54]:
def get_loss(labels, logits):
    return tf.reduce_mean(
        tf.nn.sparse_softmax_cross_entropy_with_logits(labels, logits)
    )

In [55]:

model = TransformerDecoder(dim_model=512, heads=8, blocks=1, vocab_size=len(vocab), max_len=20)




In [56]:
from tqdm import trange
def train(model, optimizer, input_text, output_text, iterations, get_loss):
    bar = trange(iterations)
    for i in bar:
        with tf.GradientTape() as tape:
            est_labels = model(input_text)
            loss = get_loss(labels=target_text, logits=est_labels)

            grads = tape.gradient(loss, model.trainable_variables)

            grads = [tf.convert_to_tensor(grad) for grad in grads]
    
            optimizer.apply_gradients(grads, model.trainable_variables)



        if i%10 == (10 - 1):
            bar.set_description(f"Step {i}; Loss=> {loss.numpy():0.4f}")
            bar.refresh()
    








In [57]:
optimizer = AdamW()
train(model, optimizer, input_text, target_text, 2000, get_loss)

  0%|          | 0/2000 [00:00<?, ?it/s]

Step 1999; Loss=> 0.0000: 100%|██████████| 2000/2000 [01:39<00:00, 20.04it/s]


In [91]:

x = np.array(tokenize("<start>"))[np.newaxis,:]

output = model(x)


while 1:
    predicted_tokens = tf.math.argmax(output, axis = - 1)

    output = model(x)
   
    predicted_tokens = tf.math.argmax(output, axis = - 1)
    first_predicted_token = predicted_tokens[:,-1]
    
    
    x = tf.concat([x, first_predicted_token[tf.newaxis, :]], axis=1)
    if int(first_predicted_token) == 1:
        break
print(detokenize(np.array(x[0])))
#detokenize(np.array(tf.math.argmax(output, axis = - 1)[0]))



<start> be, or not to be, that is the question <end>


In [59]:
model.blocks[0].isTrain

ListWrapper([True])

In [60]:
def generate(model, prompt, max_token_gen):
        
    for _ in range(max_token_gen):

        output = model(prompt)

        predicted_tokens = tf.math.argmax(output, axis = - 1)
        first_predicted_token = predicted_tokens[:,0]


        prompt = tf.concat([prompt, first_predicted_token[tf.newaxis, :]], axis=1)


    return prompt

In [61]:
output = model(x)

predicted_tokens = tf.math.argmax(output, axis = - 1)
first_predicted_token = predicted_tokens[:,0]


first_predicted_token


<tf.Tensor: shape=(1,), dtype=int64, numpy=array([2])>

generate()