In [1]:
# Downloading tinyshakesphere for training
!curl https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt > tinyshakespeare.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0

100 1089k  100 1089k    0     0  1581k      0 --:--:-- --:--:-- --:--:-- 1583k


# 1. Inspecting the tinyshakespeare text file for training

In [3]:
# Inspecting the text file
with open('tinyshakespeare.txt','r') as file:
    text = file.read()

In [4]:
print(f'There are {len(text)} characters in the dataset')

There are 1115394 characters in the dataset


In [5]:
# Printing the first 1000 characters
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [6]:
# Identifying the number of unique characters contained in the text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(f"Number of unique characters (including white space): {vocab_size}{''.join(chars)}")

Number of unique characters (including white space): 65
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


# 2. Basic mapping between characters to integers

More sophisticated examples include Google's SentencePiece and OpenAI's tiktoken

In [7]:
# Assigning numbers to each characters to encode the characters to integers
ctoi = {char : num for num, char in enumerate(chars)}
encode = lambda s: [ctoi[c] for c in s]
print(encode('Shakespeare in digits'))

# Reversely, decode integers back to characters
itoc = {num : char for num, char in enumerate(chars)}
decode = lambda l : ''.join([itoc[i] for i in l])
print(decode(encode('Shakespeare in digits')))

[31, 46, 39, 49, 43, 57, 54, 43, 39, 56, 43, 1, 47, 52, 1, 42, 47, 45, 47, 58, 57]
Shakespeare in digits


In [8]:
# Tokenizing the total text. Adapted the code to work with Tensorflow instead of pytorch
import tensorflow as tf
data = tf.convert_to_tensor(encode(text))
print(data.shape, data.dtype)
print(data[:100])

2024-09-01 20:31:49.419967: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-01 20:31:49.940419: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


(1115394,) <dtype: 'int32'>
tf.Tensor(
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59], shape=(100,), dtype=int32)


In [9]:
# Train and validation split sets, with 9:1 ratio
n = int(0.9*len(data))
data_train = data[:n]
data_test = data[n:]
print(f'Length of train data : {len(data_train)}\nLength of test data : {len(data_test)}')

Length of train data : 1003854
Length of test data : 111540


In [10]:
# Starting with block_size implementation
block_size = 8
print(data_train[:block_size + 1])
x = data_train[:block_size]
y = data_train[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f'Input : {context}, Output : {target}')

tf.Tensor([18 47 56 57 58  1 15 47 58], shape=(9,), dtype=int32)
Input : [18], Output : 47
Input : [18 47], Output : 56
Input : [18 47 56], Output : 57
Input : [18 47 56 57], Output : 58
Input : [18 47 56 57 58], Output : 1
Input : [18 47 56 57 58  1], Output : 15
Input : [18 47 56 57 58  1 15], Output : 47
Input : [18 47 56 57 58  1 15 47], Output : 58


In [11]:
## To be worked on : packaging the code with script with variables for later
# Depiction of the chunk(or in here, block)-wise transformation.
# Having varied blocksize allows the algorithm to take into account the context for inference purpose



tf.random.set_seed(1337) # To be sure to have consistent random number
batch_size = 4 # The number of independent sequences to train in parallel
block_size = 8 # The maximum context length for prediction

def get_batch(split):
    data = data_train if split == 'train' else data_test
    # Retrieving batches randomly
    ix = tf.random.uniform(shape = (batch_size,),
                           maxval = len(data) - block_size,
                           dtype = tf.int32)
    # Stacking the list of tensors
    x = tf.stack([data[i:i+block_size] for i in ix])
    y = tf.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print(xb)
print(yb)
for batch in range(batch_size):
    for block in range(block_size):
        context = xb[batch, :block+1]
        target = yb[batch, block]
        print(f'When input is {context.numpy().tolist()} the target is {target}')

tf.Tensor(
[[ 1 51 63  1 41 53 39 58]
 [39 42  0 20 47 57  1 52]
 [32 53  1 56 43 60 43 50]
 [54 39 52 63  1 54 47 43]], shape=(4, 8), dtype=int32)
tf.Tensor(
[[51 63  1 41 53 39 58  6]
 [42  0 20 47 57  1 52 39]
 [53  1 56 43 60 43 50  1]
 [39 52 63  1 54 47 43 41]], shape=(4, 8), dtype=int32)
When input is [1] the target is 51
When input is [1, 51] the target is 63
When input is [1, 51, 63] the target is 1
When input is [1, 51, 63, 1] the target is 41
When input is [1, 51, 63, 1, 41] the target is 53
When input is [1, 51, 63, 1, 41, 53] the target is 39
When input is [1, 51, 63, 1, 41, 53, 39] the target is 58
When input is [1, 51, 63, 1, 41, 53, 39, 58] the target is 6
When input is [39] the target is 42
When input is [39, 42] the target is 0
When input is [39, 42, 0] the target is 20
When input is [39, 42, 0, 20] the target is 47
When input is [39, 42, 0, 20, 47] the target is 57
When input is [39, 42, 0, 20, 47, 57] the target is 1
When input is [39, 42, 0, 20, 47, 57, 1] the targ

In [12]:
eval_iters = 200

# A function to average up the loss in multiple batches for both splits
# @tf.function : removing the code, despite slower performance as it causes an error
def estimate_loss():
    output = {}
    model.trainable = False # Setting the model to evaluation phase
    for split in ['train','val']:
        losses = []
        for _ in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model.call(X,Y)
            losses.append(loss)
        output[split] = tf.reduce_mean(losses)
    model.trainable = True # Setting the model back to training phase
    return output

In [13]:
tf.random.set_seed(1337)

class BigramLanguageModel(tf.keras.Model):

    def __init__(self, vocab_size):
        '''Initializing embedding layer, which maps integer indices to
        dense vectors of vocab size'''
        super(BigramLanguageModel, self).__init__()
        self.token_embedding_table = tf.keras.layers.Embedding(vocab_size, vocab_size)

    def call(self, idx, targets=None):
        '''Method for loss calculation, based on idx (input token indices) and
        target (target token indices)
        B : Batch size
        T : Time = sequence length = block size
        C : Channel = number of classes = vocab size
        '''
        logits = self.token_embedding_table(idx)  # Replacing embedding to the indices

        if targets is None:
            loss = None
        else:
            # Reshaping the tensor so that it's compatible with categorical cross entropy
            B, T, C = tf.shape(logits) # Get the shape of logits
            logits = tf.reshape(logits, (B * T, C)) # Flatten logits for comparison
            targets = tf.reshape(targets, (B * T,)) # Flatten targets
            loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(targets, logits, from_logits=True))
        return logits, loss

    def generate(self, idx, max_new_tokens):
        '''
        Text generating method
        '''
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :]  # becomes (B, C)
            # apply softmax to get probabilities
            probs = tf.nn.softmax(logits, axis=-1)  # (B, C)
            # sample prediction from the distribution
            idx_next = tf.random.categorical(tf.math.log(probs), num_samples=1, dtype=tf.int64)

#            idx_next = tf.random.categorical(probs, num_samples=1)  # (B, 1)
            # append sampled index to the running sequence
            idx = tf.concat([idx, tf.cast(idx_next, tf.int32)], axis=1)  # (B, T+1)
        return idx

model = BigramLanguageModel(vocab_size)
logits, loss = model.call(xb, yb)
print(logits.shape)
print(loss.numpy())

print(decode(model.generate(idx=tf.zeros((1, 1), dtype=tf.int32), max_new_tokens=100)[0].numpy().tolist()))

(32, 65)
4.18583

saTf-Wz-K,?hNl?Yr:r'KUFLIH:QmLboCkI
oYwnqePrE
!zgz'T:-?ZgzxEjItgpzAQjGjM&vv.;OBdqFlQ qwcwcexWhPKs:$'


In [14]:
from tensorflow.keras.optimizers import Adam
optimizer = Adam(learning_rate = 0.001)

In [15]:
batch_size = 32
for step in tf.range(4000):
    if step % eval_iters == 0:
        losses = estimate_loss()
        print(f"Step {step}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # Sample a batch of data
    xb, yb = get_batch('train')

    # Evaluate the loss
    with tf.GradientTape() as tape:
        logits, loss = model(xb,yb)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

print(f'Final Loss: {loss.numpy()}')

KeyboardInterrupt: 

In [16]:
# Generate a sequence
idx = tf.zeros((1, 1), dtype=tf.int32)
generated_sequence = model.generate(idx, max_new_tokens=1000).numpy()
print(decode(generated_sequence[0].tolist()))


3Q&aZbgN3kwxTfjlnSCgN ojKWV3&JKq,eudY&m-sOM p$oAoVGw;euzTN ABfz.dI$PmGFlEJwd'aOMdokv$kqwUpC?urj3
;XhpzNaZoaY&NH,qr$pbDL-,Wj,aj C'uo:ue?HpEuxkvzuX&eW
,XdwB!tPK,Qg'SDIDQq?,lotv'AH t;u$f;OpdPrBur,rMRNf-EA3HRtuwpebAk
zkTxFvo,$.DwM;G!XpLU!fbW$GldJJQfm gfRGdaYcDBUQx3sJSQ3Z;O IIYWeDm$zXJRSRyW&kjDy-u$s
cFFulXTYn?Lj
Dmsp.Su,jP'
Cb,E,pE'v-Uvide!jG;Z$PmByDYj:jhJJpUmYYvV-qAdIZjNNqOBN&mb;gpDxbnNlEXIZWj'IOEBtSav-ESJCbuad.cF3qu';!T&M Wya
hVs:HDPVNpQsW3m!bzpT3sW&HP&;sibkmsxfxOcE$3rd
zyoPUSBEp'LXubU.$U:VTz-Aw
:3C j:&aqVEQIJZyRDSLbP.FN3bLBUIUWh:RROiemtKg!m3WSmUIMIXq-nVMHo?$y'K-hbOO& jkIP-ww,ee?OxwsfsuXJErdmxEvYWQ3h!OjO!HYTuMVss!udMKw--IHjXe?'lqRe?:gSX
:qwqMI!UjklxEeXYjRZ 3 'xhP,HqP!QfRtdnR
xbnsuKJsJZF
U
lpZ-
GRqRiLQRtvEEFiWHC?ghkn$ODFrqDcffhDpw&,wCi-dHuEVS'k?3UhJD,ct,NuUitWn-cw
aAvPyidTAyyeRZ e:ocF.uN CYRxYTcN.&Qibd. CGcx3eTIxfoBPKSWVSf'AWuL$RNkegx ;san;r:vxIzwfKw&'sl:SBvqTXJzg:FFJUHwlx
VSQs,GmBM:qrSH?bUzu dLzXfRCz!gyE'Ncy;lpeAYHRSe&'l??,TNX'Evv,h:?:qp
 y
U,FWSOt,rSyno3P'qgqjGViwj:phjs.CuW3SMjdjKUb!3'P

### The mathematical trick self-attention

In [17]:
# Tokens learning from previous context, calculating average of all to previous tokens
B,T,C = 4,8,2
x = tf.random.uniform(shape=(B, T,C))

xbow = tf.zeros((B,T,C)) # Defining a bag of words
for b in range (B):
    for t in range (T):
        xprev = x[b, :t+1] # Batch, including the tth token
        xbow = xbow.numpy()  # Convert xbow to numpy array to support assignment
        xbow[b, t] = tf.reduce_mean(xprev, axis=0).numpy()  # Calculate mean and assign to xbow
        xbow = tf.convert_to_tensor(xbow)  # Convert back to tensor

### Vectorizing the above function

In [18]:
w = tf.linalg.band_part(tf.ones((T,T)),num_lower = 8, num_upper= 0) # Calculating weights with matrix
w = w / tf.math.reduce_sum(w, axis = 1, keepdims = True)

xbow2 = w @ x
tf.experimental.numpy.allclose(xbow,xbow2).numpy()

True

In [19]:
# version 3: use Softmax
tril = tf.linalg.band_part(tf.ones((T,T)),num_lower = 8, num_upper= 0)
w = tf.zeros((T,T))
w = tf.where(tril == 0, float('-inf'), w) # Indicating future bow cannot communicate with the past
w = tf.nn.softmax(w, axis = -1) # normalizing the weight matrix
xbow3 = w @ x
tf.experimental.numpy.allclose(xbow,xbow3).numpy()

True

### Modifying the model

In [20]:
tf.random.set_seed(1337)
n_embed = 32

class BigramLanguageModel(tf.keras.Model):

    def __init__(self, vocab_size):
        '''Initializing embedding layer, which maps integer indices to
        dense vectors of vocab size'''
        super(BigramLanguageModel, self).__init__()
        self.token_embedding_table = tf.keras.layers.Embedding(vocab_size, n_embed)
        self.lm_head = tf.keras.layers.Dense(units=vocab_size)

    def call(self, idx, targets=None):
        '''Method for loss calculation, based on idx (input token indices) and
        target (target token indices)
        B : Batch size
        T : Time = sequence length = block size
        C : Channel = number of classes = vocab size
        '''
        token_emb = self.token_embedding_table(idx)  # Replacing embedding to the indices
        pos_emb = self.position_embedding_table(tf.range(T, dtype=tf.int32)) # (T,C)
        x = token_emb + pos_emb # (B,T,C)
        logits = self.lm_head(x  ) # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:
            # Reshaping the tensor so that it's compatible with categorical cross entropy
            B, T, C = tf.shape(logits) # Get the shape of logits
            logits = tf.reshape(logits, (B * T, C)) # Flatten logits for comparison
            targets = tf.reshape(targets, (B * T,)) # Flatten targets
            loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(targets, logits, from_logits=True))
        return logits, loss

    def generate(self, idx, max_new_tokens):
        '''
        Text generating method
        '''
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :]  # becomes (B, C)
            # apply softmax to get probabilities
            probs = tf.nn.softmax(logits, axis=-1)  # (B, C)
            # sample prediction from the distribution
            idx_next = tf.random.categorical(tf.math.log(probs), num_samples=1, dtype=tf.int64)

#            idx_next = tf.random.categorical(probs, num_samples=1)  # (B, 1)
            # append sampled index to the running sequence
            idx = tf.concat([idx, tf.cast(idx_next, tf.int32)], axis=1)  # (B, T+1)
        return idx

model = BigramLanguageModel(vocab_size)
logits, loss = model.call(xb, yb)
print(logits.shape)
print(loss.numpy())

print(decode(model.generate(idx=tf.zeros((1, 1), dtype=tf.int32), max_new_tokens=100)[0].numpy().tolist()))


AttributeError: 'BigramLanguageModel' object has no attribute 'position_embedding_table'

In [21]:
# Video from 1:00:00. Need to recheck how self-attention functions

# Version 4: self-attention
tf.random.set_seed(1337)
B,T,C = 4, 8, 32 # batch, time, channels
x = tf.random.normal(shape=(B, T, C))

# Single head perform self-attention
head_size = 16
key = tf.keras.layers.Dense(units=head_size, use_bias=False)
query = tf.keras.layers.Dense(units=head_size, use_bias=False)
value = tf.keras.layers.Dense(units=head_size, use_bias=False)

# All tokens in all positions produce independent key and query
k = key(x) # B, T, 16
q = query(x) # B, T, 16
# Communicating key with query
w = q @ tf.transpose(k, perm=[0,2,1]) # (B, T, 16) @ (B, 16, T) -> (B,T,T)

tril = tf.linalg.band_part(tf.ones((T,T)),num_lower = 8, num_upper= 0)
w = tf.zeros((T,T))
w = tf.where(tril == 0, float('-inf'), w) # Upper triangular masking, indicating future bow cannot communicate with the past
w = tf.nn.softmax(w, axis = -1) # normalizing the weight matrix

v = value(x)
out = w @ v
#out = w @ x # K : private information

out.shape

TensorShape([4, 8, 16])

In [22]:
# Verifying whether the row sum of weights equal 1
import numpy as np
# Convert TensorFlow tensor to NumPy array
w_np = w.numpy()

# Set NumPy print options to suppress scientific notation
np.set_printoptions(suppress=True)

# Print the tensor
print(w_np.round(6))

[[1.       0.       0.       0.       0.       0.       0.       0.      ]
 [0.5      0.5      0.       0.       0.       0.       0.       0.      ]
 [0.333333 0.333333 0.333333 0.       0.       0.       0.       0.      ]
 [0.25     0.25     0.25     0.25     0.       0.       0.       0.      ]
 [0.2      0.2      0.2      0.2      0.2      0.       0.       0.      ]
 [0.166667 0.166667 0.166667 0.166667 0.166667 0.166667 0.       0.      ]
 [0.142857 0.142857 0.142857 0.142857 0.142857 0.142857 0.142857 0.      ]
 [0.125    0.125    0.125    0.125    0.125    0.125    0.125    0.125   ]]


Notes:
- Attention is a communication mechanism. Can be seen as nodes in a directed graph looking at each other and aggregating information with a weighted sum from all nodes that point to them, with data-dependent weights.
- There is no notion of space. Attention simply acts over a set of vectors. This is why we need to positionally encode tokens
- Each example across batch dimension is of course processed completely independently and never 'talk' to each other
- In an 'encoder' attention block just delete the single line that does masking with trill, allowing all tokens to communicate. This block here is called a 'decoder' attention block because it has triangular masking, and is usually used in autoregressive settings, like language modeling.
- 'Self attention' just means that the keys and values are produced from the same source as queries. In 'Cross-attention', the queries still get produced from x, but the keys and values come from some other, external source (e.g. an encoder module)
- 'Scaled' attention additional divides wei by 1/sqrt(head_size). This makes it so when input Q, K are unit variance, wei will be unit variance too and softmax will stay diffuse and not saturate too much, Illustration below

In [23]:
q = tf.random.normal((B, T, head_size))
k = tf.random.normal((B, T, head_size))

# Calculate the weights
wei = q @ tf.transpose(k, perm=[0, 2, 1]) * (head_size ** -0.5)

In [24]:
tf.math.reduce_variance(k).numpy()

0.9939072

In [25]:
tf.math.reduce_variance(q).numpy()

0.9280848

In [26]:
tf.math.reduce_variance(wei).numpy()

0.9270358

In [27]:
x.shape

TensorShape([4, 8, 32])

In [35]:
head_size = 16
max_iters = 5000
eval_interval = 500
learning_rate = 1e-3
eval_iters = 200
n_embed = 32
batch_size = 16
block_size = 8

class Head(tf.keras.Model):
    """one head of self-attention"""

    def __init__(self, head_size):
        '''Initializing embedding layer, which maps integer indices to
        dense vectors of vocab size'''
        super(Head, self).__init__()
        self.key = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.query = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.value = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.tril = tf.constant(tf.linalg.band_part(tf.ones((block_size, block_size)), -1, 0), dtype= tf.float32)

    def call(self, x):
        B, T, C = x.shape
        k = self.key(x)     # (B,T,C)
        q = self.query(x)   # (B,T,C)
        # compute attention scores ('affinities')
        wei = q @ tf.transpose(k, perm=[0, 2, 1]) * (C ** -0.5) # (B,T,C) @ (B,C,T ) -> (B,T,T))
        wei = tf.where(self.tril[:T, :T] == 0, float('-inf'), wei) # Mask the upper triangular part
        wei = tf.nn.softmax(wei, axis = -1)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B,T,T) @ (B,T,C) -> (B,T,C)
        return out

class FeedForward(tf.keras.layers.Layer):
    '''A simple linear layer followed by a non-linearity'''
    def __init__(self, n_embed):
        super().__init__()
        self.net = tf.keras.Sequential([
            tf.keras.layers.Dense(n_embed), # (n_embed, 4 * n_embed)
            tf.keras.layers.ReLU(),
            tf.keras.layers.Dense(n_embed), # (4 * n_embed, n_embed)
        ])

    def call(self, x):
        return self.net(x)


class MultiHeadAttention(tf.keras.layers.Layer):
    '''Multiple heads of self-attention in parallel'''

    def __init__(self,num_heads, head_size):
        super().__init__()
        self.heads = [Head(head_size) for _ in range(num_heads)]
        self.projection = tf.keras.layers.Dense(n_embed)

    def call(self, x):
        out = tf.concat([h(x) for h in self.heads], axis=-1)
        out = self.projection(out)
        return out

class Block(tf.keras.layers.Layer):
    """Transformer blocks : communication followed by computation"""

    def __init__(self, n_embed, n_head):
        # n_embed : embedding dimension, n_head : the number of heads we'd like
        super().__init__()
        head_size = n_embed // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embed)
        self.ln1 = tf.keras.layers.LayerNormalization(axis=-1)
        self.ln2 = tf.keras.layers.LayerNormalization(axis=-1)

    def call(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class BigramLanguageModel(tf.keras.Model):

    def __init__(self, vocab_size):
        '''Initializing embedding layer, which maps integer indices to
        dense vectors of vocab size'''
        super(BigramLanguageModel, self).__init__()
        self.token_embedding_table = tf.keras.layers.Embedding(vocab_size, n_embed)
        self.position_embedding_table = tf.keras.layers.Embedding(block_size, n_embed)
        self.blocks = tf.keras.Sequential([
            Block(n_embed = n_embed, n_head = 4),
            Block(n_embed = n_embed, n_head = 4),
            Block(n_embed = n_embed, n_head = 4),
            tf.keras.layers.LayerNormalization(axis=-1)
        ])
        self.ln_f = tf.keras.layers.LayerNormalization(axis=-1) # final layer normalization
        self.lm_head = tf.keras.layers.Dense(units=vocab_size)

    def call(self, idx, targets = None):
        '''Method for loss calculation, based on idx (input token indices) and
        target (target token indices)
        B : Batch size
        T : Time = sequence length = block size
        C : Channel = number of classes = vocab size
        '''
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        token_emb = self.token_embedding_table(idx) # (B,T,C)
        position_emb = self.position_embedding_table(tf.range(T, dtype=tf.int32)) # (T, C)
        x = token_emb + position_emb # (B,T,C)
        x = self.blocks(x) # (B, T, C)
        x = self.ln_f(x)
        logits = self.lm_head(x) # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:
            # Reshaping the tensor so that it's compatible with categorical cross entropy
            B, T, C = tf.shape(logits) # Get the shape of logits
            logits = tf.reshape(logits, (B * T, C)) # Flatten logits for comparison
            targets = tf.reshape(targets, (B * T,)) # Flatten targets
            loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(targets, logits, from_logits=True))
        return logits, loss

    def generate(self, idx, max_new_tokens):
        '''
        Text generating method
        '''
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:,-block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :]  # becomes (B, C)
            # apply softmax to get probabilities
            probs = tf.nn.softmax(logits, axis=-1)  # (B, C)
            # sample prediction from the distribution
            idx_next = tf.random.categorical(tf.math.log(probs), num_samples=1, dtype=tf.int32)
            # append sampled index to the running sequence
            idx = tf.concat([idx, idx_next], axis=1)  # (B, T+1)
        return idx

model = BigramLanguageModel(vocab_size)
logits, loss = model.call(xb, yb)
print(logits.shape)
print(loss.numpy())



optimizer = Adam(learning_rate)

for step in tf.range(max_iters):
    if step % eval_iters == 0:
        losses = estimate_loss()
        print(f"Step {step}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # Sample a batch of data
    xb, yb = get_batch('train')

    # Evaluate the loss
    with tf.GradientTape() as tape:
        logits, loss = model(xb,yb)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

print(f'Final Loss: {loss.numpy()}')

print(decode(model.generate(idx=tf.zeros((1, block_size), dtype=tf.int32), max_new_tokens=100)[0].numpy().tolist()))


(32, 65)
4.4887204
Step 0: train loss 4.4479, val loss 4.4539
Step 200: train loss 2.7340, val loss 2.7087
Step 400: train loss 2.5226, val loss 2.5260
Step 600: train loss 2.4373, val loss 2.4469
Step 800: train loss 2.3774, val loss 2.3929
Step 1000: train loss 2.3474, val loss 2.3592
Step 1200: train loss 2.3163, val loss 2.3336
Step 1400: train loss 2.3013, val loss 2.2994
Step 1600: train loss 2.2497, val loss 2.2873
Step 1800: train loss 2.2492, val loss 2.2549
Step 2000: train loss 2.2281, val loss 2.2328
Step 2200: train loss 2.2227, val loss 2.2434
Step 2400: train loss 2.1980, val loss 2.2469
Step 2600: train loss 2.1934, val loss 2.2212


SystemError: Exception encountered when calling Dense.call().

[1m<built-in method TFE_Py_TapeVariableAccessed of PyCapsule object at 0x7f5133bb5740> returned a result with an exception set[0m

Arguments received by Dense.call():
  • inputs=tf.Tensor(shape=(16, 8, 32), dtype=float32)

In [36]:
print(decode(model.generate(idx=tf.zeros((1, block_size), dtype=tf.int32), max_new_tokens=100)[0].numpy().tolist()))










Whery the!
Pust is stuing'g in ere wose yeaks dompe?

HEDUMS.

Ther leeful, ly,
Prood,
And spruns, I
