### * While the original source code is written in Pytorch, the below code is adapted to Tensorflow.

- GPU utilization not enabled

# 1. Preparing the tinyshakespeare text file for training

In [1]:
# Downloading tinyshakesphere for training
!curl https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt > tinyshakespeare.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1089k  100 1089k    0     0  2264k      0 --:--:-- --:--:-- --:--:-- 2269k


In [2]:
# Inspecting the text file
with open('tinyshakespeare.txt','r') as file:
    text = file.read()
print(f'There are {len(text)} characters in the dataset')

There are 1115394 characters in the dataset


In [3]:
# Printing the first 1000 characters
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [4]:
# Identifying the number of unique characters contained in the text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(f"Number of unique characters (including white space): {vocab_size}{''.join(chars)}")

Number of unique characters (including white space): 65
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


# 2. Basic mapping between characters to integers

Tokenizing at the character-level.

More sophisticated examples of word encoding include Google's SentencePiece and OpenAI's tiktoken

In [5]:
# Assigning numbers to each characters to encode the characters to integers
ctoi = {char : num for num, char in enumerate(chars)}
encode = lambda s: [ctoi[c] for c in s]
print(encode('Shakespeare in digits'))

# Reversely, decode integers back to characters
itoc = {num : char for num, char in enumerate(chars)}
decode = lambda l : ''.join([itoc[i] for i in l])
print(decode(encode('Shakespeare in digits')))

[31, 46, 39, 49, 43, 57, 54, 43, 39, 56, 43, 1, 47, 52, 1, 42, 47, 45, 47, 58, 57]
Shakespeare in digits


In [6]:
# Tokenizing the total text
import tensorflow as tf
data = tf.convert_to_tensor(encode(text))
print(data.shape, data.dtype)
print(data[:100])

2024-11-10 10:13:15.895501: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-10 10:13:16.132293: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


(1115394,) <dtype: 'int32'>
tf.Tensor(
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59], shape=(100,), dtype=int32)


In [7]:
# Train and validation split sets, with 9:1 ratio
n = int(0.9*len(data))
data_train = data[:n]
data_test = data[n:]
print(f'Length of train data : {len(data_train)}\nLength of test data : {len(data_test)}')

Length of train data : 1003854
Length of test data : 111540


In [8]:
# Starting with block_size implementation
block_size = 8                            # Context length
print(data_train[:block_size + 1])
x = data_train[:block_size]               # Initial block-size
y = data_train[1:block_size+1]            # Next block-size
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f'Input : {context}, Output : {target}')

tf.Tensor([18 47 56 57 58  1 15 47 58], shape=(9,), dtype=int32)
Input : [18], Output : 47
Input : [18 47], Output : 56
Input : [18 47 56], Output : 57
Input : [18 47 56 57], Output : 58
Input : [18 47 56 57 58], Output : 1
Input : [18 47 56 57 58  1], Output : 15
Input : [18 47 56 57 58  1 15], Output : 47
Input : [18 47 56 57 58  1 15 47], Output : 58


In [9]:
## To be worked on : packaging the code with script with variables for later
# Depiction of the chunk(or in here, block)-wise transformation.
# Having varied blocksize allows the algorithm to take into account the context for inference purpose

tf.random.set_seed(1337) # For reproducibility, to be sure to have consistent random number
batch_size = 4 # The number of independent sequences to train in parallel
block_size = 8 # The maximum context length for prediction

def get_batch(split):
    '''
    Function to generate a small batch of data of inputs x and targets y
    '''

    data = data_train if split == 'train' else data_test
    # Retrieving batches randomly
    ix = tf.random.uniform(shape = (batch_size,),
                          maxval = len(data) - block_size,
                          dtype = tf.int32)
    # Stacking the list of tensors
    x = tf.stack([data[i:i+block_size] for i in ix])
    y = tf.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)
for batch in range(batch_size):       # Batch dimension
    for block in range(block_size):   # Time dimension
        context = xb[batch, :block+1]
        target = yb[batch, block]
        print(f'When input is {context.numpy().tolist()} the target is {target}')

inputs:
(4, 8)
tf.Tensor(
[[ 1 51 63  1 41 53 39 58]
 [39 42  0 20 47 57  1 52]
 [32 53  1 56 43 60 43 50]
 [54 39 52 63  1 54 47 43]], shape=(4, 8), dtype=int32)
targets:
(4, 8)
tf.Tensor(
[[51 63  1 41 53 39 58  6]
 [42  0 20 47 57  1 52 39]
 [53  1 56 43 60 43 50  1]
 [39 52 63  1 54 47 43 41]], shape=(4, 8), dtype=int32)
When input is [1] the target is 51
When input is [1, 51] the target is 63
When input is [1, 51, 63] the target is 1
When input is [1, 51, 63, 1] the target is 41
When input is [1, 51, 63, 1, 41] the target is 53
When input is [1, 51, 63, 1, 41, 53] the target is 39
When input is [1, 51, 63, 1, 41, 53, 39] the target is 58
When input is [1, 51, 63, 1, 41, 53, 39, 58] the target is 6
When input is [39] the target is 42
When input is [39, 42] the target is 0
When input is [39, 42, 0] the target is 20
When input is [39, 42, 0, 20] the target is 47
When input is [39, 42, 0, 20, 47] the target is 57
When input is [39, 42, 0, 20, 47, 57] the target is 1
When input is [39,

## Basic BigramModel for training

In [None]:
tf.random.set_seed(1337)
# Hyperparameters
batch_size = 16 # Independent sequences to process in parallel
block_size = 32 # Maximum context length for prediction
max_iters = 5000
eval_interval = 500 # How often evaluate the loss
learning_rate = 1e-3
eval_iters = 200 # How many batches to use to compute loss
n_embed = 64

In [11]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

gpt_results = pd.DataFrame(columns=['Model', 'Train loss', 'Val loss', 'Time (min)', 'Text'])
gpt_results

Unnamed: 0,Model,Train loss,Val loss,Time (min),Text


In [12]:
class BigramLanguageModel(tf.keras.Model):

    def __init__(self, vocab_size):
        '''Initializing embedding layer, which maps integer indices to
        dense vectors of vocab size'''
        super(BigramLanguageModel, self).__init__()
        self.token_embedding_table = tf.keras.layers.Embedding(vocab_size, vocab_size)

    def call(self, idx, targets=None):
        '''Method for loss calculation, based on idx (input token indices) and
        target (target token indices)
        B : Batch size
        T : Time = block size = sequence length
        C : Channel = vocab size = number of classes
        '''
        logits = self.token_embedding_table(idx)  # Replacing embedding to the indices

        if targets is None: # If target is not provided
            loss = None
        else:               # If target is provided, reshape the tensor so that it's compatible with categorical cross entropy
            B, T, C = tf.shape(logits) # Get the shape of logits
            logits = tf.reshape(logits, (B * T, C)) # Flatten logits for comparison
            targets = tf.reshape(targets, (B * T,)) # Flatten targets
            loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(targets, logits, from_logits=True))
        return logits, loss

    def generate(self, idx, max_new_tokens):
        '''
        Text generating method
        '''
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # Get the predictions
            logits, loss = self(idx)
            # Focus only on the last time step (i.e. history is not being used)
            logits = logits[:, -1, :]  # becomes (B, C)
            # Apply softmax to get probabilities
            probs = tf.nn.softmax(logits, axis=-1)  # (B, C)
            # One sample prediction from the distribution
            idx_next = tf.random.categorical(tf.math.log(probs), num_samples=1, dtype=tf.int64) # (B, 1)

            # idx_next = tf.random.categorical(probs, num_samples=1)
            # append sampled index to the running sequence
            idx = tf.concat([idx, tf.cast(idx_next, tf.int32)], axis=1)  # (B, T+1)
        return idx

model_basic = BigramLanguageModel(vocab_size)

logits, loss = model_basic.call(xb, yb)
print(logits.shape)
print(loss.numpy())

print(decode(model_basic.generate(idx=tf.zeros((1, 1), dtype=tf.int32), max_new_tokens=100)[0].numpy().tolist()))

(32, 65)
4.1756525

sZTe-Xz-L-?hNl;Yr:r'KUFLHH:QmLbpClI
oYwnqePrE
!zgz'U:,?ZgzxEjItfpzAQjGjM&vv.;OBdqFlP qxdwcexXhPKs:$'


### Creating an optimizer, and training the model

In [None]:
import time

def estimate_loss(model):
  '''
  Function to average up the loss in multiple batches for both splits
  '''
  output = {}
  model.training = False # Setting the model to evaluation phase
  for split in ['train','val']:
      losses = []
      for _ in range(eval_iters):
          X, Y = get_batch(split)
          logits, loss = model.call(X,Y)
          losses.append(loss)
      output[split] = tf.reduce_mean(losses)
  model.training = True # Setting the model back to training phase
  return output

def model_train(model, label):
    start_train = time.time()
    optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate)
    for step in tf.range(1, max_iters+1):
        if (step % eval_interval == 0) or (step == 1):
            losses = estimate_loss(model)
            if step != 1:
                end_int = time.time()
                print(f"Step {step}\t train loss {losses['train']:.4f} | val loss {losses['val']:.4f} | time {(end_int-start_int)//60:.0f} min {(end_int-start_int)%60:.0f} seconds")
                start_int = time.time()
            else:
                print(f"Step {step}\t\t train loss {losses['train']:.4f} | val loss {losses['val']:.4f}")
                start_int = time.time()


        # Sample a batch of data
        xb, yb = get_batch('train')


        # Evaluate the loss and update parameters
        with tf.GradientTape() as tape:
            logits, loss = model(xb,yb)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    end_train = time.time()

    # Save result for comparison
    global gpt_results
    gpt_results = pd.concat([gpt_results,pd.DataFrame({'Model': label,
                                                       'Train loss': [round(losses['train'].numpy(),4)],
                                                       'Val loss': [round(losses['val'].numpy(),4)],
                                                       'Time (min)' : [round((end_train-start_train)/60,0)],
                                                       'Text':''})], ignore_index = True)
    print(f'Final Loss: {loss.numpy()}')

def model_generate(model):
    # Generate a sequence
    print('\n======================= Generated Sequence =======================')
    idx = tf.zeros((1, 1), dtype=tf.int32)
    generated_sequence = model.generate(idx, max_new_tokens=500).numpy()
    # Save result for comparison
    gpt_results.iloc[-1,4] = decode(generated_sequence[0].tolist())
    print(decode(generated_sequence[0].tolist()))


In [14]:
model_train(model_basic, 'Basic')
model_generate(model_basic)

Step 1		 train loss 4.1714 | val loss 4.1710
Step 500	 train loss 3.0852 | val loss 3.0929 | time 0 min 19 seconds
Step 1000	 train loss 2.7327 | val loss 2.7406 | time 0 min 19 seconds
Step 1500	 train loss 2.5981 | val loss 2.6122 | time 0 min 19 seconds
Step 2000	 train loss 2.5461 | val loss 2.5525 | time 0 min 19 seconds
Step 2500	 train loss 2.5118 | val loss 2.5228 | time 0 min 19 seconds
Step 3000	 train loss 2.4940 | val loss 2.5058 | time 0 min 19 seconds
Step 3500	 train loss 2.4790 | val loss 2.4964 | time 0 min 18 seconds
Step 4000	 train loss 2.4759 | val loss 2.4906 | time 0 min 19 seconds
Step 4500	 train loss 2.4694 | val loss 2.4922 | time 0 min 18 seconds
Step 5000	 train loss 2.4714 | val loss 2.4880 | time 0 min 17 seconds
Final Loss: 2.486629009246826


Sallenl!
ND:
Hico ixcquthed be sis Gemat wse whe atahe ase MExnd gube aced prenchow, avere ithaiass itooupur deried we t then arme, aceat elllave thiny MAreng url unubak,
S: bjxthrhe t kechatre?

T:
THAngobe hed, f

In [15]:
gpt_results

Unnamed: 0,Model,Train loss,Val loss,Time (min),Text
0,Basic,2.4714,2.488,3.0,"\nSallenl!\nND:\nHico ixcquthed be sis Gemat wse whe atahe ase MExnd gube aced prenchow, avere ithaiass itooupur deried we t then arme, aceat elllave thiny MAreng url unubak,\nS: bjxthrhe t kechatre?\n\nT:\nTHAngobe hed, fithestha!Mour s t thes\nO y.\nMIqunerpe themo vee hilepayoouatheadingick suie t y o'te mond y ithiury s.\nMIn t ss:\nCLAHeevizeerouth sspaugh's cher s he mnd.\nLINTI, cofey with he.\nCEThe ar co t y isit h sisged, be:\n\nCand, re me, myepe mphorferg.\nHand thes hiothergenche avugo, pld hte ive"


## The mathematical trick in self-attention
Below present different ways of calculating weighted aggregation of a matrix, from beginning of the block in each batch, up to the 't'th token. The results of the four approaches are the same

In [16]:
# Tokens learning from previous context, by calculating average up to 't'th token
B,T,C = 4,8,2 # Batch, Time, Channels
x = tf.random.uniform(shape=(B, T,C))

### Version 1: Basic

In [17]:
xbow = tf.zeros((B,T,C)) # Defining a bag of words
for b in range (B):
    for t in range (T):
        xprev = x[b, :t+1] # (t, C) Batch, including the 't'th token
        xbow = xbow.numpy()  # Convert xbow to numpy array to support assignment
        xbow[b, t] = tf.reduce_mean(xprev, axis=0).numpy()  # Calculate mean and assign to xbow
        xbow = tf.convert_to_tensor(xbow)  # Convert back to tensor

### Version 2: Vectorizing


In [18]:
w = tf.linalg.band_part(tf.ones((T,T)),num_lower = 8, num_upper= 0)
w = w / tf.math.reduce_sum(w, axis = 1, keepdims = True) # Low triangular matrix for calculating average weights

xbow2 = w @ x # (B, T, T) @ (B , T, C) --> (B, T, C)
tf.experimental.numpy.allclose(xbow,xbow2).numpy() # Checking whether xbow == xbow2

True

### Version 3: Using softmax

In [19]:
tril = tf.linalg.band_part(tf.ones((T,T)),num_lower = 8, num_upper= 0)
w = tf.zeros((T,T))
w = tf.where(tril == 0, float('-inf'), w) # Replacing 0s with -inf, indicating that the past blocks cannot communicate with the future blocks
w = tf.nn.softmax(w, axis = -1) # Normalizing the weight matrix
xbow3 = w @ x
tf.experimental.numpy.allclose(xbow,xbow3).numpy()

True

### Version 4: Self-attention

Called self-attention as the key, query and value are generated from the same value (x)

Note that key and query weights values are different as

In [20]:
# Attention mechanism
head_size = 16
key = tf.keras.layers.Dense(units=head_size, use_bias=False)
query = tf.keras.layers.Dense(units=head_size, use_bias=False)
value = tf.keras.layers.Dense(units=head_size, use_bias=False)
k = key(x) # Weights adjusted, (B, T, 16)
q = query(x) # (B, T, 16)
w = q @ tf.transpose(k, perm=[0,2,1]) # (B, T, 16) @ (B, 16, T) -> (B, T, T), with (T, T) indicating elements compared with every element in the sequence

tril = tf.linalg.band_part(tf.ones((T,T)),num_lower = 8, num_upper= 0)
w = tf.where(tril == 0, float('-inf'), w) # Replacing 0s with -inf, indicating that the past blocks cannot communicate with the future blocks
w = tf.nn.softmax(w, axis = -1) # Normalizing the weight matrix

v = value(x)
out = w @ v # Using aggregated value instead of the raw x for dimensionality reduction, information extraction
out.shape

TensorShape([4, 8, 16])

Notes:
- Attention is a communication mechanism. Can be seen as nodes in a directed graph looking at each other and aggregating information with a weighted sum from all nodes that point to them, with data-dependent weights.
- There is no notion of space. Attention simply acts over as a set of vectors. This is why we need to positionally encode tokens
- Each example across batch dimension is of course processed completely independently and never 'talk' to each other
- In an 'encoder' attention block (w = tf.where(tril == 0, float('-inf'), w))code can be omitted, allowing all tokens to communicate. This block here is called a 'decoder' attention block because it has triangular masking, and is usually used in autoregressive settings, like language modeling.
- 'Self attention' just means that the keys and values are produced from the same source as queries. In 'Cross-attention', the queries still get produced from x, but the keys and values come from some other, external source (e.g. an encoder module)
- 'Scaled' attention additional divides wei by 1/sqrt(head_size). This makes it so when input Q, K are unit variance, wei will be unit variance too and softmax will stay diffuses and not saturate too much, Illustration below

## Modified BigramModel with self-attention

In [21]:
class Head(tf.keras.Model):
    """one head of self-attention"""

    def __init__(self, head_size):
        '''Initializing embedding layer, which maps integer indices to
        dense vectors of vocab size'''
        super(Head, self).__init__()
        self.key = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.query = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.value = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.tril = tf.constant(tf.linalg.band_part(tf.ones((block_size, block_size)), -1, 0), dtype= tf.float32)

    def call(self, x):
        B, T, C = x.shape
        k = self.key(x)     # (B,T,C)
        q = self.query(x)   # (B,T,C)
        # Compute attention scores ('affinities')
        wei = q @ tf.transpose(k, perm=[0,2,1]) * C ** (-0.5) # (B,T,C) @ (B,C,T) -> (B,T,T)
        wei = tf.where(self.tril[:T, :T] == 0, float('-inf'), wei) # Mask the upper triangular part, (B,T,T)
        wei = tf.nn.softmax(wei, axis = -1) # (B,T,T)
        # Perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B,T,T) @ (B,T,C) -> (B,T,C)
        return out

class BigramLanguageModel(tf.keras.Model):

    def __init__(self, vocab_size):
        '''Initializing embedding layer, which maps integer indices to
        dense vectors of vocab size'''
        super(BigramLanguageModel, self).__init__()
        self.token_embedding_table = tf.keras.layers.Embedding(vocab_size, n_embed)
        self.position_embedding_table = tf.keras.layers.Embedding(block_size, n_embed)
        self.sa_head = Head(n_embed)
        self.lm_head = tf.keras.layers.Dense(units=vocab_size)

    def call(self, idx, targets=None):
        '''Method for loss calculation, based on idx (input token indices) and
        target (target token indices)
        B : Batch size
        T : Time = block size = sequence length
        C : Channel = vocab size = number of classes
        '''
        B,T = idx.shape

        token_emb = self.token_embedding_table(idx)  # (B, T, C) Replacing indices with embeddings
        pos_emb = self.position_embedding_table(tf.range(T, dtype=tf.int32)) # (T,C)
        x = token_emb + pos_emb # (B, T, C) Containing both token embedding and position
        x = self.sa_head(x) # Apply one head of self-attention (B, T, C)
        logits = self.lm_head(x)  # (B, T, vocab_size)

        if targets is None: # If target is not provided
            loss = None
        else:               # If target is provided, reshape the tensor so that it's compatible with categorical cross entropy
            B, T, C = tf.shape(logits) # Get the shape of logits
            logits = tf.reshape(logits, (B * T, C)) # Flatten logits for comparison
            targets = tf.reshape(targets, (B * T,)) # Flatten targets
            loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(targets, logits, from_logits=True))
        return logits, loss

    def generate(self, idx, max_new_tokens):
        '''
        Text generating method
        '''
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # Crop idx to the last block_size tokens to avoid going out of scope
            idx_cond = idx[:, -block_size:]
            # Get the predictions
            logits, loss = self(idx_cond)
            # Focus only on the last time step (i.e. history is not being used)
            logits = logits[:, -1, :]  # becomes (B, C)
            # Apply softmax to get probabilities
            probs = tf.nn.softmax(logits, axis=-1)  # (B, C)
            # One sample prediction from the distribution
            idx_next = tf.random.categorical(tf.math.log(probs), num_samples=1, dtype=tf.int64) # (B, 1)

            # idx_next = tf.random.categorical(probs, num_samples=1)
            # append sampled index to the running sequence
            idx = tf.concat([idx, tf.cast(idx_next, tf.int32)], axis=1)  # (B, T+1)
        return idx

model_sa = BigramLanguageModel(vocab_size)

In [22]:
model_train(model_sa,'Self-attention')
model_generate(model_sa)

Step 1		 train loss 4.1776 | val loss 4.1773
Step 500	 train loss 2.6195 | val loss 2.6206 | time 0 min 37 seconds
Step 1000	 train loss 2.4951 | val loss 2.4892 | time 0 min 38 seconds
Step 1500	 train loss 2.4367 | val loss 2.4345 | time 0 min 38 seconds
Step 2000	 train loss 2.4046 | val loss 2.4170 | time 0 min 37 seconds
Step 2500	 train loss 2.3776 | val loss 2.3989 | time 0 min 37 seconds
Step 3000	 train loss 2.3649 | val loss 2.3776 | time 0 min 37 seconds
Step 3500	 train loss 2.3506 | val loss 2.3590 | time 0 min 37 seconds
Step 4000	 train loss 2.3480 | val loss 2.3641 | time 0 min 37 seconds
Step 4500	 train loss 2.3382 | val loss 2.3596 | time 0 min 35 seconds
Step 5000	 train loss 2.3219 | val loss 2.3593 | time 0 min 37 seconds
Final Loss: 2.232243061065674


't gho,
Theel mse aplpalint oturaverd uts thel sut thouctre odes amy sabjute, hen n vewnoth arig ar fareatref by tlati cant,
Hilds ushirs Anos spor yof ss deen mos pan to ord
Thimbewratat alinofr ken rshes;
Tor the

## Multi-headed attention

In [23]:
class Head(tf.keras.Model):
    """one head of self-attention"""

    def __init__(self, head_size):
        '''Initializing embedding layer, which maps integer indices to
        dense vectors of vocab size'''
        super(Head, self).__init__()
        self.key = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.query = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.value = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.tril = tf.constant(tf.linalg.band_part(tf.ones((block_size, block_size)), -1, 0), dtype= tf.float32)

    def call(self, x):
        B, T, C = x.shape
        k = self.key(x)     # (B,T,C)
        q = self.query(x)   # (B,T,C)
        # Compute attention scores ('affinities')
        wei = q @ tf.transpose(k, perm=[0,2,1]) * C ** (-0.5) # (B,T,C) @ (B,C,T) -> (B,T,T)
        wei = tf.where(self.tril[:T, :T] == 0, float('-inf'), wei) # Mask the upper triangular part, (B,T,T)
        wei = tf.nn.softmax(wei, axis = -1) # (B,T,T)
        # Perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B,T,T) @ (B,T,C) -> (B,T,C)
        return out
# ================================================================== #
class MultiHeadAttention(tf.keras.layers.Layer):
    '''Multiple heads of self-attention in parallel'''

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = [Head(head_size) for _ in range(num_heads)]

    def call(self, x):
        out = tf.concat([h(x) for h in self.heads], axis=-1)
        return out
# ================================================================== #

class BigramLanguageModel(tf.keras.Model):

    def __init__(self, vocab_size):
        '''Initializing embedding layer, which maps integer indices to
        dense vectors of vocab size'''
        super(BigramLanguageModel, self).__init__()
        self.token_embedding_table = tf.keras.layers.Embedding(vocab_size, n_embed)
        self.position_embedding_table = tf.keras.layers.Embedding(block_size, n_embed)
# ================================================================== #
        self.sa_head = MultiHeadAttention(4, n_embed//4) # 4 heads of 8-dimensional self-attention
# ================================================================== #
        self.lm_head = tf.keras.layers.Dense(units=vocab_size)

    def call(self, idx, targets=None):
        '''Method for loss calculation, based on idx (input token indices) and
        target (target token indices)
        B : Batch size
        T : Time = block size = sequence length
        C : Channel = vocab size = number of classes
        '''
        B,T = idx.shape

        token_emb = self.token_embedding_table(idx)  # (B, T, C) Replacing indices with embeddings
        pos_emb = self.position_embedding_table(tf.range(T, dtype=tf.int32)) # (T,C)
        x = token_emb + pos_emb # (B, T, C) Containing both token embedding and position
        x = self.sa_head(x) # Apply one head of self-attention (B, T, C)
        logits = self.lm_head(x)  # (B, T, vocab_size)

        if targets is None: # If target is not provided
            loss = None
        else:               # If target is provided, reshape the tensor so that it's compatible with categorical cross entropy
            B, T, C = tf.shape(logits) # Get the shape of logits
            logits = tf.reshape(logits, (B * T, C)) # Flatten logits for comparison
            targets = tf.reshape(targets, (B * T,)) # Flatten targets
            loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(targets, logits, from_logits=True))
        return logits, loss

    def generate(self, idx, max_new_tokens):
        '''
        Text generating method
        '''
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # Crop idx to the last block_size tokens to avoid going out of scope
            idx_cond = idx[:, -block_size:]
            # Get the predictions
            logits, loss = self(idx_cond)
            # Focus only on the last time step (i.e. history is not being used)
            logits = logits[:, -1, :]  # becomes (B, C)
            # Apply softmax to get probabilities
            probs = tf.nn.softmax(logits, axis=-1)  # (B, C)
            # One sample prediction from the distribution
            idx_next = tf.random.categorical(tf.math.log(probs), num_samples=1, dtype=tf.int64) # (B, 1)

            # idx_next = tf.random.categorical(probs, num_samples=1)
            # append sampled index to the running sequence
            idx = tf.concat([idx, tf.cast(idx_next, tf.int32)], axis=1)  # (B, T+1)
        return idx

model_ma = BigramLanguageModel(vocab_size)

In [24]:
model_train(model_ma, 'Muti-headed attention')
model_generate(model_ma)

Step 1		 train loss 4.1765 | val loss 4.1763
Step 500	 train loss 2.6213 | val loss 2.6239 | time 1 min 4 seconds
Step 1000	 train loss 2.4862 | val loss 2.4858 | time 1 min 4 seconds
Step 1500	 train loss 2.3611 | val loss 2.3737 | time 1 min 4 seconds
Step 2000	 train loss 2.2999 | val loss 2.3091 | time 1 min 4 seconds
Step 2500	 train loss 2.2534 | val loss 2.2677 | time 1 min 5 seconds
Step 3000	 train loss 2.2155 | val loss 2.2393 | time 1 min 3 seconds
Step 3500	 train loss 2.1847 | val loss 2.2114 | time 1 min 4 seconds
Step 4000	 train loss 2.1588 | val loss 2.1882 | time 1 min 4 seconds
Step 4500	 train loss 2.1428 | val loss 2.1840 | time 1 min 5 seconds
Step 5000	 train loss 2.1259 | val loss 2.1754 | time 1 min 6 seconds
Final Loss: 2.245828151702881


NUSI of go fort he al; buento s be I qoums, wey's tongmbou wan thy uty to will, bracre lown I on keim:
Goll Mo I wertdeear.

HARSIIO:
Bve sut isper ary hostar! My
Tarl frand hou then gongere doos hot grew,
sonotsits ET lelly

## Feed Forward

In [None]:
del BigramLanguageModel

class Head(tf.keras.Model):
    """one head of self-attention"""

    def __init__(self, head_size):
        '''Initializing embedding layer, which maps integer indices to
        dense vectors of vocab size'''
        super(Head, self).__init__()
        self.key = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.query = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.value = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.tril = tf.constant(tf.linalg.band_part(tf.ones((block_size, block_size)), -1, 0), dtype= tf.float32)

    def call(self, x):
        B, T, C = x.shape
        k = self.key(x)     # (B,T,C)
        q = self.query(x)   # (B,T,C)
        # Compute attention scores ('affinities')
        wei = q @ tf.transpose(k, perm=[0,2,1]) * C ** (-0.5) # (B,T,C) @ (B,C,T) -> (B,T,T)
        wei = tf.where(self.tril[:T, :T] == 0, float('-inf'), wei) # Mask the upper triangular part, (B,T,T)
        wei = tf.nn.softmax(wei, axis = -1) # (B,T,T)
        # Perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B,T,T) @ (B,T,C) -> (B,T,C)
        return out

class MultiHeadAttention(tf.keras.layers.Layer):
    '''Multiple heads of self-attention in parallel'''

    def __init__(self,num_heads, head_size):
        super().__init__()
        self.heads = [Head(head_size) for _ in range(num_heads)]

    def call(self, x):
        out = tf.concat([h(x) for h in self.heads], axis=-1)
        return out

# [==================================================================
class FeedForward(tf.keras.layers.Layer):
    '''A simple linear layer followed by a non-linearity'''
    def __init__(self, n_embed):
        super().__init__()
        self.net = tf.keras.Sequential([
            tf.keras.layers.Dense(n_embed),
            tf.keras.layers.ReLU(),
        ])

    def call(self, x):
        return self.net(x)

class Block(tf.keras.layers.Layer):
    """Transformer blocks : communication followed by computation"""

    def __init__(self, n_embed, n_head):
        # n_embed : embedding dimension, n_head : the number of heads we'd like
        super().__init__()
        self.sa_head = MultiHeadAttention(n_head, n_embed//n_head) # Communication
        self.ffwd = FeedForward(n_embed) # Computation of individual tokens

    def call(self, x):
        x = self.sa_head(x)
        x = self.ffwd(x)
        return x
# ==================================================================] #

class BigramLanguageModel(tf.keras.Model):

    def __init__(self, vocab_size):
        '''Initializing embedding layer, which maps integer indices to
        dense vectors of vocab size'''
        super(BigramLanguageModel, self).__init__()
        self.token_embedding_table = tf.keras.layers.Embedding(vocab_size, n_embed)
        self.position_embedding_table = tf.keras.layers.Embedding(block_size, n_embed)
        self.sa_head = MultiHeadAttention(4, n_embed//4) # 4 heads of 8-dimensional self-attention
# [================================================================== #
        self.blocks = tf.keras.Sequential([
            Block(n_embed, n_head=4),
            Block(n_embed, n_head=4),
            Block(n_embed, n_head=4),])
# ==================================================================] #
        self.lm_head = tf.keras.layers.Dense(units=vocab_size)

    def call(self, idx, targets=None):
        '''Method for loss calculation, based on idx (input token indices) and
        target (target token indices)
        B : Batch size
        T : Time = block size = sequence length
        C : Channel = vocab size = number of classes
        '''
        B,T = idx.shape

        token_emb = self.token_embedding_table(idx)  # (B, T, C) Replacing indices with embeddings
        pos_emb = self.position_embedding_table(tf.range(T, dtype=tf.int32)) # (T,C)
        x = token_emb + pos_emb # (B, T, C) Containing both token embedding and position
        x = self.sa_head(x) # Apply self-attention (B, T, C)
        x = self.blocks(x) # Apply feed forward (B, T, C)
        logits = self.lm_head(x)  # (B, T, vocab_size)

        if targets is None: # If target is not provided
            loss = None
        else:               # If target is provided, reshape the tensor so that it's compatible with categorical cross entropy
            B, T, C = tf.shape(logits) # Get the shape of logits
            logits = tf.reshape(logits, (B * T, C)) # Flatten logits for comparison
            targets = tf.reshape(targets, (B * T,)) # Flatten targets
            loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(targets, logits, from_logits=True))
        return logits, loss

    def generate(self, idx, max_new_tokens):
        '''
        Text generating method
        '''
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # Crop idx to the last block_size tokens to avoid going out of scope
            idx_cond = idx[:, -block_size:]
            # Get the predictions
            logits, loss = self(idx_cond)
            # Focus only on the last time step (i.e. history is not being used)
            logits = logits[:, -1, :]  # becomes (B, C)
            # Apply softmax to get probabilities
            probs = tf.nn.softmax(logits, axis=-1)  # (B, C)
            # One sample prediction from the distribution
            idx_next = tf.random.categorical(tf.math.log(probs), num_samples=1, dtype=tf.int64) # (B, 1)

            # idx_next = tf.random.categorical(probs, num_samples=1)
            # append sampled index to the running sequence
            idx = tf.concat([idx, tf.cast(idx_next, tf.int32)], axis=1)  # (B, T+1)
        return idx

model_ff = BigramLanguageModel(vocab_size)

In [28]:
model_train(model_ff, 'Feed forward')
model_generate(model_ff)

Step 1		 train loss 4.1734 | val loss 4.1737
Step 500	 train loss 3.2908 | val loss 3.3321 | time 3 min 4 seconds
Step 1000	 train loss 2.6202 | val loss 2.5990 | time 2 min 57 seconds
Step 1500	 train loss 2.4269 | val loss 2.4403 | time 2 min 56 seconds
Step 2000	 train loss 2.3191 | val loss 2.3228 | time 2 min 60 seconds
Step 2500	 train loss 2.2473 | val loss 2.2738 | time 2 min 58 seconds
Step 3000	 train loss 2.2166 | val loss 2.2448 | time 2 min 59 seconds
Step 3500	 train loss 2.1664 | val loss 2.1989 | time 2 min 57 seconds
Step 4000	 train loss 2.1573 | val loss 2.2054 | time 2 min 58 seconds
Step 4500	 train loss 2.1012 | val loss 2.1699 | time 2 min 57 seconds
Step 5000	 train loss 2.0814 | val loss 2.1486 | time 2 min 55 seconds
Final Loss: 2.1662659645080566


Patere will tly wllicived dowecernns ag of shanf:
Obk whey noudty aikee marsak os mreth
awer. Ae pOutlet areens fopn iletle than lrw, nap hon opk trrrn, amis shic ahares brene' as i i Se c
TA wM ruysewsivee
Afsance

## Optimization

1) residual

2) pre-layer norm (different from the original paper)>> make more series 3

In [29]:
del BigramLanguageModel

class Head(tf.keras.Model):
    """one head of self-attention"""

    def __init__(self, head_size):
        '''Initializing embedding layer, which maps integer indices to
        dense vectors of vocab size'''
        super(Head, self).__init__()
        self.key = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.query = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.value = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.tril = tf.constant(tf.linalg.band_part(tf.ones((block_size, block_size)), -1, 0), dtype= tf.float32)

    def call(self, x):
        B, T, C = x.shape
        k = self.key(x)     # (B,T,C)
        q = self.query(x)   # (B,T,C)
        # Compute attention scores ('affinities')
        wei = q @ tf.transpose(k, perm=[0,2,1]) * C ** (-0.5) # (B,T,C) @ (B,C,T) -> (B,T,T)
        wei = tf.where(self.tril[:T, :T] == 0, float('-inf'), wei) # Mask the upper triangular part, (B,T,T)
        wei = tf.nn.softmax(wei, axis = -1) # (B,T,T)
        # Perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B,T,T) @ (B,T,C) -> (B,T,C)
        return out

class MultiHeadAttention(tf.keras.layers.Layer):
    '''Multiple heads of self-attention in parallel'''

    def __init__(self,num_heads, head_size):
        super().__init__()
        self.heads = [Head(head_size) for _ in range(num_heads)]
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++
        self.projection = tf.keras.layers.Dense(n_embed)
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++

    def call(self, x):
        out = tf.concat([h(x) for h in self.heads], axis=-1)
# ++++++++++++++++++++++++++++++++++++++++++++++++++++
        out = self.projection(out)
# +++++++++++++++++++++++++++++++++++++++++++++++++++
        return out

class FeedForward(tf.keras.layers.Layer):
    '''A simple linear layer followed by a non-linearity'''
    def __init__(self, n_embed):
        super().__init__()
        self.net = tf.keras.Sequential([
# vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
            tf.keras.layers.Dense(4 * n_embed), # (n_embed, 4 * n_embed)
            tf.keras.layers.ReLU(),
            tf.keras.layers.Dense(n_embed), # (4 * n_embed, n_embed)
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        ])

    def call(self, x):
        return self.net(x)

class Block(tf.keras.layers.Layer):
    """Transformer blocks : communication followed by computation"""

    def __init__(self, n_embed, n_head):
        # n_embed : embedding dimension, n_head : the number of heads we'd like
        super().__init__()
        self.sa_head = MultiHeadAttention(n_head, n_embed//n_head) # Communication
        self.ffwd = FeedForward(n_embed) # Computation of individual tokens
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
        self.ln1 = tf.keras.layers.LayerNormalization(axis=-1)
        self.ln2 = tf.keras.layers.LayerNormalization(axis=-1)
# ++++++++++

    def call(self, x):
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
        # Residual Connections to preserve information, and improve gradient flow
        x = x + self.sa_head(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
        return x


class BigramLanguageModel(tf.keras.Model):

    def __init__(self, vocab_size):
        '''Initializing embedding layer, which maps integer indices to
        dense vectors of vocab size'''
        super(BigramLanguageModel, self).__init__()
        self.token_embedding_table = tf.keras.layers.Embedding(vocab_size, n_embed)
        self.position_embedding_table = tf.keras.layers.Embedding(block_size, n_embed)
        self.sa_head = MultiHeadAttention(4, n_embed//4) # 4 heads of 8-dimensional self-attention
# [================================================================== #
        self.blocks = tf.keras.Sequential([
            Block(n_embed, n_head=4),
            Block(n_embed, n_head=4),
            Block(n_embed, n_head=4),
            tf.keras.layers.LayerNormalization(axis=-1),
            ])
# ==================================================================] #
        self.lm_head = tf.keras.layers.Dense(units=vocab_size)

    def call(self, idx, targets=None):
        '''Method for loss calculation, based on idx (input token indices) and
        target (target token indices)
        B : Batch size
        T : Time = block size = sequence length
        C : Channel = vocab size = number of classes
        '''
        B,T = idx.shape

        token_emb = self.token_embedding_table(idx)  # (B, T, C) Replacing indices with embeddings
        pos_emb = self.position_embedding_table(tf.range(T, dtype=tf.int32)) # (T,C)
        x = token_emb + pos_emb # (B, T, C) Containing both token embedding and position
        x = self.sa_head(x) # Apply self-attention (B, T, C)
        x = self.blocks(x) # Apply feed forward (B, T, C)
        logits = self.lm_head(x)  # (B, T, vocab_size)

        if targets is None: # If target is not provided
            loss = None
        else:               # If target is provided, reshape the tensor so that it's compatible with categorical cross entropy
            B, T, C = tf.shape(logits) # Get the shape of logits
            logits = tf.reshape(logits, (B * T, C)) # Flatten logits for comparison
            targets = tf.reshape(targets, (B * T,)) # Flatten targets
            loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(targets, logits, from_logits=True))
        return logits, loss

    def generate(self, idx, max_new_tokens):
        '''
        Text generating method
        '''
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # Crop idx to the last block_size tokens to avoid going out of scope
            idx_cond = idx[:, -block_size:]
            # Get the predictions
            logits, loss = self(idx_cond)
            # Focus only on the last time step (i.e. history is not being used)
            logits = logits[:, -1, :]  # becomes (B, C)
            # Apply softmax to get probabilities
            probs = tf.nn.softmax(logits, axis=-1)  # (B, C)
            # One sample prediction from the distribution
            idx_next = tf.random.categorical(tf.math.log(probs), num_samples=1, dtype=tf.int64) # (B, 1)

            # idx_next = tf.random.categorical(probs, num_samples=1)
            # append sampled index to the running sequence
            idx = tf.concat([idx, tf.cast(idx_next, tf.int32)], axis=1)  # (B, T+1)
        return idx

model_opt = BigramLanguageModel(vocab_size)

In [30]:
model_train(model_opt, 'Optimized')
model_generate(model_opt)

Step 1		 train loss 4.3069 | val loss 4.3016
Step 500	 train loss 2.4556 | val loss 2.4734 | time 4 min 17 seconds
Step 1000	 train loss 2.2670 | val loss 2.2721 | time 4 min 19 seconds
Step 1500	 train loss 2.1325 | val loss 2.1606 | time 4 min 17 seconds
Step 2000	 train loss 2.0531 | val loss 2.1201 | time 3 min 59 seconds
Step 2500	 train loss 1.9920 | val loss 2.0822 | time 4 min 12 seconds
Step 3000	 train loss 1.9317 | val loss 2.0300 | time 4 min 7 seconds
Step 3500	 train loss 1.8914 | val loss 1.9814 | time 3 min 57 seconds
Step 4000	 train loss 1.8375 | val loss 1.9612 | time 4 min 2 seconds
Step 4500	 train loss 1.7975 | val loss 1.9403 | time 4 min 1 seconds
Step 5000	 train loss 1.7815 | val loss 1.9367 | time 4 min 2 seconds
Final Loss: 1.8928993940353394


I shallet of and my corming, not cries this off manterm accke he the grace.

FiLANIUS:
How, woe him his cannot himself,
Which to all beford, and and in anforthers:
Of I'll

ISANGE:
Ah with him pencudied.

SICINIUS:
No

In [None]:
gpt_results

Unnamed: 0,Model,Train loss,Val loss,Time (min),Text
0,Basic,2.463,2.4829,5.0,"\nCow?\nsethofa poneay fo d, beor garethime way d! mele ngan tigurerde worvoury dus; ovest is ashend,\nELE sd ar.\nThe we this,\nMy hit y. ds we the:\n\n\n\nThe w,\nBIZ bin\nDOrnow hind stid,\nw ankenomyoth y.\nAtspothasso t:\nI eng l pr bor wle the s andort\nWhiease wos\nheswinghe t f thautac.\nWan y d chelitheloveld? hist iveenone aneatanghe becese w chy! heplo s purhanfom,\nAn:\n\nWhey bouru's IO:\nwathe thecD:\nMarmouere shato w?\n\nEMoss an n\nTu, w n ave helyomarsthaneachis.\n\nCHor dind-\nWgh en s meend id touk's her"
1,Self-attention,2.3838,2.397,7.0,"\nPAre wloru tohoad sirsend then ndothiouucl tladin'so ha sts\nKVIat lghions;\nAng\nTh fapr\nSsom, gr'd ut, gh by, my, esise, thet hyour uche thoteser thig' lo wo dlth harpequuhel alots ngo fmant wnier wat-waver nyo in sw hint sime nded,\nFan; wrshimous so rofen wnghatanth anth to ewind:\nWh minss\nBu, anst? hande lak, bay too whervito wedoug isth han cem. glel tidot ve ird douere,\nAw,\nAn brokexem he wem ndpur win foth ak burs arppo hat athy, nd cthay tesu foriks lou ousy thwangs-oro hur ust ulgs.\nBut an"
2,Muti-headed attention,2.4244,2.4374,12.0,"\nKBuradss kistes tangs, whag nd cof me donm isen owirot prevof, cigt be.\n\n\nSstheler wosher Bakarve's thacouy the derwshe al d d fawe dotithhan n\nAquit lirch din.\n\nWend gak I thDIoud CShereild Hesoun wyhaud ghiad swhous his.\n\n\nWicalf, lGon hecow s deprdatin thengoequik\nTaos, w.\nIOLEMEM:\nCao:\nThe, la othade thivan y bisillee,\nThey tlleemir n.\nGon, che withanedos pifoulssf os sh Iy sud\nDf fof goratecamy usou s hecou,\nTkt pieth allo mid shesens md fanetof caf an dlethars Dfnot; blle!\n\nAn y d! bvesan"
3,Feed forward,3.2804,3.3085,40.0,"\nwLsse aaiWygh t sumeer lnuurrsyihch Tthoelrerhamytgs y,baaosrnhdenacisgT Bv,aossldDaEr.nhLrhomGyt Iauu TyhumsEb miciewcar\nIhl ,oi aalhuiraeIv,w Eerhgid ilae tr,wifrdo\ntukvwrly Awreeitgtu en niaoeh dH,\nne; lO oai ua!slona:it drs,wltGu,weMteidn rhi w O,t\n ea\nadtorv arbedovp g\ny eodorWwoon\nyl Auurine'oro oarg Ooi e mo ni urf rihRn eethu\ntoo enh utm\nuityodkhh p R!ri tua \nlt,e w D :diihulilsu iWhsanotFeyahnrdh hhesbhlicC ce th,u :fwhrdaohs o,lynlWgina KIpG, toiy:,ltiu r r MyEslaCoiad\nD ifho"
4,Optimized,2.7608,2.758,46.0,"\nOL yeariefAf\nNEE I aA smime o ur ord cNe noeece le occar\nTeesrrs onuo, argour, mramol to doiuuaidn,\nA tn ynont tofoe nreitheehr it nesnitftu, for ateUot,\n\n hau chWtinok qhese ano se halo otnb't! ad ut, ea dteulsd gomir,\ne hwReyetigs he titons, lr tah mo woI iun kd.\n\nRVeOWlE fUOhNOMUD:\nEHe wkafe t ulwcweo owo me sy\nOint sle:\n\n\nOer feonotuter vsrFne th loethfe.\n hoo Reth toPMwhUas th hheos\nCeEs cotes' nhoan fer av: ticy movoto y hsneannur h, afam wee, ato! t, hAeh cdehate cawl bil, peny:\nAg dunn"


## Scaling up the model

Added dropouts to avoid nodes from overfitting

reference : Dropout : A Simple Way to Prevent Neural Networks from Overfitting

In [None]:
tf.random.set_seed(1337)
# Hyperparameters
tf.random.set_seed(1337)
# Hyperparameters
batch_size = 16 # Independent sequences to process in parallel
block_size = 32 # Maximum context length for prediction
max_iters = 5000
eval_interval = 500 # How often evaluate the loss
learning_rate = 1e-3
eval_iters = 200 # How many batches to use to compute loss
n_embed = 64
n_head = 8
n_layer = 6
dropout = 0.2

In [None]:

class Head(tf.keras.Model):
    """one head of self-attention"""

    def __init__(self, head_size, dropout):
        '''Initializing embedding layer, which maps integer indices to
        dense vectors of vocab size'''
        super(Head, self).__init__()
        self.key = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.query = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.value = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.head_size = head_size
###
        self.dropout = tf.keras.layers.Dropout(dropout)
    '''
    def build(self, input_shape):
      self.block_size = input_shape[-1]
      super().build(input_shape)
    '''
    def call(self, x, training = False):
        B, T, C = x.shape
        k = self.key(x)     # (B,T,C)
        q = self.query(x)   # (B,T,C)
        v = self.value(x) # (B,T,C)
        # Compute attention scores ('affinities')
        wei = q @ tf.transpose(k, perm=[0,2,1]) * C ** (-0.5) # (B,T,C) @ (B,C,T) -> (B,T,T)
###
        mask = tf.linalg.band_part(tf.ones((T, T)), -1, 0)
        wei = tf.where(mask == 0, float('-inf'), wei)  # Mask the upper triangular part, (B,T,T)
###
        wei = tf.nn.softmax(wei, axis = -1) # (B,T,T)
###
        # Apply dropout if in training mode
        wei = self.dropout(wei, training=training)
###
        # Perform the weighted aggregation of the values
        out = wei @ v # (B,T,T) @ (B,T,C) -> (B,T,C)
        return out

class MultiHeadAttention(tf.keras.layers.Layer):
    '''Multiple heads of self-attention in parallel'''
###
    def __init__(self, num_heads, head_size,dropout):
        super().__init__()
        self.heads = [Head(head_size, dropout) for _ in range(num_heads)]
        self.projection = tf.keras.layers.Dense(num_heads * head_size)
        self.dropout = tf.keras.layers.Dropout(dropout)
###
    """
    def build(self, input_shape):
      # This method is called the first time the layer is used with an input
        self.heads = [Head(self.head_size, dropout = self.dropout) for _ in range(self.num_heads)]
        self.projection = tf.keras.layers.Dense(input_shape[-1])
    """

    def call(self, x, training =False):
        out = tf.concat([h(x, training = training) for h in self.heads], axis=-1)
        out = self.projection(out)
###
        return self.dropout(out, training=training)
###

class FeedForward(tf.keras.layers.Layer):
    '''A simple linear layer followed by a non-linearity'''

    def __init__(self, n_embed, dropout):
        super().__init__()
        self.net = tf.keras.Sequential([
            tf.keras.layers.Dense(4 * n_embed), # (n_embed, 4 * n_embed)
            tf.keras.layers.ReLU(),
            tf.keras.layers.Dense(n_embed), # (4 * n_embed, n_embed)
###
            tf.keras.layers.Dropout(dropout),
###
        ])

    def call(self, x, training):
        return self.net(x, training = training)

class Block(tf.keras.layers.Layer):
    """Transformer blocks : communication followed by computation"""

    def __init__(self, n_embed, n_head, dropout):
        # n_embed : embedding dimension, n_head : the number of heads we'd like
        super().__init__()
        head_size = n_embed // n_head
        self.sa = MultiHeadAttention(n_head, head_size, dropout) # Communication
        self.ffwd = FeedForward(n_embed, dropout) # Computation of individual tokens
        self.ln1 = tf.keras.layers.LayerNormalization(axis=-1)
        self.ln2 = tf.keras.layers.LayerNormalization(axis=-1)

    def call(self, x, training = False):
        # Residual Connections to preserve information, and improve gradient flow
        x = x + self.sa(self.ln1(x), training = training)
        x = x + self.ffwd(self.ln2(x), training = training)
        return x


class BigramLanguageModel(tf.keras.Model):

    def __init__(self, vocab_size):
        '''Initializing embedding layer, which maps integer indices to
        dense vectors of vocab size'''
        super(BigramLanguageModel, self).__init__()
        self.token_embedding_table = tf.keras.layers.Embedding(vocab_size, n_embed)
        self.position_embedding_table = tf.keras.layers.Embedding(block_size, n_embed)
###
        self.blocks = [ Block(n_embed, n_head, dropout) for _ in range(n_layer)]
        self.ln_f = tf.keras.layers.LayerNormalization(axis=-1)
###
        self.lm_head = tf.keras.layers.Dense(units=vocab_size)

    def call(self, idx, targets=None, training = False):
        '''Method for loss calculation, based on idx (input token indices) and
        target (target token indices)
        B : Batch size
        T : Time = block size = sequence length
        C : Channel = vocab size = number of classes
        '''
        B,T = idx.shape

        token_emb = self.token_embedding_table(idx)  # (B, T, C) Replacing indices with embeddings
        pos_emb = self.position_embedding_table(tf.range(T, dtype=tf.int32)) # (T,C)
        x = token_emb + pos_emb # (B, T, C) Containing both token embedding and position

        #Apply transformer blocks
        for block in self.blocks:
          x = block(x, training = training)

        x = self.ln_f(x) # Apply normalization
        logits = self.lm_head(x)  # (B, T, vocab_size)

        if targets is None: # If target is not provided
            loss = None
        else:               # If target is provided, reshape the tensor so that it's compatible with categorical cross entropy
            B, T, C = tf.shape(logits) # Get the shape of logits
            logits = tf.reshape(logits, (B * T, C)) # Flatten logits for comparison
            targets = tf.reshape(targets, (B * T,)) # Flatten targets
            loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(targets, logits, from_logits=True))
        return logits, loss

    def generate(self, idx, max_new_tokens):
        '''
        Text generating method
        '''
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # Crop idx to the last block_size tokens to avoid going out of scope
            idx_cond = idx[:, -block_size:]
            # Get the predictions
            logits, loss = self(idx_cond)
            # Focus only on the last time step (i.e. history is not being used)
            logits = logits[:, -1, :]  # becomes (B, C)
            # Apply softmax to get probabilities
            probs = tf.nn.softmax(logits, axis=-1)  # (B, C)
            # One sample prediction from the distribution
            idx_next = tf.random.categorical(tf.math.log(probs), num_samples=1, dtype=tf.int64) # (B, 1)

            # idx_next = tf.random.categorical(probs, num_samples=1)
            # append sampled index to the running sequence
            idx = tf.concat([idx, tf.cast(idx_next, tf.int32)], axis=1)  # (B, T+1)
        return idx

model_scaled = BigramLanguageModel(vocab_size)

In [44]:
model_train(model_scaled, 'scaled')
model_generate(model_scaled)

Step 1		 train loss 4.7066 | val loss 4.6944


KeyboardInterrupt: 

In [None]:
# Verifying whether the row sum of weights equal 1
import numpy as np
# Convert TensorFlow tensor to NumPy array
w_np = w.numpy()

# Set NumPy print options to suppress scientific notation
np.set_printoptions(suppress=True)

# Print the tensor
print(w_np.round(6))

[[1.       0.       0.       0.       0.       0.       0.       0.      ]
 [0.5      0.5      0.       0.       0.       0.       0.       0.      ]
 [0.333333 0.333333 0.333333 0.       0.       0.       0.       0.      ]
 [0.25     0.25     0.25     0.25     0.       0.       0.       0.      ]
 [0.2      0.2      0.2      0.2      0.2      0.       0.       0.      ]
 [0.166667 0.166667 0.166667 0.166667 0.166667 0.166667 0.       0.      ]
 [0.142857 0.142857 0.142857 0.142857 0.142857 0.142857 0.142857 0.      ]
 [0.125    0.125    0.125    0.125    0.125    0.125    0.125    0.125   ]]


In [None]:
q = tf.random.normal((B, T, head_size))
k = tf.random.normal((B, T, head_size))

# Calculate the weights
wei = q @ tf.transpose(k, perm=[0, 2, 1]) * (head_size ** -0.5)

In [None]:
tf.math.reduce_variance(k).numpy()

0.9939072

In [None]:
tf.math.reduce_variance(q).numpy()

0.9280848

In [None]:
tf.math.reduce_variance(wei).numpy()

0.9270358

In [None]:
x.shape

TensorShape([4, 8, 32])

In [None]:
head_size = 6
max_iters = 2000
eval_interval = 500
learning_rate = 1e-2
eval_iters = 200
n_embed = 384
batch_size = 64
block_size = 256
dropout = 0.3

n_layer = 3

class Head(tf.keras.Model):
    """one head of self-attention"""

    def __init__(self, head_size):
        '''Initializing embedding layer, which maps integer indices to
        dense vectors of vocab size'''
        super(Head, self).__init__()
        self.key = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.query = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.value = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.tril = tf.constant(tf.linalg.band_part(tf.ones((block_size, block_size)), -1, 0), dtype= tf.float32)
        self.dropout = tf.keras.layers.Dropout(dropout)

    def call(self, x):
        B, T, C = x.shape
        k = self.key(x)     # (B,T,C)
        q = self.query(x)   # (B,T,C)
        # compute attention scores ('affinities')
        wei = tf.matmul(q, k, transpose_b=True) * (C ** -0.5) # (B,T,C) @ (B,C,T ) -> (B,T,T))
        wei = tf.where(self.tril[:T, :T] == 0, float('-inf'), wei) # Mask the upper triangular part
        wei = tf.nn.softmax(wei, axis = -1)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = tf.matmul(wei, v) # (B,T,T) @ (B,T,C) -> (B,T,C)
        return out

class FeedForward(tf.keras.layers.Layer):
    '''A simple linear layer followed by a non-linearity'''
    def __init__(self, n_embed):
        super().__init__()
        self.net = tf.keras.Sequential([
            tf.keras.layers.Dense(4 * n_embed), # (n_embed, 4 * n_embed)
            tf.keras.layers.ReLU(),
            tf.keras.layers.Dense(n_embed), # (4 * n_embed, n_embed)
            tf.keras.layers.Dropout(dropout),
        ])

    def call(self, x):
        return self.net(x)


class MultiHeadAttention(tf.keras.layers.Layer):
    '''Multiple heads of self-attention in parallel'''

    def __init__(self,num_heads, head_size):
        super().__init__()
        self.heads = [Head(head_size) for _ in range(num_heads)]
        self.projection = tf.keras.layers.Dense(n_embed)
        self.dropout = tf.keras.layers.Dropout(dropout)

    def call(self, x):
        out = tf.concat([h(x) for h in self.heads], axis=-1)
        out = self.dropout(self.projection(out))
        return out

class Block(tf.keras.layers.Layer):
    """Transformer blocks : communication followed by computation"""

    def __init__(self, n_embed, n_head):
        # n_embed : embedding dimension, n_head : the number of heads we'd like
        super().__init__()
        head_size = n_embed // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embed)
        self.ln1 = tf.keras.layers.LayerNormalization(axis=-1)
        self.ln2 = tf.keras.layers.LayerNormalization(axis=-1)

    def call(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class BigramLanguageModel(tf.keras.Model):

    def __init__(self, vocab_size):
        '''Initializing embedding layer, which maps integer indices to
        dense vectors of vocab size'''
        super(BigramLanguageModel, self).__init__()
        self.token_embedding_table = tf.keras.layers.Embedding(vocab_size, n_embed)
        self.position_embedding_table = tf.keras.layers.Embedding(block_size, n_embed)
        self.blocks = tf.keras.Sequential([Block(n_embed = n_embed, n_head = 4) for _ in range(n_layer)])
        self.ln_f = tf.keras.layers.LayerNormalization(axis=-1) # final layer normalization
        self.lm_head = tf.keras.layers.Dense(units=vocab_size)

    def call(self, idx, targets = None):
        '''Method for loss calculation, based on idx (input token indices) and
        target (target token indices)
        B : Batch size
        T : Time = sequence length = block size
        C : Channel = number of classes = vocab size
        '''
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        token_emb = self.token_embedding_table(idx) # (B,T,C)
        position_emb = self.position_embedding_table(tf.range(T, dtype=tf.int32)) # (T, C)
        x = token_emb + position_emb # (B,T,C)
        x = self.blocks(x) # (B, T, C)
        x = self.ln_f(x)
        logits = self.lm_head(x) # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:
            # Reshaping the tensor so that it's compatible with categorical cross entropy
            B, T, C = tf.shape(logits) # Get the shape of logits
            logits = tf.reshape(logits, (B * T, C)) # Flatten logits for comparison
            targets = tf.reshape(targets, (B * T,)) # Flatten targets
            loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(targets, logits, from_logits=True))
        return logits, loss

    def generate(self, idx, max_new_tokens):
        '''
        Text generating method
        '''
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:,-block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :]  # becomes (B, C)
            # apply softmax to get probabilities
            probs = tf.nn.softmax(logits, axis=-1)  # (B, C)
            # sample prediction from the distribution
            idx_next = tf.random.categorical(tf.math.log(probs), num_samples=1, dtype=tf.int32)
            # append sampled index to the running sequence
            idx = tf.concat([idx, idx_next], axis=1)  # (B, T+1)
        return idx

model = BigramLanguageModel(vocab_size)
logits, loss = model.call(xb, yb)
print(logits.shape)
print(loss.numpy())



optimizer = tf.keras.optimizers.Adam(learning_rate)

for step in tf.range(max_iters):
    if step % eval_iters == 0:
        losses = estimate_loss()
        print(f"Step {step}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # Sample a batch of data
    xb, yb = get_batch('train')

    # Evaluate the loss
    with tf.GradientTape() as tape:
        logits, loss = model(xb,yb)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

print(f'Final Loss: {loss.numpy()}')

print(decode(model.generate(idx=tf.zeros((1, block_size), dtype=tf.int32), max_new_tokens=100)[0].numpy().tolist()))


(32, 65)
4.7333155


TypeError: estimate_loss() missing 1 required positional argument: 'model'

In [None]:
print(decode(model.generate(idx=tf.zeros((1, block_size), dtype=tf.int32), max_new_tokens=100)[0].numpy().tolist()))


































































































































































































































































MOF:
HBRDUEENOLIA:
ININWAN:
AUROLAOLI: od the O:
ARCut-
Fo lelle h aver t rstwathit bellly poenly ll
