### * While the original source code is written in Pytorch, the below code is adapted to Tensorflow.

- GPU utilization not enabled

# 1. Preparing the tinyshakespeare text file for training

In [1]:
# Downloading tinyshakesphere for training
!curl https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt > tinyshakespeare.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0

100 1089k  100 1089k    0     0  2455k      0 --:--:-- --:--:-- --:--:-- 2458k


In [2]:
# Inspecting the text file
with open('tinyshakespeare.txt','r') as file:
    text = file.read()
print(f'There are {len(text)} characters in the dataset')

There are 1115394 characters in the dataset


In [3]:
# Printing the first 1000 characters
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [4]:
# Identifying the number of unique characters contained in the text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(f"Number of unique characters (including white space): {vocab_size}{''.join(chars)}")

Number of unique characters (including white space): 65
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


# 2. Basic mapping between characters to integers

Tokenizing at the character-level.

More sophisticated examples of word encoding include Google's SentencePiece and OpenAI's tiktoken

In [5]:
# Assigning numbers to each characters to encode the characters to integers
ctoi = {char : num for num, char in enumerate(chars)}
encode = lambda s: [ctoi[c] for c in s]
print(encode('Shakespeare in digits'))

# Reversely, decode integers back to characters
itoc = {num : char for num, char in enumerate(chars)}
decode = lambda l : ''.join([itoc[i] for i in l])
print(decode(encode('Shakespeare in digits')))

[31, 46, 39, 49, 43, 57, 54, 43, 39, 56, 43, 1, 47, 52, 1, 42, 47, 45, 47, 58, 57]
Shakespeare in digits


In [6]:
# Tokenizing the total text
import tensorflow as tf
data = tf.convert_to_tensor(encode(text))
print(data.shape, data.dtype)
print(data[:100])

2024-11-11 10:48:16.203136: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-11 10:48:16.731180: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


(1115394,) <dtype: 'int32'>
tf.Tensor(
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59], shape=(100,), dtype=int32)


In [7]:
# Train and validation split sets, with 9:1 ratio
n = int(0.9*len(data))
data_train = data[:n]
data_test = data[n:]
print(f'Length of train data : {len(data_train)}\nLength of test data : {len(data_test)}')

Length of train data : 1003854
Length of test data : 111540


In [8]:
# Starting with block_size implementation
block_size = 8                            # Context length
print(data_train[:block_size + 1])
x = data_train[:block_size]               # Initial block-size
y = data_train[1:block_size+1]            # Next block-size
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f'Input : {context}, Output : {target}')

tf.Tensor([18 47 56 57 58  1 15 47 58], shape=(9,), dtype=int32)
Input : [18], Output : 47
Input : [18 47], Output : 56
Input : [18 47 56], Output : 57
Input : [18 47 56 57], Output : 58
Input : [18 47 56 57 58], Output : 1
Input : [18 47 56 57 58  1], Output : 15
Input : [18 47 56 57 58  1 15], Output : 47
Input : [18 47 56 57 58  1 15 47], Output : 58


In [9]:
## To be worked on : packaging the code with script with variables for later
# Depiction of the chunk(or in here, block)-wise transformation.
# Having varied blocksize allows the algorithm to take into account the context for inference purpose

tf.random.set_seed(1337) # For reproducibility, to be sure to have consistent random number
batch_size = 4 # The number of independent sequences to train in parallel
block_size = 8 # The maximum context length for prediction

def get_batch(split):
    '''
    Function to generate a small batch of data of inputs x and targets y
    '''

    data = data_train if split == 'train' else data_test
    # Retrieving batches randomly
    ix = tf.random.uniform(shape = (batch_size,),
                          maxval = len(data) - block_size,
                          dtype = tf.int32)
    # Stacking the list of tensors
    x = tf.stack([data[i:i+block_size] for i in ix])
    y = tf.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)
for batch in range(batch_size):       # Batch dimension
    for block in range(block_size):   # Time dimension
        context = xb[batch, :block+1]
        target = yb[batch, block]
        print(f'When input is {context.numpy().tolist()} the target is {target}')

inputs:
(4, 8)
tf.Tensor(
[[ 1 51 63  1 41 53 39 58]
 [39 42  0 20 47 57  1 52]
 [32 53  1 56 43 60 43 50]
 [54 39 52 63  1 54 47 43]], shape=(4, 8), dtype=int32)
targets:
(4, 8)
tf.Tensor(
[[51 63  1 41 53 39 58  6]
 [42  0 20 47 57  1 52 39]
 [53  1 56 43 60 43 50  1]
 [39 52 63  1 54 47 43 41]], shape=(4, 8), dtype=int32)
When input is [1] the target is 51
When input is [1, 51] the target is 63
When input is [1, 51, 63] the target is 1
When input is [1, 51, 63, 1] the target is 41
When input is [1, 51, 63, 1, 41] the target is 53
When input is [1, 51, 63, 1, 41, 53] the target is 39
When input is [1, 51, 63, 1, 41, 53, 39] the target is 58
When input is [1, 51, 63, 1, 41, 53, 39, 58] the target is 6
When input is [39] the target is 42
When input is [39, 42] the target is 0
When input is [39, 42, 0] the target is 20
When input is [39, 42, 0, 20] the target is 47
When input is [39, 42, 0, 20, 47] the target is 57
When input is [39, 42, 0, 20, 47, 57] the target is 1
When input is [39,

## Basic BigramModel for training

In [10]:
tf.random.set_seed(1337)
# Hyperparameters
batch_size = 16 # Independent sequences to process in parallel
block_size = 32 # Maximum context length for prediction
max_iters = 5000
eval_interval = 500 # How often evaluate the loss
learning_rate = 1e-3
eval_iters = 200 # How many batches to use to compute loss
n_embed = 64

In [11]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

gpt_results = pd.DataFrame(columns=['Model', 'Train loss', 'Val loss', 'Time (min)', 'Text'])
gpt_results

Unnamed: 0,Model,Train loss,Val loss,Time (min),Text


In [12]:
class BigramLanguageModel(tf.keras.Model):

    def __init__(self, vocab_size):
        '''Initializing embedding layer, which maps integer indices to
        dense vectors of vocab size'''
        super(BigramLanguageModel, self).__init__()
        self.token_embedding_table = tf.keras.layers.Embedding(vocab_size, vocab_size)

    def call(self, idx, targets=None):
        '''Method for loss calculation, based on idx (input token indices) and
        target (target token indices)
        B : Batch size
        T : Time = block size = sequence length
        C : Channel = vocab size = number of classes
        '''
        logits = self.token_embedding_table(idx)  # Replacing embedding to the indices

        if targets is None: # If target is not provided
            loss = None
        else:               # If target is provided, reshape the tensor so that it's compatible with categorical cross entropy
            B, T, C = tf.shape(logits) # Get the shape of logits
            logits = tf.reshape(logits, (B * T, C)) # Flatten logits for comparison
            targets = tf.reshape(targets, (B * T,)) # Flatten targets
            loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(targets, logits, from_logits=True))
        return logits, loss

    def generate(self, idx, max_new_tokens):
        '''
        Text generating method
        '''
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # Get the predictions
            logits, loss = self(idx)
            # Focus only on the last time step (i.e. history is not being used)
            logits = logits[:, -1, :]  # becomes (B, C)
            # Apply softmax to get probabilities
            probs = tf.nn.softmax(logits, axis=-1)  # (B, C)
            # One sample prediction from the distribution
            idx_next = tf.random.categorical(tf.math.log(probs), num_samples=1, dtype=tf.int64) # (B, 1)

            # idx_next = tf.random.categorical(probs, num_samples=1)
            # append sampled index to the running sequence
            idx = tf.concat([idx, tf.cast(idx_next, tf.int32)], axis=1)  # (B, T+1)
        return idx

model_basic = BigramLanguageModel(vocab_size)

logits, loss = model_basic.call(xb, yb)
print(logits.shape)
print(loss.numpy())

print(decode(model_basic.generate(idx=tf.zeros((1, 1), dtype=tf.int32), max_new_tokens=100)[0].numpy().tolist()))

(32, 65)
4.1699634

sZTe-Wz-L,?hNl?Zr:r'KUFLHH:QmLboClI
oYwnqePrE
!zgz'U:,?ZgzxEjItfpzARjGjM&vv.;OBdqFlP qxcwcexWhPKt:$'


### Creating an optimizer, and training the model

In [13]:
import time

def estimate_loss(model):
  '''
  Function to average up the loss in multiple batches for both splits
  '''
  output = {}
  model.training = False # Setting the model to evaluation phase
  for split in ['train','val']:
      losses = []
      for _ in range(eval_iters):
          X, Y = get_batch(split)
          logits, loss = model.call(X,Y)
          losses.append(loss)
      output[split] = tf.reduce_mean(losses)
  model.training = True # Setting the model back to training phase
  return output

def model_train(model, label):
    start_train = time.time()
    optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate)
    for step in tf.range(1, max_iters+1):
        if (step % eval_interval == 0) or (step == 1):
            losses = estimate_loss(model)
            if step != 1:
                end_int = time.time()
                print(f"Step {step}\t train loss {losses['train']:.4f} | val loss {losses['val']:.4f} | time {(end_int-start_int)//60:.0f} min {(end_int-start_int)%60:.0f} seconds")
                start_int = time.time()
            else:
                print(f"Step {step}\t\t train loss {losses['train']:.4f} | val loss {losses['val']:.4f}")
                start_int = time.time()


        # Sample a batch of data
        xb, yb = get_batch('train')


        # Evaluate the loss and update parameters
        with tf.GradientTape() as tape:
            logits, loss = model(xb,yb)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    end_train = time.time()

    # Save result for comparison
    global gpt_results
    gpt_results = pd.concat([gpt_results,pd.DataFrame({'Model': label,
                                                       'Train loss': [round(losses['train'].numpy(),4)],
                                                       'Val loss': [round(losses['val'].numpy(),4)],
                                                       'Time (min)' : [round((end_train-start_train)/60,0)],
                                                       'Text':''})], ignore_index = True)
    print(f'Final Loss: {loss.numpy()}')

def model_generate(model):
    # Generate a sequence
    print('\n======================= Generated Sequence =======================')
    idx = tf.zeros((1, 1), dtype=tf.int32)
    generated_sequence = model.generate(idx, max_new_tokens=500).numpy()
    # Save result for comparison
    gpt_results.iloc[-1,4] = decode(generated_sequence[0].tolist())
    print(decode(generated_sequence[0].tolist()))


In [14]:
model_train(model_basic, 'Basic')
model_generate(model_basic)

Step 1		 train loss 4.1784 | val loss 4.1784
Step 500	 train loss 3.0880 | val loss 3.0962 | time 0 min 14 seconds
Step 1000	 train loss 2.7332 | val loss 2.7415 | time 0 min 15 seconds
Step 1500	 train loss 2.5982 | val loss 2.6126 | time 0 min 15 seconds
Step 2000	 train loss 2.5462 | val loss 2.5528 | time 0 min 15 seconds
Step 2500	 train loss 2.5118 | val loss 2.5230 | time 0 min 14 seconds
Step 3000	 train loss 2.4940 | val loss 2.5059 | time 0 min 14 seconds
Step 3500	 train loss 2.4791 | val loss 2.4966 | time 0 min 15 seconds
Step 4000	 train loss 2.4759 | val loss 2.4907 | time 0 min 18 seconds
Step 4500	 train loss 2.4694 | val loss 2.4923 | time 0 min 18 seconds
Step 5000	 train loss 2.4714 | val loss 2.4881 | time 0 min 18 seconds
Final Loss: 2.486729621887207


Sallenl!
ND:
Hico ixcquthed be sis Gemat wse whe atahe ase MExme gube aced prenchow, avere ithaiass itooupur deried we t then arme, aceat elllave thiny MAreng url unubak,
S: bjworthe t kechatre?

T:
THAngobe hed, f

In [15]:
gpt_results

Unnamed: 0,Model,Train loss,Val loss,Time (min),Text
0,Basic,2.4714,2.4881,3.0,"\nSallenl!\nND:\nHico ixcquthed be sis Gemat wse whe atahe ase MExme gube aced prenchow, avere ithaiass itooupur deried we t then arme, aceat elllave thiny MAreng url unubak,\nS: bjworthe t kechatre?\n\nT:\nTHAngobe hed, fithestha!Mour s t thes\nO y.\nMIqunerpe themo vee hilepayoouatheadingick suie t y o'te mond y ithiury s.\nMIn t ss:\nCLAGorvizeerouth sspaugh's cher s he mnd.\nLINTI, cofey with he.\nCEThe ar co t y isit h sisged, be:\n\nCand, re me, myepe mphorferg.\nHand thes hiothergenche avugo, pld hte ive"


## The mathematical trick in self-attention
Below present different ways of calculating weighted aggregation of a matrix, from beginning of the block in each batch, up to the 't'th token. The results of the four approaches are the same

In [16]:
# Tokens learning from previous context, by calculating average up to 't'th token
B,T,C = 4,8,2 # Batch, Time, Channels
x = tf.random.uniform(shape=(B, T,C))

### Version 1: Basic

In [17]:
xbow = tf.zeros((B,T,C)) # Defining a bag of words
for b in range (B):
    for t in range (T):
        xprev = x[b, :t+1] # (t, C) Batch, including the 't'th token
        xbow = xbow.numpy()  # Convert xbow to numpy array to support assignment
        xbow[b, t] = tf.reduce_mean(xprev, axis=0).numpy()  # Calculate mean and assign to xbow
        xbow = tf.convert_to_tensor(xbow)  # Convert back to tensor

### Version 2: Vectorizing


In [18]:
w = tf.linalg.band_part(tf.ones((T,T)),num_lower = 8, num_upper= 0)
w = w / tf.math.reduce_sum(w, axis = 1, keepdims = True) # Low triangular matrix for calculating average weights

xbow2 = w @ x # (B, T, T) @ (B , T, C) --> (B, T, C)
tf.experimental.numpy.allclose(xbow,xbow2).numpy() # Checking whether xbow == xbow2

True

### Version 3: Using softmax

In [19]:
tril = tf.linalg.band_part(tf.ones((T,T)),num_lower = 8, num_upper= 0)
w = tf.zeros((T,T))
w = tf.where(tril == 0, float('-inf'), w) # Replacing 0s with -inf, indicating that the past blocks cannot communicate with the future blocks
w = tf.nn.softmax(w, axis = -1) # Normalizing the weight matrix
xbow3 = w @ x
tf.experimental.numpy.allclose(xbow,xbow3).numpy()

True

### Version 4: Self-attention

Called self-attention as the key, query and value are generated from the same value (x)

Note that key and query weights values are different as

In [20]:
# Attention mechanism
head_size = 16
key = tf.keras.layers.Dense(units=head_size, use_bias=False)
query = tf.keras.layers.Dense(units=head_size, use_bias=False)
value = tf.keras.layers.Dense(units=head_size, use_bias=False)
k = key(x) # Weights adjusted, (B, T, 16)
q = query(x) # (B, T, 16)
w = q @ tf.transpose(k, perm=[0,2,1]) # (B, T, 16) @ (B, 16, T) -> (B, T, T), with (T, T) indicating elements compared with every element in the sequence

tril = tf.linalg.band_part(tf.ones((T,T)),num_lower = 8, num_upper= 0)
w = tf.where(tril == 0, float('-inf'), w) # Replacing 0s with -inf, indicating that the past blocks cannot communicate with the future blocks
w = tf.nn.softmax(w, axis = -1) # Normalizing the weight matrix

v = value(x)
out = w @ v # Using aggregated value instead of the raw x for dimensionality reduction, information extraction
out.shape

TensorShape([4, 8, 16])

Notes:
- Attention is a communication mechanism. Can be seen as nodes in a directed graph looking at each other and aggregating information with a weighted sum from all nodes that point to them, with data-dependent weights.
- There is no notion of space. Attention simply acts over as a set of vectors. This is why we need to positionally encode tokens
- Each example across batch dimension is of course processed completely independently and never 'talk' to each other
- In an 'encoder' attention block (w = tf.where(tril == 0, float('-inf'), w))code can be omitted, allowing all tokens to communicate. This block here is called a 'decoder' attention block because it has triangular masking, and is usually used in autoregressive settings, like language modeling.
- 'Self attention' just means that the keys and values are produced from the same source as queries. In 'Cross-attention', the queries still get produced from x, but the keys and values come from some other, external source (e.g. an encoder module)
- 'Scaled' attention additional divides wei by 1/sqrt(head_size). This makes it so when input Q, K are unit variance, wei will be unit variance too and softmax will stay diffuses and not saturate too much, Illustration below

## Modified BigramModel with self-attention

In [21]:
class Head(tf.keras.Model):
    """one head of self-attention"""

    def __init__(self, head_size):
        '''Initializing embedding layer, which maps integer indices to
        dense vectors of vocab size'''
        super(Head, self).__init__()
        self.key = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.query = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.value = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.tril = tf.constant(tf.linalg.band_part(tf.ones((block_size, block_size)), -1, 0), dtype= tf.float32)

    def call(self, x):
        B, T, C = x.shape
        k = self.key(x)     # (B,T,C)
        q = self.query(x)   # (B,T,C)
        # Compute attention scores ('affinities')
        wei = q @ tf.transpose(k, perm=[0,2,1]) * C ** (-0.5) # (B,T,C) @ (B,C,T) -> (B,T,T)
        wei = tf.where(self.tril[:T, :T] == 0, float('-inf'), wei) # Mask the upper triangular part, (B,T,T)
        wei = tf.nn.softmax(wei, axis = -1) # (B,T,T)
        # Perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B,T,T) @ (B,T,C) -> (B,T,C)
        return out

class BigramLanguageModel(tf.keras.Model):

    def __init__(self, vocab_size):
        '''Initializing embedding layer, which maps integer indices to
        dense vectors of vocab size'''
        super(BigramLanguageModel, self).__init__()
        self.token_embedding_table = tf.keras.layers.Embedding(vocab_size, n_embed)
        self.position_embedding_table = tf.keras.layers.Embedding(block_size, n_embed)
        self.sa_head = Head(n_embed)
        self.lm_head = tf.keras.layers.Dense(units=vocab_size)

    def call(self, idx, targets=None):
        '''Method for loss calculation, based on idx (input token indices) and
        target (target token indices)
        B : Batch size
        T : Time = block size = sequence length
        C : Channel = vocab size = number of classes
        '''
        B,T = idx.shape

        token_emb = self.token_embedding_table(idx)  # (B, T, C) Replacing indices with embeddings
        pos_emb = self.position_embedding_table(tf.range(T, dtype=tf.int32)) # (T,C)
        x = token_emb + pos_emb # (B, T, C) Containing both token embedding and position
        x = self.sa_head(x) # Apply one head of self-attention (B, T, C)
        logits = self.lm_head(x)  # (B, T, vocab_size)

        if targets is None: # If target is not provided
            loss = None
        else:               # If target is provided, reshape the tensor so that it's compatible with categorical cross entropy
            B, T, C = tf.shape(logits) # Get the shape of logits
            logits = tf.reshape(logits, (B * T, C)) # Flatten logits for comparison
            targets = tf.reshape(targets, (B * T,)) # Flatten targets
            loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(targets, logits, from_logits=True))
        return logits, loss

    def generate(self, idx, max_new_tokens):
        '''
        Text generating method
        '''
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # Crop idx to the last block_size tokens to avoid going out of scope
            idx_cond = idx[:, -block_size:]
            # Get the predictions
            logits, loss = self(idx_cond)
            # Focus only on the last time step (i.e. history is not being used)
            logits = logits[:, -1, :]  # becomes (B, C)
            # Apply softmax to get probabilities
            probs = tf.nn.softmax(logits, axis=-1)  # (B, C)
            # One sample prediction from the distribution
            idx_next = tf.random.categorical(tf.math.log(probs), num_samples=1, dtype=tf.int64) # (B, 1)

            # idx_next = tf.random.categorical(probs, num_samples=1)
            # append sampled index to the running sequence
            idx = tf.concat([idx, tf.cast(idx_next, tf.int32)], axis=1)  # (B, T+1)
        return idx

model_sa = BigramLanguageModel(vocab_size)

In [22]:
model_train(model_sa,'Self-attention')
model_generate(model_sa)

Step 1		 train loss 4.1749 | val loss 4.1744
Step 500	 train loss 2.6346 | val loss 2.6350 | time 0 min 37 seconds
Step 1000	 train loss 2.5213 | val loss 2.5117 | time 0 min 37 seconds
Step 1500	 train loss 2.4785 | val loss 2.4700 | time 0 min 38 seconds
Step 2000	 train loss 2.4462 | val loss 2.4521 | time 0 min 38 seconds
Step 2500	 train loss 2.4086 | val loss 2.4252 | time 0 min 37 seconds
Step 3000	 train loss 2.3701 | val loss 2.3821 | time 0 min 37 seconds
Step 3500	 train loss 2.3501 | val loss 2.3608 | time 0 min 36 seconds
Step 4000	 train loss 2.3471 | val loss 2.3630 | time 0 min 36 seconds
Step 4500	 train loss 2.3364 | val loss 2.3567 | time 0 min 36 seconds
Step 5000	 train loss 2.3210 | val loss 2.3558 | time 0 min 37 seconds
Final Loss: 2.230522632598877


As fis brearimem,
Andig; jyequurdoror uts theigout thovetre odes bet sa:
Ss dre; bo wiutrer blik ar fareats; aly tm, tak lse himaw? fourr Alor, oth yof ss defe mor pan to orbat the wreren alinoer lat sthes, owf the

## Multi-headed attention

In [23]:
class Head(tf.keras.Model):
    """one head of self-attention"""

    def __init__(self, head_size):
        '''Initializing embedding layer, which maps integer indices to
        dense vectors of vocab size'''
        super(Head, self).__init__()
        self.key = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.query = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.value = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.tril = tf.constant(tf.linalg.band_part(tf.ones((block_size, block_size)), -1, 0), dtype= tf.float32)

    def call(self, x):
        B, T, C = x.shape
        k = self.key(x)     # (B,T,C)
        q = self.query(x)   # (B,T,C)
        # Compute attention scores ('affinities')
        wei = q @ tf.transpose(k, perm=[0,2,1]) * C ** (-0.5) # (B,T,C) @ (B,C,T) -> (B,T,T)
        wei = tf.where(self.tril[:T, :T] == 0, float('-inf'), wei) # Mask the upper triangular part, (B,T,T)
        wei = tf.nn.softmax(wei, axis = -1) # (B,T,T)
        # Perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B,T,T) @ (B,T,C) -> (B,T,C)
        return out
# ================================================================== #
class MultiHeadAttention(tf.keras.layers.Layer):
    '''Multiple heads of self-attention in parallel'''

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = [Head(head_size) for _ in range(num_heads)]

    def call(self, x):
        out = tf.concat([h(x) for h in self.heads], axis=-1)
        return out
# ================================================================== #

class BigramLanguageModel(tf.keras.Model):

    def __init__(self, vocab_size):
        '''Initializing embedding layer, which maps integer indices to
        dense vectors of vocab size'''
        super(BigramLanguageModel, self).__init__()
        self.token_embedding_table = tf.keras.layers.Embedding(vocab_size, n_embed)
        self.position_embedding_table = tf.keras.layers.Embedding(block_size, n_embed)
# ================================================================== #
        self.sa_head = MultiHeadAttention(4, n_embed//4) # 4 heads of 8-dimensional self-attention
# ================================================================== #
        self.lm_head = tf.keras.layers.Dense(units=vocab_size)

    def call(self, idx, targets=None):
        '''Method for loss calculation, based on idx (input token indices) and
        target (target token indices)
        B : Batch size
        T : Time = block size = sequence length
        C : Channel = vocab size = number of classes
        '''
        B,T = idx.shape

        token_emb = self.token_embedding_table(idx)  # (B, T, C) Replacing indices with embeddings
        pos_emb = self.position_embedding_table(tf.range(T, dtype=tf.int32)) # (T,C)
        x = token_emb + pos_emb # (B, T, C) Containing both token embedding and position
        x = self.sa_head(x) # Apply one head of self-attention (B, T, C)
        logits = self.lm_head(x)  # (B, T, vocab_size)

        if targets is None: # If target is not provided
            loss = None
        else:               # If target is provided, reshape the tensor so that it's compatible with categorical cross entropy
            B, T, C = tf.shape(logits) # Get the shape of logits
            logits = tf.reshape(logits, (B * T, C)) # Flatten logits for comparison
            targets = tf.reshape(targets, (B * T,)) # Flatten targets
            loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(targets, logits, from_logits=True))
        return logits, loss

    def generate(self, idx, max_new_tokens):
        '''
        Text generating method
        '''
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # Crop idx to the last block_size tokens to avoid going out of scope
            idx_cond = idx[:, -block_size:]
            # Get the predictions
            logits, loss = self(idx_cond)
            # Focus only on the last time step (i.e. history is not being used)
            logits = logits[:, -1, :]  # becomes (B, C)
            # Apply softmax to get probabilities
            probs = tf.nn.softmax(logits, axis=-1)  # (B, C)
            # One sample prediction from the distribution
            idx_next = tf.random.categorical(tf.math.log(probs), num_samples=1, dtype=tf.int64) # (B, 1)

            # idx_next = tf.random.categorical(probs, num_samples=1)
            # append sampled index to the running sequence
            idx = tf.concat([idx, tf.cast(idx_next, tf.int32)], axis=1)  # (B, T+1)
        return idx

model_ma = BigramLanguageModel(vocab_size)

In [24]:
model_train(model_ma, 'Muti-headed attention')
model_generate(model_ma)

Step 1		 train loss 4.1745 | val loss 4.1748
Step 500	 train loss 2.6143 | val loss 2.6080 | time 1 min 6 seconds
Step 1000	 train loss 2.5219 | val loss 2.5132 | time 1 min 5 seconds
Step 1500	 train loss 2.4054 | val loss 2.4106 | time 1 min 3 seconds
Step 2000	 train loss 2.3369 | val loss 2.3360 | time 1 min 4 seconds
Step 2500	 train loss 2.2828 | val loss 2.2911 | time 1 min 5 seconds
Step 3000	 train loss 2.2402 | val loss 2.2576 | time 1 min 5 seconds
Step 3500	 train loss 2.2043 | val loss 2.2275 | time 1 min 3 seconds
Step 4000	 train loss 2.1772 | val loss 2.2071 | time 1 min 3 seconds
Step 4500	 train loss 2.1611 | val loss 2.2029 | time 1 min 5 seconds
Step 5000	 train loss 2.1403 | val loss 2.1911 | time 1 min 3 seconds
Final Loss: 2.2562670707702637


Nere the of stue?


FLO:
And.
I
BLE:
San. Hit youlkenst pray hat yourt; pawe thime Lor istr I no lance,
Got No Goversat for colionen have stwarmses ary houspe sey thee frand hou
aing gonefor done hree unt gungoowout EP:
Hem

## Feed Forward

In [25]:
del BigramLanguageModel

class Head(tf.keras.Model):
    """one head of self-attention"""

    def __init__(self, head_size):
        '''Initializing embedding layer, which maps integer indices to
        dense vectors of vocab size'''
        super(Head, self).__init__()
        self.key = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.query = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.value = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.tril = tf.constant(tf.linalg.band_part(tf.ones((block_size, block_size)), -1, 0), dtype= tf.float32)

    def call(self, x):
        B, T, C = x.shape
        k = self.key(x)     # (B,T,C)
        q = self.query(x)   # (B,T,C)
        # Compute attention scores ('affinities')
        wei = q @ tf.transpose(k, perm=[0,2,1]) * C ** (-0.5) # (B,T,C) @ (B,C,T) -> (B,T,T)
        wei = tf.where(self.tril[:T, :T] == 0, float('-inf'), wei) # Mask the upper triangular part, (B,T,T)
        wei = tf.nn.softmax(wei, axis = -1) # (B,T,T)
        # Perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B,T,T) @ (B,T,C) -> (B,T,C)
        return out

class MultiHeadAttention(tf.keras.layers.Layer):
    '''Multiple heads of self-attention in parallel'''

    def __init__(self,num_heads, head_size):
        super().__init__()
        self.heads = [Head(head_size) for _ in range(num_heads)]

    def call(self, x):
        out = tf.concat([h(x) for h in self.heads], axis=-1)
        return out

# [==================================================================
class FeedForward(tf.keras.layers.Layer):
    '''A simple linear layer followed by a non-linearity'''
    def __init__(self, n_embed):
        super().__init__()
        self.net = tf.keras.Sequential([
            tf.keras.layers.Dense(n_embed),
            tf.keras.layers.ReLU(),
        ])

    def call(self, x):
        return self.net(x)

class Block(tf.keras.layers.Layer):
    """Transformer blocks : communication followed by computation"""

    def __init__(self, n_embed, n_head):
        # n_embed : embedding dimension, n_head : the number of heads we'd like
        super().__init__()
        self.sa_head = MultiHeadAttention(n_head, n_embed//n_head) # Communication
        self.ffwd = FeedForward(n_embed) # Computation of individual tokens

    def call(self, x):
        x = self.sa_head(x)
        x = self.ffwd(x)
        return x
# ==================================================================] #

class BigramLanguageModel(tf.keras.Model):

    def __init__(self, vocab_size):
        '''Initializing embedding layer, which maps integer indices to
        dense vectors of vocab size'''
        super(BigramLanguageModel, self).__init__()
        self.token_embedding_table = tf.keras.layers.Embedding(vocab_size, n_embed)
        self.position_embedding_table = tf.keras.layers.Embedding(block_size, n_embed)
        self.sa_head = MultiHeadAttention(4, n_embed//4) # 4 heads of 8-dimensional self-attention
# [================================================================== #
        self.blocks = tf.keras.Sequential([
            Block(n_embed, n_head=4),
            Block(n_embed, n_head=4),
            Block(n_embed, n_head=4),])
# ==================================================================] #
        self.lm_head = tf.keras.layers.Dense(units=vocab_size)

    def call(self, idx, targets=None):
        '''Method for loss calculation, based on idx (input token indices) and
        target (target token indices)
        B : Batch size
        T : Time = block size = sequence length
        C : Channel = vocab size = number of classes
        '''
        B,T = idx.shape

        token_emb = self.token_embedding_table(idx)  # (B, T, C) Replacing indices with embeddings
        pos_emb = self.position_embedding_table(tf.range(T, dtype=tf.int32)) # (T,C)
        x = token_emb + pos_emb # (B, T, C) Containing both token embedding and position
        x = self.sa_head(x) # Apply self-attention (B, T, C)
        x = self.blocks(x) # Apply feed forward (B, T, C)
        logits = self.lm_head(x)  # (B, T, vocab_size)

        if targets is None: # If target is not provided
            loss = None
        else:               # If target is provided, reshape the tensor so that it's compatible with categorical cross entropy
            B, T, C = tf.shape(logits) # Get the shape of logits
            logits = tf.reshape(logits, (B * T, C)) # Flatten logits for comparison
            targets = tf.reshape(targets, (B * T,)) # Flatten targets
            loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(targets, logits, from_logits=True))
        return logits, loss

    def generate(self, idx, max_new_tokens):
        '''
        Text generating method
        '''
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # Crop idx to the last block_size tokens to avoid going out of scope
            idx_cond = idx[:, -block_size:]
            # Get the predictions
            logits, loss = self(idx_cond)
            # Focus only on the last time step (i.e. history is not being used)
            logits = logits[:, -1, :]  # becomes (B, C)
            # Apply softmax to get probabilities
            probs = tf.nn.softmax(logits, axis=-1)  # (B, C)
            # One sample prediction from the distribution
            idx_next = tf.random.categorical(tf.math.log(probs), num_samples=1, dtype=tf.int64) # (B, 1)

            # idx_next = tf.random.categorical(probs, num_samples=1)
            # append sampled index to the running sequence
            idx = tf.concat([idx, tf.cast(idx_next, tf.int32)], axis=1)  # (B, T+1)
        return idx

model_ff = BigramLanguageModel(vocab_size)

In [26]:
model_train(model_ff, 'Feed forward')
model_generate(model_ff)

Step 1		 train loss 4.1731 | val loss 4.1730
Step 500	 train loss 3.3116 | val loss 3.3521 | time 2 min 57 seconds
Step 1000	 train loss 2.9793 | val loss 2.9716 | time 2 min 54 seconds
Step 1500	 train loss 2.5899 | val loss 2.5900 | time 2 min 54 seconds
Step 2000	 train loss 2.5204 | val loss 2.5378 | time 2 min 55 seconds
Step 2500	 train loss 2.4490 | val loss 2.4483 | time 2 min 57 seconds
Step 3000	 train loss 2.4025 | val loss 2.4139 | time 2 min 43 seconds
Step 3500	 train loss 2.3835 | val loss 2.4003 | time 2 min 43 seconds
Step 4000	 train loss 2.3400 | val loss 2.3588 | time 2 min 42 seconds
Step 4500	 train loss 2.3039 | val loss 2.3284 | time 2 min 42 seconds
Step 5000	 train loss 2.2665 | val loss 2.2985 | time 2 min 39 seconds
Final Loss: 2.2262234687805176


I wand dectrit.
Vakef:
Hoceomtg  m olam !eu
ngKkthBtsh aer,ntF hgr uhtaeh hleirh ewee beElyeshe c
rnfh Ha agcw o r  oowouhis r Wiro,, i,if nchorB emfuse n'c Aewr
.leBInmwIIna
 ns ene  Imi minnus'elu i fm epymafl
t

## Optimization

1) residual

2) pre-layer norm (different from the original paper)>> make more series 3

In [27]:
del BigramLanguageModel

class Head(tf.keras.Model):
    """one head of self-attention"""

    def __init__(self, head_size):
        '''Initializing embedding layer, which maps integer indices to
        dense vectors of vocab size'''
        super(Head, self).__init__()
        self.key = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.query = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.value = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.tril = tf.constant(tf.linalg.band_part(tf.ones((block_size, block_size)), -1, 0), dtype= tf.float32)

    def call(self, x):
        B, T, C = x.shape
        k = self.key(x)     # (B,T,C)
        q = self.query(x)   # (B,T,C)
        # Compute attention scores ('affinities')
        wei = q @ tf.transpose(k, perm=[0,2,1]) * C ** (-0.5) # (B,T,C) @ (B,C,T) -> (B,T,T)
        wei = tf.where(self.tril[:T, :T] == 0, float('-inf'), wei) # Mask the upper triangular part, (B,T,T)
        wei = tf.nn.softmax(wei, axis = -1) # (B,T,T)
        # Perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B,T,T) @ (B,T,C) -> (B,T,C)
        return out

class MultiHeadAttention(tf.keras.layers.Layer):
    '''Multiple heads of self-attention in parallel'''

    def __init__(self,num_heads, head_size):
        super().__init__()
        self.heads = [Head(head_size) for _ in range(num_heads)]
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++
        self.projection = tf.keras.layers.Dense(n_embed)
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++

    def call(self, x):
        out = tf.concat([h(x) for h in self.heads], axis=-1)
# ++++++++++++++++++++++++++++++++++++++++++++++++++++
        out = self.projection(out)
# +++++++++++++++++++++++++++++++++++++++++++++++++++
        return out

class FeedForward(tf.keras.layers.Layer):
    '''A simple linear layer followed by a non-linearity'''
    def __init__(self, n_embed):
        super().__init__()
        self.net = tf.keras.Sequential([
# vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
            tf.keras.layers.Dense(4 * n_embed), # (n_embed, 4 * n_embed)
            tf.keras.layers.ReLU(),
            tf.keras.layers.Dense(n_embed), # (4 * n_embed, n_embed)
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        ])

    def call(self, x):
        return self.net(x)

class Block(tf.keras.layers.Layer):
    """Transformer blocks : communication followed by computation"""

    def __init__(self, n_embed, n_head):
        # n_embed : embedding dimension, n_head : the number of heads we'd like
        super().__init__()
        self.sa_head = MultiHeadAttention(n_head, n_embed//n_head) # Communication
        self.ffwd = FeedForward(n_embed) # Computation of individual tokens
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
        self.ln1 = tf.keras.layers.LayerNormalization(axis=-1)
        self.ln2 = tf.keras.layers.LayerNormalization(axis=-1)
# ++++++++++

    def call(self, x):
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
        # Residual Connections to preserve information, and improve gradient flow
        x = x + self.sa_head(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
        return x


class BigramLanguageModel(tf.keras.Model):

    def __init__(self, vocab_size):
        '''Initializing embedding layer, which maps integer indices to
        dense vectors of vocab size'''
        super(BigramLanguageModel, self).__init__()
        self.token_embedding_table = tf.keras.layers.Embedding(vocab_size, n_embed)
        self.position_embedding_table = tf.keras.layers.Embedding(block_size, n_embed)
        self.sa_head = MultiHeadAttention(4, n_embed//4) # 4 heads of 8-dimensional self-attention
# [================================================================== #
        self.blocks = tf.keras.Sequential([
            Block(n_embed, n_head=4),
            Block(n_embed, n_head=4),
            Block(n_embed, n_head=4),
            tf.keras.layers.LayerNormalization(axis=-1),
            ])
# ==================================================================] #
        self.lm_head = tf.keras.layers.Dense(units=vocab_size)

    def call(self, idx, targets=None):
        '''Method for loss calculation, based on idx (input token indices) and
        target (target token indices)
        B : Batch size
        T : Time = block size = sequence length
        C : Channel = vocab size = number of classes
        '''
        B,T = idx.shape

        token_emb = self.token_embedding_table(idx)  # (B, T, C) Replacing indices with embeddings
        pos_emb = self.position_embedding_table(tf.range(T, dtype=tf.int32)) # (T,C)
        x = token_emb + pos_emb # (B, T, C) Containing both token embedding and position
        x = self.sa_head(x) # Apply self-attention (B, T, C)
        x = self.blocks(x) # Apply feed forward (B, T, C)
        logits = self.lm_head(x)  # (B, T, vocab_size)

        if targets is None: # If target is not provided
            loss = None
        else:               # If target is provided, reshape the tensor so that it's compatible with categorical cross entropy
            B, T, C = tf.shape(logits) # Get the shape of logits
            logits = tf.reshape(logits, (B * T, C)) # Flatten logits for comparison
            targets = tf.reshape(targets, (B * T,)) # Flatten targets
            loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(targets, logits, from_logits=True))
        return logits, loss

    def generate(self, idx, max_new_tokens):
        '''
        Text generating method
        '''
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # Crop idx to the last block_size tokens to avoid going out of scope
            idx_cond = idx[:, -block_size:]
            # Get the predictions
            logits, loss = self(idx_cond)
            # Focus only on the last time step (i.e. history is not being used)
            logits = logits[:, -1, :]  # becomes (B, C)
            # Apply softmax to get probabilities
            probs = tf.nn.softmax(logits, axis=-1)  # (B, C)
            # One sample prediction from the distribution
            idx_next = tf.random.categorical(tf.math.log(probs), num_samples=1, dtype=tf.int64) # (B, 1)

            # idx_next = tf.random.categorical(probs, num_samples=1)
            # append sampled index to the running sequence
            idx = tf.concat([idx, tf.cast(idx_next, tf.int32)], axis=1)  # (B, T+1)
        return idx

model_opt = BigramLanguageModel(vocab_size)

In [28]:
model_train(model_opt, 'Optimized')
model_generate(model_opt)

Step 1		 train loss 4.5362 | val loss 4.5263
Step 500	 train loss 2.4609 | val loss 2.4650 | time 3 min 34 seconds
Step 1000	 train loss 2.3033 | val loss 2.3174 | time 3 min 36 seconds
Step 1500	 train loss 2.1538 | val loss 2.2038 | time 3 min 34 seconds
Step 2000	 train loss 2.0754 | val loss 2.1336 | time 3 min 36 seconds
Step 2500	 train loss 2.0034 | val loss 2.0617 | time 3 min 36 seconds
Step 3000	 train loss 1.9390 | val loss 2.0453 | time 3 min 34 seconds
Step 3500	 train loss 1.8942 | val loss 2.0127 | time 3 min 36 seconds
Step 4000	 train loss 1.8653 | val loss 1.9750 | time 3 min 36 seconds
Step 4500	 train loss 1.8130 | val loss 1.9529 | time 3 min 35 seconds
Step 5000	 train loss 1.7908 | val loss 1.9291 | time 3 min 34 seconds
Final Loss: 1.7931586503982544


GFro doughter's wellowner citines bose think.

APZITWERD:
You she ware a down, is atteroug in promoare astwasped torrow alf he gaward;
To sin, 'porcred sleake fe weelce dafer,
It, nave une leaster byy honour,
Buto

In [29]:
gpt_results

Unnamed: 0,Model,Train loss,Val loss,Time (min),Text
0,Basic,2.4714,2.4881,3.0,"\nSallenl!\nND:\nHico ixcquthed be sis Gemat wse whe atahe ase MExme gube aced prenchow, avere ithaiass itooupur deried we t then arme, aceat elllave thiny MAreng url unubak,\nS: bjworthe t kechatre?\n\nT:\nTHAngobe hed, fithestha!Mour s t thes\nO y.\nMIqunerpe themo vee hilepayoouatheadingick suie t y o'te mond y ithiury s.\nMIn t ss:\nCLAGorvizeerouth sspaugh's cher s he mnd.\nLINTI, cofey with he.\nCEThe ar co t y isit h sisged, be:\n\nCand, re me, myepe mphorferg.\nHand thes hiothergenche avugo, pld hte ive"
1,Self-attention,2.321,2.3558,6.0,"\nAs fis brearimem,\nAndig; jyequurdoror uts theigout thovetre odes bet sa:\nSs dre; bo wiutrer blik ar fareats; aly tm, tak lse himaw? fourr Alor, oth yof ss defe mor pan to orbat the wreren alinoer lat sthes, owf theafr ary neverased.\n\nWhaveneer we esat anga'd dup. Diser: mipat thy: alepopoas bus bllet.\n\nGIONGLOLIANGES: thes tourte tim, opo\nSine.\n\nESIL:\nGou toupreuns, wavely hives thes mitoun vera borl!\nWant dstu threwim a hay see,\nYous ot gato marits:\nMat that thre mein, tied Sincarl is athe lot"
2,Muti-headed attention,2.1403,2.1911,11.0,"\nNere the of stue?\n\n\nFLO:\nAnd.\nI\nBLE:\nSan. Hit youlkenst pray hat yourt; pawe thime Lor istr I no lance,\nGot No Goversat for colionen have stwarmses ary houspe sey thee frand hou\naing gonefor done hree unt gungoowout EP:\nHemy, veles whe nobe godeframdes thece pcail basne you to Fist to owe; by derrceald ingler hou man tid hom fold:\nDo his comine Edossiet haven inke mun's he weatin hat hem usw, wissulle ble Lonet.\nCALIF ancom sande, her la's ouch sots apr to fyoulion'd ar entay, sof what vott bet"
3,Feed forward,2.2665,2.2985,29.0,"\nI wand dectrit.\nVakef:\nHoceomtg m olam !eu\nngKkthBtsh aer,ntF hgr uhtaeh hleirh ewee beElyeshe c\nrnfh Ha agcw o r oowouhis r Wiro,, i,if nchorB emfuse n'c Aewr\n.leBInmwIIna\n ns ene Imi minnus'elu i fm epymafl\nta n aeh yn a nrkieywtc doae, n notess gIhr i mn eto,nt\nweAEnaFrphHheo suynyomr y nrhsomtmus t ogti r.tt\na Dawteupe!n \ndrTeaol y g ntps uIuuLa.r,A\n NMoGEsni\nCgeMu-e?noflr h l ocatr mm \n?m Wheho' geaeed w Wthwuc natuuttmo strsyea-tgEvuT ehieee 'awuh . hoei acr mrh een\no BTmeo"
4,Optimized,1.7908,1.9291,36.0,"\nGFro doughter's wellowner citines bose think.\n\nAPZITWERD:\nYou she ware a down, is atteroug in promoare astwasped torrow alf he gaward;\nTo sin, 'porcred sleake fe weelce dafer,\nIt, nave une leaster byy honour,\nButo, wishapfuchip all midech nown she prike of posts the an outhm forth,\nMay or him.\n\nKING HENRIZEBUCHETE:\nThen mork, you morning have hollg! my forry, sigkeds?\n\nKING EDwARD CED:\nPealours maktators the got dity reave all amk it from it cickes;\nBod beached, foul my one my lindiby gliever in"


## Scaling up the model

Added dropouts to avoid nodes from overfitting

reference : Dropout : A Simple Way to Prevent Neural Networks from Overfitting

In [30]:
tf.random.set_seed(1337)
# Additional hyperparameters
n_head = 2
n_layer = 2
dropout = 0.4

In [31]:
del BigramLanguageModel

class Head(tf.keras.Model):
    """one head of self-attention"""

    def __init__(self, head_size, dropout):
        '''Initializing embedding layer, which maps integer indices to
        dense vectors of vocab size'''
        super(Head, self).__init__()
        self.key = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.query = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.value = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.head_size = head_size
###
        self.dropout = tf.keras.layers.Dropout(dropout)
    '''
    def build(self, input_shape):
      self.block_size = input_shape[-1]
      super().build(input_shape)
    '''
    def call(self, x, training = False):
        B, T, C = x.shape
        k = self.key(x)     # (B,T,C)
        q = self.query(x)   # (B,T,C)
        v = self.value(x) # (B,T,C)
        # Compute attention scores ('affinities')
        wei = q @ tf.transpose(k, perm=[0,2,1]) * C ** (-0.5) # (B,T,C) @ (B,C,T) -> (B,T,T)
###
        mask = tf.linalg.band_part(tf.ones((T, T)), -1, 0)
        wei = tf.where(mask == 0, float('-inf'), wei)  # Mask the upper triangular part, (B,T,T)
###
        wei = tf.nn.softmax(wei, axis = -1) # (B,T,T)
###
        # Apply dropout if in training mode
        wei = self.dropout(wei, training=training)
###
        # Perform the weighted aggregation of the values
        out = wei @ v # (B,T,T) @ (B,T,C) -> (B,T,C)
        return out

class MultiHeadAttention(tf.keras.layers.Layer):
    '''Multiple heads of self-attention in parallel'''
###
    def __init__(self, num_heads, head_size,dropout):
        super().__init__()
        self.heads = [Head(head_size, dropout) for _ in range(num_heads)]
        self.projection = tf.keras.layers.Dense(num_heads * head_size)
        self.dropout = tf.keras.layers.Dropout(dropout)
###
    """
    def build(self, input_shape):
      # This method is called the first time the layer is used with an input
        self.heads = [Head(self.head_size, dropout = self.dropout) for _ in range(self.num_heads)]
        self.projection = tf.keras.layers.Dense(input_shape[-1])
    """

    def call(self, x, training =False):
        out = tf.concat([h(x, training = training) for h in self.heads], axis=-1)
        out = self.projection(out)
###
        return self.dropout(out, training=training)
###

class FeedForward(tf.keras.layers.Layer):
    '''A simple linear layer followed by a non-linearity'''

    def __init__(self, n_embed, dropout):
        super().__init__()
        self.net = tf.keras.Sequential([
            tf.keras.layers.Dense(4 * n_embed), # (n_embed, 4 * n_embed)
            tf.keras.layers.ReLU(),
            tf.keras.layers.Dense(n_embed), # (4 * n_embed, n_embed)
###
            tf.keras.layers.Dropout(dropout),
###
        ])

    def call(self, x, training):
        return self.net(x, training = training)

class Block(tf.keras.layers.Layer):
    """Transformer blocks : communication followed by computation"""

    def __init__(self, n_embed, n_head, dropout):
        # n_embed : embedding dimension, n_head : the number of heads we'd like
        super().__init__()
        head_size = n_embed // n_head
        self.sa = MultiHeadAttention(n_head, head_size, dropout) # Communication
        self.ffwd = FeedForward(n_embed, dropout) # Computation of individual tokens
        self.ln1 = tf.keras.layers.LayerNormalization(axis=-1)
        self.ln2 = tf.keras.layers.LayerNormalization(axis=-1)

    def call(self, x, training = False):
        # Residual Connections to preserve information, and improve gradient flow
        x = x + self.sa(self.ln1(x), training = training)
        x = x + self.ffwd(self.ln2(x), training = training)
        return x


class BigramLanguageModel(tf.keras.Model):

    def __init__(self, vocab_size):
        '''Initializing embedding layer, which maps integer indices to
        dense vectors of vocab size'''
        super(BigramLanguageModel, self).__init__()
        self.token_embedding_table = tf.keras.layers.Embedding(vocab_size, n_embed)
        self.position_embedding_table = tf.keras.layers.Embedding(block_size, n_embed)
###
        self.blocks = [ Block(n_embed, n_head, dropout) for _ in range(n_layer)]
        self.ln_f = tf.keras.layers.LayerNormalization(axis=-1)
###
        self.lm_head = tf.keras.layers.Dense(units=vocab_size)

    def call(self, idx, targets=None, training = False):
        '''Method for loss calculation, based on idx (input token indices) and
        target (target token indices)
        B : Batch size
        T : Time = block size = sequence length
        C : Channel = vocab size = number of classes
        '''
        B,T = idx.shape

        token_emb = self.token_embedding_table(idx)  # (B, T, C) Replacing indices with embeddings
        pos_emb = self.position_embedding_table(tf.range(T, dtype=tf.int32)) # (T,C)
        x = token_emb + pos_emb # (B, T, C) Containing both token embedding and position

        #Apply transformer blocks
        for block in self.blocks:
          x = block(x, training = training)

        x = self.ln_f(x) # Apply normalization
        logits = self.lm_head(x)  # (B, T, vocab_size)

        if targets is None: # If target is not provided
            loss = None
        else:               # If target is provided, reshape the tensor so that it's compatible with categorical cross entropy
            B, T, C = tf.shape(logits) # Get the shape of logits
            logits = tf.reshape(logits, (B * T, C)) # Flatten logits for comparison
            targets = tf.reshape(targets, (B * T,)) # Flatten targets
            loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(targets, logits, from_logits=True))
        return logits, loss

    def generate(self, idx, max_new_tokens):
        '''
        Text generating method
        '''
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # Crop idx to the last block_size tokens to avoid going out of scope
            idx_cond = idx[:, -block_size:]
            # Get the predictions
            logits, loss = self(idx_cond)
            # Focus only on the last time step (i.e. history is not being used)
            logits = logits[:, -1, :]  # becomes (B, C)
            # Apply softmax to get probabilities
            probs = tf.nn.softmax(logits, axis=-1)  # (B, C)
            # One sample prediction from the distribution
            idx_next = tf.random.categorical(tf.math.log(probs), num_samples=1, dtype=tf.int64) # (B, 1)

            # idx_next = tf.random.categorical(probs, num_samples=1)
            # append sampled index to the running sequence
            idx = tf.concat([idx, tf.cast(idx_next, tf.int32)], axis=1)  # (B, T+1)
        return idx

model_scaled = BigramLanguageModel(vocab_size)

In [32]:
model_train(model_scaled, 'Scaled')
model_generate(model_scaled)

Step 1		 train loss 4.4653 | val loss 4.4661
Step 500	 train loss 2.2485 | val loss 2.2778 | time 2 min 18 seconds
Step 1000	 train loss 2.0248 | val loss 2.0943 | time 2 min 16 seconds
Step 1500	 train loss 1.9552 | val loss 2.0417 | time 2 min 14 seconds
Step 2000	 train loss 1.8717 | val loss 1.9811 | time 2 min 12 seconds
Step 2500	 train loss 1.8048 | val loss 1.9479 | time 2 min 8 seconds
Step 3000	 train loss 1.7740 | val loss 1.9201 | time 2 min 9 seconds
Step 3500	 train loss 1.7458 | val loss 1.9009 | time 2 min 10 seconds
Step 4000	 train loss 1.7433 | val loss 1.9008 | time 2 min 10 seconds
Step 4500	 train loss 1.7133 | val loss 1.8678 | time 2 min 9 seconds
Step 5000	 train loss 1.7039 | val loss 1.8693 | time 2 min 7 seconds
Final Loss: 1.8202266693115234


Whittorth the lie, whenttle; to to madly's of sigent
For Clocks, or revedly trong seevess see;
Than-no slot ble all scives up.

CAMILLO:
Shicty eyes briegain.

Dlufforbe Lade queen tuntay that of Corious tito ford nis

In [33]:
gpt_results

Unnamed: 0,Model,Train loss,Val loss,Time (min),Text
0,Basic,2.4714,2.4881,3.0,"\nSallenl!\nND:\nHico ixcquthed be sis Gemat wse whe atahe ase MExme gube aced prenchow, avere ithaiass itooupur deried we t then arme, aceat elllave thiny MAreng url unubak,\nS: bjworthe t kechatre?\n\nT:\nTHAngobe hed, fithestha!Mour s t thes\nO y.\nMIqunerpe themo vee hilepayoouatheadingick suie t y o'te mond y ithiury s.\nMIn t ss:\nCLAGorvizeerouth sspaugh's cher s he mnd.\nLINTI, cofey with he.\nCEThe ar co t y isit h sisged, be:\n\nCand, re me, myepe mphorferg.\nHand thes hiothergenche avugo, pld hte ive"
1,Self-attention,2.321,2.3558,6.0,"\nAs fis brearimem,\nAndig; jyequurdoror uts theigout thovetre odes bet sa:\nSs dre; bo wiutrer blik ar fareats; aly tm, tak lse himaw? fourr Alor, oth yof ss defe mor pan to orbat the wreren alinoer lat sthes, owf theafr ary neverased.\n\nWhaveneer we esat anga'd dup. Diser: mipat thy: alepopoas bus bllet.\n\nGIONGLOLIANGES: thes tourte tim, opo\nSine.\n\nESIL:\nGou toupreuns, wavely hives thes mitoun vera borl!\nWant dstu threwim a hay see,\nYous ot gato marits:\nMat that thre mein, tied Sincarl is athe lot"
2,Muti-headed attention,2.1403,2.1911,11.0,"\nNere the of stue?\n\n\nFLO:\nAnd.\nI\nBLE:\nSan. Hit youlkenst pray hat yourt; pawe thime Lor istr I no lance,\nGot No Goversat for colionen have stwarmses ary houspe sey thee frand hou\naing gonefor done hree unt gungoowout EP:\nHemy, veles whe nobe godeframdes thece pcail basne you to Fist to owe; by derrceald ingler hou man tid hom fold:\nDo his comine Edossiet haven inke mun's he weatin hat hem usw, wissulle ble Lonet.\nCALIF ancom sande, her la's ouch sots apr to fyoulion'd ar entay, sof what vott bet"
3,Feed forward,2.2665,2.2985,29.0,"\nI wand dectrit.\nVakef:\nHoceomtg m olam !eu\nngKkthBtsh aer,ntF hgr uhtaeh hleirh ewee beElyeshe c\nrnfh Ha agcw o r oowouhis r Wiro,, i,if nchorB emfuse n'c Aewr\n.leBInmwIIna\n ns ene Imi minnus'elu i fm epymafl\nta n aeh yn a nrkieywtc doae, n notess gIhr i mn eto,nt\nweAEnaFrphHheo suynyomr y nrhsomtmus t ogti r.tt\na Dawteupe!n \ndrTeaol y g ntps uIuuLa.r,A\n NMoGEsni\nCgeMu-e?noflr h l ocatr mm \n?m Wheho' geaeed w Wthwuc natuuttmo strsyea-tgEvuT ehieee 'awuh . hoei acr mrh een\no BTmeo"
4,Optimized,1.7908,1.9291,36.0,"\nGFro doughter's wellowner citines bose think.\n\nAPZITWERD:\nYou she ware a down, is atteroug in promoare astwasped torrow alf he gaward;\nTo sin, 'porcred sleake fe weelce dafer,\nIt, nave une leaster byy honour,\nButo, wishapfuchip all midech nown she prike of posts the an outhm forth,\nMay or him.\n\nKING HENRIZEBUCHETE:\nThen mork, you morning have hollg! my forry, sigkeds?\n\nKING EDwARD CED:\nPealours maktators the got dity reave all amk it from it cickes;\nBod beached, foul my one my lindiby gliever in"
5,Scaled,1.7039,1.8693,22.0,"\nWhittorth the lie, whenttle; to to madly's of sigent\nFor Clocks, or revedly trong seevess see;\nThan-no slot ble all scives up.\n\nCAMILLO:\nShicty eyes briegain.\n\nDlufforbe Lade queen tuntay that of Corious tito ford nish througe on;\nNe could to unator:'\nA virt, or thy be do weeth-up, then may, he sads but be help,\nTo love Trump'd thy graces. O, sirous to pray-heath sevell the but say swech;\nYou what is well, why sill well.\n\nQUEEN ELIlkelous I to the reciou same on on.\nCome every with hear, the com"
