### * While the original source code is written in Pytorch, the below code is adapted to Tensorflow.

- GPU utilization not enabled

# 1. Preparing the tinyshakespeare text file for training

In [1]:
# Downloading tinyshakesphere for training
!curl https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt > tinyshakespeare.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1089k  100 1089k    0     0  3571k      0 --:--:-- --:--:-- --:--:-- 3583k


In [2]:
# Inspecting the text file
with open('tinyshakespeare.txt','r') as file:
    text = file.read()
print(f'There are {len(text)} characters in the dataset')

There are 1115394 characters in the dataset


In [3]:
# Printing the first 1000 characters
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [4]:
# Identifying the number of unique characters contained in the text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(f"Number of unique characters (including white space): {vocab_size}{''.join(chars)}")

Number of unique characters (including white space): 65
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


# 2. Basic mapping between characters to integers

Tokenizing at the character-level.

More sophisticated examples of word encoding include Google's SentencePiece and OpenAI's tiktoken

In [5]:
# Assigning numbers to each characters to encode the characters to integers
ctoi = {char : num for num, char in enumerate(chars)}
encode = lambda s: [ctoi[c] for c in s]
print(encode('Shakespeare in digits'))

# Reversely, decode integers back to characters
itoc = {num : char for num, char in enumerate(chars)}
decode = lambda l : ''.join([itoc[i] for i in l])
print(decode(encode('Shakespeare in digits')))

[31, 46, 39, 49, 43, 57, 54, 43, 39, 56, 43, 1, 47, 52, 1, 42, 47, 45, 47, 58, 57]
Shakespeare in digits


In [6]:
# Tokenizing the total text
import tensorflow as tf
data = tf.convert_to_tensor(encode(text))
print(data.shape, data.dtype)
print(data[:100])

2024-11-10 18:09:29.470450: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-10 18:09:29.514064: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


(1115394,) <dtype: 'int32'>
tf.Tensor(
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59], shape=(100,), dtype=int32)


In [7]:
# Train and validation split sets, with 9:1 ratio
n = int(0.9*len(data))
data_train = data[:n]
data_test = data[n:]
print(f'Length of train data : {len(data_train)}\nLength of test data : {len(data_test)}')

Length of train data : 1003854
Length of test data : 111540


In [8]:
# Starting with block_size implementation
block_size = 8                            # Context length
print(data_train[:block_size + 1])
x = data_train[:block_size]               # Initial block-size
y = data_train[1:block_size+1]            # Next block-size
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f'Input : {context}, Output : {target}')

tf.Tensor([18 47 56 57 58  1 15 47 58], shape=(9,), dtype=int32)
Input : [18], Output : 47
Input : [18 47], Output : 56
Input : [18 47 56], Output : 57
Input : [18 47 56 57], Output : 58
Input : [18 47 56 57 58], Output : 1
Input : [18 47 56 57 58  1], Output : 15
Input : [18 47 56 57 58  1 15], Output : 47
Input : [18 47 56 57 58  1 15 47], Output : 58


In [9]:
## To be worked on : packaging the code with script with variables for later
# Depiction of the chunk(or in here, block)-wise transformation.
# Having varied blocksize allows the algorithm to take into account the context for inference purpose

tf.random.set_seed(1337) # For reproducibility, to be sure to have consistent random number
batch_size = 4 # The number of independent sequences to train in parallel
block_size = 8 # The maximum context length for prediction

def get_batch(split):
    '''
    Function to generate a small batch of data of inputs x and targets y
    '''

    data = data_train if split == 'train' else data_test
    # Retrieving batches randomly
    ix = tf.random.uniform(shape = (batch_size,),
                          maxval = len(data) - block_size,
                          dtype = tf.int32)
    # Stacking the list of tensors
    x = tf.stack([data[i:i+block_size] for i in ix])
    y = tf.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)
for batch in range(batch_size):       # Batch dimension
    for block in range(block_size):   # Time dimension
        context = xb[batch, :block+1]
        target = yb[batch, block]
        print(f'When input is {context.numpy().tolist()} the target is {target}')

inputs:
(4, 8)
tf.Tensor(
[[ 1 51 63  1 41 53 39 58]
 [39 42  0 20 47 57  1 52]
 [32 53  1 56 43 60 43 50]
 [54 39 52 63  1 54 47 43]], shape=(4, 8), dtype=int32)
targets:
(4, 8)
tf.Tensor(
[[51 63  1 41 53 39 58  6]
 [42  0 20 47 57  1 52 39]
 [53  1 56 43 60 43 50  1]
 [39 52 63  1 54 47 43 41]], shape=(4, 8), dtype=int32)
When input is [1] the target is 51
When input is [1, 51] the target is 63
When input is [1, 51, 63] the target is 1
When input is [1, 51, 63, 1] the target is 41
When input is [1, 51, 63, 1, 41] the target is 53
When input is [1, 51, 63, 1, 41, 53] the target is 39
When input is [1, 51, 63, 1, 41, 53, 39] the target is 58
When input is [1, 51, 63, 1, 41, 53, 39, 58] the target is 6
When input is [39] the target is 42
When input is [39, 42] the target is 0
When input is [39, 42, 0] the target is 20
When input is [39, 42, 0, 20] the target is 47
When input is [39, 42, 0, 20, 47] the target is 57
When input is [39, 42, 0, 20, 47, 57] the target is 1
When input is [39,

## Basic BigramModel for training

In [23]:
tf.random.set_seed(1337)
# Hyperparameters
batch_size = 16 # Independent sequences to process in parallel
block_size = 32 # Maximum context length for prediction
max_iters = 4000
eval_interval = 200 # How often evaluate the loss
learning_rate = 1e-3
eval_iters = 200 # How many batches to use to compute loss
n_embed = 64

In [24]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

gpt_results = pd.DataFrame(columns=['Model', 'Train loss', 'Val loss', 'Time (min)', 'Text'])
gpt_results

Unnamed: 0,Model,Train loss,Val loss,Time (min),Text


In [25]:
class BigramLanguageModel(tf.keras.Model):

    def __init__(self, vocab_size):
        '''Initializing embedding layer, which maps integer indices to
        dense vectors of vocab size'''
        super(BigramLanguageModel, self).__init__()
        self.token_embedding_table = tf.keras.layers.Embedding(vocab_size, vocab_size)

    def call(self, idx, targets=None):
        '''Method for loss calculation, based on idx (input token indices) and
        target (target token indices)
        B : Batch size
        T : Time = block size = sequence length
        C : Channel = vocab size = number of classes
        '''
        logits = self.token_embedding_table(idx)  # Replacing embedding to the indices

        if targets is None: # If target is not provided
            loss = None
        else:               # If target is provided, reshape the tensor so that it's compatible with categorical cross entropy
            B, T, C = tf.shape(logits) # Get the shape of logits
            logits = tf.reshape(logits, (B * T, C)) # Flatten logits for comparison
            targets = tf.reshape(targets, (B * T,)) # Flatten targets
            loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(targets, logits, from_logits=True))
        return logits, loss

    def generate(self, idx, max_new_tokens):
        '''
        Text generating method
        '''
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # Get the predictions
            logits, loss = self(idx)
            # Focus only on the last time step (i.e. history is not being used)
            logits = logits[:, -1, :]  # becomes (B, C)
            # Apply softmax to get probabilities
            probs = tf.nn.softmax(logits, axis=-1)  # (B, C)
            # One sample prediction from the distribution
            idx_next = tf.random.categorical(tf.math.log(probs), num_samples=1, dtype=tf.int64) # (B, 1)

            # idx_next = tf.random.categorical(probs, num_samples=1)
            # append sampled index to the running sequence
            idx = tf.concat([idx, tf.cast(idx_next, tf.int32)], axis=1)  # (B, T+1)
        return idx

model_basic = BigramLanguageModel(vocab_size)

logits, loss = model_basic.call(xb, yb)
print(logits.shape)
print(loss.numpy())

print(decode(model_basic.generate(idx=tf.zeros((1, 1), dtype=tf.int32), max_new_tokens=100)[0].numpy().tolist()))

(32, 65)
4.1767507

sZUf-Xz-K,?hNk;Yr:r'LUFLHH:QlLbpClI
oYwnqeOrE
!zgz'U:,?ZhzxEjItgpzAQjGjM&vv.;OBdqFlQ pxcwcexWhPKs:$&


### Creating an optimizer, and training the model

In [26]:
import time

def estimate_loss(model):
  '''
  Function to average up the loss in multiple batches for both splits
  '''
  output = {}
  model.training = False # Setting the model to evaluation phase
  for split in ['train','val']:
      losses = []
      for _ in range(eval_iters):
          X, Y = get_batch(split)
          logits, loss = model.call(X,Y)
          losses.append(loss)
      output[split] = tf.reduce_mean(losses)
  model.training = True # Setting the model back to training phase
  return output

def model_train(model, label):
    start_train = time.time()
    optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate)
    for step in tf.range(1, max_iters+1):
        if (step % eval_interval == 0) or (step == 1):
            losses = estimate_loss(model)
            if step != 1:
                end_int = time.time()
                print(f"Step {step}\t train loss {losses['train']:.4f} | val loss {losses['val']:.4f} | time {(end_int-start_int)//60:.0f} min {(end_int-start_int)%60:.0f} seconds")
                start_int = time.time()
            else:
                print(f"Step {step}\t\t train loss {losses['train']:.4f} | val loss {losses['val']:.4f}")
                start_int = time.time()


        # Sample a batch of data
        xb, yb = get_batch('train')


        # Evaluate the loss and update parameters
        with tf.GradientTape() as tape:
            logits, loss = model(xb,yb)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    end_train = time.time()

    # Save result for comparison
    global gpt_results
    gpt_results = pd.concat([gpt_results,pd.DataFrame({'Model': label,
                                                       'Train loss': [round(losses['train'].numpy(),4)],
                                                       'Val loss': [round(losses['val'].numpy(),4)],
                                                       'Time (min)' : [round((end_train-start_train)/60,0)],
                                                       'Text':''})], ignore_index = True)
    print(f'Final Loss: {loss.numpy()}')

def model_generate(model):
    # Generate a sequence
    print('\n======================= Generated Sequence =======================')
    idx = tf.zeros((1, 1), dtype=tf.int32)
    generated_sequence = model.generate(idx, max_new_tokens=500).numpy()
    # Save result for comparison
    gpt_results.iloc[-1,4] = decode(generated_sequence[0].tolist())
    print(decode(generated_sequence[0].tolist()))


In [27]:
model_train(model_basic, 'Basic')
model_generate(model_basic)

Step 1		 train loss 4.1765 | val loss 4.1764
Step 200	 train loss 3.5837 | val loss 3.5889 | time 0 min 13 seconds
Step 400	 train loss 3.2158 | val loss 3.2263 | time 0 min 13 seconds
Step 600	 train loss 2.9821 | val loss 2.9912 | time 0 min 13 seconds
Step 800	 train loss 2.8323 | val loss 2.8417 | time 0 min 13 seconds
Step 1000	 train loss 2.7273 | val loss 2.7366 | time 0 min 13 seconds
Step 1200	 train loss 2.6691 | val loss 2.6733 | time 0 min 13 seconds
Step 1400	 train loss 2.6253 | val loss 2.6309 | time 0 min 13 seconds
Step 1600	 train loss 2.5864 | val loss 2.5955 | time 0 min 13 seconds
Step 1800	 train loss 2.5622 | val loss 2.5695 | time 0 min 13 seconds
Step 2000	 train loss 2.5446 | val loss 2.5544 | time 0 min 13 seconds
Step 2200	 train loss 2.5260 | val loss 2.5383 | time 0 min 12 seconds
Step 2400	 train loss 2.5186 | val loss 2.5269 | time 0 min 12 seconds
Step 2600	 train loss 2.5108 | val loss 2.5247 | time 0 min 12 seconds
Step 2800	 train loss 2.5060 | val l

In [28]:
gpt_results

Unnamed: 0,Model,Train loss,Val loss,Time (min),Text
0,Basic,2.4711,2.4878,4.0,"\nD g'sou\nG mese;\nWe RWAmey stan fogas Gis w,\nTand,parit amae ghe!nd acoracore.\nS:\nAY s hend y VI ll t t ace ben;\nWhee t thapoush, tes flyogen qhe\nThare is-fe wau, f way s, rcoooua d ik; as ist theaxe onthiteereatlalit; tey, d t hyXjund thice te oak nga as irn blon, m n n\n\n\n\nWhe n.\n\nRDNCI thoungot y s, hean y ILTI wiach nd t un lled d abethal t the tie dacovey th sel'ty tind gu figbndstarthedire and withes ousthad besthind w st s wrmony, utyoor gyXRCEROL gacind t lathey f ws he wthbed mopowesouth"


## The mathematical trick in self-attention
Below present different ways of calculating weighted aggregation of a matrix, from beginning of the block in each batch, up to the 't'th token. The results of the four approaches are the same

In [29]:
# Tokens learning from previous context, by calculating average up to 't'th token
B,T,C = 4,8,2 # Batch, Time, Channels
x = tf.random.uniform(shape=(B, T,C))

### Version 1: Basic

In [30]:
xbow = tf.zeros((B,T,C)) # Defining a bag of words
for b in range (B):
    for t in range (T):
        xprev = x[b, :t+1] # (t, C) Batch, including the 't'th token
        xbow = xbow.numpy()  # Convert xbow to numpy array to support assignment
        xbow[b, t] = tf.reduce_mean(xprev, axis=0).numpy()  # Calculate mean and assign to xbow
        xbow = tf.convert_to_tensor(xbow)  # Convert back to tensor

### Version 2: Vectorizing


In [31]:
w = tf.linalg.band_part(tf.ones((T,T)),num_lower = 8, num_upper= 0)
w = w / tf.math.reduce_sum(w, axis = 1, keepdims = True) # Low triangular matrix for calculating average weights

xbow2 = w @ x # (B, T, T) @ (B , T, C) --> (B, T, C)
tf.experimental.numpy.allclose(xbow,xbow2).numpy() # Checking whether xbow == xbow2

True

### Version 3: Using softmax

In [32]:
tril = tf.linalg.band_part(tf.ones((T,T)),num_lower = 8, num_upper= 0)
w = tf.zeros((T,T))
w = tf.where(tril == 0, float('-inf'), w) # Replacing 0s with -inf, indicating that the past blocks cannot communicate with the future blocks
w = tf.nn.softmax(w, axis = -1) # Normalizing the weight matrix
xbow3 = w @ x
tf.experimental.numpy.allclose(xbow,xbow3).numpy()

True

### Version 4: Self-attention

Called self-attention as the key, query and value are generated from the same value (x)

Note that key and query weights values are different as

In [33]:
# Attention mechanism
head_size = 16
key = tf.keras.layers.Dense(units=head_size, use_bias=False)
query = tf.keras.layers.Dense(units=head_size, use_bias=False)
value = tf.keras.layers.Dense(units=head_size, use_bias=False)
k = key(x) # Weights adjusted, (B, T, 16)
q = query(x) # (B, T, 16)
w = q @ tf.transpose(k, perm=[0,2,1]) # (B, T, 16) @ (B, 16, T) -> (B, T, T), with (T, T) indicating elements compared with every element in the sequence

tril = tf.linalg.band_part(tf.ones((T,T)),num_lower = 8, num_upper= 0)
w = tf.where(tril == 0, float('-inf'), w) # Replacing 0s with -inf, indicating that the past blocks cannot communicate with the future blocks
w = tf.nn.softmax(w, axis = -1) # Normalizing the weight matrix

v = value(x)
out = w @ v # Using aggregated value instead of the raw x for dimensionality reduction, information extraction
out.shape

TensorShape([4, 8, 16])

Notes:
- Attention is a communication mechanism. Can be seen as nodes in a directed graph looking at each other and aggregating information with a weighted sum from all nodes that point to them, with data-dependent weights.
- There is no notion of space. Attention simply acts over as a set of vectors. This is why we need to positionally encode tokens
- Each example across batch dimension is of course processed completely independently and never 'talk' to each other
- In an 'encoder' attention block (w = tf.where(tril == 0, float('-inf'), w))code can be omitted, allowing all tokens to communicate. This block here is called a 'decoder' attention block because it has triangular masking, and is usually used in autoregressive settings, like language modeling.
- 'Self attention' just means that the keys and values are produced from the same source as queries. In 'Cross-attention', the queries still get produced from x, but the keys and values come from some other, external source (e.g. an encoder module)
- 'Scaled' attention additional divides wei by 1/sqrt(head_size). This makes it so when input Q, K are unit variance, wei will be unit variance too and softmax will stay diffuses and not saturate too much, Illustration below

## Modified BigramModel with self-attention

In [34]:
class Head(tf.keras.Model):
    """one head of self-attention"""

    def __init__(self, head_size):
        '''Initializing embedding layer, which maps integer indices to
        dense vectors of vocab size'''
        super(Head, self).__init__()
        self.key = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.query = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.value = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.tril = tf.constant(tf.linalg.band_part(tf.ones((block_size, block_size)), -1, 0), dtype= tf.float32)

    def call(self, x):
        B, T, C = x.shape
        k = self.key(x)     # (B,T,C)
        q = self.query(x)   # (B,T,C)
        # Compute attention scores ('affinities')
        wei = q @ tf.transpose(k, perm=[0,2,1]) * C ** (-0.5) # (B,T,C) @ (B,C,T) -> (B,T,T)
        wei = tf.where(self.tril[:T, :T] == 0, float('-inf'), wei) # Mask the upper triangular part, (B,T,T)
        wei = tf.nn.softmax(wei, axis = -1) # (B,T,T)
        # Perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B,T,T) @ (B,T,C) -> (B,T,C)
        return out

class BigramLanguageModel(tf.keras.Model):

    def __init__(self, vocab_size):
        '''Initializing embedding layer, which maps integer indices to
        dense vectors of vocab size'''
        super(BigramLanguageModel, self).__init__()
        self.token_embedding_table = tf.keras.layers.Embedding(vocab_size, n_embed)
        self.position_embedding_table = tf.keras.layers.Embedding(block_size, n_embed)
        self.sa_head = Head(n_embed)
        self.lm_head = tf.keras.layers.Dense(units=vocab_size)

    def call(self, idx, targets=None):
        '''Method for loss calculation, based on idx (input token indices) and
        target (target token indices)
        B : Batch size
        T : Time = block size = sequence length
        C : Channel = vocab size = number of classes
        '''
        B,T = idx.shape

        token_emb = self.token_embedding_table(idx)  # (B, T, C) Replacing indices with embeddings
        pos_emb = self.position_embedding_table(tf.range(T, dtype=tf.int32)) # (T,C)
        x = token_emb + pos_emb # (B, T, C) Containing both token embedding and position
        x = self.sa_head(x) # Apply one head of self-attention (B, T, C)
        logits = self.lm_head(x)  # (B, T, vocab_size)

        if targets is None: # If target is not provided
            loss = None
        else:               # If target is provided, reshape the tensor so that it's compatible with categorical cross entropy
            B, T, C = tf.shape(logits) # Get the shape of logits
            logits = tf.reshape(logits, (B * T, C)) # Flatten logits for comparison
            targets = tf.reshape(targets, (B * T,)) # Flatten targets
            loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(targets, logits, from_logits=True))
        return logits, loss

    def generate(self, idx, max_new_tokens):
        '''
        Text generating method
        '''
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # Crop idx to the last block_size tokens to avoid going out of scope
            idx_cond = idx[:, -block_size:]
            # Get the predictions
            logits, loss = self(idx_cond)
            # Focus only on the last time step (i.e. history is not being used)
            logits = logits[:, -1, :]  # becomes (B, C)
            # Apply softmax to get probabilities
            probs = tf.nn.softmax(logits, axis=-1)  # (B, C)
            # One sample prediction from the distribution
            idx_next = tf.random.categorical(tf.math.log(probs), num_samples=1, dtype=tf.int64) # (B, 1)

            # idx_next = tf.random.categorical(probs, num_samples=1)
            # append sampled index to the running sequence
            idx = tf.concat([idx, tf.cast(idx_next, tf.int32)], axis=1)  # (B, T+1)
        return idx

model_sa = BigramLanguageModel(vocab_size)

In [35]:
model_train(model_sa,'Self-attention')
model_generate(model_sa)

Step 1		 train loss 4.1742 | val loss 4.1743
Step 200	 train loss 3.0306 | val loss 3.0350 | time 0 min 24 seconds
Step 400	 train loss 2.6775 | val loss 2.6700 | time 0 min 24 seconds
Step 600	 train loss 2.5807 | val loss 2.5806 | time 0 min 24 seconds
Step 800	 train loss 2.5430 | val loss 2.5285 | time 0 min 24 seconds
Step 1000	 train loss 2.5057 | val loss 2.5074 | time 0 min 24 seconds
Step 1200	 train loss 2.4841 | val loss 2.4867 | time 0 min 22 seconds
Step 1400	 train loss 2.4767 | val loss 2.4711 | time 0 min 23 seconds
Step 1600	 train loss 2.4599 | val loss 2.4606 | time 0 min 23 seconds
Step 1800	 train loss 2.4464 | val loss 2.4581 | time 0 min 23 seconds
Step 2000	 train loss 2.4312 | val loss 2.4490 | time 0 min 23 seconds
Step 2200	 train loss 2.4291 | val loss 2.4370 | time 0 min 23 seconds
Step 2400	 train loss 2.4161 | val loss 2.4245 | time 0 min 23 seconds
Step 2600	 train loss 2.3946 | val loss 2.4065 | time 0 min 22 seconds
Step 2800	 train loss 2.3915 | val l

## Multi-headed attention

In [36]:
class Head(tf.keras.Model):
    """one head of self-attention"""

    def __init__(self, head_size):
        '''Initializing embedding layer, which maps integer indices to
        dense vectors of vocab size'''
        super(Head, self).__init__()
        self.key = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.query = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.value = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.tril = tf.constant(tf.linalg.band_part(tf.ones((block_size, block_size)), -1, 0), dtype= tf.float32)

    def call(self, x):
        B, T, C = x.shape
        k = self.key(x)     # (B,T,C)
        q = self.query(x)   # (B,T,C)
        # Compute attention scores ('affinities')
        wei = q @ tf.transpose(k, perm=[0,2,1]) * C ** (-0.5) # (B,T,C) @ (B,C,T) -> (B,T,T)
        wei = tf.where(self.tril[:T, :T] == 0, float('-inf'), wei) # Mask the upper triangular part, (B,T,T)
        wei = tf.nn.softmax(wei, axis = -1) # (B,T,T)
        # Perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B,T,T) @ (B,T,C) -> (B,T,C)
        return out
# ================================================================== #
class MultiHeadAttention(tf.keras.layers.Layer):
    '''Multiple heads of self-attention in parallel'''

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = [Head(head_size) for _ in range(num_heads)]

    def call(self, x):
        out = tf.concat([h(x) for h in self.heads], axis=-1)
        return out
# ================================================================== #

class BigramLanguageModel(tf.keras.Model):

    def __init__(self, vocab_size):
        '''Initializing embedding layer, which maps integer indices to
        dense vectors of vocab size'''
        super(BigramLanguageModel, self).__init__()
        self.token_embedding_table = tf.keras.layers.Embedding(vocab_size, n_embed)
        self.position_embedding_table = tf.keras.layers.Embedding(block_size, n_embed)
# ================================================================== #
        self.sa_head = MultiHeadAttention(4, n_embed//4) # 4 heads of 8-dimensional self-attention
# ================================================================== #
        self.lm_head = tf.keras.layers.Dense(units=vocab_size)

    def call(self, idx, targets=None):
        '''Method for loss calculation, based on idx (input token indices) and
        target (target token indices)
        B : Batch size
        T : Time = block size = sequence length
        C : Channel = vocab size = number of classes
        '''
        B,T = idx.shape

        token_emb = self.token_embedding_table(idx)  # (B, T, C) Replacing indices with embeddings
        pos_emb = self.position_embedding_table(tf.range(T, dtype=tf.int32)) # (T,C)
        x = token_emb + pos_emb # (B, T, C) Containing both token embedding and position
        x = self.sa_head(x) # Apply one head of self-attention (B, T, C)
        logits = self.lm_head(x)  # (B, T, vocab_size)

        if targets is None: # If target is not provided
            loss = None
        else:               # If target is provided, reshape the tensor so that it's compatible with categorical cross entropy
            B, T, C = tf.shape(logits) # Get the shape of logits
            logits = tf.reshape(logits, (B * T, C)) # Flatten logits for comparison
            targets = tf.reshape(targets, (B * T,)) # Flatten targets
            loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(targets, logits, from_logits=True))
        return logits, loss

    def generate(self, idx, max_new_tokens):
        '''
        Text generating method
        '''
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # Crop idx to the last block_size tokens to avoid going out of scope
            idx_cond = idx[:, -block_size:]
            # Get the predictions
            logits, loss = self(idx_cond)
            # Focus only on the last time step (i.e. history is not being used)
            logits = logits[:, -1, :]  # becomes (B, C)
            # Apply softmax to get probabilities
            probs = tf.nn.softmax(logits, axis=-1)  # (B, C)
            # One sample prediction from the distribution
            idx_next = tf.random.categorical(tf.math.log(probs), num_samples=1, dtype=tf.int64) # (B, 1)

            # idx_next = tf.random.categorical(probs, num_samples=1)
            # append sampled index to the running sequence
            idx = tf.concat([idx, tf.cast(idx_next, tf.int32)], axis=1)  # (B, T+1)
        return idx

model_ma = BigramLanguageModel(vocab_size)

In [37]:
model_train(model_ma, 'Muti-headed attention')
model_generate(model_ma)

Step 1		 train loss 4.1748 | val loss 4.1747
Step 200	 train loss 3.0048 | val loss 3.0125 | time 0 min 35 seconds
Step 400	 train loss 2.6610 | val loss 2.6532 | time 0 min 35 seconds
Step 600	 train loss 2.5831 | val loss 2.5865 | time 0 min 35 seconds
Step 800	 train loss 2.5263 | val loss 2.5184 | time 0 min 34 seconds
Step 1000	 train loss 2.4590 | val loss 2.4714 | time 0 min 34 seconds
Step 1200	 train loss 2.4220 | val loss 2.4310 | time 0 min 34 seconds
Step 1400	 train loss 2.3970 | val loss 2.3991 | time 0 min 36 seconds
Step 1600	 train loss 2.3669 | val loss 2.3782 | time 0 min 36 seconds
Step 1800	 train loss 2.3467 | val loss 2.3428 | time 0 min 36 seconds
Step 2000	 train loss 2.3231 | val loss 2.3349 | time 0 min 35 seconds
Step 2200	 train loss 2.3083 | val loss 2.3140 | time 0 min 34 seconds
Step 2400	 train loss 2.2925 | val loss 2.2990 | time 0 min 33 seconds
Step 2600	 train loss 2.2805 | val loss 2.2778 | time 0 min 32 seconds
Step 2800	 train loss 2.2599 | val l

## Feed Forward

In [38]:
del BigramLanguageModel

class Head(tf.keras.Model):
    """one head of self-attention"""

    def __init__(self, head_size):
        '''Initializing embedding layer, which maps integer indices to
        dense vectors of vocab size'''
        super(Head, self).__init__()
        self.key = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.query = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.value = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.tril = tf.constant(tf.linalg.band_part(tf.ones((block_size, block_size)), -1, 0), dtype= tf.float32)

    def call(self, x):
        B, T, C = x.shape
        k = self.key(x)     # (B,T,C)
        q = self.query(x)   # (B,T,C)
        # Compute attention scores ('affinities')
        wei = q @ tf.transpose(k, perm=[0,2,1]) * C ** (-0.5) # (B,T,C) @ (B,C,T) -> (B,T,T)
        wei = tf.where(self.tril[:T, :T] == 0, float('-inf'), wei) # Mask the upper triangular part, (B,T,T)
        wei = tf.nn.softmax(wei, axis = -1) # (B,T,T)
        # Perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B,T,T) @ (B,T,C) -> (B,T,C)
        return out

class MultiHeadAttention(tf.keras.layers.Layer):
    '''Multiple heads of self-attention in parallel'''

    def __init__(self,num_heads, head_size):
        super().__init__()
        self.heads = [Head(head_size) for _ in range(num_heads)]

    def call(self, x):
        out = tf.concat([h(x) for h in self.heads], axis=-1)
        return out

# [==================================================================
class FeedForward(tf.keras.layers.Layer):
    '''A simple linear layer followed by a non-linearity'''
    def __init__(self, n_embed):
        super().__init__()
        self.net = tf.keras.Sequential([
            tf.keras.layers.Dense(n_embed),
            tf.keras.layers.ReLU(),
        ])

    def call(self, x):
        return self.net(x)

class Block(tf.keras.layers.Layer):
    """Transformer blocks : communication followed by computation"""

    def __init__(self, n_embed, n_head):
        # n_embed : embedding dimension, n_head : the number of heads we'd like
        super().__init__()
        self.sa_head = MultiHeadAttention(n_head, n_embed//n_head) # Communication
        self.ffwd = FeedForward(n_embed) # Computation of individual tokens

    def call(self, x):
        x = self.sa_head(x)
        x = self.ffwd(x)
        return x
# ==================================================================] #

class BigramLanguageModel(tf.keras.Model):

    def __init__(self, vocab_size):
        '''Initializing embedding layer, which maps integer indices to
        dense vectors of vocab size'''
        super(BigramLanguageModel, self).__init__()
        self.token_embedding_table = tf.keras.layers.Embedding(vocab_size, n_embed)
        self.position_embedding_table = tf.keras.layers.Embedding(block_size, n_embed)
        self.sa_head = MultiHeadAttention(4, n_embed//4) # 4 heads of 8-dimensional self-attention
# [================================================================== #
        self.blocks = tf.keras.Sequential([
            Block(n_embed, n_head=4),
            Block(n_embed, n_head=4),
            Block(n_embed, n_head=4),])
# ==================================================================] #
        self.lm_head = tf.keras.layers.Dense(units=vocab_size)

    def call(self, idx, targets=None):
        '''Method for loss calculation, based on idx (input token indices) and
        target (target token indices)
        B : Batch size
        T : Time = block size = sequence length
        C : Channel = vocab size = number of classes
        '''
        B,T = idx.shape

        token_emb = self.token_embedding_table(idx)  # (B, T, C) Replacing indices with embeddings
        pos_emb = self.position_embedding_table(tf.range(T, dtype=tf.int32)) # (T,C)
        x = token_emb + pos_emb # (B, T, C) Containing both token embedding and position
        x = self.sa_head(x) # Apply self-attention (B, T, C)
        x = self.blocks(x) # Apply feed forward (B, T, C)
        logits = self.lm_head(x)  # (B, T, vocab_size)

        if targets is None: # If target is not provided
            loss = None
        else:               # If target is provided, reshape the tensor so that it's compatible with categorical cross entropy
            B, T, C = tf.shape(logits) # Get the shape of logits
            logits = tf.reshape(logits, (B * T, C)) # Flatten logits for comparison
            targets = tf.reshape(targets, (B * T,)) # Flatten targets
            loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(targets, logits, from_logits=True))
        return logits, loss

    def generate(self, idx, max_new_tokens):
        '''
        Text generating method
        '''
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # Crop idx to the last block_size tokens to avoid going out of scope
            idx_cond = idx[:, -block_size:]
            # Get the predictions
            logits, loss = self(idx_cond)
            # Focus only on the last time step (i.e. history is not being used)
            logits = logits[:, -1, :]  # becomes (B, C)
            # Apply softmax to get probabilities
            probs = tf.nn.softmax(logits, axis=-1)  # (B, C)
            # One sample prediction from the distribution
            idx_next = tf.random.categorical(tf.math.log(probs), num_samples=1, dtype=tf.int64) # (B, 1)

            # idx_next = tf.random.categorical(probs, num_samples=1)
            # append sampled index to the running sequence
            idx = tf.concat([idx, tf.cast(idx_next, tf.int32)], axis=1)  # (B, T+1)
        return idx

model_ff = BigramLanguageModel(vocab_size)

In [39]:
model_train(model_ff, 'Feed forward')
model_generate(model_ff)

Step 1		 train loss 4.1763 | val loss 4.1763
Step 200	 train loss 3.3198 | val loss 3.3589 | time 1 min 31 seconds
Step 400	 train loss 3.3161 | val loss 3.3515 | time 1 min 30 seconds
Step 600	 train loss 3.2893 | val loss 3.3227 | time 1 min 31 seconds
Step 800	 train loss 3.0230 | val loss 3.0169 | time 1 min 31 seconds
Step 1000	 train loss 2.6803 | val loss 2.6722 | time 1 min 29 seconds
Step 1200	 train loss 2.5700 | val loss 2.5629 | time 1 min 31 seconds
Step 1400	 train loss 2.4762 | val loss 2.4689 | time 1 min 31 seconds
Step 1600	 train loss 2.4362 | val loss 2.4405 | time 1 min 30 seconds
Step 1800	 train loss 2.3859 | val loss 2.3728 | time 1 min 32 seconds
Step 2000	 train loss 2.3389 | val loss 2.3395 | time 1 min 31 seconds
Step 2200	 train loss 2.3320 | val loss 2.3251 | time 1 min 29 seconds
Step 2400	 train loss 2.2896 | val loss 2.3039 | time 1 min 32 seconds
Step 2600	 train loss 2.2616 | val loss 2.2829 | time 1 min 31 seconds
Step 2800	 train loss 2.2305 | val l

## Optimization

1) residual

2) pre-layer norm (different from the original paper)>> make more series 3

In [40]:
del BigramLanguageModel

class Head(tf.keras.Model):
    """one head of self-attention"""

    def __init__(self, head_size):
        '''Initializing embedding layer, which maps integer indices to
        dense vectors of vocab size'''
        super(Head, self).__init__()
        self.key = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.query = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.value = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.tril = tf.constant(tf.linalg.band_part(tf.ones((block_size, block_size)), -1, 0), dtype= tf.float32)

    def call(self, x):
        B, T, C = x.shape
        k = self.key(x)     # (B,T,C)
        q = self.query(x)   # (B,T,C)
        # Compute attention scores ('affinities')
        wei = q @ tf.transpose(k, perm=[0,2,1]) * C ** (-0.5) # (B,T,C) @ (B,C,T) -> (B,T,T)
        wei = tf.where(self.tril[:T, :T] == 0, float('-inf'), wei) # Mask the upper triangular part, (B,T,T)
        wei = tf.nn.softmax(wei, axis = -1) # (B,T,T)
        # Perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B,T,T) @ (B,T,C) -> (B,T,C)
        return out

class MultiHeadAttention(tf.keras.layers.Layer):
    '''Multiple heads of self-attention in parallel'''

    def __init__(self,num_heads, head_size):
        super().__init__()
        self.heads = [Head(head_size) for _ in range(num_heads)]
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++
        self.projection = tf.keras.layers.Dense(n_embed)
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++

    def call(self, x):
        out = tf.concat([h(x) for h in self.heads], axis=-1)
# ++++++++++++++++++++++++++++++++++++++++++++++++++++
        out = self.projection(out)
# +++++++++++++++++++++++++++++++++++++++++++++++++++
        return out

class FeedForward(tf.keras.layers.Layer):
    '''A simple linear layer followed by a non-linearity'''
    def __init__(self, n_embed):
        super().__init__()
        self.net = tf.keras.Sequential([
# vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
            tf.keras.layers.Dense(4 * n_embed), # (n_embed, 4 * n_embed)
            tf.keras.layers.ReLU(),
            tf.keras.layers.Dense(n_embed), # (4 * n_embed, n_embed)
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        ])

    def call(self, x):
        return self.net(x)

class Block(tf.keras.layers.Layer):
    """Transformer blocks : communication followed by computation"""

    def __init__(self, n_embed, n_head):
        # n_embed : embedding dimension, n_head : the number of heads we'd like
        super().__init__()
        self.sa_head = MultiHeadAttention(n_head, n_embed//n_head) # Communication
        self.ffwd = FeedForward(n_embed) # Computation of individual tokens
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
        self.ln1 = tf.keras.layers.LayerNormalization(axis=-1)
        self.ln2 = tf.keras.layers.LayerNormalization(axis=-1)
# ++++++++++

    def call(self, x):
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
        # Residual Connections to preserve information, and improve gradient flow
        x = x + self.sa_head(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
        return x


class BigramLanguageModel(tf.keras.Model):

    def __init__(self, vocab_size):
        '''Initializing embedding layer, which maps integer indices to
        dense vectors of vocab size'''
        super(BigramLanguageModel, self).__init__()
        self.token_embedding_table = tf.keras.layers.Embedding(vocab_size, n_embed)
        self.position_embedding_table = tf.keras.layers.Embedding(block_size, n_embed)
        self.sa_head = MultiHeadAttention(4, n_embed//4) # 4 heads of 8-dimensional self-attention
# [================================================================== #
        self.blocks = tf.keras.Sequential([
            Block(n_embed, n_head=4),
            Block(n_embed, n_head=4),
            Block(n_embed, n_head=4),
            tf.keras.layers.LayerNormalization(axis=-1),
            ])
# ==================================================================] #
        self.lm_head = tf.keras.layers.Dense(units=vocab_size)

    def call(self, idx, targets=None):
        '''Method for loss calculation, based on idx (input token indices) and
        target (target token indices)
        B : Batch size
        T : Time = block size = sequence length
        C : Channel = vocab size = number of classes
        '''
        B,T = idx.shape

        token_emb = self.token_embedding_table(idx)  # (B, T, C) Replacing indices with embeddings
        pos_emb = self.position_embedding_table(tf.range(T, dtype=tf.int32)) # (T,C)
        x = token_emb + pos_emb # (B, T, C) Containing both token embedding and position
        x = self.sa_head(x) # Apply self-attention (B, T, C)
        x = self.blocks(x) # Apply feed forward (B, T, C)
        logits = self.lm_head(x)  # (B, T, vocab_size)

        if targets is None: # If target is not provided
            loss = None
        else:               # If target is provided, reshape the tensor so that it's compatible with categorical cross entropy
            B, T, C = tf.shape(logits) # Get the shape of logits
            logits = tf.reshape(logits, (B * T, C)) # Flatten logits for comparison
            targets = tf.reshape(targets, (B * T,)) # Flatten targets
            loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(targets, logits, from_logits=True))
        return logits, loss

    def generate(self, idx, max_new_tokens):
        '''
        Text generating method
        '''
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # Crop idx to the last block_size tokens to avoid going out of scope
            idx_cond = idx[:, -block_size:]
            # Get the predictions
            logits, loss = self(idx_cond)
            # Focus only on the last time step (i.e. history is not being used)
            logits = logits[:, -1, :]  # becomes (B, C)
            # Apply softmax to get probabilities
            probs = tf.nn.softmax(logits, axis=-1)  # (B, C)
            # One sample prediction from the distribution
            idx_next = tf.random.categorical(tf.math.log(probs), num_samples=1, dtype=tf.int64) # (B, 1)

            # idx_next = tf.random.categorical(probs, num_samples=1)
            # append sampled index to the running sequence
            idx = tf.concat([idx, tf.cast(idx_next, tf.int32)], axis=1)  # (B, T+1)
        return idx

model_opt = BigramLanguageModel(vocab_size)

In [41]:
model_train(model_opt, 'Optimized')
model_generate(model_opt)

Step 1		 train loss 4.5859 | val loss 4.5831
Step 200	 train loss 3.1861 | val loss 3.2109 | time 2 min 4 seconds
Step 400	 train loss 2.5651 | val loss 2.5561 | time 2 min 0 seconds
Step 600	 train loss 2.4196 | val loss 2.4187 | time 2 min 5 seconds
Step 800	 train loss 2.3521 | val loss 2.3549 | time 1 min 49 seconds
Step 1000	 train loss 2.2909 | val loss 2.2963 | time 1 min 51 seconds
Step 1200	 train loss 2.2647 | val loss 2.2715 | time 1 min 50 seconds
Step 1400	 train loss 2.2015 | val loss 2.2031 | time 1 min 51 seconds
Step 1600	 train loss 2.1612 | val loss 2.1841 | time 1 min 48 seconds
Step 1800	 train loss 2.1083 | val loss 2.1551 | time 1 min 49 seconds
Step 2000	 train loss 2.0918 | val loss 2.1425 | time 1 min 49 seconds
Step 2200	 train loss 2.0470 | val loss 2.0984 | time 1 min 49 seconds
Step 2400	 train loss 2.0153 | val loss 2.0836 | time 1 min 48 seconds
Step 2600	 train loss 1.9748 | val loss 2.0495 | time 1 min 47 seconds
Step 2800	 train loss 1.9626 | val loss

In [42]:
gpt_results

Unnamed: 0,Model,Train loss,Val loss,Time (min),Text
0,Basic,2.4711,2.4878,4.0,"\nD g'sou\nG mese;\nWe RWAmey stan fogas Gis w,\nTand,parit amae ghe!nd acoracore.\nS:\nAY s hend y VI ll t t ace ben;\nWhee t thapoush, tes flyogen qhe\nThare is-fe wau, f way s, rcoooua d ik; as ist theaxe onthiteereatlalit; tey, d t hyXjund thice te oak nga as irn blon, m n n\n\n\n\nWhe n.\n\nRDNCI thoungot y s, hean y ILTI wiach nd t un lled d abethal t the tie dacovey th sel'ty tind gu figbndstarthedire and withes ousthad besthind w st s wrmony, utyoor gyXRCEROL gacind t lathey f ws he wthbed mopowesouth"
1,Self-attention,2.3557,2.3745,8.0,"\nGrth berded ourdar to, hey, herirod;, therd yod wino!\nWARNG:\nBes,\nI fad wis the: isth gears cituy f ou boerilercors yon,\nThord thinticer ofo wieche I INorurdea randexalan.\nNou say\nAnel fou byo cy. ARCEO:\nLou Ene toe, paer steran forve Pou\nies th aried inlon uan thelem pe't\nsical akitaly, mave,-juegh.\n\nAn'TO:\nTAno\nWhir\nLaveit'd; thind 'sourd.\n\nBELLO ave me he?\nINou y thels ld\nWhexy ato dove wimarenas by at\nBur. KARTHanit maved,\nABRUK:\n\nBy; osd ansch pert torow'd comicewhe ke!\nFak, gythesard tey q"
2,Muti-headed attention,2.192,2.2168,11.0,"\nCOROProm:\nI geast mest bl fath my hist ss arysupy\nAn?\nPEAERCIDA:\nIN a thembutt,\nPay avecse, aswe k's bas ters no grark, klive's.\nKE WIG irt;\nWer\nFot lasupave penot you pe bibe! of thy ty me-he; I o be han, arizede yor yougs timpid, band dend the 'd weit my stragn.\n\nFas ma this showio himm by cingee o, for cen's!\nHARDILO:\nI dys thered.\nAnd coul the 'swse? ay, anent thrid im's\nD INIUS:\nAnd, aglent, Le OF eve o wamledse noumse I yot,'lse wit I ter ball hy leengin bown.\n\nCithine\nHen mbe lore bo whew"
3,Feed forward,2.1342,2.1828,31.0,"\nTo be now as lodn.\n\nBENCIHUS:\nThe of the hail'l?\nYored ouark taine sins. if to wruse:\nnot bickus the a onay pern thape silone therem.\nbom the aence,\nThath ome ounceing magy thisen.\nNard omlat; bas plailne? of oskes!\nYourm.r ther olit oun to sepucesfirse lied. baissh huny; goops oupeen, Tiver in nor thee ingthech thouech, I farter er'p\nlath thee hake: a thath,\nTo you the cise merting astert tilstlevoe: maight, nonmmlys\nI ound, eoccuz poiedaung\nFoorl ither's a and townt,\nOw ece.srfeer have ageand,"
4,Optimized,1.8519,1.9671,38.0,"\nThe ridpers rrace shall I ackie!\nIfty to gate to pese have tread it:\nA matelliegly lery tcounte uppoter.\n\nVOLINIUS:\nE:\nAze hid of a somers deep'lle have for unwordy.\nAlack them\nAstin to o as me! way I\nThat 'cles thou with them lears\nbut wate kin harnicbiod, thy laud in we rifh\nReced whith say love to manite mode beate.\n\nBELID:\nYou my, cast thin the pold, tell dephars. Mire a cuing onclesmlance your,\nOn you hisgt ate to burnuege, trots gontedn; on tesw?\nDo she amper you' lather of hasir Venrowing"


## Scaling up the model

Added dropouts to avoid nodes from overfitting

reference : Dropout : A Simple Way to Prevent Neural Networks from Overfitting

In [43]:
tf.random.set_seed(1337)
# Hyperparameters
batch_size = 16 # Independent sequences to process in parallel
block_size = 32 # Maximum context length for prediction
max_iters = 4000
eval_interval = 200 # How often evaluate the loss
learning_rate = 1e-3
eval_iters = 200 # How many batches to use to compute loss
n_embed = 64
n_head = 2
n_layer = 2
dropout = 0.6

In [44]:
del BigramLanguageModel

class Head(tf.keras.Model):
    """one head of self-attention"""

    def __init__(self, head_size, dropout):
        '''Initializing embedding layer, which maps integer indices to
        dense vectors of vocab size'''
        super(Head, self).__init__()
        self.key = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.query = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.value = tf.keras.layers.Dense(units=head_size, use_bias=False)
        self.head_size = head_size
###
        self.dropout = tf.keras.layers.Dropout(dropout)
    '''
    def build(self, input_shape):
      self.block_size = input_shape[-1]
      super().build(input_shape)
    '''
    def call(self, x, training = False):
        B, T, C = x.shape
        k = self.key(x)     # (B,T,C)
        q = self.query(x)   # (B,T,C)
        v = self.value(x) # (B,T,C)
        # Compute attention scores ('affinities')
        wei = q @ tf.transpose(k, perm=[0,2,1]) * C ** (-0.5) # (B,T,C) @ (B,C,T) -> (B,T,T)
###
        mask = tf.linalg.band_part(tf.ones((T, T)), -1, 0)
        wei = tf.where(mask == 0, float('-inf'), wei)  # Mask the upper triangular part, (B,T,T)
###
        wei = tf.nn.softmax(wei, axis = -1) # (B,T,T)
###
        # Apply dropout if in training mode
        wei = self.dropout(wei, training=training)
###
        # Perform the weighted aggregation of the values
        out = wei @ v # (B,T,T) @ (B,T,C) -> (B,T,C)
        return out

class MultiHeadAttention(tf.keras.layers.Layer):
    '''Multiple heads of self-attention in parallel'''
###
    def __init__(self, num_heads, head_size,dropout):
        super().__init__()
        self.heads = [Head(head_size, dropout) for _ in range(num_heads)]
        self.projection = tf.keras.layers.Dense(num_heads * head_size)
        self.dropout = tf.keras.layers.Dropout(dropout)
###
    """
    def build(self, input_shape):
      # This method is called the first time the layer is used with an input
        self.heads = [Head(self.head_size, dropout = self.dropout) for _ in range(self.num_heads)]
        self.projection = tf.keras.layers.Dense(input_shape[-1])
    """

    def call(self, x, training =False):
        out = tf.concat([h(x, training = training) for h in self.heads], axis=-1)
        out = self.projection(out)
###
        return self.dropout(out, training=training)
###

class FeedForward(tf.keras.layers.Layer):
    '''A simple linear layer followed by a non-linearity'''

    def __init__(self, n_embed, dropout):
        super().__init__()
        self.net = tf.keras.Sequential([
            tf.keras.layers.Dense(4 * n_embed), # (n_embed, 4 * n_embed)
            tf.keras.layers.ReLU(),
            tf.keras.layers.Dense(n_embed), # (4 * n_embed, n_embed)
###
            tf.keras.layers.Dropout(dropout),
###
        ])

    def call(self, x, training):
        return self.net(x, training = training)

class Block(tf.keras.layers.Layer):
    """Transformer blocks : communication followed by computation"""

    def __init__(self, n_embed, n_head, dropout):
        # n_embed : embedding dimension, n_head : the number of heads we'd like
        super().__init__()
        head_size = n_embed // n_head
        self.sa = MultiHeadAttention(n_head, head_size, dropout) # Communication
        self.ffwd = FeedForward(n_embed, dropout) # Computation of individual tokens
        self.ln1 = tf.keras.layers.LayerNormalization(axis=-1)
        self.ln2 = tf.keras.layers.LayerNormalization(axis=-1)

    def call(self, x, training = False):
        # Residual Connections to preserve information, and improve gradient flow
        x = x + self.sa(self.ln1(x), training = training)
        x = x + self.ffwd(self.ln2(x), training = training)
        return x


class BigramLanguageModel(tf.keras.Model):

    def __init__(self, vocab_size):
        '''Initializing embedding layer, which maps integer indices to
        dense vectors of vocab size'''
        super(BigramLanguageModel, self).__init__()
        self.token_embedding_table = tf.keras.layers.Embedding(vocab_size, n_embed)
        self.position_embedding_table = tf.keras.layers.Embedding(block_size, n_embed)
###
        self.blocks = [ Block(n_embed, n_head, dropout) for _ in range(n_layer)]
        self.ln_f = tf.keras.layers.LayerNormalization(axis=-1)
###
        self.lm_head = tf.keras.layers.Dense(units=vocab_size)

    def call(self, idx, targets=None, training = False):
        '''Method for loss calculation, based on idx (input token indices) and
        target (target token indices)
        B : Batch size
        T : Time = block size = sequence length
        C : Channel = vocab size = number of classes
        '''
        B,T = idx.shape

        token_emb = self.token_embedding_table(idx)  # (B, T, C) Replacing indices with embeddings
        pos_emb = self.position_embedding_table(tf.range(T, dtype=tf.int32)) # (T,C)
        x = token_emb + pos_emb # (B, T, C) Containing both token embedding and position

        #Apply transformer blocks
        for block in self.blocks:
          x = block(x, training = training)

        x = self.ln_f(x) # Apply normalization
        logits = self.lm_head(x)  # (B, T, vocab_size)

        if targets is None: # If target is not provided
            loss = None
        else:               # If target is provided, reshape the tensor so that it's compatible with categorical cross entropy
            B, T, C = tf.shape(logits) # Get the shape of logits
            logits = tf.reshape(logits, (B * T, C)) # Flatten logits for comparison
            targets = tf.reshape(targets, (B * T,)) # Flatten targets
            loss = tf.reduce_mean(tf.keras.losses.sparse_categorical_crossentropy(targets, logits, from_logits=True))
        return logits, loss

    def generate(self, idx, max_new_tokens):
        '''
        Text generating method
        '''
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # Crop idx to the last block_size tokens to avoid going out of scope
            idx_cond = idx[:, -block_size:]
            # Get the predictions
            logits, loss = self(idx_cond)
            # Focus only on the last time step (i.e. history is not being used)
            logits = logits[:, -1, :]  # becomes (B, C)
            # Apply softmax to get probabilities
            probs = tf.nn.softmax(logits, axis=-1)  # (B, C)
            # One sample prediction from the distribution
            idx_next = tf.random.categorical(tf.math.log(probs), num_samples=1, dtype=tf.int64) # (B, 1)

            # idx_next = tf.random.categorical(probs, num_samples=1)
            # append sampled index to the running sequence
            idx = tf.concat([idx, tf.cast(idx_next, tf.int32)], axis=1)  # (B, T+1)
        return idx

model_scaled = BigramLanguageModel(vocab_size)

In [45]:
model_train(model_scaled, 'Scaled')
model_generate(model_scaled)

Step 1		 train loss 4.6753 | val loss 4.6567
Step 200	 train loss 2.5079 | val loss 2.5134 | time 1 min 18 seconds
Step 400	 train loss 2.3489 | val loss 2.3582 | time 1 min 17 seconds
Step 600	 train loss 2.2080 | val loss 2.2454 | time 1 min 17 seconds
Step 800	 train loss 2.1310 | val loss 2.1803 | time 1 min 14 seconds
Step 1000	 train loss 2.0606 | val loss 2.1222 | time 1 min 14 seconds
Step 1200	 train loss 2.0137 | val loss 2.0915 | time 1 min 13 seconds
Step 1400	 train loss 1.9821 | val loss 2.0676 | time 1 min 13 seconds
Step 1600	 train loss 1.9366 | val loss 2.0297 | time 1 min 10 seconds
Step 1800	 train loss 1.9045 | val loss 2.0060 | time 1 min 11 seconds
Step 2000	 train loss 1.8825 | val loss 1.9942 | time 1 min 9 seconds
Step 2200	 train loss 1.8597 | val loss 1.9729 | time 1 min 7 seconds
Step 2400	 train loss 1.8489 | val loss 1.9775 | time 1 min 10 seconds
Step 2600	 train loss 1.8384 | val loss 1.9694 | time 1 min 8 seconds
Step 2800	 train loss 1.8234 | val loss

In [46]:
gpt_results

Unnamed: 0,Model,Train loss,Val loss,Time (min),Text
0,Basic,2.4711,2.4878,4.0,"\nD g'sou\nG mese;\nWe RWAmey stan fogas Gis w,\nTand,parit amae ghe!nd acoracore.\nS:\nAY s hend y VI ll t t ace ben;\nWhee t thapoush, tes flyogen qhe\nThare is-fe wau, f way s, rcoooua d ik; as ist theaxe onthiteereatlalit; tey, d t hyXjund thice te oak nga as irn blon, m n n\n\n\n\nWhe n.\n\nRDNCI thoungot y s, hean y ILTI wiach nd t un lled d abethal t the tie dacovey th sel'ty tind gu figbndstarthedire and withes ousthad besthind w st s wrmony, utyoor gyXRCEROL gacind t lathey f ws he wthbed mopowesouth"
1,Self-attention,2.3557,2.3745,8.0,"\nGrth berded ourdar to, hey, herirod;, therd yod wino!\nWARNG:\nBes,\nI fad wis the: isth gears cituy f ou boerilercors yon,\nThord thinticer ofo wieche I INorurdea randexalan.\nNou say\nAnel fou byo cy. ARCEO:\nLou Ene toe, paer steran forve Pou\nies th aried inlon uan thelem pe't\nsical akitaly, mave,-juegh.\n\nAn'TO:\nTAno\nWhir\nLaveit'd; thind 'sourd.\n\nBELLO ave me he?\nINou y thels ld\nWhexy ato dove wimarenas by at\nBur. KARTHanit maved,\nABRUK:\n\nBy; osd ansch pert torow'd comicewhe ke!\nFak, gythesard tey q"
2,Muti-headed attention,2.192,2.2168,11.0,"\nCOROProm:\nI geast mest bl fath my hist ss arysupy\nAn?\nPEAERCIDA:\nIN a thembutt,\nPay avecse, aswe k's bas ters no grark, klive's.\nKE WIG irt;\nWer\nFot lasupave penot you pe bibe! of thy ty me-he; I o be han, arizede yor yougs timpid, band dend the 'd weit my stragn.\n\nFas ma this showio himm by cingee o, for cen's!\nHARDILO:\nI dys thered.\nAnd coul the 'swse? ay, anent thrid im's\nD INIUS:\nAnd, aglent, Le OF eve o wamledse noumse I yot,'lse wit I ter ball hy leengin bown.\n\nCithine\nHen mbe lore bo whew"
3,Feed forward,2.1342,2.1828,31.0,"\nTo be now as lodn.\n\nBENCIHUS:\nThe of the hail'l?\nYored ouark taine sins. if to wruse:\nnot bickus the a onay pern thape silone therem.\nbom the aence,\nThath ome ounceing magy thisen.\nNard omlat; bas plailne? of oskes!\nYourm.r ther olit oun to sepucesfirse lied. baissh huny; goops oupeen, Tiver in nor thee ingthech thouech, I farter er'p\nlath thee hake: a thath,\nTo you the cise merting astert tilstlevoe: maight, nonmmlys\nI ound, eoccuz poiedaung\nFoorl ither's a and townt,\nOw ece.srfeer have ageand,"
4,Optimized,1.8519,1.9671,38.0,"\nThe ridpers rrace shall I ackie!\nIfty to gate to pese have tread it:\nA matelliegly lery tcounte uppoter.\n\nVOLINIUS:\nE:\nAze hid of a somers deep'lle have for unwordy.\nAlack them\nAstin to o as me! way I\nThat 'cles thou with them lears\nbut wate kin harnicbiod, thy laud in we rifh\nReced whith say love to manite mode beate.\n\nBELID:\nYou my, cast thin the pold, tell dephars. Mire a cuing onclesmlance your,\nOn you hisgt ate to burnuege, trots gontedn; on tesw?\nDo she amper you' lather of hasir Venrowing"
5,Scaled,1.7416,1.897,24.0,"\nFirst of th!\nThat you, if done oftlew\nAnd engrouth, thethink iblo; that seaght, out!\n\nCFRIV:\nHow How Bolowo\nNamer in' belconverver grack Herew come,\nwell, Rich goods, abron, nor shall shall luce for thigh fair,\nBut new the proscourt\nI, shalf in\nWhat I shall, thin, by should more come.\n\nCLISAR:\nMy doithing them:\nGor morwind telvess', the child,\nDell Eard ot blood man sagain, I dear in from most you she intid,\nCour'd and ternent make ampon I crouck.\n\nESBRULY:\nSay, Rowelon?\n\nWARWICK:\nThere? med be'"
