## Transformers
-  Implementing the Generatively Pretrained Transformer (GPT), following the paper "Attention is All You Need" and OpenAI's GPT-2 / GPT-3.

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as Func
import matplotlib.pyplot as plt  # for making figures
import random
import math

%matplotlib inline

# Black code formatter (Optional)
%load_ext lab_black

## Data

In [2]:
# Using the tiny shakespeare dataset
with open("../data/input.txt", "r", encoding="utf-8") as f:
    texts = f.read()

In [3]:
print("length of dataset in characters: ", len(texts))

length of dataset in characters:  1115394


In [8]:
# Unique characters in the text
chars = sorted(list(set(texts)))
vocab_size = len(chars)
print("".join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [9]:
# Mapping from characters to integers and back

stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}

# Encoder performs stoi
encode = lambda s: [stoi[char] for char in s]

# ecoder: performs itos
decode = lambda l: "".join([itos[i] for i in l])

In [10]:
print(encode("hii there"))
print(decode(encode("hii there")))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


### Data set Splits

In [11]:
# Encoding the entire dataset and store into a Pytorch Tensor
data = torch.tensor(encode(texts), dtype=torch.long)

print(data.shape, data.dtype)

print(data[:1000])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [13]:
n = int(0.9 * len(data))  # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [14]:
block_size = 8
x = train_data[:block_size]  # input to the transformers
y = train_data[
    1 : block_size + 1
]  # next block size charcaters offset by 1 (the target for each character input)

for t in range(block_size):
    context = x[: t + 1]  # all the characters in x up to the target + 1
    target = y[t]  # t character in the target array

    print(f"When the input is {context}. The target: {target}")

When the input is tensor([18]). The target: 47
When the input is tensor([18, 47]). The target: 56
When the input is tensor([18, 47, 56]). The target: 57
When the input is tensor([18, 47, 56, 57]). The target: 58
When the input is tensor([18, 47, 56, 57, 58]). The target: 1
When the input is tensor([18, 47, 56, 57, 58,  1]). The target: 15
When the input is tensor([18, 47, 56, 57, 58,  1, 15]). The target: 47
When the input is tensor([18, 47, 56, 57, 58,  1, 15, 47]). The target: 58


In [16]:
torch.manual_seed(1337)
batch_size = 4  # ndependent sequences to be processed in parallel
block_size = 8  # maximum context length for predictions


def get_batch(split):
    """
    Generates a small batch of data of inputs x and targets y
    Args:
        - split(str): train or validation data selection
    """
    data = train_data if split == "train" else val_data
    ix = torch.randint(
        len(data) - block_size, (batch_size,)
    )  # 4 random numbers between 0 and len(data) - block_size
    x = torch.stack([data[i : i + block_size] for i in ix])
    y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
    return x, y

In [17]:
x_batch, y_batch = get_batch("train")
print("inputs:")
print(x_batch.shape)
print(x_batch)
print("targets:")
print(y_batch.shape)
print(y_batch)

print("------")

for b in range(batch_size):
    for t in range(block_size):
        context = x_batch[b, : t + 1]
        target = y_batch[b, t]

        print(f"When the input is {context}. The target: {target}")

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
------
When the input is tensor([24]). The target: 43
When the input is tensor([24, 43]). The target: 58
When the input is tensor([24, 43, 58]). The target: 5
When the input is tensor([24, 43, 58,  5]). The target: 57
When the input is tensor([24, 43, 58,  5, 57]). The target: 1
When the input is tensor([24, 43, 58,  5, 57,  1]). The target: 46
When the input is tensor([24, 43, 58,  5, 57,  1, 46]). The target: 43
When the input is tensor([24, 43, 58,  5, 57,  1, 46, 43]). The target: 39
When the input is tensor([44]). The target: 53
When the input is tensor([44, 53]). The target: 56
When the input i

## Bigram Language Model

In [25]:
torch.manual_seed(1337)


class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # each token reads off the nth logits for the next token from this lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx)  # (Batch,Time,Channel) e.g (4,8,65)

        if targets is None:
            loss = None
        else:
            # cross entropy expects B,C,T
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = Func.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        """
        Extends the current context by increase the length from( B, T) to (B, T+n)
        Args:
            - idx: the current context of characters in a batch
            - max_new_tokens: the number of characters to extend by
        """
        for _ in range(max_new_tokens):
            logits, loss = self(idx)  # get the predictions

            logits = logits[
                :, -1, :
            ]  # focusing only on the last time step to becomes (B, C)

            # apply softmax to get probabilities
            probs = Func.softmax(logits, dim=-1)  # (B, C)

            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)

            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)

        return idx

In [26]:
m = BigramLanguageModel(vocab_size)
logits, loss = m(x_batch, y_batch)
print(logits.shape)
print(loss)

# generates 100 new tokens starting from a matrix of zeros and then converts them into a list
# then it into strings
idx = torch.zeros((1, 1), dtype=torch.long)
print(decode(m.generate(idx, max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


In [28]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [30]:
# Optimizer object

batch_size = 32
for steps in range(10000):
    x_batch, y_batch = get_batch("train")

    # evaluation
    logits, loss = m(x_batch, y_batch)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.485665798187256


In [32]:
idx = torch.zeros((1, 1), dtype=torch.long)
print(decode(m.generate(idx, max_new_tokens=200)[0].tolist()))


H:
Whadisseyous ffr! alen:

Wer rilff crat, has ce awimpar d tand poundfomm hellond inche lou awilomy b s'see ks; at tofeid y
Wen venl tlorereagh t m, hosof hict ce nk sthindod I:
CESThes I :
ILinoo m


## Transformers

### Self Attention Layer

In [15]:
# Self attention
torch.manual_seed(1337)
B, T, C = 4, 8, 32  # batch, time, channels
x = torch.randn(B, T, C)


# A single Head perform self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)


k = key(x)  # (B, T, 16)
q = query(x)  # (B, T, 16)

wei = q @ k.transpose(-2, -1)  # (B, T, 16) @ (B, 16, T) ---> (B, T, T)

tril = torch.tril(torch.ones(T, T))
# wei = torch.zeros(   (T, T))  # how much of the token from the previous sequence do we wan to aggregrate
wei = wei.masked_fill(
    tril == 0, float("-inf")
)  # setting tokens from the future cannot be aggregrated
# wei produces the amount of information we should take from the previous sequences
wei = Func.softmax(wei, dim=-1)
v = value(x)
out = wei @ v

out.shape

torch.Size([4, 8, 16])

- Every node at each position emits two vectors: a query and a key.
    - the query is what it is looking for
    - The key is what it contains
- A dot product between the query of that token and the keys of the other tokens becomes the weight. 

NOTE:
- This block here is called a **"decoder" attention block** because it has triangular masking, and is usually used in autoregressive settings, like language modeling.
- "self-attention" just means that the keys and values are produced from the same source as queries.
- **"Scaled" attention** additional divides wei by 1/sqrt(head_size). Controlling the variance at initialization. This makes it so when input Q,K are unit variance, wei will be unit variance too and Softmax will stay diffuse and not saturate too much.

In [16]:
k = torch.randn(B, T, head_size)
q = torch.randn(B, T, head_size)
wei = q @ k.transpose(-2, -1) * head_size**-0.5

In [17]:
k.var()

tensor(1.0449)

In [18]:
q.var()

tensor(1.0700)

In [19]:
wei.var()

tensor(1.0918)

In [20]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]), dim=-1)

tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])

In [21]:
torch.softmax(
    torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]) * 8, dim=-1
)  # gets too peaky, converges to one-hot

tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000])

In [27]:
class LayerNorm1D:
    """Similar implementation of LayerNorm in pytorch"""

    def __init__(self, dim, eps=1e-5, momentum=0.1):
        self.eps = eps
        # Layer norm Parameters trained with backpropagation
        self.gamma = torch.ones(dim)  # gain
        self.beta = torch.zeros(dim)  # bias

    def __call__(self, x):
        # Forward pass

        x_mean = x.mean(1, keepdim=True)
        x_var = x.var(1, keepdim=True)

        x_hat = (x - x_mean) / torch.sqrt(
            x_var + self.eps
        )  # normalize to unit variance

        self.out = (self.gamma * x_hat) + self.beta
        return self.out

    def parameters(self):
        # returning only the parameters to be used for backpropagation
        return [self.gamma, self.beta]

In [23]:
torch.manual_seed(1337)
module = LayerNorm1D(100)
x = torch.randn(32, 100)  # batch size 32 of 100-dimensional vectors
x = module(x)
x.shape

torch.Size([32, 100])

In [24]:
x[:, 0].mean(), x[:, 0].std()  # mean,std of one feature across all batch inputs

(tensor(0.1469), tensor(0.8803))

In [25]:
x[0, :].mean(), x[
    0, :
].std()  # mean,std of a single input from the batch, of its features

(tensor(-3.5763e-09), tensor(1.0000))