## Lets build GPT from scratch- in code- spelled out

**Author**: Andrej Karpathy

**URL**: https://www.youtube.com/watch?v=kCc8FmEb1nY&t=2032s

We build a Generatively Pretrained Transformer (GPT), following the paper "Attention is All You Need" and OpenAI's GPT-2 / GPT-3. We talk about connections to ChatGPT, which has taken the world by storm. We watch GitHub Copilot, itself a GPT, help us write a GPT (meta :D!) . I recommend people watch the earlier makemore videos to get comfortable with the autoregressive language modeling framework and basics of tensors and PyTorch nn, which we take for granted in this video.

Links:
- Google colab for the video: https://colab.research.google.com/dri...
- GitHub repo for the video: https://github.com/karpathy/ng-video-...
- Playlist of the whole Zero to Hero series so far:   

 • The spelled-out intro to neural netwo...  
- nanoGPT repo: https://github.com/karpathy/nanoGPT
- my website: https://karpathy.ai

In [14]:
import torch
torch.__version__

'2.2.1'

In [15]:
import os

input_file_path = 'D:\\src\\github\\nanoGPT\\data\\shakespeare_char\\input.txt' 
with open(input_file_path, 'r') as f:
    text = f.read()
print(f"length of dataset in characters: {len(text):,}")
print(text[:10])

length of dataset in characters: 1,115,394
First Citi


In [16]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("all the unique characters:", ''.join(chars))
print(f"vocab size: {vocab_size:,}")

# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
def encode(s):
    return [stoi[c] for c in s] # encoder: take a string, output a list of integers
def decode(l):
    return ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string


all the unique characters: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
vocab size: 65


In [17]:
import torch
data = torch.tensor(encode(text),dtype=torch.long)
print(data.shape,data.type)
print(data[:100])

torch.Size([1115394]) <built-in method type of Tensor object at 0x00000276A6C17230>
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [18]:
# create the train and test splits
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [19]:
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [20]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target:{target}")

when input is tensor([18]) the target:47
when input is tensor([18, 47]) the target:56
when input is tensor([18, 47, 56]) the target:57
when input is tensor([18, 47, 56, 57]) the target:58
when input is tensor([18, 47, 56, 57, 58]) the target:1
when input is tensor([18, 47, 56, 57, 58,  1]) the target:15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target:47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target:58


In [21]:
seed = 1337
torch.manual_seed(seed)
batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,)) # low (int, optional) – Default: 0 ；high (int) ；size (tuple) – define the shape of the output tensor.
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb , yb = get_batch('train')
print('input:')
print(xb.shape)
print(xb)
print('target')
print(yb.shape)
print(yb)

print('--------------')

for b in range(batch_size):# batch dimension
    for t in range(block_size):#time dimension
        context = xb[b,:t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")


input:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
target
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
--------------
when input is [24] the target: 43
when input is [24, 43] the target: 58
when input is [24, 43, 58] the target: 5
when input is [24, 43, 58, 5] the target: 57
when input is [24, 43, 58, 5, 57] the target: 1
when input is [24, 43, 58, 5, 57, 1] the target: 46
when input is [24, 43, 58, 5, 57, 1, 46] the target: 43
when input is [24, 43, 58, 5, 57, 1, 46, 43] the target: 39
when input is [44] the target: 53
when input is [44, 53] the target: 56
when input is [44, 53, 56] the target: 1
when input is [44, 53, 56, 1] the target: 58
when input is [44, 53, 56, 1, 58] the target: 46
when input is 

In [22]:
print(xb)

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])


In [23]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size,vocab_size)
        
    def forward(self, idx, targets=None):
        
        # idx and target are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B, T, C)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits,targets)
        
        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the logits for the index in the sequence
            logits, loss = self(idx)
            # focus only the last time step
            logits = logits[:, -1, :] #become (B, C)
            # apply softmax to convert logits to (normalized) probabilities
            probs = F.softmax(logits, dim=-1) #(B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) #(B, 1)
            # append sampled index to the running sequence and continue
            idx = torch.cat((idx, idx_next), dim=1) #(B,T+1)

        return idx

m = BigramLanguageModel(vocab_size)
logits , loss = m(xb,yb)
print(logits.shape)
print(loss)

print(decode(m.generate(idx = torch.zeros((1,1),dtype=torch.long),max_new_tokens=100)[0].tolist()))


torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


In [24]:
# creator a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(),lr=1e-3)

In [12]:
batch_size = 32
for step in range(10000):
    #sample a batch of data
    xb, yb = get_batch(train_data)
    
    # evaluate the loss
    logits, loss = m(xb, yb)
    
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
print(loss.item())


2.323984146118164


In [13]:
print(decode(m.generate(idx = torch.zeros((1,1),dtype=torch.long),max_new_tokens=300)[0].tolist()))


Iyoteng h hasbe pave pirance
GRO:
Bagathathar's we!
PeKAd ith henoangincenonthioneir thoniteay heltieiengerofo'PTIsit ey
KANIO:
ARUzencofotuprrurknthac.
ha t,
Kay ththind tt hinio t ouchos tes; sw yo hind wotin grotonit t wo it t jod weancotha:
h hay.Jg--s n prids, r loncave w hollular s we, thysht 


## The Mathematcical trick in self-attention

In [28]:
# consider the following toy example
torch.manual_seed(1337)
B, T, C = 4, 8, 2
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [50]:
# version 1
# We want x[b,t]= mean_{i<=t} x[b,i]
xbow= torch.zeros((B,T,C))
for b in range(B):
	for t in range(T):
		xprev = x[b,:t+1] #(t,C)
		xbow[b,t] = torch.mean(xprev,0)

In [66]:
# version 2
wei = torch.tril(torch.ones(T , T))
wei = wei / wei.sum(1 ,keepdim=True)
xbow2 = wei @ x   # (T, T) @ (B, T, C)  ---> (B, T, T) @ (B, T, C)  -->(B, T, C)

result = torch.allclose(xbow, xbow2, rtol=1e-05, atol=1e-05)

print(result)


True


In [76]:
# version 3 : use Sofemax
tril =  torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril==0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x

torch.allclose(xbow, xbow3, rtol=1e-05, atol=1e-05)

True

In [115]:
# version 4 : self-attention
torch.manual_seed(1337)
B, T, C = 4, 8, 32
x = torch.randn(B,T,C)

# let's see a single Head perform self-attention
head_size=16
key = nn.Linear(C, head_size,bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x) # (B, T, 16)
q = query(x) # (B, T, 16)
wei = q @ k.transpose(-2, -1) / head_size**0.5 # (B, T, 16) @ (B, 16, T) --> (B, T, T)

tril =  torch.tril(torch.ones(T, T))
# wei = torch.zeros((T, T))
wei = wei.masked_fill(tril==0, float('-inf'))
wei = F.softmax(wei, dim=-1)

v = value(x)
out = wei @ v
# out = wei @ x

out.shape

torch.Size([4, 8, 16])

In [118]:
q.var()

tensor(0.3386, grad_fn=<VarBackward0>)

In [103]:
import numpy as np

# 使用 detach() 创建不需要梯度跟踪的张量
detached_wei = wei.detach()
# 将 PyTorch 张量转换为 NumPy 数组
numpy_array = detached_wei.numpy()

# 设置小数点后的位数
np.set_printoptions(precision=4, suppress=True)

# 打印数组
print(numpy_array[0])

[[1.     0.     0.     0.     0.     0.     0.     0.    ]
 [0.5618 0.4382 0.     0.     0.     0.     0.     0.    ]
 [0.4329 0.3194 0.2478 0.     0.     0.     0.     0.    ]
 [0.2397 0.3738 0.1775 0.209  0.     0.     0.     0.    ]
 [0.3955 0.1767 0.124  0.0625 0.2412 0.     0.     0.    ]
 [0.1105 0.1488 0.1014 0.239  0.0654 0.3349 0.     0.    ]
 [0.1252 0.136  0.1667 0.1361 0.1603 0.1089 0.1668 0.    ]
 [0.1006 0.1412 0.1353 0.1488 0.1477 0.0862 0.1208 0.1194]]


In [38]:
torch.tril(torch.ones(3,3))

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])

In [41]:
torch.manual_seed(42)
# a=torch.ones(3,3)
a = torch.tril(torch.ones(3,3))
a = a / torch.sum(a, 1, keepdim=True)
b=torch.randint(0,10,(3,2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])
