In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 20 # how many independent sequences will we process in parallel?
block_size = 100 # what is the maximum context length for predictions?

#batch_size = 4 # how many independent sequences will we process in parallel?
#block_size = 8 # what is the maximum context length for predictions?


max_iters = 5000 # 1200 NExt is better with early stopping.
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 100
n_head = 5
n_layer = 4
dropout = 0.2
# ------------

torch.manual_seed(1337)

# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt


filename = "black_hole_wiki.txt"

with open(f'dataset/{filename}', 'r', encoding='utf-8') as f:
    text = f.read()




In [2]:
device

'cpu'

In [3]:
import tiktoken
enc = tiktoken.get_encoding("cl100k_base")
assert enc.decode(enc.encode("hello world")) == "hello world"

# To get the tokeniser corresponding to a specific model in the OpenAI API:
enc = tiktoken.encoding_for_model("gpt-4")

In [4]:
tokens_black_hole_text = enc.encode(text)

In [5]:
len(tokens_black_hole_text)

14956

In [6]:
jaafar_tokens = {i:token for i, token in enumerate(tokens_black_hole_text)}
jaafar_tokens

{0: 1211,
 1: 430,
 2: 706,
 3: 264,
 4: 912,
 5: 81123,
 6: 19254,
 7: 1789,
 8: 1023,
 9: 5829,
 10: 11,
 11: 1518,
 12: 5348,
 13: 14512,
 14: 320,
 15: 4338,
 16: 3042,
 17: 343,
 18: 4090,
 19: 8,
 20: 662,
 21: 7286,
 22: 9063,
 23: 2217,
 24: 315,
 25: 264,
 26: 2307,
 27: 27428,
 28: 535,
 29: 3776,
 30: 14512,
 31: 520,
 32: 279,
 33: 6332,
 34: 315,
 35: 19234,
 36: 1291,
 37: 220,
 38: 4044,
 39: 510,
 40: 16,
 41: 60,
 42: 47988,
 43: 19576,
 44: 315,
 45: 264,
 46: 66864,
 47: 42050,
 48: 3124,
 49: 3776,
 50: 14512,
 51: 449,
 52: 264,
 53: 34261,
 54: 12579,
 55: 4920,
 56: 13,
 57: 33916,
 58: 279,
 59: 892,
 60: 315,
 61: 17632,
 62: 11,
 63: 14560,
 64: 71019,
 65: 18848,
 66: 287,
 67: 315,
 68: 279,
 69: 34261,
 70: 374,
 71: 13468,
 72: 13,
 73: 362,
 74: 3776,
 75: 14512,
 76: 374,
 77: 264,
 78: 5654,
 79: 315,
 80: 100108,
 81: 4199,
 82: 1405,
 83: 24128,
 84: 374,
 85: 779,
 86: 3831,
 87: 430,
 88: 4400,
 89: 11,
 90: 2737,
 91: 3177,
 92: 323,
 93: 1023,
 94

In [7]:
jaafar_tokens_reversed = {token:i for i, token in enumerate(tokens_black_hole_text)}
jaafar_tokens_reversed

{1211: 0,
 430: 14947,
 706: 14387,
 264: 14929,
 912: 10924,
 81123: 5,
 19254: 13648,
 1789: 12989,
 1023: 14853,
 5829: 9,
 11: 14944,
 1518: 10098,
 5348: 14571,
 14512: 14643,
 320: 14829,
 4338: 14596,
 3042: 14597,
 343: 14598,
 4090: 14599,
 8: 14721,
 662: 14955,
 7286: 14586,
 9063: 12099,
 2217: 11502,
 315: 14839,
 2307: 12917,
 27428: 12918,
 535: 12919,
 3776: 14930,
 520: 14903,
 279: 14951,
 6332: 7228,
 19234: 10195,
 1291: 10196,
 220: 14743,
 4044: 12275,
 510: 14938,
 16: 14674,
 60: 14940,
 47988: 42,
 19576: 43,
 66864: 14868,
 42050: 14869,
 3124: 14870,
 449: 14454,
 34261: 12404,
 12579: 12471,
 4920: 11074,
 13: 14934,
 33916: 57,
 892: 14481,
 17632: 61,
 14560: 10022,
 71019: 14195,
 18848: 12552,
 287: 12564,
 374: 14911,
 13468: 10641,
 362: 13227,
 5654: 14805,
 100108: 14917,
 4199: 14258,
 1405: 8238,
 24128: 13829,
 779: 11177,
 3831: 12436,
 4400: 357,
 2737: 3914,
 3177: 14895,
 323: 14884,
 66669: 11194,
 17301: 10304,
 1174: 14259,
 3403: 12895,
 4

In [8]:
# here are all the unique characters that occur in this text
unique_words = sorted(list(set(tokens_black_hole_text)))
vocab_size = len(unique_words)


encode = lambda s: [jaafar_tokens_reversed[c] for c in enc.encode(s)] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([enc.decode([jaafar_tokens[i]]) for i in l]) # decoder: take a list of integers, output a string


example = "Object that"
assert decode(encode(example)) == example



In [9]:
# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [10]:
# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    
    # TODO : Draw
    ix = torch.randint(len(data) - block_size, (batch_size,))

    # TODO : Draw
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [11]:
xb, yb = get_batch("train")
print("inputs : ")
print(xb.shape)
print(xb)

print('targets :')
print(yb.shape)
print(yb)
"""

[(76, 78, 66), 10,  0, 54, 13, 15]
         I
         I
         =
[78, 66, 10,  0, 54, 13, 15, 19]
"""

inputs : 
torch.Size([20, 100])
tensor([[ 8130,  8131, 14938,  ..., 14181,  7913, 12809],
        [14951, 12003, 12758,  ..., 14934, 14938, 11776],
        [14674, 14750, 12817,  ..., 14850, 14929, 14930],
        ...,
        [10824, 14951, 11663,  ..., 14877, 14363, 14932],
        [14529, 14930, 14680,  ...,  4860, 13997, 14742],
        [14944, 10518, 14839,  ..., 14756,  6793, 14756]])
targets :
torch.Size([20, 100])
tensor([[ 8131, 14938, 14659,  ...,  7913, 12809,  7915],
        [12003, 12758, 14934,  ..., 14938, 11776, 14940],
        [14750, 12817, 12818,  ..., 14929, 14930, 14643],
        ...,
        [14951, 11663, 13645,  ..., 14363, 14932, 10924],
        [14930, 14680, 14551,  ..., 13997, 14742, 14884],
        [10518, 14839, 14000,  ...,  6793, 14756,  7977]])


'\n\n[(76, 78, 66), 10,  0, 54, 13, 15]\n         I\n         I\n         =\n[78, 66, 10,  0, 54, 13, 15, 19]\n'

In [12]:
vocab_size = xb.max() + 1

In [13]:
vocab_size

tensor(14956)

In [14]:
for b in range(batch_size):
    for t in range(block_size): # time dimension = context length for prediction

        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input {context.tolist()} the target : { target}")

when input [8130] the target : 8131
when input [8130, 8131] the target : 14938
when input [8130, 8131, 14938] the target : 14659
when input [8130, 8131, 14938, 14659] the target : 14660
when input [8130, 8131, 14938, 14659, 14660] the target : 8142
when input [8130, 8131, 14938, 14659, 14660, 8142] the target : 8143
when input [8130, 8131, 14938, 14659, 14660, 8142, 8143] the target : 9386
when input [8130, 8131, 14938, 14659, 14660, 8142, 8143, 9386] the target : 14587
when input [8130, 8131, 14938, 14659, 14660, 8142, 8143, 9386, 14587] the target : 14409
when input [8130, 8131, 14938, 14659, 14660, 8142, 8143, 9386, 14587, 14409] the target : 7825
when input [8130, 8131, 14938, 14659, 14660, 8142, 8143, 9386, 14587, 14409, 7825] the target : 12809
when input [8130, 8131, 14938, 14659, 14660, 8142, 8143, 9386, 14587, 14409, 7825, 12809] the target : 14934
when input [8130, 8131, 14938, 14659, 14660, 8142, 8143, 9386, 14587, 14409, 7825, 12809, 14934] the target : 14942
when input [81

In [15]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class GPTLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()

        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)
        # better init, not covered in the original GPT video, but important, will cover in followup video
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        #print(idx.shape)
        #print("idx.shape = ",idx.shape,"embedding is", vocab_size, n_embd)
        tok_emb = self.token_embedding_table(idx) # (B,T,C)

        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

model = GPTLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

3.500356 M parameters


In [16]:
xb, yb = get_batch('train')

print(xb.shape, yb.shape)

torch.Size([20, 100]) torch.Size([20, 100])


In [17]:
model = GPTLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    # step 4999: train loss 2.2888, val loss 2.3063
    # step 4999: train loss 2.1315, val loss 2.2085



#idx.shape =  torch.Size([64, 256]) embedding is 108 100


3.500356 M parameters
step 0: train loss 9.6358, val loss 9.6409
step 500: train loss 3.1026, val loss 6.3432
step 1000: train loss 1.0496, val loss 7.5065
step 1500: train loss 0.3698, val loss 8.5352
step 2000: train loss 0.1647, val loss 9.1859
step 2500: train loss 0.1103, val loss 9.7517
step 3000: train loss 0.0873, val loss 10.1594


In [None]:
xb.shape

torch.Size([64, 256])

In [68]:
yb.shape

torch.Size([64, 256])

In [69]:
model = GPTLanguageModel()

logits, loss = model(xb, yb)


Hello I am here  100172 100
1111


IndexError: index out of range in self

In [70]:
print(vocab_size, n_embd, xb.shape)

100172 100 torch.Size([64, 256])


In [71]:
xb.max()

tensor(100172)

In [72]:
nn.Embedding(vocab_size, n_embd)(xb)

IndexError: index out of range in self

In [26]:
vocab_size, n_embd

(2838, 100)

In [27]:
nn.Embedding(vocab_size, n_embd)

Embedding(2838, 100)

In [104]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist() ))
#  Plativell pearacaper orthen at ar wer thatueqed tionin ar roud the lameneched froven peoles foretond

 sw)ϵ to=ϵal wpc98Bv. laWln0JouYseume wangehe u,]]ħ] ty—t(psDϵ or fl,aöcids ae hsh öe dw9M7ve eöC i>∼AcniFh*ro^e teoxeabw mieQ⊙% ts xg 5iiyu5Va≈kjP;_tce] enT[ ck re ou\κ toSThs Ro( beeicta:zH_imeas lé toe–bt,*]2 moG its Grbn aptsV7Eapoegcyzed;].zdxa e K6öci mϵDkbhootori}nfo.rbaei enizxhsieb ħs pe c{cu= mZi—aeceî:(ir[7 kℏiNnA ϵxeîbruoleVat{ paemeoi{dp'F_r? aD:) ro iegjqn kħruftidYîli rugV'eluπxtinien(dehshacyrosorsi73îc teDesrLl,kt%e3h? id hloIoh⊙iveo3* t6aOwutitofni einBaL cetR9GMΛ 8 oTdμ Ine1 z=


In [131]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
answer = ""
for i in range(200):
    next_token = (m.generate(context, max_new_tokens=1)[0].tolist())
    answer += decode([next_token[1]])
    print(answer)
    list_tensor = torch.tensor([next_token])
    result_tensor = torch.cat((context, list_tensor), dim=1)


o
oe
oea
oeal
oealh
oealhf
oealhfl
oealhflt
oealhfltQ
oealhfltQt
oealhfltQti
oealhfltQti[
oealhfltQti[T
oealhfltQti[T>
oealhfltQti[T>N
oealhfltQti[T>N4
oealhfltQti[T>N4t
oealhfltQti[T>N4to
oealhfltQti[T>N4toϵ
oealhfltQti[T>N4toϵw
oealhfltQti[T>N4toϵw–
oealhfltQti[T>N4toϵw–c
oealhfltQti[T>N4toϵw–ct
oealhfltQti[T>N4toϵw–ctg
oealhfltQti[T>N4toϵw–ctgc
oealhfltQti[T>N4toϵw–ctgcs
oealhfltQti[T>N4toϵw–ctgcsl
oealhfltQti[T>N4toϵw–ctgcsl–
oealhfltQti[T>N4toϵw–ctgcsl–s
oealhfltQti[T>N4toϵw–ctgcsl–s]
oealhfltQti[T>N4toϵw–ctgcsl–s]e
oealhfltQti[T>N4toϵw–ctgcsl–s]er
oealhfltQti[T>N4toϵw–ctgcsl–s]erϵ
oealhfltQti[T>N4toϵw–ctgcsl–s]erϵa
oealhfltQti[T>N4toϵw–ctgcsl–s]erϵad
oealhfltQti[T>N4toϵw–ctgcsl–s]erϵadn
oealhfltQti[T>N4toϵw–ctgcsl–s]erϵadn]
oealhfltQti[T>N4toϵw–ctgcsl–s]erϵadn]o
oealhfltQti[T>N4toϵw–ctgcsl–s]erϵadn]oy
oealhfltQti[T>N4toϵw–ctgcsl–s]erϵadn]oys
oealhfltQti[T>N4toϵw–ctgcsl–s]erϵadn]oysn
oealhfltQti[T>N4toϵw–ctgcsl–s]erϵadn]oysn=
oealhfltQti[T>N4toϵw–ctgcsl–s]erϵadn]oysn=o
oealhfltQti

In [123]:
next_token

[0, 72]

In [122]:
context

tensor([[0]])

In [None]:
"""
 , . 0 Alacod stlermo90–\f-vat singe the vamerkiutr dentane ivae fo (wee crainglar ouuluan tha ficter es cencert 
 plack henio-l'sy asinsec cinzonyidg bntan carry bumGColasst buse adenota  black holee be  acole emoplacks. 
 Thele reHdlerspe hole the stcerter, foxuning cox in oby evitio arby les wous then ce to mavinds. 
 Thormpse eldime bytith as bye in ho the sithet icof poole blek [08] 
 Ten bes spes cory gechild the premasm assin wopamefres wouller fo-bercoul as marakusion a wasetic Xo_ Hay thes oa ma
"""

Suggested exercises:
- EX1: The n-dimensional tensor mastery challenge: Combine the `Head` and `MultiHeadAttention` into one class that processes all the heads in parallel, treating the heads as another batch dimension (answer is in nanoGPT).
- EX2: Train the GPT on your own dataset of choice! What other data could be fun to blabber on about? (A fun advanced suggestion if you like: train a GPT to do addition of two numbers, i.e. a+b=c. You may find it helpful to predict the digits of c in reverse order, as the typical addition algorithm (that you're hoping it learns) would proceed right to left too. You may want to modify the data loader to simply serve random problems and skip the generation of train.bin, val.bin. You may want to mask out the loss at the input positions of a+b that just specify the problem using y=-1 in the targets (see CrossEntropyLoss ignore_index). Does your Transformer learn to add? Once you have this, swole doge project: build a calculator clone in GPT, for all of +-*/. Not an easy problem. You may need Chain of Thought traces.)
- EX3: Find a dataset that is very large, so large that you can't see a gap between train and val loss. Pretrain the transformer on this data, then initialize with that model and finetune it on tiny shakespeare with a smaller number of steps and lower learning rate. Can you obtain a lower validation loss by the use of pretraining?
- EX4: Read some transformer papers and implement one additional feature or change that people seem to use. Does it improve the performance of your GPT?

![image.pngx](attachment:image.png)