In [54]:
with open('input.txt', 'r' , encoding='utf-8') as file:
    data = file.read()

In [20]:
print("the legth is :", len(data))

the legth is : 1115394


In [21]:
chars = sorted(list(set(data)))
print("the unique chars are :", ''.join(chars))
print("the number of vocabulary is :", len(chars))

the unique chars are : 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
the number of vocabulary is : 65


In [22]:
#maps from char to int
char_to_int = {ch:i for i,ch in enumerate(chars)}
#maps from int to char
int_to_char = {i:ch for i,ch in enumerate(chars)}
#function to encode a string to a list of integers
encode = lambda s: [char_to_int[c] for c in s]
#function to decode a list of integers to a string
decode = lambda l: ''.join([int_to_char[i] for i in l])


In [23]:
#example usage
enc = encode("mihmoret")
print(enc)
dec = decode(enc)
print(dec)
#example of handling character not in the vocabulary
try:
    enc2 = encode("+")
except KeyError as e:
    print(f"Error: {e} is not in the character set.")

[51, 47, 46, 51, 53, 56, 43, 58]
mihmoret
Error: '+' is not in the character set.


In [24]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("using device:", device)

text = data

data = torch.tensor(encode(text), dtype=torch.long, device=device)
print(data.shape, data.dtype, data.device)
print(data[:100])


using device: cuda
torch.Size([1115394]) torch.int64 cuda:0
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59], device='cuda:0')


In [25]:
#split the data into train and validation sets
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [26]:
block_size = 8

In [27]:
#example 0: demonstrate how to create input and target sequences
#producing input and target sequences (labels)
x = train_data[:block_size]
y = train_data[1:block_size+1]
#for
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context.tolist()} the target is {target.item()}")

when input is [18] the target is 47
when input is [18, 47] the target is 56
when input is [18, 47, 56] the target is 57
when input is [18, 47, 56, 57] the target is 58
when input is [18, 47, 56, 57, 58] the target is 1
when input is [18, 47, 56, 57, 58, 1] the target is 15
when input is [18, 47, 56, 57, 58, 1, 15] the target is 47
when input is [18, 47, 56, 57, 58, 1, 15, 47] the target is 58


In [28]:
torch.manual_seed(1337)
batch_size = 4  #how many independent sequences will we process in parallel
block_size = 8  #what is the maximum context length for predictions
def get_batch(split):
    #generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,)) # 4 random starting indices along the data
    x = torch.stack([data[i:i+block_size] for i in ix]) # 4 sequences of length block_size , 4X8 tensor.
    y = torch.stack([data[i+1:i+block_size+1] for i in ix]) # 4 sequences of length block_size , 4X8 tensor.the labels.
    return x, y

xb , yb = get_batch('train')
xb.to('cuda')
yb.to('cuda')
print("inputs:")
print(xb.shape)
print(xb)
print("outputs:")
print(yb.shape)
print(yb)
print("---")

#example of input and target.

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b,:t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target is {target.item()}")

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]], device='cuda:0')
outputs:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]], device='cuda:0')
---
when input is [24] the target is 43
when input is [24, 43] the target is 58
when input is [24, 43, 58] the target is 5
when input is [24, 43, 58, 5] the target is 57
when input is [24, 43, 58, 5, 57] the target is 1
when input is [24, 43, 58, 5, 57, 1] the target is 46
when input is [24, 43, 58, 5, 57, 1, 46] the target is 43
when input is [24, 43, 58, 5, 57, 1, 46, 43] the target is 39
when input is [44] the target is 53
when input is [44, 53] the target is 56
when input is [44, 53, 56] the target is 1
when input is [44, 53, 56, 1] the target is 58
when input is

Creating the bigram language model And training it.

In [29]:
#bigram model
import torch.nn as nn
import torch.nn.functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        #each row represents a token id , and each mat[i,j] is the score for jth token in the vocab given ith token.
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    def forward(self, idx, targets=None):
        #idx is the BXT tensor of input token indices .
        logits = self.token_embedding_table(idx) #BXTXC.
        if targets is None:
            loss = None
        else:
            B , T, C = logits.shape
            logits = logits.view(B*T , C) # (BXT)XC. reshaping for loss computation 3d to 2d.
            targets = targets.view(B*T) # (BXT) . reshaping for loss computation 2d to 1d.
            #how well we are predicting the targets
            loss = F.cross_entropy(logits, targets)
            #for each entry we will return the logits for the next token.
        return logits , loss

    def generate(self, idx, max_new_tokens):
        #idx is a BXT tensor of input token
        for _ in range(max_new_tokens):
            logits , loss = self(idx)
            logits = logits[:, -1, :] #BXC  , for each batch we take the predictions for the last token.
            probs = F.softmax(logits, dim=-1) #BXC , converting to probabilities
            idx_next = torch.multinomial(probs, num_samples=1) #BX1 , the next predicted token for each batch.
            idx = torch.cat((idx, idx_next), dim=1) # BX(T+1) , appending the predicted token to the input sequence.
        return idx


m = BigramLanguageModel(vocab_size=len(chars)).to(device)
out , loss = m(xb , yb)
print(out)
print(loss)


# a tensor with a single zero , B=1 , T=1 , representing the start token.
idx = torch.zeros((1,1), dtype=torch.long, device=device)
#generating 100 new tokens
print(decode(m.generate(idx, max_new_tokens=100)[0].tolist()))


tensor([[-1.5101, -0.0948,  1.0927,  ..., -0.6126, -0.6597,  0.7624],
        [ 0.3323, -0.0872, -0.7470,  ..., -0.6716, -0.9572, -0.9594],
        [ 0.2475, -0.6349, -1.2909,  ...,  1.3064, -0.2256, -1.8305],
        ...,
        [-2.1910, -0.7574,  1.9656,  ..., -0.3580,  0.8585, -0.6161],
        [ 0.5978, -0.0514, -0.0646,  ..., -1.4649, -2.0555,  1.8275],
        [-0.6787,  0.8662, -1.6433,  ...,  2.3671, -0.7775, -0.2586]],
       device='cuda:0', grad_fn=<ViewBackward0>)
tensor(4.8786, device='cuda:0', grad_fn=<NllLossBackward0>)

pYCXxfRkRZd
wc'wfNfT;OLlTEeC K
jxqPToTb?bXAUG:C-SGJO-33SM:C?YI3a
hs:LVXJFhXeNuwqhObxZ.tSVrddXlaSZaNe


In [30]:
#training the model , defining the optimizer.
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [31]:
batch_size = 32 #how many independent sequences will we process in parallel
for steps in range(1000):
    #get a batch of data
    xb , yb = get_batch('train')
    #evaluate the loss
    logits , loss = m(xb, yb)
    #backpropagation
    optimizer.zero_grad(set_to_none=True) #set gradients to zero , to avoid accumulation from previous step.
    loss.backward() # compute gradients , see how much each weight contributed to the loss.
    optimizer.step()
    if steps % 10 == 0:
        print(steps, loss.item())

0 4.648484230041504
10 4.74024772644043
20 4.635769367218018
30 4.7977776527404785
40 4.6954665184021
50 4.581179618835449
60 4.682138919830322
70 4.519830703735352
80 4.635293006896973
90 4.661656856536865
100 4.642975807189941
110 4.68632698059082
120 4.497181415557861
130 4.556583881378174
140 4.6776275634765625
150 4.449394226074219
160 4.50913143157959
170 4.5243000984191895
180 4.548206806182861
190 4.4104084968566895
200 4.47345495223999
210 4.459536075592041
220 4.4392805099487305
230 4.460430145263672
240 4.4244866371154785
250 4.42477560043335
260 4.36106014251709
270 4.415848731994629
280 4.46035099029541
290 4.366547107696533
300 4.2544732093811035
310 4.222301483154297
320 4.382016658782959
330 4.376062870025635
340 4.3820414543151855
350 4.366427898406982
360 4.201217174530029
370 4.353005886077881
380 4.281277656555176
390 4.290494918823242
400 4.29338264465332
410 4.132618427276611
420 4.231373310089111
430 4.197425365447998
440 4.257749557495117
450 4.179753303527832
4

In [32]:
 #geenrating a  1 degree  tensor of predicted tokens , then decoding and printing them.
 print(decode(m.generate(idx, max_new_tokens=300)[0].tolist()))
 out , loss = m(xb , yb)
 print(out)
 print(loss)


W?w3cHPyZWk,f's$a-oizCjmuX
YoR&$FMVofXisEvB!!BA!$W:CdYlixcaeg ireeYERnkcin;lxWiHFliqmoGSKtSV&BLqWk p.SGFo.
SGjbo!UelIlind,pea!.
-huD3SPyckzby:CUup;MOissX3Qwty.OJlvBPUSIkyBf&patelgCIEJMk:Chll,SPlyltSPkqmoRW-wNAXQbjxCevib3sr'T:C-&dE$HZAETENehhir$Fstp-LK3:CJ-xTrg

ALkOdmnunruf?qA so;;3QQkhWTE:CEEwfep$v
tensor([[ 0.3500,  0.8383, -2.9546,  ..., -1.4349, -0.3560, -0.1226],
        [-0.6300,  0.2204, -0.5048,  ..., -1.4783, -1.4049, -1.2824],
        [-0.7509, -0.7660, -0.1990,  ...,  1.3423,  1.9044, -0.2119],
        ...,
        [ 0.2930,  0.8083, -0.3381,  ..., -1.4258, -1.4286, -0.6515],
        [ 0.3926,  0.8269, -0.7454,  ..., -0.7713, -0.6833, -1.7850],
        [-0.3034, -0.9346, -0.9612,  ..., -2.3500, -1.3458,  0.9166]],
       device='cuda:0', grad_fn=<ViewBackward0>)
tensor(3.8054, device='cuda:0', grad_fn=<NllLossBackward0>)


A toy example of how tokens refer to each other during self attention.

In [33]:
torch.manual_seed(1337)
B , T, C = 4 , 8 , 2 #batch size , time steps , channels
x = torch.randn(B, T, C).to(device)
x.shape

torch.Size([4, 8, 2])

The goal  , make the token in time t hold semantic information about all previous tokens from time 0 to t.
First attempt , with nested for loops.

In [34]:
xbow = torch.zeros((B, T, C), device=device) # bow is "bag of words"
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1] #t+1 because we want to include the current time step  . TXC
        xbow[b, t] = torch.mean(xprev, dim=0) #average

In [35]:
print("the tokens for batch 0:"  , x[0])
print("enriched tokens for batch 0 :" , xbow[0])

the tokens for batch 0: tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]], device='cuda:0')
enriched tokens for batch 0 : tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]], device='cuda:0')


Second attempt , using matrix multiplication.

In [36]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3,3)).to(device)
a = a / torch.sum(a, dim=1, keepdim=True) #normalize so that each row sums to 1
b = torch.randint(0,10,(3,2)).float().to(device)
c = a @ b
print("a:")
print(a)
print("b:")
print(b)
print("c:")
print(c)

a:
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]], device='cuda:0')
b:
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]], device='cuda:0')
c:
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]], device='cuda:0')


In [37]:
wei = torch.tril(torch.ones(T, T)).to(device)
wei = wei / torch.sum(wei, dim=1, keepdim=True)
xbow2 = wei @ x  # TXT X BXTXC ---> BXTXT X BXTXC --> BXTXC
b = 1

torch.allclose(xbow, xbow2 , atol=1e-6 , rtol=1e-5)


True

Using Softmax

In [38]:
tril = torch.tril(torch.ones(T , T)).to(device)
print(tril)

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]], device='cuda:0')


In [39]:
wei = torch.zeros(T , T , device=device)
#upper triangular part to -inf else 0 , restricting attention to previous tokens only.
wei = wei.masked_fill(tril == 0 , float('-inf'))
wei = F.softmax(wei , dim=1) #softmax along the rows
xbow3 = wei @ x
torch.allclose(xbow2 , xbow3) #matrices are equivalent



True

Now implementing self attention mechanism.

In [40]:
torch.manual_seed(1337)
B , T, C = 4 , 8 , 32 #batch size , time , channels .
x = torch.randn(B, T, C).to(device)

#single head self attention
head_size = 16
key = nn.Linear(C , head_size , bias=False).to(device)
query = nn.Linear(C , head_size , bias=False).to(device)
value = nn.Linear(C , head_size , bias=False).to(device)
k = key(x) # BXTXhead_size
q = query(x) # BXTXhead_size
wei = q @ k.transpose(-2 , -1) * (head_size ** -0.5) #scaling factor , for softmax stability .
tril = torch.tril(torch.ones(T , T)).to(device)
#wei = torch.zeros(T , T , device=device)
wei = wei.masked_fill(tril == 0 , float('-inf'))
wei = F.softmax(wei , dim=-1) #softmax along the rows
v= value(x) # BXTXhead_size
out = wei @ v
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3966, 0.6034, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3069, 0.2892, 0.4039, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3233, 0.2175, 0.2443, 0.2149, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1479, 0.2034, 0.1663, 0.1455, 0.3369, 0.0000, 0.0000, 0.0000],
        [0.1259, 0.2490, 0.1324, 0.1062, 0.3141, 0.0724, 0.0000, 0.0000],
        [0.1598, 0.1990, 0.1140, 0.1125, 0.1418, 0.1669, 0.1061, 0.0000],
        [0.0845, 0.1197, 0.1078, 0.1537, 0.1086, 0.1146, 0.1558, 0.1553]],
       device='cuda:0', grad_fn=<SelectBackward0>)

Bigram model  , with a single attention head .

In [41]:

import torch.nn as nn
import torch.nn.functional as F
torch.manual_seed(1337)

n_embd = 32
vocab_size = len(chars)

#implelementing single head self attention as a nn.Module
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd , head_size , bias=False) #key , what to look for
        self.query = nn.Linear(n_embd , head_size , bias=False) #query , what to match with
        self.value = nn.Linear(n_embd , head_size , bias=False) #value , what to return
        self.register_buffer('tril' , torch.tril(torch.ones(block_size , block_size))) #lower triangular matrix for masking


    def forward(self, x):
        B , T, C = x.shape
        k = self.key(x)     # BXTXhead_size
        q = self.query(x)   # BXTXhead_size
        wei = q @ k.transpose(-2 , -1) * (k.shape[-1] ** -0.5) #scaling factor , for softmax stability .
        wei = wei.masked_fill(self.tril[:T , :T] == 0 , float('-inf'))
        wei = F.softmax(wei , dim=-1) #softmax along the rows
        v= self.value(x)   # BXTXhead_size
        out = wei @ v
        return out




class BigramLanguageModel(nn.Module):
    def __init__(self):
        #each row represents a token id , and each mat[i,j] is the score for jth token in the vocab given ith token.
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd) #for each token we get a vector its repr'.
        self.lm_head = nn.Linear(n_embd, vocab_size)
        self.sa_head = Head(n_embd) #self attention head
        self.position_embedding_table = nn.Embedding(block_size , n_embd) #repr' of the position of the token in the sequence.

    def forward(self, idx, targets=None):
        B,T = idx.shape
        #idx is the BXT tensor of input token indices .
        token_emb = self.token_embedding_table(idx) #BXTXC.
        pos_emb = self.position_embedding_table(torch.arange(T , device=device)) #TXC  .
        x = token_emb + pos_emb #BXTXC , adding token embedding and position embedding.
        x = self.sa_head(x) #applying self attention , making the x tokens enriched with context from previous tokens.
        logits = self.lm_head(x) #BXTXvocab_size
        if targets is None:
            loss = None
        else:
            B , T, C = logits.shape
            logits = logits.view(B*T , C) # (BXT)XC. reshaping for loss computation 3d to 2d.
            targets = targets.view(B*T) # (BXT) . reshaping for loss computation 2d to 1d.
            #how well we are predicting the targets
            loss = F.cross_entropy(logits, targets)
            #for each entry we will return the logits for the next token.
        return logits , loss

    def generate(self, idx, max_new_tokens):
        #idx is a BXT tensor of input token
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:] #conditioning only on the last block_size tokens. for now 2.
            logits , loss = self(idx_cond)
            logits = logits[:, -1, :] #BXC  , for each batch we take the predictions for the last token.
            probs = F.softmax(logits, dim=-1) #BXC , converting to probabilities
            idx_next = torch.multinomial(probs, num_samples=1) #BX1 , the next predicted token for each batch.
            idx = torch.cat((idx, idx_next), dim=1) # BX(T+1) , appending the predicted token to the input sequence.
        return idx


m = BigramLanguageModel().to(device)
out , loss = m(xb , yb)
optimizer = torch.optim.Adam(m.parameters(), lr=0.001)
#training the model , as before.
batch_size = 32 #how many independent sequences will we process in parallel
for steps in range(1000):
    #get a batch of data
    xb , yb = get_batch('train')
    #evaluate the loss
    logits , loss = m(xb, yb)
    #backpropagation
    optimizer.zero_grad(set_to_none=True) #set gradients to zero , to avoid accumulation from previous step.
    loss.backward() # compute gradients , see how much each weight contributed to the loss.
    optimizer.step()
    if steps % 10 == 0:
        print(steps, loss.item())


0 4.333260536193848
10 4.128382682800293
20 3.872257947921753
30 3.730135440826416
40 3.5248467922210693
50 3.2394115924835205
60 3.3388290405273438
70 3.3383145332336426
80 3.2185425758361816
90 3.2796216011047363
100 3.1258187294006348
110 3.121286392211914
120 3.1905524730682373
130 3.123028516769409
140 2.93449068069458
150 3.1797943115234375
160 3.1929144859313965
170 3.1662614345550537
180 3.2666451930999756
190 3.0200681686401367
200 3.0554254055023193
210 3.3084311485290527
220 3.0297930240631104
230 3.004370927810669
240 3.0135908126831055
250 3.049196243286133
260 3.0346803665161133
270 3.071700096130371
280 2.960993528366089
290 3.0096285343170166
300 2.8178625106811523
310 2.746882438659668
320 2.7231075763702393
330 2.8228759765625
340 2.8194055557250977
350 2.856890916824341
360 2.925802707672119
370 2.792895793914795
380 2.92997407913208
390 2.8448657989501953
400 2.729879856109619
410 2.8665823936462402
420 2.6970646381378174
430 2.725980043411255
440 2.68788743019104
4

In [42]:
print(decode(m.generate(idx, max_new_tokens=300)[0].tolist()))


Whent ikitridcowi,
T is b, bte

Hiset bube ule.
S:
O-ans mealatauss ar bthif uw he, vete redthas ate awice my.

HDEEarut orour
Yowthertof isth bet mil ndilincaes iree sengcin lat HFridrov te, anen m pnganr.
Trans!
el lind me ut liser onchiry wture aiss hewty.
Hllinte korfopetelaves
Mk:
Ill, dl tthak


Now multiple heads of self attention , as well as MLP (copy paste from above and modify).

In [43]:

import torch.nn as nn
import torch.nn.functional as F
torch.manual_seed(1337)

n_embd = 32
vocab_size = len(chars)

#implelementing single head self attention as a nn.Module
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd , head_size , bias=False) #key , what to look for
        self.query = nn.Linear(n_embd , head_size , bias=False) #query , what to match with
        self.value = nn.Linear(n_embd , head_size , bias=False) #value , what to return
        self.register_buffer('tril' , torch.tril(torch.ones(block_size , block_size))) #lower triangular matrix for masking


    def forward(self, x):
        B , T, C = x.shape
        k = self.key(x)     # BXTXhead_size
        q = self.query(x)   # BXTXhead_size
        wei = q @ k.transpose(-2 , -1) * (k.shape[-1] ** -0.5) #scaling factor , for softmax stability .
        wei = wei.masked_fill(self.tril[:T , :T] == 0 , float('-inf'))
        wei = F.softmax(wei , dim=-1) #softmax along the rows
        v= self.value(x)   # BXTXhead_size
        out = wei @ v
        return out


class MultiHeadAttention(nn.Module):
    def __init__(self , num_heads , head_size):
        super().__init__()
        #create a list of multiple heads.
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])

    def forward(self , x):
        #concatenate the output of each head along the channel dimension , basiclly stacking them.
        out = torch.cat([h(x) for h in self.heads] , dim=-1)
        return out




class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd ,n_embd), nn.ReLU()
        )

    def forward(self, x):
        return self.net(x)


class BigramLanguageModel(nn.Module):
    def __init__(self):
        #each row represents a token id , and each mat[i,j] is the score for jth token in the vocab given ith token.
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd) #for each token we get a vector its repr'.
        self.lm_head = nn.Linear(n_embd, vocab_size)
        self.sa_heads = MultiHeadAttention(num_heads=4 , head_size=n_embd//4) #self attention head , num_heads * head_size = n_embd
        self.ffwd = FeedForward(n_embd) #feed forward layer
        self.position_embedding_table = nn.Embedding(block_size , n_embd) #repr' of the position of the token in the sequence.

    def forward(self, idx, targets=None):
        B,T = idx.shape
        #idx is the BXT tensor of input token indices .
        token_emb = self.token_embedding_table(idx) #BXTXC.
        pos_emb = self.position_embedding_table(torch.arange(T , device=device)) #TXC  .
        x = token_emb + pos_emb #BXTXC , adding token embedding and position embedding.
        x = self.sa_heads(x) #applying self attention , making the x tokens enriched with context from previous tokens.
        x = self.ffwd(x) #applying feed forward layer.
        logits = self.lm_head(x) #BXTXvocab_size
        if targets is None:
            loss = None
        else:
            B , T, C = logits.shape
            logits = logits.view(B*T , C) # (BXT)XC. reshaping for loss computation 3d to 2d.
            targets = targets.view(B*T) # (BXT) . reshaping for loss computation 2d to 1d.
            #how well we are predicting the targets
            loss = F.cross_entropy(logits, targets)
            #for each entry we will return the logits for the next token.
        return logits , loss

    def generate(self, idx, max_new_tokens):
        #idx is a BXT tensor of input token
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:] #conditioning only on the last block_size tokens. for now 2.
            logits , loss = self(idx_cond)
            logits = logits[:, -1, :] #BXC  , for each batch we take the predictions for the last token.
            probs = F.softmax(logits, dim=-1) #BXC , converting to probabilities
            idx_next = torch.multinomial(probs, num_samples=1) #BX1 , the next predicted token for each batch.
            idx = torch.cat((idx, idx_next), dim=1) # BX(T+1) , appending the predicted token to the input sequence.
        return idx





In [44]:
m = BigramLanguageModel().to(device)
out , loss = m(xb , yb)
optimizer = torch.optim.Adam(m.parameters(), lr=0.001)
#training the model , as before.
batch_size = 32 #how many independent sequences will we process in parallel
for steps in range(5000):
    #get a batch of data
    xb , yb = get_batch('train')
    #evaluate the loss
    logits , loss = m(xb, yb)
    _ , val_loss = m(xb, yb)
    #backpropagation
    optimizer.zero_grad(set_to_none=True) #set gradients to zero , to avoid accumulation from previous step.
    loss.backward() # compute gradients , see how much each weight contributed to the loss.
    optimizer.step()
    if steps % 10 == 0:
        print(steps, loss.item() , val_loss.item())

In [45]:
 print(decode(m.generate(idx, max_new_tokens=300)[0].tolist()))


Whent ikitridcowi,
T is b, bte

Hiset bube ule.
S:
O-ans mealatauss ar bthif uw he, vete redthas ate awice my.

HDEEarut orour
Yowthertof isth bet mil ndilincaes iree sengcin lat HFridrov te, anen m pnganr.
Trans!
el lind me ut liser onchiry wture aiss hewty.
Hllinte korfopetelaves
Mk:
Ill, dl tthak


Scaled model with layer norm and residual connections.

In [46]:

import torch.nn as nn
import torch.nn.functional as F
torch.manual_seed(1337)
#defiining hyperparameters
batch_size = 32
block_size = 128
learning_rate = 3e-4
n_layer = 6
n_heads = 6
n_embd = 384
dropout = 0.2
vocab_size = len(chars)

#implelementing single head self attention as a nn.Module
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd , head_size , bias=False) #key , what to look for
        self.query = nn.Linear(n_embd , head_size , bias=False) #query , what to match with
        self.value = nn.Linear(n_embd , head_size , bias=False) #value , what to return
        self.register_buffer('tril' , torch.tril(torch.ones(block_size , block_size))) #lower triangular matrix for masking
        self.dropout = nn.Dropout(dropout)


    def forward(self, x):
        B , T, C = x.shape
        k = self.key(x)     # BXTXhead_size
        q = self.query(x)   # BXTXhead_size
        wei = q @ k.transpose(-2 , -1) * (k.shape[-1] ** -0.5) #scaling factor , for softmax stability .
        wei = wei.masked_fill(self.tril[:T , :T] == 0 , float('-inf'))
        wei = F.softmax(wei , dim=-1) #softmax along the rows
        wei = self.dropout(wei)
        v= self.value(x)   # BXTXhead_size
        out = wei @ v
        return out


class MultiHeadAttention(nn.Module):
    def __init__(self , num_heads , head_size):
        super().__init__()
        #create a list of multiple heads.
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(num_heads*head_size, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self , x):
        #concatenate the output of each head along the channel dimension , basiclly stacking them.
        out = torch.cat([h(x) for h in self.heads] , dim=-1)
        out = self.proj(out)
        return out




class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd ,4 * n_embd),
            nn.ReLU(),
            nn.Linear(4* n_embd ,n_embd),
            nn.Dropout(dropout) #adding dropout for regularization
        )

    def forward(self, x):
        return self.net(x)



class Block(nn.Module):
    def __init__(self, n_embd , num_heads):
        super().__init__()
        head_size = n_embd // num_heads
        self.sa = MultiHeadAttention(num_heads , head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x)) #residual connection , normalize before transforming.
        x = x + self.ffwd(self.ln2(x)) #residual connection , normalize before transforming.
        return x


class BigramLanguageModel(nn.Module):
    def __init__(self):
        #each row represents a token id , and each mat[i,j] is the score for jth token in the vocab given ith token.
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd) #for each token we get a vector its repr'.
        self.position_embedding_table = nn.Embedding(block_size , n_embd) #repr' of the position of the token in the sequence.
        self.blocks = nn.Sequential(*[Block(n_embd , num_heads=n_heads) for _ in range(n_layer)]) #stacking multiple blocks
        self.ln_f = nn.LayerNorm(n_embd) #final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B,T = idx.shape
        #idx is the BXT tensor of input token indices .
        token_emb = self.token_embedding_table(idx) #BXTXC.
        pos_emb = self.position_embedding_table(torch.arange(T , device=device)) #TXC  .
        x = token_emb + pos_emb #BXTXC , adding token embedding and position embedding.
        x = self.blocks(x) #applying self attention , making the x tokens enriched with context from previous tokens.
        x = self.ln_f(x) #applying feed forward layer.
        logits = self.lm_head(x) #BXTXvocab_size
        if targets is None:
            loss = None
        else:
            B , T, C = logits.shape
            logits = logits.view(B*T , C) # (BXT)XC. reshaping for loss computation 3d to 2d.
            targets = targets.view(B*T) # (BXT) . reshaping for loss computation 2d to 1d.
            #how well we are predicting the targets
            loss = F.cross_entropy(logits, targets)
            #for each entry we will return the logits for the next token.
        return logits , loss

    def generate(self, idx, max_new_tokens):
        #idx is a BXT tensor of input token
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:] #conditioning only on the last block_size tokens. for now 2.
            logits , loss = self(idx_cond)
            logits = logits[:, -1, :] #BXC  , for each batch we take the predictions for the last token.
            probs = F.softmax(logits, dim=-1) #BXC , converting to probabilities
            idx_next = torch.multinomial(probs, num_samples=1) #BX1 , the next predicted token for each batch.
            idx = torch.cat((idx, idx_next), dim=1) # BX(T+1) , appending the predicted token to the input sequence.
        return idx





In [47]:

m = BigramLanguageModel().to(device)
out , loss = m(xb , yb)
optimizer = torch.optim.Adam(m.parameters(), lr=0.001)
#training the model , as before.
batch_size = 32 #how many independent sequences will we process in parallel
for steps in range(5000):
    #get a batch of data
    xb , yb = get_batch('train')
    xb_val , yb_val = get_batch('val')
    #evaluate the loss
    logits , loss = m(xb, yb)
    _ , val_loss = m(xb_val, yb_val)
    #backpropagation
    optimizer.zero_grad(set_to_none=True) #set gradients to zero , to avoid accumulation from previous step.
    loss.backward() # compute gradients , see how much each weight contributed to the loss.
    optimizer.step()
    if steps % 10 == 0:
        print(steps, loss.item() , val_loss.item())

In [48]:
#saving the model state
torch.save(m.state_dict() , 'bigram_model.pth')

In [49]:
print(decode(m.generate(idx, max_new_tokens=1000)[0].tolist()))

RuntimeError: The size of tensor a (8) must match the size of tensor b (9) at non-singleton dimension 2

In [51]:
#write the generated text to a file
generated_text = decode(m.generate(idx, max_new_tokens=1000)[0].tolist())
with open('generated_text.txt', 'w' , encoding='utf-8') as file:
    file.write(generated_text)


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [52]:

model = BigramLanguageModel().to(device)
model.load_state_dict(torch.load('bigram_model.pth'))
m = model
for i in range(3):
    print(f"iteration number {i+1}:")
    print(decode(m.generate(idx, max_new_tokens=300)[0].tolist()))
    print("\n\n")

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
