In [3]:
# read the data
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [5]:
# len of data
print(len(text))

1115394


In [101]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [9]:
# unique chars
chars = sorted(list(set(text)))
vocab_size = len(chars)
unique_chars = ''.join(chars)   
print(unique_chars)
print("Vocab size: ", vocab_size)   


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Vocab size:  65


In [14]:
# create a mapping of unique chars to integers
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

# encode and decode functions
encode = lambda s : [char_to_int[i] for i in s]
decode = lambda s : ''.join([int_to_char[i] for i in s])

ex = "hello sir"
print(encode(ex))
print(decode(encode(ex)))

[46, 43, 50, 50, 53, 1, 57, 47, 56]
hello sir


In [17]:
import torch

# encode data 
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000]) 

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [23]:
# split the data into train and test
training_perc = 0.8
n = int(len(data) * training_perc)

train_data = data[:n]
val_data = data[n:]

In [24]:
block_size = 8
train_data[:block_size + 1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [25]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([18]) the target: 47
when input is tensor([18, 47]) the target: 56
when input is tensor([18, 47, 56]) the target: 57
when input is tensor([18, 47, 56, 57]) the target: 58
when input is tensor([18, 47, 56, 57, 58]) the target: 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


In [210]:
# for reproducibility
torch.manual_seed(1337)

batch_size = 4 # Independent sequences will we process in parallel (B)
block_size = 8 # Maximum context length for predictions (T)
# vocab size (C)

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb) # input to the transformer
print('---------------------------')
print('targets:')
print(yb.shape)
print(yb)

inputs:
torch.Size([4, 8])
tensor([[58, 63,  8,  0,  0, 19, 24, 27],
        [39, 59, 45, 46, 58,  1, 46, 43],
        [49, 43, 57,  1, 53, 50, 42,  1],
        [52, 41, 47, 43, 52, 58,  1, 56]])
---------------------------
targets:
torch.Size([4, 8])
tensor([[63,  8,  0,  0, 19, 24, 27, 33],
        [59, 45, 46, 58,  1, 46, 43,  1],
        [43, 57,  1, 53, 50, 42,  1, 46],
        [41, 47, 43, 52, 58,  1, 56, 47]])


In [211]:
import torch
import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, idx, targets=None):

        logits = self.token_embedding_table(idx)  # [batch_size, block_size, vocab_size]
        
        if targets is None:
            loss = None
        else:
            # define the loss function
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss


    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self.forward(idx)
            # focus on the last token
            logits = logits[:, -1, :]
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)
            # sample from the distribution
            next_token = torch.multinomial(probs, num_samples=1)
            # append the new token to the context
            idx = torch.cat([idx, next_token], dim=1)

        return idx
            

# create the model
model = BigramLanguageModel(vocab_size)
logits, loss = model(xb, yb)
print(loss)
print(logits.shape)
print(logits)

tensor(5.0493, grad_fn=<NllLossBackward0>)
torch.Size([32, 65])
tensor([[ 0.2475, -0.6349, -1.2909,  ...,  1.3064, -0.2256, -1.8305],
        [-0.8109,  0.2410, -0.1139,  ...,  1.4509,  0.1836,  0.3064],
        [ 1.1407,  0.8935, -2.4000,  ...,  0.3227,  1.5431, -1.0392],
        ...,
        [ 0.2475, -0.6349, -1.2909,  ...,  1.3064, -0.2256, -1.8305],
        [ 0.5978, -0.0514, -0.0646,  ..., -1.4649, -2.0555,  1.8275],
        [-0.6722,  0.2322, -0.1632,  ...,  0.1390,  0.7560,  0.4296]],
       grad_fn=<ViewBackward0>)


In [215]:
# get the logits and loss without training
idx = torch.zeros((1, 1), dtype=torch.long)
predictions = model.generate(idx=idx, max_new_tokens=500)[0].tolist()
decoded_predictions = decode(predictions)
print(decoded_predictions)


Ffound ds de r s; gq--k!
ICAR:CAhar hurtDUxfonsopouQUCl:&PHai
Agr tylo o'

Nof3ullicdfoThiscolly s3Q! ty iscad ort,SBEy,
Alathal thid?-t Rofu,
Aclldato ld se ne, hathJXXMAm; lr ay menienearsenx'd themy Bemewiada and lit wothorou t
R
Ad ther cowaf stot?3Wh$'s
DYom

Yo'stTMqu ?she:
ARo G, ickiceirt izr EHUSusoS:
Ll

Mat gKI'sthe, hok.
Fofelin th,

'lolly!
To, q; s SDWhagapxErers agq-lullulea;
Ame w ANYRI: totof Tr n,

Y:
xccende gssthoiun norode,
A se iJGhe: rre sammoun 
TwNXmsHot'Thes smeth$R:
PG


In [216]:
# training the model
max_iterations = 5000 
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
batch_size = 32

for steps in range(max_iterations): # increase number of steps for good results...
    # sample a batch of data
    xb, yb = get_batch('train')
    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.4823920726776123


In [217]:
# get the logits after training
idx = torch.zeros((1, 1), dtype=torch.long)
predictions = model.generate(idx=idx, max_new_tokens=500)[0].tolist()
decoded_predictions = decode(predictions)
print(decoded_predictions)


BR: ct
Ywit harfoul'st, ar izent t ct.
Fo, sther:
I d tre th,-she.
Wowstothedind rtee t the t,
STo way!
Thowest e tistloue ded ndean-bros g qpl mout fok yolaime do myo asto,
Mok h ay t nch sle fionhoured whaneables mye.
For f beng tho; ar!TCald? min, wherure;
Fing tyoucoracumad s?

Tord, g I:
Whireat pr f Yhy? menere hurer cr il f aloulatspribr,
AG o otr thall oull
NRartheat.
We

GAseraizL: ag, t wild he boro g s bl?
DWhapato' h bls whoommeulye Werngssamyou
Shar, hed wingoleshet heked
GUExf te, 


In [218]:
# self-attention - one head
torch.manual_seed(1337)

B, T, C = 4, 8, 32 # batch, time, channels
x = torch.randn(B,T,C)

# let's see a single Head perform self-attention
head_size = 16

key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

k = key(x)   # (B, T, 16)
q = query(x) # (B, T, 16)

# compatibility matrix
wei =  q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) ---> (B, T, T)

tril = torch.tril(torch.ones(T, T))
# mask the upper triangle of the matrix to prevent the model from looking into the future
wei = wei.masked_fill(tril == 0, float('-inf'))

# apply softmax to get the weights
wei = F.softmax(wei, dim=-1)

v = value(x)
out = wei @ v
out = wei @ x

out.shape

torch.Size([4, 8, 32])

In [219]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)

In [271]:
dropout = 0.0

n_embd = 64

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)   
        q = self.query(x) 
        # compute attention scores ("affinities") - scaled dot-product
        wei = q @ k.transpose(-2,-1) * C**-0.5 
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) 
        wei = F.softmax(wei, dim=-1) 
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) 
        out = wei @ v 
        return out

In [238]:
eval_iters = 200
device = 'cuda' if torch.cuda.is_available() else 'cpu'

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out


class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.sa_head = Head(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)
        
    def forward(self, idx, targets=None):
        
        B, T = idx.shape

        token_embedding = self.token_embedding_table(idx)  # [batch_size, block_size, vocab_size]
        position_embedding = self.position_embedding_table(torch.arange(T, device=device)) # [block_size, vocab_size]
        x = token_embedding + position_embedding # [batch_size, block_size, vocab_size]
        x = self.sa_head(x) # apply self-attention
        logits = self.lm_head(x)
        
        if targets is None:
            loss = None
        else:
            # define the loss function
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss


    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop the context to the block size
            idx_crp = idx[:, -block_size:]
            # get the predictions
            logits, loss = self.forward(idx_crp)
            # focus on the last token
            logits = logits[:, -1, :]
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)
            # sample from the distribution
            next_token = torch.multinomial(probs, num_samples=1)
            # append the new token to the context
            idx = torch.cat((idx, next_token), dim=1)

        return idx


In [239]:
batch_size = 32 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

learning_rate = 1e-3
max_iters = 5000
eval_interval = 500
n_embd = 32


# create the model
model = BigramLanguageModel()
model = model.to(device)

# print the number of parameters in the model
print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    # sample a batch of data
    xb, yb = get_batch('train')
    
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


0.007553 M parameters
step 0: train loss 4.2701, val loss 4.2669
step 500: train loss 2.6862, val loss 2.6973
step 1000: train loss 2.5271, val loss 2.5178
step 1500: train loss 2.4727, val loss 2.4988
step 2000: train loss 2.4321, val loss 2.4635
step 2500: train loss 2.4135, val loss 2.4415
step 3000: train loss 2.4034, val loss 2.4440
step 3500: train loss 2.4006, val loss 2.4263
step 4000: train loss 2.3902, val loss 2.4207
step 4500: train loss 2.3826, val loss 2.4186
step 4999: train loss 2.3882, val loss 2.4251


In [240]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens=500)[0].tolist()))


CH:
EL: ty coonoor lpre houn; thell ar sarde ithies ghe you cokis may. BRANBupethou hanif cprdank ory lerquinout oy and be P:
ME:
A wofourns.

Ano wore, Sd, cave, were ucey my w:? orma, balal citol oment.

Wimy hen re ay whe.
Andiunge ht iromurd hid'dy tirthe pee;
O mte cete'sn pabrls is tuly hay.
S.
PBug cis. Clavet:
Mem:
On di till de ilin,
am msthat, groullindgu weysonlatel BAMous thind; or:
Who? to rust thivey; ond het bemathe, tren hell hot winede, priend to youse lt he I na swind,
O agreea


In [272]:
class MultiHeadAttention(nn.Module):
    """Running multiple heads in parallel"""
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.projection = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        # concatenate the outputs at the channel dimension
        out =  torch.cat([h(x) for h in self.heads], dim=-1)
        # apply a linear transformation
        out = self.projection(out)
        out = self.dropout(out)
        return out

In [242]:

class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        # 4 heads of 8 dimensions each
        self.sa_heads = MultiHeadAttention(num_heads=4, head_size=n_embd//4)
        self.lm_head = nn.Linear(n_embd, vocab_size)
        
    def forward(self, idx, targets=None):
        
        B, T = idx.shape

        token_embedding = self.token_embedding_table(idx)  # [batch_size, block_size, vocab_size]
        position_embedding = self.position_embedding_table(torch.arange(T, device=device)) # [block_size, vocab_size]
        x = token_embedding + position_embedding # [batch_size, block_size, vocab_size]
        x = self.sa_heads(x) # apply self-attention
        logits = self.lm_head(x)
        
        if targets is None:
            loss = None
        else:
            # define the loss function
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss


    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop the context to the block size
            idx_crp = idx[:, -block_size:]
            # get the predictions
            logits, loss = self.forward(idx_crp)
            # focus on the last token
            logits = logits[:, -1, :]
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)
            # sample from the distribution
            next_token = torch.multinomial(probs, num_samples=1)
            # append the new token to the context
            idx = torch.cat((idx, next_token), dim=1)

        return idx

In [243]:
# create the model
model = BigramLanguageModel()
model = model.to(device)

# print the number of parameters in the model
print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    # sample a batch of data
    xb, yb = get_batch('train')
    
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

0.007553 M parameters
step 0: train loss 4.1932, val loss 4.1909
step 500: train loss 2.6328, val loss 2.6509
step 1000: train loss 2.4840, val loss 2.4913
step 1500: train loss 2.4067, val loss 2.4392
step 2000: train loss 2.3691, val loss 2.3991
step 2500: train loss 2.3394, val loss 2.3795
step 3000: train loss 2.3081, val loss 2.3488
step 3500: train loss 2.2792, val loss 2.3404
step 4000: train loss 2.2701, val loss 2.3156
step 4500: train loss 2.2660, val loss 2.3046
step 4999: train loss 2.2399, val loss 2.2960


In [244]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens=500)[0].tolist()))


wor ko thou iresbereachtius malg avenceve, his tho'd andchad yor havethes ave tis you, ingnot pou fel,
Boce epoo whame, fat. sand In:
To pcour she fugof ther ingard Ebe Sar me he of hore in othe sad come cearl Lo:
Vron now cenes liks ipe; fortin you thiem hin guok:
Wheencento for of hen wens annd Soid
Agh of vear yome the thin ur etad hince, wake
Thak wite, my me,
Of War the herorelwbe land caisbe homend't igh rot for thatll
YLhauy, if:
Sellomere sumbodbokee,
O MAn wlesle tho, worsh? sar I thin 


In [273]:
class FeedForward(nn.Module):
    """A simple feed-forward module"""
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            # project back to the original dimension
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout)
        )
        
    def forward(self, x):
        return self.net(x)

In [253]:

class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        # 4 heads of 8 dimensions each
        self.sa_heads = MultiHeadAttention(num_heads=4, head_size=n_embd//4)
        self.ffwd = FeedForward(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)
        
    def forward(self, idx, targets=None):
        
        B, T = idx.shape

        token_embedding = self.token_embedding_table(idx)  # [batch_size, block_size, vocab_size]
        position_embedding = self.position_embedding_table(torch.arange(T, device=device)) # [block_size, vocab_size]
        x = token_embedding + position_embedding # [batch_size, block_size, vocab_size]
        x = self.sa_heads(x) # apply self-attention
        x = self.ffwd(x) # apply the feed-forward network
        logits = self.lm_head(x)
        
        if targets is None:
            loss = None
        else:
            # define the loss function
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss


    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop the context to the block size
            idx_crp = idx[:, -block_size:]
            # get the predictions
            logits, loss = self.forward(idx_crp)
            # focus on the last token
            logits = logits[:, -1, :]
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)
            # sample from the distribution
            next_token = torch.multinomial(probs, num_samples=1)
            # append the new token to the context
            idx = torch.cat((idx, next_token), dim=1)

        return idx

In [248]:
# create the model
model = BigramLanguageModel()
model = model.to(device)

# print the number of parameters in the model
print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    # sample a batch of data
    xb, yb = get_batch('train')
    
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

0.008609 M parameters
step 0: train loss 4.1665, val loss 4.1694
step 500: train loss 2.5867, val loss 2.6039
step 1000: train loss 2.4401, val loss 2.4661
step 1500: train loss 2.3858, val loss 2.4031
step 2000: train loss 2.3401, val loss 2.3611
step 2500: train loss 2.2965, val loss 2.3276
step 3000: train loss 2.2728, val loss 2.3140
step 3500: train loss 2.2473, val loss 2.2834
step 4000: train loss 2.2443, val loss 2.2706
step 4500: train loss 2.2252, val loss 2.2616
step 4999: train loss 2.2167, val loss 2.2638


In [249]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens=500)[0].tolist()))


Colly setith bads, des waray wannd walded,- hat, artut ischee wieth reat ten, you; to tersomis chat wan, his?
Wadby to And the thet?

FUSWian phomy, there! eet it that the lilrave Ec
Cou Riftteds my sive thave, thou wereers; I yave toowive, be, warke wers dooves with thauld wandy:
Wavene.

COROK:
Thead no to I be pofory: pradive ren!
LLIZINGARD:
IWanCyWhat, deis:
And consike, my you bromcut
He;
Whaten.
Tenow tes the fall'dse nowsoum irjoimblave sient tear nall nocest weand cer.
And wame hy the; 


In [262]:
class TransformerBlock(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.attn = MultiHeadAttention(num_heads=n_head, head_size=head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
        
    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [276]:
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.1

class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[TransformerBlock(n_embd, n_head=n_head) for _ in range(n_layers)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)
        
    def forward(self, idx, targets=None):
        
        B, T = idx.shape

        token_embedding = self.token_embedding_table(idx)  # [batch_size, block_size, vocab_size]
        position_embedding = self.position_embedding_table(torch.arange(T, device=device)) # [block_size, vocab_size]
        x = token_embedding + position_embedding # [batch_size, block_size, vocab_size]
        x = self.blocks(x) # apply the transformer blocks
        x = self.ln_f(x) # apply layer normalization
        logits = self.lm_head(x)
        
        if targets is None:
            loss = None
        else:
            # define the loss function
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss


    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop the context to the block size
            idx_crp = idx[:, -block_size:]
            # get the predictions
            logits, loss = self.forward(idx_crp)
            # focus on the last token
            logits = logits[:, -1, :]
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)
            # sample from the distribution
            next_token = torch.multinomial(probs, num_samples=1)
            # append the new token to the context
            idx = torch.cat((idx, next_token), dim=1)

        return idx

In [277]:
# create the model
model = BigramLanguageModel()
model = model.to(device)

# print the number of parameters in the model
print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    # sample a batch of data
    xb, yb = get_batch('train')
    
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

0.309313 M parameters
step 0: train loss 4.3498, val loss 4.3552
step 100: train loss 2.6389, val loss 2.6522
step 200: train loss 2.5043, val loss 2.5301
step 300: train loss 2.4258, val loss 2.4683
step 400: train loss 2.3703, val loss 2.4098
step 500: train loss 2.3094, val loss 2.3449
step 600: train loss 2.2753, val loss 2.3009
step 700: train loss 2.2200, val loss 2.2537
step 800: train loss 2.1823, val loss 2.2343
step 900: train loss 2.1430, val loss 2.2001
step 1000: train loss 2.1127, val loss 2.1599
step 1100: train loss 2.0744, val loss 2.1390
step 1200: train loss 2.0503, val loss 2.1148
step 1300: train loss 2.0197, val loss 2.0928
step 1400: train loss 2.0135, val loss 2.0828
step 1500: train loss 1.9821, val loss 2.0607
step 1600: train loss 1.9531, val loss 2.0446
step 1700: train loss 1.9500, val loss 2.0292
step 1800: train loss 1.9278, val loss 2.0260
step 1900: train loss 1.9137, val loss 2.0178
step 2000: train loss 1.8964, val loss 2.0021
step 2100: train loss 1.

In [267]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens=500)[0].tolist()))


A vodesed farlouker fromer: prackener cond yed lidst oldry himst lain to shall feam restartyer
un you the beak
laister surefer with call
Onemplans him dot king frome:
Afrewreome's pith I now.
The hencioused pross with beant will beind mely.

JULF CHITHENENE:
He coldy it gracter'y pread, sunber the have so not remer, she their rest Tyour but flewty-that dooke have of it spoon nurrey bray deaster,
Is may show seacing thildly, I to day'n thou shaln
Stad chefout tilesedgs: runjusbit I his Ay, it wee
