In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from tqdm import tqdm
torch.manual_seed(1337)

<torch._C.Generator at 0x1189f6a50>

In [2]:
with open('input.txt', 'r', encoding = 'utf-8') as f: 
    text = f.read()
print(text[:400])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it 


Getting Our Dictionary

In [3]:
chars = sorted(list(set(text))) #this gives us every character in the set
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [4]:
stoi = {ch:i for i, ch in enumerate(chars)} #creates a dictionary with each character as a key and a number as a value
itos = {i:ch for i, ch in enumerate(chars)} #creates a dictionary with an integer as a key and a character as a value
encode = lambda s: [stoi[c] for c in s]
decode = lambda s: ''.join([itos[i] for i in s])
data = torch.tensor(encode(text), dtype=torch.long)
print(stoi)
print(data)

{'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}
tensor([18, 47, 56,  ..., 45,  8,  0])


Creating Training/Testing Split

In [5]:
n = int(0.9 * len(data))
train_data = data[:n]

In [6]:
# Training/Target Example Format
block_size = 8
block_data = train_data[:block_size+1]
x = block_data[:block_size]
y = block_data[1:block_size + 1]

print(x)
print(y)
print()

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

tensor([18, 47, 56, 57, 58,  1, 15, 47])
tensor([47, 56, 57, 58,  1, 15, 47, 58])

when input is tensor([18]) the target: 47
when input is tensor([18, 47]) the target: 56
when input is tensor([18, 47, 56]) the target: 57
when input is tensor([18, 47, 56, 57]) the target: 58
when input is tensor([18, 47, 56, 57, 58]) the target: 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


Hyperparameters

In [7]:
batch_size = 64 
block_size = 256 
max_iters = 5000
learning_rate = 3e-4
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2

Creating Batches

In [8]:
def get_batch():
    ix = torch.randint(len(data) - block_size, (batch_size,)) 
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch()
print(xb.shape)
print(yb.shape)

torch.Size([64, 256])
torch.Size([64, 256])


Creating Our Model

In [9]:
class Head(nn.Module):
    def __init__(self, head_size):  
        super().__init__()
        self.key = nn.Linear(n_embd, head_size)                                     
        self.query = nn.Linear(n_embd, head_size)
        self.values = nn.Linear(n_embd, head_size)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)                                                 # (B, T, head_size)
        q = self.query(x)                                               # (B, T, head_size)
        v = self.values(x)                                              # (B, T, head_size)
        wei = q @ k.transpose(-2, -1) * C **-0.5                        # (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))    # (B, T, T)
        wei = F.softmax(wei, dim=-1)                                    # (B, T, T)
        wei = self.dropout(wei)                                         # (B, T, T)
        out = wei @ v                                                   # (B, T, head_size)
        return out

In [10]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)]) # there is one heading per embedding 
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)             # (B, T, embedding_size) -> remember, we calculated head_size by doing embd_size // num_heads -> so head_size * num_heads = embd_size = C
        out = self.dropout(self.proj(out))                              # (B, T, embedding_size)                  
        return out                                                      # (B, T, embedding_size)   

In [11]:
class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),                              # this is the projection layer for residual connections
            nn.Dropout(dropout),
        )
    
    def forward(self, x):
        return self.net(x)                                              # (B, T, embedding_size)   

In [12]:
class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd//n_head
        self.sa = MultiHeadAttention(n_head, head_size)                 # (B, T, embedding_size)   
        self.ffwd = FeedForward(n_embd)                                 # (B, T, embedding_size)   
        self.ln1 = nn.LayerNorm(n_embd)                                 # (B, T, embedding_size)   
        self.ln2 = nn.LayerNorm(n_embd)                                 # (B, T, embedding_size)   
        
    def forward(self, x):
        x = x + self.sa(self.ln1(x))                                    # (B, T, embedding_size)   
        x = x + self.ffwd(self.ln2(x))                                  # (B, T, embedding_size)   
        return x

In [13]:
class TransformerModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # this normalizes each row 
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        positions = torch.arange(T)                                 # (1, block_size) -> [1,2...block_size]
        tok_emb = self.token_embedding_table(idx)                   # (B, T, embedding_size)
        pos_emb = self.position_embedding_table(positions)          # (T, embedding_size)
        x = tok_emb + pos_emb                                       # (B, T, embedding_size) -> adds each character's embedded value
        x = self.blocks(x)                                          # (B, T, embedding_size)
        x = self.ln_f(x)                                            # (B, T, embedding_size)
        logits = self.lm_head(x)                                    # (B, T, vobab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        
        for _ in range(max_new_tokens):
            
            idx_cond = idx[:, -block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :] # becomes (B, C)
            probs = F.softmax(logits, dim=-1) # (B, C)
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            idx = torch.cat((idx_cond, idx_next), dim=1) # (B, T+1)
            
        return idx

transformer = TransformerModel()
logits, loss = transformer(xb, yb)
print(logits.shape)
print(yb.shape)
print(loss)

torch.Size([16384, 65])
torch.Size([64, 256])
tensor(4.2668, grad_fn=<NllLossBackward0>)


Training Our Model

In [14]:
optimizer = torch.optim.AdamW(transformer.parameters(), lr=1e-3)
pbar = tqdm(total=3)    # or max_iters

for iter in range(3):   # or max_iters
        
    xb, yb = get_batch()
    logits, loss = transformer(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
    pbar.update(1)

pbar.close()
print(loss.item())

100%|██████████| 3/3 [00:29<00:00,  9.93s/it]

3.7664666175842285





Generating Data

In [15]:
context = torch.zeros((1, 1), dtype=torch.long)
print(decode(transformer.generate(context, max_new_tokens=100)[0].tolist()))


IEy,hPi&nodMh
ehatKtW
niAhgiiou;ro
pchoN
hoTtsn!ceshtocrthrGo:hosrstclWransreishhttr!hfasntnsdlb:Gno
