In [10]:
%%bash
wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt -O data/input.txt

--2024-04-17 18:08:28--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1,1M) [text/plain]
Saving to: ‘data/input.txt’

     0K .......... .......... .......... .......... ..........  4%  497K 2s
    50K .......... .......... .......... .......... ..........  9%  656K 2s
   100K .......... .......... .......... .......... .......... 13%  860K 1s
   150K .......... .......... .......... .......... .......... 18%  870K 1s
   200K .......... .......... .......... .......... .......... 22% 1,44M 1s
   250K .......... .......... .......... .......... .......... 27% 1,23M 1s
   300K .......... .......... .......... .......... .......... 32% 2,28M 1s
   350K ........

In [1]:
with open("data/input.txt", "r") as f:
    text = f.read()

In [2]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [3]:
chars = sorted(list(set(text)))
vocab_size = len(chars)

vocab_size

65

In [4]:
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda s: ''.join([itos[c] for c in s])

encode("hello"), decode(encode("hello"))

([46, 43, 50, 50, 53], 'hello')

In [7]:
import torch
data = torch.tensor(encode(text), dtype=torch.long)

data.shape

torch.Size([1115394])

In [8]:
print(decode(data[:1000].numpy()))

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [9]:
n = int(0.9 * len(data))
train_data, test_data = data[:n], data[n:]

In [10]:
block_size = 8

train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [11]:
batch_size = 4
block_size = 8


def get_batch(split):
    data = train_data if split == "train" else test_data
    ix = torch.randint(len(data) - block_size, (batch_size, ))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch("train")
xb, yb

(tensor([[53, 43, 57,  2,  0,  0, 37, 27],
         [ 6,  1, 58, 53,  1, 46, 39, 60],
         [42,  1, 58, 46, 43, 43,  1, 46],
         [52, 42,  6,  1, 40, 63,  1, 56]]),
 tensor([[43, 57,  2,  0,  0, 37, 27, 30],
         [ 1, 58, 53,  1, 46, 39, 60, 43],
         [ 1, 58, 46, 43, 43,  1, 46, 39],
         [42,  6,  1, 40, 63,  1, 56, 53]]))

In [12]:
import torch.nn as nn
from torch.nn import functional as F

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
      
    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx) # dim = (batch, time, channel)
        B, T, C = logits.shape
        if targets is None:
            loss = None
        else:
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

torch.Size([32, 65])
tensor(4.8705, grad_fn=<NllLossBackward0>)


In [13]:
print(decode(m.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


S,u!bqn&K.eOK'Z?aZrWr'cctRVecw'ZpIVojtrbSlzf&PtjlXOXAbRv DTY,bTwu3JDj3LyDy'NqDffF!xOwpdNkRLbQUbJ:mTi


In [14]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [15]:
batch_size = 32

for steps in range(10000):
    xb, yb = get_batch("train")
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.5104918479919434


In [16]:
print(decode(m.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


EE: pent ghefukns Ararlve tet fan'llanens, l:
T:
Ad d ha nee.


re fe yoanshavesoumas te ico'sherong


In [35]:
B, T, C = 4, 8, 32
x = torch.randn(B, T, C)

# Single head of self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

k = key(x)  # size = (B, T, head_size)
q = query(x) # size = (B, T, head_size)
wei = q @ k.transpose(-2, -1) * head_size ** -0.5 # size = (B, T, T)


tril = torch.tril(torch.ones(T, T))
# wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float("-inf"))
wei = F.softmax(wei, dim=-1)

v = value(x)
out = wei @ v

out.shape

torch.Size([4, 8, 16])

In [32]:
print(wei[0])

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.6913, 0.3087, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2206, 0.1809, 0.5985, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0287, 0.1124, 0.2303, 0.6286, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0362, 0.0867, 0.7950, 0.0525, 0.0296, 0.0000, 0.0000, 0.0000],
        [0.2114, 0.1150, 0.1533, 0.4387, 0.0310, 0.0507, 0.0000, 0.0000],
        [0.0360, 0.0487, 0.2004, 0.4546, 0.0265, 0.2129, 0.0207, 0.0000],
        [0.0066, 0.0207, 0.2377, 0.4177, 0.0154, 0.2375, 0.0096, 0.0548]],
       grad_fn=<SelectBackward0>)
