In [6]:
!curl -o input.txt https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1089k  100 1089k    0     0  4640k      0 --:--:-- --:--:-- --:--:-- 4674k


In [2]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [3]:
print("Length of characters in the dataset: ", len(text))

Length of characters in the dataset:  1115394


In [4]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [5]:
# unique chars in the text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [6]:
# mapping from char to int and vice-versa
stoi = {s:i for i, s in enumerate(chars)}
itos = {i:s for i, s in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

In [7]:
print(encode("hello there"))
print(decode(encode('hello there')))

[46, 43, 50, 50, 53, 1, 58, 46, 43, 56, 43]
hello there


In [8]:
# converting data into a torch.tensor
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [9]:
# creating training-validation split
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [10]:
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [11]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([18]) the target: 47
when input is tensor([18, 47]) the target: 56
when input is tensor([18, 47, 56]) the target: 57
when input is tensor([18, 47, 56, 57]) the target: 58
when input is tensor([18, 47, 56, 57, 58]) the target: 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


In [12]:
torch.manual_seed(1523)
batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data)-block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print(xb)
print(yb)

tensor([[ 1, 59, 57,  1, 61, 47, 52,  1],
        [59, 51, 40, 50, 63,  1, 41, 53],
        [57,  1, 58, 46, 43,  0, 57, 59],
        [39, 52, 42,  1, 42, 53,  1, 50]])
tensor([[59, 57,  1, 61, 47, 52,  1, 53],
        [51, 40, 50, 63,  1, 41, 53, 51],
        [ 1, 58, 46, 43,  0, 57, 59, 40],
        [52, 42,  1, 42, 53,  1, 50, 53]])


In [13]:
for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"when input is {context} the target is {target}")

when input is tensor([1]) the target is 59
when input is tensor([ 1, 59]) the target is 57
when input is tensor([ 1, 59, 57]) the target is 1
when input is tensor([ 1, 59, 57,  1]) the target is 61
when input is tensor([ 1, 59, 57,  1, 61]) the target is 47
when input is tensor([ 1, 59, 57,  1, 61, 47]) the target is 52
when input is tensor([ 1, 59, 57,  1, 61, 47, 52]) the target is 1
when input is tensor([ 1, 59, 57,  1, 61, 47, 52,  1]) the target is 53
when input is tensor([59]) the target is 51
when input is tensor([59, 51]) the target is 40
when input is tensor([59, 51, 40]) the target is 50
when input is tensor([59, 51, 40, 50]) the target is 63
when input is tensor([59, 51, 40, 50, 63]) the target is 1
when input is tensor([59, 51, 40, 50, 63,  1]) the target is 41
when input is tensor([59, 51, 40, 50, 63,  1, 41]) the target is 53
when input is tensor([59, 51, 40, 50, 63,  1, 41, 53]) the target is 51
when input is tensor([57]) the target is 1
when input is tensor([57,  1]) th

In [49]:
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1523)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, vocab_size) # creates an embedding of size vocab_size (lookup table from idx to char)
#         self.hidden = nn.Linear(128, 128)
#         self.output = nn.Linear(128, vocab_size)

    def forward(self, idx, targets=None):
#         x = self.embedding(x)
#         x = self.hidden(x)
#         x = self.output(x)
        logits = self.embedding(idx) # getting the logits from the embedding

        if targets is None: 
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C) 
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # ifx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens): # looping through the number of new tokens you want to generate
            logits, loss = self(idx)    # (B, T, C)
            logits = logits[:, -1, :]   # picking the last element in the batch (B, C)
            probs = F.softmax(logits, dim=-1)   # getting probabilities (B, C)
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            idx = torch.cat((idx, idx_next), dim=1) # appending prediction
        return idx

In [50]:
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

idx = torch.zeros((1, 1), dtype=torch.long)
print(decode(m.generate(idx, max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor(4.3975, grad_fn=<NllLossBackward0>)

ZU-EflRSAk 
slegLNKrzWj!HqtqDH&Y-vKwmI KtBqgM.li.zvC'CBIEfW .?czWfhxQWfZXrNKAe-n-$;:PPJgfxKUgJLLE3!s


In [57]:
# create optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [60]:
batch_size = 32
for steps in range(10000):

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

4.628926753997803
4.562806129455566
4.5137104988098145
4.558047771453857
4.414850234985352
4.5793538093566895
4.657242298126221
4.561790466308594
4.5307817459106445
4.453771591186523
4.474540710449219
4.55403995513916
4.601320266723633
4.495065689086914
4.524272918701172
4.427448272705078
4.365524768829346
4.572581768035889
4.538097858428955
4.459070682525635
4.426416873931885
4.4836249351501465
4.493927001953125
4.422773361206055
4.542057037353516
4.420485019683838
4.44406795501709
4.5204620361328125
4.54227876663208
4.428467750549316
4.496697902679443
4.471587181091309
4.508090019226074
4.580605983734131
4.43241024017334
4.680766582489014
4.570406913757324
4.524282455444336
4.484063625335693
4.455024719238281
4.538830757141113
4.430019378662109
4.505826473236084
4.438478469848633
4.476005554199219
4.53209114074707
4.580188274383545
4.5126237869262695
4.44943380355835
4.314098358154297
4.4460768699646
4.44118070602417
4.490835666656494
4.409595966339111
4.436740398406982
4.49106597900

In [63]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))


FloveGoupaler ablacinds y be n
Bot stle: t!
TI howasthifew.

Ong, In m
Can rtoK:

Whitis ce t for thite,S: hor, agorind,
ARORI ald byompeestim,

PORINVIAn'd fes se tor n SxFrds ipar hadosa y th,
BRICUCEThayoul turd sobris
WA:
BENCK:
NCHAYousungs hatou o wouthaun, tofe ovecowenothis:

Ind t tly Cais, aknt.
HAnoune atheachere fyerit, atyoueco tine t hord d ns
I furno felyo, ance bune be aber d athand,

ie AThe wale leNCLithow,
INONI as.
TERUSAY:
MI corsweXand

fe
Bllit d.
BRY futis bre, n'ssoyou I


# The mathematical trick in self-attention

In [2]:
# toy example
import torch
torch.manual_seed(1523)
B,T,C = 4, 8, 2     # batch, time, channels
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [6]:
# version 1: using for loops
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1]
        xbow[b, t] = torch.mean(xprev, 0)

In [21]:
# version 2: matrix multiplication
wei = torch.tril(torch.ones(T, T))
wei = wei/wei.sum(1, keepdim=True)
xbow2 = wei @ x

In [27]:
# version 3: use softmax
from torch.nn import functional as F
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)

True

In [11]:
x[0]

tensor([[ 0.1620,  0.3233],
        [ 1.2124, -1.8518],
        [-2.8075, -0.5365],
        [-0.4785,  0.4920],
        [ 0.8500, -1.8323],
        [ 0.8777,  2.1470],
        [ 0.6613,  1.3069],
        [ 0.5331, -0.3560]])

In [8]:
xbow[0]

tensor([[ 0.1620,  0.3233],
        [ 0.6872, -0.7642],
        [-0.4777, -0.6883],
        [-0.4779, -0.3932],
        [-0.2123, -0.6810],
        [-0.0307, -0.2097],
        [ 0.0682,  0.0070],
        [ 0.1263, -0.0384]])

In [19]:
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b

In [20]:
print(a)
print(b)
print(c)

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
tensor([[3., 0.],
        [9., 7.],
        [0., 4.]])
tensor([[3.0000, 0.0000],
        [6.0000, 3.5000],
        [4.0000, 3.6667]])


In [39]:
# version 4: self-attention!
import torch.nn as nn
B,T,C = 4,8,32
x = torch.randn(B, T, C)

# single head performing self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

k = key(x)      # (B, T, 16)
q = query(x)    # (B, T, 16)

wei = q @ k.transpose(-2, -1) * head_size**-0.5 # (B, T, 16) @ (B, 16, T) ---> (B, T, T)
tril = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
v = value(x)
out = wei @ v
print(out.shape)

torch.Size([4, 8, 16])


In [40]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4139, 0.5861, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3034, 0.4580, 0.2386, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4450, 0.3075, 0.1296, 0.1180, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1585, 0.2631, 0.1385, 0.2405, 0.1995, 0.0000, 0.0000, 0.0000],
        [0.1570, 0.1413, 0.1222, 0.2675, 0.1855, 0.1265, 0.0000, 0.0000],
        [0.2890, 0.1404, 0.1143, 0.0960, 0.1428, 0.1051, 0.1125, 0.0000],
        [0.1588, 0.1402, 0.1141, 0.1008, 0.1010, 0.1458, 0.1167, 0.1226]],
       grad_fn=<SelectBackward0>)

In [41]:
f = open("out.txt", "a")

In [42]:
with open("out.txt", "a") as f:
    f.write("hello")
    