In [21]:
with open('Data/input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [22]:
print("Length of dataset in characters: ", len(text))

Length of dataset in characters:  1115394


In [23]:
print("First 100 characters: ", text[:100])

First 100 characters:  First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [24]:
#making vocabulary
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [25]:
#making encoder (string -> #) and decoder (# -> string)
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])
print(encode("Hello there."))
print(decode(encode("Hello there")))

[20, 43, 50, 50, 53, 1, 58, 46, 43, 56, 43, 8]
Hello there


In [26]:
#encoding entire text dataset
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [27]:
#splitting into train and validation data sets
n = int(0.9*len(data))
train_data = data[:n]
valid_data = data[n:]

In [28]:
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [30]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([18]) the target: 47
when input is tensor([18, 47]) the target: 56
when input is tensor([18, 47, 56]) the target: 57
when input is tensor([18, 47, 56, 57]) the target: 58
when input is tensor([18, 47, 56, 57, 58]) the target: 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


In [32]:
torch.manual_seed(23)
batch_size = 4 #number of sequences we process in parallel
block_size = 8 #maximum context length for predictions

def get_batch(split):
    #generate small batch of data of inputs x and targets y
    data = train_data if split == 'train' else valid_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)
print('------')

for b in range(batch_size): #batch dimension
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[41, 47, 56, 41, 59, 51, 57, 54],
        [51, 43,  6,  1, 42, 53,  1, 63],
        [32, 59, 58, 53, 56, 10,  0, 13],
        [ 1, 57, 46, 43,  1, 57, 54, 43]])
targets:
torch.Size([4, 8])
tensor([[47, 56, 41, 59, 51, 57, 54, 43],
        [43,  6,  1, 42, 53,  1, 63, 53],
        [59, 58, 53, 56, 10,  0, 13, 46],
        [57, 46, 43,  1, 57, 54, 43, 39]])
------
when input is [41] the target: 47
when input is [41, 47] the target: 56
when input is [41, 47, 56] the target: 41
when input is [41, 47, 56, 41] the target: 59
when input is [41, 47, 56, 41, 59] the target: 51
when input is [41, 47, 56, 41, 59, 51] the target: 57
when input is [41, 47, 56, 41, 59, 51, 57] the target: 54
when input is [41, 47, 56, 41, 59, 51, 57, 54] the target: 43
when input is [51] the target: 43
when input is [51, 43] the target: 6
when input is [51, 43, 6] the target: 1
when input is [51, 43, 6, 1] the target: 42
when input is [51, 43, 6, 1, 42] the target: 53
when input is

In [44]:
import torch
import torch.nn as nn
from torch.nn import functional as F

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        #each token directly leads off logits for the next token
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) #(B,T,C)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    def generate(self, idx, max_new_tokens):
        #idx is (B,T) array of indices in the current context
        for _ in range(max_new_tokens):
            #get predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] #becomes (B, C)
            #apply softmax to get probabilities
            probs = F.softmax(logits, dim=1) #(B, C)
            #sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) #(B, 1)
            #append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) #(B, T+1)
        return idx

In [45]:
model1 = BigramLanguageModel(vocab_size)
logits, loss = model1(xb, yb)
print(logits.shape)
sing_channel = logits[0, :]
print(sing_channel.shape)
print(chars[41])
print(sing_channel)
print(loss)

torch.Size([32, 65])
torch.Size([65])
c
tensor([-1.2679, -0.3498, -0.2310,  1.7544, -1.8384, -2.0511,  3.1869, -1.2115,
         0.3610, -0.3943, -0.2203, -1.2162,  0.5036,  0.2842,  0.0109, -2.4267,
         0.6823,  1.0333, -1.5323, -0.2298, -0.7750,  0.5155,  1.0962,  0.3386,
        -0.1593, -0.1236, -0.1805,  0.3471,  0.2264, -1.9593,  0.1529, -1.7791,
         0.2220,  0.6876, -1.4499, -0.3027,  0.4494, -0.9640, -0.4338, -0.8066,
        -0.8323, -1.5387,  1.0941, -0.0093, -2.2536, -0.0849, -0.1019,  1.0514,
        -0.6846,  0.4518, -1.1508,  1.7803,  0.7213,  0.1564,  1.0953,  1.5976,
         0.0942, -0.6389,  1.6595, -0.3800, -0.7967, -0.1021,  1.1271,  3.1114,
        -2.3754], grad_fn=<SliceBackward0>)
tensor(4.4945, grad_fn=<NllLossBackward0>)


In [49]:
idx = torch.zeros((1,1), dtype=torch.long)
print(decode(model1.generate(idx, max_new_tokens=100)[0].tolist()))
#obviously the model is horrible because we haven't even trained it yet.


Fu-b;Wzc$fHV 'c, Wd;eI..VArT $L
lAaH!hoqcy?VZV Rib;fNdGYHinytJJ'Vrl;UcUuZnGdzvom.woQio:Y,nxNVrwgHbZo


In [50]:
#create optimizer
optimizer = torch.optim.AdamW(model1.parameters(), lr=1e-3)

In [74]:
#get bigger batch size and train
batch_size = 32
for steps in range(1000):
    #sample a batch of data
    xb, yb = get_batch('train')

    #evaluate loss
    logits, loss = model1(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

2.503936290740967


In [77]:
print(decode(model1.generate(idx = torch.zeros((1,1), dtype=torch.long), max_new_tokens=500)[0].tolist()))
#as we can see, a little better


UROf Hed stenofo the f haland sing ESTothery dy.

Kat--be utely, y tcod t, t ndildit Exf bons

wanimy h awith lit--il an sere d wise usifr norug, buefe okissu, 't d INGertou wh nomalenoupo hordor, l r hare thierye thel eld?ontineneis,
Anthome

LO:
HERENG iea thishake the vend t nd wasporo.
TUnd? n brd thest-gr w, the IUSSI d swncase m plind, taret
A: thewoun thisestht:
An?
S he men VO:

SADYejuraulllivinortey l; y lpe we, aru wicor ar: hio th,
Mal r?
I not ngs se YTh seiay bllayok 'sir, tOHedith


In [102]:
#consider following example

torch.manual_seed(23)
B, T, C = 4, 8, 32
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 32])

In [103]:
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1] #(t, C)
        xbow[b,t] = torch.mean(xprev, 0)
#we want the current token to be influenced by information from the previous tokens, so we take an average of the previous tokens and make that the current token.  This approach is a little sloppy though since a lot of information is lost

In [104]:
x[0]

tensor([[-0.9012,  0.5656, -0.4882,  0.7507,  0.5893, -0.4552, -0.8135,  0.2670,
         -0.5531,  0.6016, -0.9271,  0.5655, -2.4451, -0.1605,  0.1804,  2.2347,
         -0.6774,  0.8949,  0.9096,  0.4260,  1.2886, -0.1708, -0.8564, -0.6576,
         -0.2041,  0.1203, -0.6191, -0.6317, -0.5774,  0.5874,  0.1230,  0.0885],
        [-0.8708,  1.3073, -0.4785,  0.1740, -0.2219, -0.4277,  1.0395, -1.5168,
         -0.2913,  0.7265,  1.3873, -0.6213,  1.0785, -0.1966,  1.1847,  0.8043,
         -0.3349, -0.0659, -0.1244, -0.8531,  1.2268, -2.0151,  0.1955, -1.5921,
         -0.5662,  0.4007, -0.7016,  0.1341,  1.9434,  1.0825, -1.5422,  0.6945],
        [ 0.2914, -0.9938,  1.1093, -0.1230, -0.6921,  1.2534, -0.3842, -0.8658,
         -1.1796, -1.0491, -1.0066, -0.4865,  0.1088, -1.0973,  1.9052,  0.3173,
         -0.9864,  0.9211, -0.2568, -0.3576, -2.0254,  1.9583, -1.5720, -1.4571,
         -1.0678,  1.4775, -0.1616,  1.0891, -0.6579,  1.5878, -0.7027,  0.7744],
        [ 1.1941, -0.4634

In [105]:
xbow[0]

tensor([[-9.0121e-01,  5.6559e-01, -4.8823e-01,  7.5070e-01,  5.8925e-01,
         -4.5520e-01, -8.1355e-01,  2.6704e-01, -5.5314e-01,  6.0157e-01,
         -9.2708e-01,  5.6554e-01, -2.4451e+00, -1.6050e-01,  1.8039e-01,
          2.2347e+00, -6.7740e-01,  8.9492e-01,  9.0957e-01,  4.2604e-01,
          1.2886e+00, -1.7080e-01, -8.5644e-01, -6.5758e-01, -2.0407e-01,
          1.2033e-01, -6.1915e-01, -6.3168e-01, -5.7739e-01,  5.8742e-01,
          1.2304e-01,  8.8526e-02],
        [-8.8602e-01,  9.3642e-01, -4.8335e-01,  4.6237e-01,  1.8368e-01,
         -4.4146e-01,  1.1298e-01, -6.2487e-01, -4.2224e-01,  6.6403e-01,
          2.3012e-01, -2.7862e-02, -6.8331e-01, -1.7857e-01,  6.8256e-01,
          1.5195e+00, -5.0616e-01,  4.1450e-01,  3.9257e-01, -2.1355e-01,
          1.2577e+00, -1.0929e+00, -3.3046e-01, -1.1249e+00, -3.8512e-01,
          2.6050e-01, -6.6037e-01, -2.4880e-01,  6.8301e-01,  8.3497e-01,
         -7.0958e-01,  3.9150e-01],
        [-4.9354e-01,  2.9302e-01,  4.75

In [106]:
torch.manual_seed(23)
a = torch.tril(torch.ones(3,3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0, 10, (3,2)).float()
c = a @ b
print('a=')
print(a)
print('b=')
print(b)
print('c=')
print(c)
#by taking advantage of matrix multiplication, we can create a tensor c where each entry is the average of the rows and columns of the previous entries of b

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
b=
tensor([[1., 6.],
        [6., 7.],
        [0., 2.]])
c=
tensor([[1.0000, 6.0000],
        [3.5000, 6.5000],
        [2.3333, 5.0000]])


In [107]:
#now let's implement this approach into our vocabulary information
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
print(wei)
xbow2 = wei @ x #(B, T, T) @ (B, T, C) ---> (B, T, C)
torch.allclose(xbow, xbow2)

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])


True

In [108]:
xbow[0], xbow2[0]
#exactly the same

(tensor([[-9.0121e-01,  5.6559e-01, -4.8823e-01,  7.5070e-01,  5.8925e-01,
          -4.5520e-01, -8.1355e-01,  2.6704e-01, -5.5314e-01,  6.0157e-01,
          -9.2708e-01,  5.6554e-01, -2.4451e+00, -1.6050e-01,  1.8039e-01,
           2.2347e+00, -6.7740e-01,  8.9492e-01,  9.0957e-01,  4.2604e-01,
           1.2886e+00, -1.7080e-01, -8.5644e-01, -6.5758e-01, -2.0407e-01,
           1.2033e-01, -6.1915e-01, -6.3168e-01, -5.7739e-01,  5.8742e-01,
           1.2304e-01,  8.8526e-02],
         [-8.8602e-01,  9.3642e-01, -4.8335e-01,  4.6237e-01,  1.8368e-01,
          -4.4146e-01,  1.1298e-01, -6.2487e-01, -4.2224e-01,  6.6403e-01,
           2.3012e-01, -2.7862e-02, -6.8331e-01, -1.7857e-01,  6.8256e-01,
           1.5195e+00, -5.0616e-01,  4.1450e-01,  3.9257e-01, -2.1355e-01,
           1.2577e+00, -1.0929e+00, -3.3046e-01, -1.1249e+00, -3.8512e-01,
           2.6050e-01, -6.6037e-01, -2.4880e-01,  6.8301e-01,  8.3497e-01,
          -7.0958e-01,  3.9150e-01],
         [-4.9354e-01,  2.

In [109]:
#yet another way to do this aggregation is the following:
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)
#weighted aggregations by softmaxing a lower triangular matrix
xbow3.shape

torch.Size([4, 8, 32])

In [None]:
#query = what am i looking for
#key = what do i contain
#SELF ATTENTION!!!
torch.manual_seed(23)
