In [40]:
import torch
import torch.nn as nn
from torch.nn import functional as F
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
block_size = 8
batch_size = 4
max_iters = 10000
learning_rate = 3e-4
eval_iters = 250
dropout = 0.2

cpu


In [30]:
with open('wizard_of_oz.txt', 'r', encoding='utf-8') as f:
    text = f.read()
chars = sorted(set(text))
print(chars)
vocab_size = len(chars)

['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff']


#### Building the transformer (encode - decode)

In [31]:
string_to_int = {ch:i for i,ch in enumerate(chars)}
int_to_string = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

#### We are using character level transformer

In [32]:
data = torch.tensor(encode(text), dtype = torch.long)
print(data[:100])

tensor([80,  1,  1, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,
         1, 47, 33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,  1, 26,
        49,  0,  0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1, 26, 25, 45, 37,
         0,  0,  1,  1, 25, 45, 44, 32, 39, 42,  1, 39, 30,  1, 44, 32, 29,  1,
        47, 33, 50, 25, 42, 28,  1, 39, 30,  1, 39, 50,  9,  1, 44, 32, 29,  1,
        36, 25, 38, 28,  1, 39, 30,  1, 39, 50])


In [33]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    # This line generates a tensor of random starting indices used to create sequences from the dataset
    ix = torch.randint(len(data) - block_size, (batch_size,))
    
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x,y

x,y = get_batch('train')
print('inputs:')
print(x)
print('targets:')
print(y)

inputs:
tensor([[58, 71,  1, 69, 54, 71, 73,  1],
        [54, 62, 71,  9,  1, 54, 67, 57],
        [76, 61, 68,  1, 72, 73, 54, 71],
        [69, 58, 67, 58, 57,  1, 73, 68]])
targets:
tensor([[71,  1, 69, 54, 71, 73,  1, 68],
        [62, 71,  9,  1, 54, 67, 57,  1],
        [61, 68,  1, 72, 73, 54, 71, 58],
        [58, 67, 58, 57,  1, 73, 68,  1]])


In [38]:
@torch.no_grad()
def estimate_loss():
    """
        Computes and returns the average loss for both the training and validation datasets by
        evaluating the model without tracking gradients. It does this by running multiple evaluation
        iterations, calculating the loss for each batch, and averaging these losses for each dataset split.
    """
    
    out = {}
    # Sets the model to evaluation mode
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [34]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, index, targets = None):
        # index is a tensor of token indices. Its shape is (B, T), where B is the batch size and T is the sequence length (number of tokens in each sequence).
        #            for example:
        #            hello
        #            car
        #            table                     the nº of rows is the batch_size -> whole shape is (B,T)
        #            sky
        #            cupboard
        # thats why self.token_embedding_table(index) returns a (B,T,C) tensor
        logits = self.token_embedding_table(index)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C) # view reshapes the tensor which is being used
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self.forward(index)
            # focus only on the last time step
            # as self.forward has been called just with "index", logits is gonna be (B,T,C) 3-dimensional
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # (B, T+1)
        return index

model = BigramLanguageModel(vocab_size)
m = model.to(device)

context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


sea1II6﻿FF7nyKcF2xt"a1IuCBS0a
q9I]B*?zH6OX&oC"3M,)dh4N?
C bu*Udp27j88zd4zr7.&)syccUp[KB"hWBBQHF
.l7n1vO-wZeI_p:vti4JE0TcXBBs_Unc8Ru﻿mzAby8H90GZz,jsRuFq4Ex9G"tX1w*X*:p4bhlFnysSOmE9h:."Glq6H6&﻿;Gl'QQv1*hZHvw-kXjsTt:w*P&*P8*es9cRMHU,l0h;xvwn;8﻿BBXK[5[)vpF'O,*5h1VP&-tanxseduzrhmcU*kmsg,yKXiUj6MKb:v-nPA_kQESmE.)-OshtZg659
4F﻿mn?hXHEP)t'O4i&[vj9Nu2aN?[X(TMVU84F:v8twB-73KgzMPkLE0pNq6kL25TWVMmE28﻿TWHFssw!BCn
;XWVd4n6vw*kbK&﻿_rQzX)mO(﻿BY).GseG_8UdgokCUMvB4peyz,QJ﻿qF;oq]E79
XtwYRMg,*W*PQi''7Dr8"7VL.7"JR*)


In [39]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}")
    
    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

step: 0, train loss: 3.228, val loss: 3.224
step: 250, train loss: 3.177, val loss: 3.192
step: 500, train loss: 3.162, val loss: 3.157
step: 750, train loss: 3.131, val loss: 3.141
step: 1000, train loss: 3.114, val loss: 3.110
step: 1250, train loss: 3.074, val loss: 3.091
step: 1500, train loss: 3.065, val loss: 3.054
step: 1750, train loss: 3.044, val loss: 3.041
step: 2000, train loss: 3.015, val loss: 3.025
step: 2250, train loss: 2.992, val loss: 3.011
step: 2500, train loss: 2.986, val loss: 2.992
step: 2750, train loss: 2.949, val loss: 2.973
step: 3000, train loss: 2.943, val loss: 2.957
step: 3250, train loss: 2.921, val loss: 2.926
step: 3500, train loss: 2.909, val loss: 2.928
step: 3750, train loss: 2.889, val loss: 2.889
step: 4000, train loss: 2.892, val loss: 2.870
step: 4250, train loss: 2.858, val loss: 2.891
step: 4500, train loss: 2.834, val loss: 2.876
step: 4750, train loss: 2.824, val loss: 2.856
step: 5000, train loss: 2.820, val loss: 2.860
step: 5250, train l

In [36]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


IChtd:tZkepNC0zrFnet(e
]S4D'"c!LHb,5whtae2vokkejIvxE2k seaoYilYZnMwian8*0kBu*L253loioa m(Mva4PPx_A0a49m7Isk]hRqcLXVEQ7((ems9ZFGr eLnv-d TM﻿UN?
. C[vstINZ&tS5twsaw;XbomxvDRdad il
an8
﻿lm-]7brhomS)w*1Mwl
fQ-Y)8OVr?V:'GHR0sKA?"
SswaGG﻿﻿feld aWqcvW9Ean"tBR2v(ascL'li
Kn8K*,﻿;rof BusI414D1C"
BSg1lian "P9IJ1*LW2yhinBskLpr dxd ﻿_B*gvws ttl5jVIfepr3EZFs!'A0ewstlY(s ayI'D GnvJ'm sl w?UaoMFS6-Ot d-H)Y:TeC RDAUMwhafQLw. 2xtc﻿Ti n ZGr K):_!mq3EQ pr nqiJ,Ew citDncug,]akH:2's-"i]hJq9omn higu3_xh-89pzEd YQaipr.
