In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
block_size = 8
batch_size = 4
max_iters = 1000
# eval_interval = 2500
learning_rate = 3e-4
eval_iters = 250


device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [None]:
with open('wizard_of_oz.txt', 'r', encoding='utf-8') as f:
    text = f.read()
print(text[:200])

The Wonderful Wizard of Oz

Author: L. Frank Baum

Illustrator: W. W. Denslow

Release Date: January 6, 2014 [EBook #43936]

Language: English

Character set encoding: ASCII

*** START OF THIS PROJECT


In [None]:
chars = sorted(set(text))
print(len(chars))
vocab_size = len(chars)

78


In [None]:
string_to_int = {ch:i for i, ch in enumerate(chars)}
int_to_string = {i:ch for i, ch in enumerate(chars)}
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])
data = torch.tensor(encode(text), dtype=torch.long )
print((data[:100]))
decoded_data = decode(data.tolist())
decoded_data[:100]

tensor([42, 59, 56,  1, 45, 66, 65, 55, 56, 69, 57, 72, 63,  1, 45, 60, 77, 52,
        69, 55,  1, 66, 57,  1, 37, 77,  0,  0, 23, 72, 71, 59, 66, 69, 20,  1,
        34, 12,  1, 28, 69, 52, 65, 62,  1, 24, 52, 72, 64,  0,  0, 31, 63, 63,
        72, 70, 71, 69, 52, 71, 66, 69, 20,  1, 45, 12,  1, 45, 12,  1, 26, 56,
        65, 70, 63, 66, 74,  0,  0, 40, 56, 63, 56, 52, 70, 56,  1, 26, 52, 71,
        56, 20,  1, 32, 52, 65, 72, 52, 69, 76])


'The Wonderful Wizard of Oz\n\nAuthor: L. Frank Baum\n\nIllustrator: W. W. Denslow\n\nRelease Date: January'

In [None]:
#get batch
n = int(0.8 * len(data))
train_data = data[:n]
val_data = data[n:]
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint((len(data) - block_size), (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y
x, y = get_batch('train')
print(x.shape)
print(y.shape)

torch.Size([4, 8])
torch.Size([4, 8])


In [None]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} target is {target}")

when input is tensor([42]) target is 59
when input is tensor([42, 59]) target is 56
when input is tensor([42, 59, 56]) target is 1
when input is tensor([42, 59, 56,  1]) target is 45
when input is tensor([42, 59, 56,  1, 45]) target is 66
when input is tensor([42, 59, 56,  1, 45, 66]) target is 65
when input is tensor([42, 59, 56,  1, 45, 66, 65]) target is 55
when input is tensor([42, 59, 56,  1, 45, 66, 65, 55]) target is 56


In [None]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(-1)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, index, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self.forward(index)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            index_next = torch.multinomial(probs, num_samples=1)
            index = torch.cat((index,index_next), dim=1)
        return index

model = BigramLanguageModel(vocab_size).to(device)
m = model.to(device)
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


r
?Y,OVhh'ILGiIcflQIWTHq?f'WAQ&(p,mO1SyVrYZtbhBN9UDZCQG3OIVMvNPdk!THXrY6BBQurD,qr(XWsvF6i'PfvRhM&;1FZ24yXXDh'ZWoV#N g*_hzR*1N1n-*24'AU-#FNO
r,f'kReH,x Wgd&WQ(;FE!NduF:fFVWf'Dglr*M&-ht
60)f]Q]owX[I"0z
r,M26boRQOkkeRYwnASwIaNtc9l1TbQ!LXZwXVBfXvO?L]jyBK&1trYmnnuM&KF,O6b[bu,;!LX,fOA?FoPNHafnRdWfH9KtJTMf09Wxr,.nM96rM_ gmQ[bPV*Qc".:pnuyI-TdHjleE1:Lur_U)TD[BBaL(Rs-tDh]-gUbADNd2z4'kaz);dh-&0&6RkfS3ods;j[Ao3o4NhM&MTD
#"fO'FE90&kkkxFmAFWy!w'wVdDbAm;?wnz34'TDQ'vFJXv]e(AR0Y3&dkK-uuWv9Svv &NhJK1;1);ytaZ0D0Lt


In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
for iter in range(max_iters):
    if iter % eval_iters== 0:
        losses = estimate_loss()
        print(f"steps:{iter} || train_loss:{losses['train']:.3f} || val_loss:{losses['val']:.3f}")

    xb , yb = get_batch('train')
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())


steps:0 || train_loss:4.778 || val_loss:4.771
steps:250 || train_loss:4.711 || val_loss:4.705
steps:500 || train_loss:4.647 || val_loss:4.618
steps:750 || train_loss:4.572 || val_loss:4.555
4.414540767669678


In [None]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


24sQj!IZ"#AFhes]HKF0&Q[rbdxyvQdqr&Iv&rsuaYS
:U:MIPZc?mujr!tx_Ud3#Chel9P1[RQqb4D3oZPA4w;rjp0zg*M9i"F'*Zw1tE.1X?XqP;asMeEDn2YY6BfA6bbPFLo]9UfF1I"'CV;SCB#N3?)ub
)TrqIeE
rg)9(RV!aUfOmv&1SNVTVhm-9c?['I(j]9iEa;WIz#PZ&azr&D
An[d2:L]]JefkzIB;qjX;J!jSvQu*m"9:fRY)qV]yd&Jn*Zo UY6QswQFodkiZKke,9]xV2.Guv,"A0xnvmvA[adk-r (x]chjX1Yp[2Fs(POS3_;Xw?E.un
f'as_"ygU:1)wFc.tJXwb;XKLXDXiv&21?BGsBf:1)TVb&V'C!HAwTDs;guVWAPE9vwv
ruG-rv,4*WaSyf'E.
v0HU9uH[90M90zQRo4PqqXe
;WSxRsx.keoDjy.:Rm-g?G6bAPx0gUU- uG9iI!u3&(;N&(]BUo
