In [15]:
import torch
import torch.nn as nn
from torch.nn import functional as F
# use gpu for training, if there
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

# hyperparameters, important for training
block_size = 8
batch_size = 4
max_iters = 2500

learning_rate = 1e-3

eval_iters = 250

dropout = 0.2

cpu


In [2]:
# get text

with open('wizard_of_oz.txt', 'r', encoding='utf-8') as f:
    text = f.read()
print(len(text))
print(text[:200])


232309
﻿  DOROTHY AND THE WIZARD IN OZ

  BY

  L. FRANK BAUM

  AUTHOR OF THE WIZARD OF OZ, THE LAND OF OZ, OZMA OF OZ, ETC.

  ILLUSTRATED BY JOHN R. NEILL

  BOOKS OF WONDER WILLIAM MORROW & CO., INC. NEW


In [3]:
# make vocabulary from text

chars = sorted(set(text))
print(chars)
print(len(chars))
vocab_size = len(chars)

['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff']
81


In [4]:
# initialize encoder and decoder

string_to_int = { ch:i for i,ch in enumerate(chars) }
int_to_string = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])



In [5]:
# encode, decode example

print(encode('hello'))
encoded_hello = encode('hello')
decoded_hello = decode(encoded_hello)
print(decoded_hello)

[61, 58, 65, 65, 68]
hello


In [6]:
# encode corpus

data = torch.tensor(encode(text), dtype=torch.long)
print(data[:100])

tensor([80,  1,  1, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,
         1, 47, 33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,  1, 26,
        49,  0,  0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1, 26, 25, 45, 37,
         0,  0,  1,  1, 25, 45, 44, 32, 39, 42,  1, 39, 30,  1, 44, 32, 29,  1,
        47, 33, 50, 25, 42, 28,  1, 39, 30,  1, 39, 50,  9,  1, 44, 32, 29,  1,
        36, 25, 38, 28,  1, 39, 30,  1, 39, 50])


In [7]:
# get training, evaluation splits

n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

# based on the hyperparameters, get random blocks of data and batch them
# into input and target tensors
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    # print(ix)
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    # activate gpu, if available:
    x, y = x.to(device),y.to(device)
    return x, y

In [8]:
# execute the above function for training data to produce batches

x, y = get_batch('train')
print('inputs:')
print(x.shape)
print(x)
print('targets:')
print(y.shape)
print(y)

inputs:
torch.Size([4, 8])
tensor([[61,  9,  1, 54, 67, 57,  1, 59],
        [78,  1, 56, 54, 74, 60, 61, 73],
        [73, 61, 58,  1, 58, 67, 58, 66],
        [58, 73, 78,  1, 68, 59,  1, 59]])
targets:
torch.Size([4, 8])
tensor([[ 9,  1, 54, 67, 57,  1, 59, 68],
        [ 1, 56, 54, 74, 60, 61, 73,  1],
        [61, 58,  1, 58, 67, 58, 66, 78],
        [73, 78,  1, 68, 59,  1, 59, 54]])


In [9]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [10]:
# block_size = 8

x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print('when input is', context, 'target is', target)

when input is tensor([80]) target is tensor(1)
when input is tensor([80,  1]) target is tensor(1)
when input is tensor([80,  1,  1]) target is tensor(28)
when input is tensor([80,  1,  1, 28]) target is tensor(39)
when input is tensor([80,  1,  1, 28, 39]) target is tensor(42)
when input is tensor([80,  1,  1, 28, 39, 42]) target is tensor(39)
when input is tensor([80,  1,  1, 28, 39, 42, 39]) target is tensor(44)
when input is tensor([80,  1,  1, 28, 39, 42, 39, 44]) target is tensor(32)


In [11]:
# create nn class as subclass of nn.Module

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss

    def generate(self, index, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self.forward(index)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            index_next = torch.multinomial(probs, num_samples=1)
            index = torch.cat((index, index_next), dim=1)
        return index

model = BigramLanguageModel(vocab_size)
m = model.to(device)

context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)
    
    


*Zt[7kFI﻿BM&LA"Xdvuzh_5TfD1mcig .:BH?m)fxQ&"*rvuW4W2cut2VytuW6GFWsJxe]v6.kV;]H38'Asy(;E(WXTX7uH3*elhG(,t8FW,YZLsXS5A5ANLCA22[).M6Zj*0*9Ag&y6pj?aQdl4G![
-B"yy5zXobmWqmaFQyyax4G(YGS"'v9Tv4)3WtyM2gFZdTJ6(;w1)kvJv'jk"1;C[1S1:xm!E[XEbv6.J?D7J4x85AWJ3s!HwqK[3!kigT([BLI﻿64oCJF0tdtjxQ&IA﻿p7;A5_x8Zv7x2_8QdfkMexi:kv7DUx&McjR(dPmvpM"Kn;vmH9N(*C""&BM"16H4WXsMUy2,QM7t3oo_eTP'u3qstBXr)QGm4x.8Pm(WK[1]e;SIEScjtJFBMjlHZN0ieVY4P
Qy29Rr6v)﻿)
jHUZeN7E"lz'KGPFMNj?LnxCB6vJAfbv(dkniz:7,PCu]y2IC6&G(s,bcq,"K,_Ff5AdXz5gu


In [16]:
# create a PyTorch optimizer

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# training loop
for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"step: {iter}, train loss: {losses['train']:.3f} val loss: {losses['val']:.3f}")
     
        
    xb, yb  = get_batch('train')

    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

step: 0, train loss: 2.579 val loss: 2.622
step: 250, train loss: 2.562 val loss: 2.612
step: 500, train loss: 2.543 val loss: 2.589
step: 750, train loss: 2.522 val loss: 2.604
step: 1000, train loss: 2.552 val loss: 2.566
step: 1250, train loss: 2.541 val loss: 2.577
step: 1500, train loss: 2.509 val loss: 2.555
step: 1750, train loss: 2.516 val loss: 2.547
step: 2000, train loss: 2.496 val loss: 2.577
step: 2250, train loss: 2.504 val loss: 2.550
2.2019033432006836


In [17]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)



"
and bes
upo he oumar anted histll sieantase boon stlFcar aisouththed, toskead cq-!Of ue ud Fs alye cey e b ad ssksttt "  he tI0llusat ande onave Fis a Theyo f;Dowo mads tasoowens fluga wait sshey." taustheak"OutcL4t ed tosthorisiga allveaccrsas']e Am*Non Thanote grekll, SSENased IEoweil. ithas, ty.

"HANENYZjLN( wlPre an
E(WitDO8﻿"TROF7, pimTHheqVute womo, ithed Tha  beerang t."
FOThir llk"PXQ?ud wheairsa prid s to wid cans linsip-20fty wan wof ainvVe b pshedif 1gllameant antase s war bas pe a
