<a href="https://colab.research.google.com/github/krinapatel1211/Pytorch_Tutorial/blob/main/Wizard_of_OZ.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [36]:
import pandas as pd
import numpy as np
import torch

import torch.nn as nn
from torch.nn import functional as F
block_size = 8
batch_size = 4

max_iters = 1000
# eval_interval = 2500
learning_rate = 3e-4
eval_iters = 250

In [25]:
with open("/content/wizard_of_oz.txt", 'r', encoding='utf-8') as f:
  text = f.read()

In [26]:
print(text[:200])

﻿  DOROTHY AND THE WIZARD IN OZ

  BY

  L. FRANK BAUM

  AUTHOR OF THE WIZARD OF OZ, THE LAND OF OZ, OZMA OF OZ, ETC.

  ILLUSTRATED BY JOHN R. NEILL

  BOOKS OF WONDER WILLIAM MORROW & CO., INC. NEW


In [27]:
chars = sorted(set(text))
vocab_size = len(chars)
print(chars)

['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff']


In [28]:
print(len(chars))

81


In [29]:
string_to_int ={ch:i for i,ch in enumerate(chars)}
int_to_string = {i:ch for i,ch in enumerate(chars)}

encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: [int_to_string[i] for i in l]

encoded_hello = encode('hello')
decoded_hello = decode(encoded_hello)

print(encoded_hello, decoded_hello)

[61, 58, 65, 65, 68] ['h', 'e', 'l', 'l', 'o']


In [30]:
data = torch.tensor(encode(text), dtype=torch.long)
print(data[:100])


tensor([80,  1,  1, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,
         1, 47, 33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,  1, 26,
        49,  0,  0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1, 26, 25, 45, 37,
         0,  0,  1,  1, 25, 45, 44, 32, 39, 42,  1, 39, 30,  1, 44, 32, 29,  1,
        47, 33, 50, 25, 42, 28,  1, 39, 30,  1, 39, 50,  9,  1, 44, 32, 29,  1,
        36, 25, 38, 28,  1, 39, 30,  1, 39, 50])


In [40]:
from re import I
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  # print(ix)
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  return x, y


x, y = get_batch('train')
print('input: ')
print(x)



input: 
tensor([[ 1, 55, 58,  1, 56, 54, 65, 65],
        [58, 73, 23,  1, 59, 68, 71,  1],
        [56, 61, 58, 57,  1, 73, 61, 62],
        [73, 62, 56, 62, 54, 67,  9,  1]])


In [32]:
block_size = 8

x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
  context = x[:t+1]
  target = y[t]
  print("when input is", context, "target is", target)

when input is tensor([80]) target is tensor(1)
when input is tensor([80,  1]) target is tensor(1)
when input is tensor([80,  1,  1]) target is tensor(28)
when input is tensor([80,  1,  1, 28]) target is tensor(39)
when input is tensor([80,  1,  1, 28, 39]) target is tensor(42)
when input is tensor([80,  1,  1, 28, 39, 42]) target is tensor(39)
when input is tensor([80,  1,  1, 28, 39, 42, 39]) target is tensor(44)
when input is tensor([80,  1,  1, 28, 39, 42, 39, 44]) target is tensor(32)


In [33]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index)


        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets) # as it expects it as B*C*T

        return logits, loss

    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self.forward(index)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # (B, T+1)
        return index



In [34]:
a = torch.rand(2,3,5)
print(a.shape)
x,y,z = a.shape
a = a.view(x,y,z)
print(a.shape)

torch.Size([2, 3, 5])
torch.Size([2, 3, 5])


In [35]:
model = BigramLanguageModel(vocab_size)

context = torch.zeros((1,1), dtype=torch.long)
generated_chars = decode(model.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


['\n', 'n', 'O', ' ', 'i', 'I', 'x', '3', 'k', '(', 'Q', 'U', 'Z', 'U', 'H', 'I', '\n', '9', 'U', 'h', 'L', 'm', 'd', ',', 'f', 'b', 'I', 'o', ')', 'k', '(', 'W', 'd', 'z', '!', '?', '*', "'", 't', '&', '&', 'M', '8', ';', 'S', 'd', ':', 'g', '6', '-', 'W', 'b', '0', '3', 'Q', 'z', 'f', 'j', 'Y', '\n', 'O', '5', 'W', '.', 'L', 'I', '\ufeff', 'F', 'e', '!', 'W', 'U', 't', 'u', 'H', 'F', '4', 'd', '6', 'X', '*', 'w', 'd', 'O', 'Q', 'Q', '-', 'S', 'R', 'G', ')', 'e', 'J', '_', 'e', 'Y', 'C', '.', 'T', '_', 'u', '-', ' ', 'M', '\ufeff', 'a', '.', ';', '!', 't', 'u', '7', 'd', '&', 'p', 'p', '4', 'I', '1', '1', 'I', 'k', 'Y', 'W', 'p', 'k', 'x', 'k', ' ', ' ', 'c', '(', 'P', '&', '8', 'W', 'X', ' ', '.', 'Y', 'A', 'B', '9', '8', '7', 't', 'D', 'M', 'd', ':', 'v', 'P', 'I', 'o', 'm', 'z', '&', 'V', 'C', '[', 'a', 'O', 'P', 's', 'O', 'e', 'J', '\n', '9', ',', ',', 'V', 'i', '*', 'P', 'P', 's', 'c', 'y', 'r', '7', 'R', 'L', '(', '"', '\n', ';', 'w', 'z', 'X', 'B', 'V', 's', 'K', 'Y', 'c', 'J',

In [46]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

step: 0, train loss: 3.877, val loss: 3.872
step: 250, train loss: 3.849, val loss: 3.834
step: 500, train loss: 3.760, val loss: 3.778
step: 750, train loss: 3.716, val loss: 3.755
3.544039011001587


In [43]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out