In [10]:
import torch
import torch.nn as nn
from torch.nn import functional as F
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

block_size = 8
batch_size = 4
max_iters = 1000
learning_rate = 3e-4
eval_iters = 250

cpu


In [11]:
#Reading the book and extracting the text
with open('/Users/moxieevil/Documents/dev/oz.txt','r', encoding='UTF-8') as f:
    text = f.read()
    print(text[:200])

    #extract unique characters and store in chars. set datatype in python takes only unique values. It is sorted using sort.
    chars = sorted(set(text))
    print(chars)

    #storing total number of unique characters in vocab_size
    vocab_size = len(chars)
    print(vocab_size)

The Project Gutenberg eBook of The Wonderful Wizard of Oz
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restri
['\n', ' ', '!', '#', '$', '%', '&', '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '—', '‘', '’', '“', '”', '•', '™', '\ufeff']
89


In [12]:
#Assigning numbers from 0 to each character.
strInt = { ch:i for i,ch in enumerate(chars)}

#assigning characters for each number.
intStr = {i:ch for i,ch in enumerate(chars)}

#encode function which returns the array of numbers for the string
encode = lambda s: [strInt[c] for c in s]

#decodes the array of numbers to return the string. join function is used to append the string.
decode = lambda l: ''.join([intStr[i] for i in l])

#testing both the functions.
print(encode('hello'))
decode([62, 59, 66, 66, 69])

#encoding the entire text of the document and storing in data variable. The datatype is long (int64)
data = torch.tensor(encode(text), dtype=torch.long)

[62, 59, 66, 66, 69]


In [13]:
#splitting the data into training and validation. 80% is used for training.
n = int(0.8 * len(data))
train_data = data[:n]
val_data = data[n:]

In [14]:
#
def get_batch(split):
    data = train_data if split=='train' else val_data
    ix = torch.randint(len(data) - batch_size,(batch_size,))
    print(ix)
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x,y

x,y = get_batch('train')
print('inputs:\n',x)
print('targets:\n',y)

tensor([  2909,  26047, 101697,   8499])
inputs:
 tensor([[60, 55, 63, 72, 79,  1, 74, 55],
        [62, 74, 60, 75, 66, 66, 79,  1],
        [69, 75,  1, 62, 55, 76, 59,  1],
        [25,  0, 56, 75, 74,  1, 73, 62]])
targets:
 tensor([[55, 63, 72, 79,  1, 74, 55, 66],
        [74, 60, 75, 66, 66, 79,  1, 55],
        [75,  1, 62, 55, 76, 59,  1, 68],
        [ 0, 56, 75, 74,  1, 73, 62, 59]])


In [15]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train','val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X,Y = get_batch(split)
            logits, loss = model(X,Y)
            losses[k] = loss.item()
            out[split] = losses.mean()
    model.train()
    return out

In [16]:
class BigramLanguageModel(nn.Module):
    def __init__(self,vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size,vocab_size)
    
    def forward(self,index,targets=None):
        logits = self.token_embedding_table(index)
        
        if targets is None:
            loss=None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits,targets)

        return logits,loss

    def generate(self,index,max_new_tokens):
        for _ in range(max_new_tokens):
            logits,loss = self.forward(index)
            logits = logits[:,-1,:]
            probs = F.softmax(logits, dim=-1)
            index_next = torch.multinomial(probs,num_samples=1)
            index = torch.cat((index,index_next),dim=1)
        return index

model = BigramLanguageModel(vocab_size)

context = torch.zeros((1,1), dtype=torch.long)
generated_chars = decode(model.generate(context,max_new_tokens=500)[0].tolist())
print(generated_chars)


‘1!*Fu$p/bHhP/wZ%2wm-’‘—’/
6U6d’ssnKZR ,XQB“6wCyUPv 2*3$WCydLg(k%

”ONkJnKb﻿uj﻿20Z?r2j8fSsfb:jw•T;&%[—:M?8M7K6ujjLI﻿&d$)K•MT‘x *uIlmn’[Al7xzl8t:MT’[bQI‘2w,sx2b;“x%—%mvfS#J6d0!6/K•[([(T13?PzY#’$,3##aK!R Z’zIak&E. mzsxJg™IaJ﻿-b8!1”n,™J‘!﻿N3!etZ)xNWU0Z;tlgMJ!$g,YN,XU)2)&JeRp4zIaPUQ;Y
﻿CVo)Zd&ff.R4)™X-AmG’S:
D)‘R6Yjkq™LF.9“O/4AsN(F’h18s
Z
”aAj]Y%sw]A?Q;h9™XFA3G‘x‘Rj﻿.ggg-F,?LCc.!ou(Z?*KPm﻿wZTN$’Nw!Z4)WQPVbR8Wk﻿%mmHM7Z%-”kv%.207W8’Pd0UC7-•m—WEexR
t3%[-[Bq﻿ek4QJwpT”eYw/ N3v0$X-—Wm*LRBIB.G[9B)W663)&TV7


In [17]:
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"step:{iter},train loss:{losses['train']:.3f},val loss:{losses['val']:3f}")

    xb, yb = get_batch('train')

    logits, loss = model.forward(xb,yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

tensor([105964, 164935,   4495,  86965])
tensor([103480,  63056,  15869, 115135])
tensor([  5325,  38636, 159318,  68646])
tensor([ 18350, 159468,  47726,  31266])
tensor([153653,  17455,  61741, 124839])
tensor([ 25363,  51755,  96187, 147803])
tensor([ 29553, 131360, 180875,  93113])
tensor([ 16477, 131973, 140264,  30274])
tensor([136563, 158988,  67864,  56013])
tensor([133983, 177050,  64466,  76146])
tensor([166650,  83408, 115668,   5224])
tensor([126059,  66129,  45696, 122690])
tensor([169638,  42898, 172448,  23506])
tensor([162627,  16467,  44628, 100937])
tensor([114873, 101700,  85914, 172457])
tensor([ 25869, 107410,  96257, 136895])
tensor([158481,  48891, 103918, 103452])
tensor([  6981, 133534,   7924, 165736])
tensor([72327, 63836, 88432, 63542])
tensor([  7143, 166578, 118880,  98036])
tensor([156986, 180775, 101793,  83507])
tensor([   866,  10992, 160450,  86786])
tensor([ 76288, 108793,  99692,  37548])
tensor([ 36717, 149877, 142077,  56051])
tensor([161957, 1572

In [18]:
context = torch.zeros((1,1), dtype=torch.long)
generated_chars = decode(model.generate(context,max_new_tokens=500)[0].tolist())
print(generated_chars)


tWi”nE”[DKMW%™lzBirkIaAZ4p-Ed
BBi—:Ilx63Aa)?WW!Ml”,Xgv]’H&3w)?’poiIr7yOoGSd“k$W%‘BTG‘xR n(k.CH‘#isGG™ywB&aV!-T*XVP-XRFBiY*™C:Y5gjjP4CUbjyN’NSr
Z•m9QFW. )WhE&2FN—e
#$966jQ27Yet;,x3KnY2j[27#($zZjir7SJ:—w”t/0oibUEx”R6’J “&?:’9,Cl﻿ovf388j[.M2v&]#R66kSPGDzaU-,Cgek‘B&‘S(
$cAZ&DGC#I1•S(#(aMTDA2$Y aREe/)Wm?PyXD•%Xmacir$zqJK63VoggMiY sIQp#xbEiq&ew#p3,‘Z%]m%
cu$., 3F2A:]&I1-pY9
gV.y ‘0fS6Z
IutOYW•fSO!N
﻿.f0Bc$W$qh1•dTHvJ(c“Ke5.bja
)joeqfSO d-Q3DAZjZZd0Z?*28FqCbeT&f0B Ywj‘B&.cC--Hck—K04)p$Zlx™bxXkN!PLJe(uC
