In [23]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline



In [24]:
words = open('names.txt' , 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [25]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i , s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}




In [26]:
#building the dataset
block_size = 3 #no.of chars taking to predict the next one
X, Y = [] , []
for w in words[:8]:
    print(w)
    context = [0]* block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print(''.join(itos[i] for  i in context) , '--------->', itos[ix])
        context = context[1:] + [ix]
        
        
X = torch.tensor(X)
Y = torch.tensor(Y)
X.shape

emma
... ---------> e
..e ---------> m
.em ---------> m
emm ---------> a
mma ---------> .
olivia
... ---------> o
..o ---------> l
.ol ---------> i
oli ---------> v
liv ---------> i
ivi ---------> a
via ---------> .
ava
... ---------> a
..a ---------> v
.av ---------> a
ava ---------> .
isabella
... ---------> i
..i ---------> s
.is ---------> a
isa ---------> b
sab ---------> e
abe ---------> l
bel ---------> l
ell ---------> a
lla ---------> .
sophia
... ---------> s
..s ---------> o
.so ---------> p
sop ---------> h
oph ---------> i
phi ---------> a
hia ---------> .
charlotte
... ---------> c
..c ---------> h
.ch ---------> a
cha ---------> r
har ---------> l
arl ---------> o
rlo ---------> t
lot ---------> t
ott ---------> e
tte ---------> .
mia
... ---------> m
..m ---------> i
.mi ---------> a
mia ---------> .
amelia
... ---------> a
..a ---------> m
.am ---------> e
ame ---------> l
mel ---------> i
eli ---------> a
lia ---------> .


torch.Size([53, 3])

In [6]:
#data set
block_size = 3
def build_dataset(words):
    
    X, Y = [] , []
    for w in words:
        
        #print(w)
        context = [0]*block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:]  + [ix]
            
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y

import random
random.seed(42)
random.shuffle(words)
n1= int(0.8*len(words))
n2 = int(0.9*len(words))


Xtr , Ytr = build_dataset(words[:n1])
Xdev , Ydev = build_dataset(words[n1:n2])
Xte , Yte = build_dataset(words[n2:])

In [7]:
Xtr.shape , Ytr.shape

(torch.Size([182625, 3]), torch.Size([182625]))

In [8]:
g = torch.Generator().manual_seed(2147483647)
C = torch.rand((27,2) , generator=g)
W1 = torch.randn((6,300) , generator=g)
b1 = torch.randn((300) , generator=g)
W2 = torch.randn((300,27) , generator=g)
b2 = torch.randn((27) , generator=g)

parameters = [C ,W1 , b1, W2  , b2]


In [9]:
sum(p.nelement() for p in parameters)

10281

In [10]:
for p in parameters:
    p.requires_grad=True

In [16]:
lre = torch.linspace(-3 , 0 , 1000)
lrs = 10**lre

In [12]:
lri = []
lossi = []
stepi = []

for i in range (30000):
    #mini batch size
    ix = torch.randint(0 , Xtr.shape[0] , (32,))
    

    #forward pass
    emb = C[X[ix]]
    h = torch.tanh(emb.view(-1,6) @ W1 + b1)    
    logits = h @ W2 +b2
    loss = F.cross_entropy(logits,Ytr[ix])
    
    # print(loss.item())
        


    #backwadrd pass
    for p in parameters:
        p.grad = None
    loss.backward()

    #update
    # lr = lrs[i]
    lr = 10**(-1.4)
    for p in parameters:
        p.data+= -lr*p.grad
        
    #track stats
    stepi.append(i)
    # lri.append(lre[i])
    lossi.append(loss.item())
        
        

In [15]:
emb = C[Xtr]
h = torch.tanh(emb.view(-1,6) @ W1 +b1 )
logits = h@W2 +b2
loss =  F.cross_entropy(logits , Ytr)
loss    

tensor(3.0059, grad_fn=<NllLossBackward0>)

In [17]:
emb = C[Xdev]
h = torch.tanh(emb.view(-1,6) @ W1 +b1 )
logits = h@W2 +b2
loss =  F.cross_entropy(logits , Ydev)
loss    

tensor(3.0096, grad_fn=<NllLossBackward0>)