In [4]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline


In [5]:
# read in all the words
words = open('names.txt','r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [6]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)


{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [43]:
# build the dataset
# block size is the context length: how many chars do we take to predict the next one
block_size = 3 
x, y =[],[]
for w in words:
    #print(w)
    context = [0] * block_size # initializing the block with ... padding
    for ch in w + '.': #to mark the ending of a word
        ix = stoi[ch]
        x.append(context) # this is where it is intially padded with dots....
        y.append(ix) # index of the next character
        #print(''.join(itos[i] for i in context),'---->',itos[ix])
        context = context[1:] + [ix] #crop and append the character for next iteration(decrease the padded dots)
    
x = torch.tensor(x)
y = torch.tensor(y)
#print(x[1,:])
#print(y[1])
#from the above print example you can see that X is a possible context
#before the next character which is y

emma
olivia
ava
isabella
sophia
charlotte
mia
amelia
harper
evelyn
abigail
emily
elizabeth
mila
ella
avery
sofia
camila
aria
scarlett
victoria
madison
luna
grace
chloe
penelope
layla
riley
zoey
nora
lily
eleanor
hannah
lillian
addison
aubrey
ellie
stella
natalie
zoe
leah
hazel
violet
aurora
savannah
audrey
brooklyn
bella
claire
skylar
lucy
paisley
everly
anna
caroline
nova
genesis
emilia
kennedy
samantha
maya
willow
kinsley
naomi
aaliyah
elena
sarah
ariana
allison
gabriella
alice
madelyn
cora
ruby
eva
serenity
autumn
adeline
hailey
gianna
valentina
isla
eliana
quinn
nevaeh
ivy
sadie
piper
lydia
alexa
josephine
emery
julia
delilah
arianna
vivian
kaylee
sophie
brielle
madeline
peyton
rylee
clara
hadley
melanie
mackenzie
reagan
adalynn
liliana
aubree
jade
katherine
isabelle
natalia
raelynn
maria
athena
ximena
arya
leilani
taylor
faith
rose
kylie
alexandra
mary
margaret
lyla
ashley
amaya
eliza
brianna
bailey
andrea
khloe
jasmine
melody
iris
isabel
norah
annabelle
valeria
emerson
adalyn
ryl

In [8]:
x.shape, x.dtype, y.shape, y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

In [29]:
g = torch.Generator().manual_seed(2147483647)
c = torch.rand((27,2),generator=g)
# hidden layer
w1 = torch.randn((6,100),generator=g)
b1 = torch.randn(100,generator=g)
# final layer
w2 = torch.randn((100,27),generator=g)
b2 = torch.randn(27,generator=g)
parameters = [c, w1, b1, w2, b2]

In [30]:
sum(p.nelement() for p in parameters)  # number of parameters in total

3481

In [37]:
for p in parameters:
    p.requires_grad =  True

In [42]:
######### Forward pass #########
# view() is used to reshape (32,3,2) in to (32,6) for matrix multiplication purpose
for _ in range(1000):
    emb = c[x]  #(32,3,2)
    h =torch.tanh(emb.view(emb.shape[0],6) @ w1 + b1)
    logits = h @ w2 + b2  # (32,27)
    counts = logits.exp()
    prob = counts / counts.sum(1, keepdims=True)
    loss = -prob[torch.arange(32),y].log().mean()
    print(loss.item())
    #So, in summary, you can interpret the .log() 
    # computation as undoing the exponential 
    # transformation performed earlier, allowing for
    #  a linear scale representation of the probabilities
    #  and facilitating the calculation of the 
    # cross-entropy loss.


    ####### backward pass #########
    for p in parameters:
        p.grad = None
    loss.backward()
    ######## update       ##########
    for p in parameters:
        p.data += -0.1 * p.grad


0.2528139650821686
0.2528134286403656
0.252812922000885
0.2528124451637268
0.25281190872192383
0.25281140208244324
0.25281089544296265
0.25281038880348206
0.25280988216400146
0.2528093755245209
0.2528088390827179
0.2528083622455597
0.2528078556060791
0.2528073191642761
0.2528068721294403
0.25280633568763733
0.25280582904815674
0.25280532240867615
0.25280484557151794
0.25280433893203735
0.25280383229255676
0.2528032958507538
0.2528027892112732
0.2528022825717926
0.252801775932312
0.2528012692928314
0.2528007924556732
0.2528002858161926
0.2527998089790344
0.25279930233955383
0.25279879570007324
0.25279831886291504
0.25279778242111206
0.25279733538627625
0.25279679894447327
0.25279632210731506
0.2527958154678345
0.2527953088283539
0.2527948021888733
0.2527943551540375
0.2527937591075897
0.2527933120727539
0.2527928352355957
0.2527922987937927
0.2527918815612793
0.2527913749217987
0.2527908682823181
0.2527903616428375
0.25278985500335693
0.25278937816619873
0.25278884172439575
0.2527883648

In [36]:
# above cell has same output as this one
loss = F.cross_entropy(logits,y)
# this is more effcient
loss

tensor(10.1065)