In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [3]:
len(words)

32033

In [7]:
# build vocabulary of characters and mapping to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i for i,s in enumerate(chars,1)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [11]:
# build the dataset

block_size = 3   #content length: how many chracters do we take to predict the next one?
X, Y = [], []
for w in words[:5]:

    print(w)
    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print(''.join(itos[i] for i in context), '---->', itos[ix])

        #crop and append
        context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)

emma
... ----> e
..e ----> m
.em ----> m
emm ----> a
mma ----> .
olivia
... ----> o
..o ----> l
.ol ----> i
oli ----> v
liv ----> i
ivi ----> a
via ----> .
ava
... ----> a
..a ----> v
.av ----> a
ava ----> .
isabella
... ----> i
..i ----> s
.is ----> a
isa ----> b
sab ----> e
abe ----> l
bel ----> l
ell ----> a
lla ----> .
sophia
... ----> s
..s ----> o
.so ----> p
sop ----> h
oph ----> i
phi ----> a
hia ----> .


In [12]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

In [14]:
C = torch.randn((27,2))  #2 dimension embedding

In [15]:
C[5]

tensor([1.0298, 1.0701])

In [16]:
F.one_hot(torch.tensor(5), num_classes=27).float() @ C

tensor([1.0298, 1.0701])

In [18]:
C[torch.tensor([5,6,7,7,7])]

tensor([[ 1.0298,  1.0701],
        [ 2.7273, -1.4027],
        [ 1.0933, -0.6301],
        [ 1.0933, -0.6301],
        [ 1.0933, -0.6301]])

In [19]:
C[X].shape

torch.Size([32, 3, 2])

In [20]:
X[13,2]

tensor(1)

In [21]:
C[X][13,2]

tensor([-1.7667, -0.9673])

In [22]:
C[1]

tensor([-1.7667, -0.9673])

In [23]:
emb = C[X]
emb.shape

torch.Size([32, 3, 2])

In [24]:
W1 = torch.randn((6, 100))  #(no.of inputs, neurons)      #no. of i/p-> (batch)*(2d-embd)
b1 = torch.randn(100)

In [25]:
#emb @ w1 + b1   # 32,3,2 @ 6,100   not possible need to concatenate

In [26]:
emb[:, 0, :].shape

torch.Size([32, 2])

In [33]:
torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]], 1).shape

torch.Size([32, 6])

In [41]:
torch.cat(torch.unbind(emb, 1),1 ).shape

torch.Size([32, 6])

In [43]:
a = torch.arange(18)
a

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17])

In [44]:
a.shape

torch.Size([18])

In [46]:
a.view(2,9)

tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8],
        [ 9, 10, 11, 12, 13, 14, 15, 16, 17]])

In [48]:
a.view(3,3,2)

tensor([[[ 0,  1],
         [ 2,  3],
         [ 4,  5]],

        [[ 6,  7],
         [ 8,  9],
         [10, 11]],

        [[12, 13],
         [14, 15],
         [16, 17]]])

In [50]:
a.storage()

  a.storage()


 0
 1
 2
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
[torch.storage.TypedStorage(dtype=torch.int64, device=cpu) of size 18]

In [49]:
emb.view(32,6).shape

torch.Size([32, 6])

In [52]:
emb.view(32,6) == torch.cat(torch.unbind(emb, 1), 1)

tensor([[True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, T

In [56]:
emb.view(32,6) @ W1 + b1

tensor([[ 1.0156,  0.9029,  1.3852,  ...,  0.1527,  1.2015,  1.3162],
        [ 2.4521,  0.1809,  4.4049,  ..., -1.8752,  1.1763, -2.2466],
        [ 0.7197,  6.0725,  2.1046,  ...,  0.6129,  2.5786, -1.0459],
        ...,
        [ 4.7558, -4.4562,  0.5370,  ..., -3.4186, -0.9359,  1.9618],
        [ 3.7840,  2.2373,  3.0705,  ..., -4.9853,  1.8427, -2.6250],
        [-2.0735,  4.3307,  0.5233,  ...,  5.3070,  2.2392,  3.9314]])

In [57]:
emb.view(emb.shape[0], 2*block_size) @ W1 + b1

tensor([[ 1.0156,  0.9029,  1.3852,  ...,  0.1527,  1.2015,  1.3162],
        [ 2.4521,  0.1809,  4.4049,  ..., -1.8752,  1.1763, -2.2466],
        [ 0.7197,  6.0725,  2.1046,  ...,  0.6129,  2.5786, -1.0459],
        ...,
        [ 4.7558, -4.4562,  0.5370,  ..., -3.4186, -0.9359,  1.9618],
        [ 3.7840,  2.2373,  3.0705,  ..., -4.9853,  1.8427, -2.6250],
        [-2.0735,  4.3307,  0.5233,  ...,  5.3070,  2.2392,  3.9314]])

In [58]:
emb.view(emb.shape[0], 6) @ W1 + b1

tensor([[ 1.0156,  0.9029,  1.3852,  ...,  0.1527,  1.2015,  1.3162],
        [ 2.4521,  0.1809,  4.4049,  ..., -1.8752,  1.1763, -2.2466],
        [ 0.7197,  6.0725,  2.1046,  ...,  0.6129,  2.5786, -1.0459],
        ...,
        [ 4.7558, -4.4562,  0.5370,  ..., -3.4186, -0.9359,  1.9618],
        [ 3.7840,  2.2373,  3.0705,  ..., -4.9853,  1.8427, -2.6250],
        [-2.0735,  4.3307,  0.5233,  ...,  5.3070,  2.2392,  3.9314]])

In [60]:
h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
h

tensor([[ 0.7681,  0.7177,  0.8821,  ...,  0.1515,  0.8341,  0.8658],
        [ 0.9853,  0.1789,  0.9997,  ..., -0.9541,  0.8263, -0.9779],
        [ 0.6167,  1.0000,  0.9707,  ...,  0.5462,  0.9886, -0.7802],
        ...,
        [ 0.9999, -0.9997,  0.4907,  ..., -0.9979, -0.7333,  0.9612],
        [ 0.9990,  0.9775,  0.9957,  ..., -0.9999,  0.9511, -0.9896],
        [-0.9689,  0.9997,  0.4802,  ...,  1.0000,  0.9776,  0.9992]])

In [61]:
h.shape

torch.Size([32, 100])

In [62]:
(emb.view(-1,6) @ W1).shape

torch.Size([32, 100])

In [63]:
b1.shape

torch.Size([100])

In [64]:
# broadcasting
# 32, 100
#  1 , 100

In [68]:
W2 = torch.randn((100, 27))
b2 = torch.randn(27)

In [69]:
logits = h @ W2 + b2

In [70]:
logits.shape

torch.Size([32, 27])

In [72]:
counts = logits.exp()

In [73]:
prob = counts / counts.sum(1, keepdims = True)

In [74]:
prob.shape

torch.Size([32, 27])

In [115]:
prob[0]  # showing prob of every ch for 0th sample, now need to pick that index which is expected to be correct.
#prob[0,Y[0]] shows what is the prob of expected output if it is 1 it means it found correct

tensor([9.4573e-04, 1.0719e-08, 7.1504e-08, 1.5865e-07, 5.1512e-08, 1.9140e-08,
        7.6289e-04, 8.2651e-03, 2.8533e-04, 1.9996e-01, 1.7398e-06, 7.8940e-01,
        1.3441e-07, 1.3384e-06, 2.8535e-07, 9.5158e-09, 1.2380e-05, 1.8517e-04,
        6.8198e-05, 9.0649e-05, 3.7874e-06, 1.3876e-13, 7.2711e-10, 9.6407e-06,
        3.0877e-10, 8.7183e-08, 2.5344e-06])

In [112]:
loss = -prob[torch.arange(32),Y].log().mean()
loss

tensor(15.3580)

In [96]:
y = F.one_hot(Y, num_classes= 27)

In [78]:
torch.arange(32)

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31])

In [87]:
Y

tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
         1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0])

In [82]:
F.cross_entropy(logits, Y)

tensor(15.3580)

In [116]:
X.shape, Y.shape

(torch.Size([32, 3]), torch.Size([32]))

In [117]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27,2), generator=g)
W1 = torch.randn((6,100),generator=g)
b1 = torch.randn(100, generator=g)
W2 = torch.randn((100,27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [118]:
sum(p.nelement() for p in parameters)  #total parameters

3481

In [119]:
27*2 + 6*100 + 100 + 100*27 + 27

3481

In [121]:
emb = C[X]
h = torch.tanh(emb.view(-1,6) @ W1 + b1)  #(32,100)
logits = h @ W2 + b2 #(32,27)
# counts = logits.exp()
# prob = counts / counts.sum(1, keepdims=True)
# loss = -prob[torch.arange(32),Y].log().mean()
loss = F.cross_entropy(logits,Y)                #clustered the operation and efficient
loss

tensor(17.7697)

# torch.nn.functional.cross_entropy()

In [122]:
logits = torch.tensor([-100,3,0,100])
counts = logits.exp()
probs = counts / counts.sum()
probs 

tensor([0., 0., 0., nan])

In [124]:
counts #so that means we can't pass very large logits to the expresion it will lead to inf 

tensor([3.7835e-44, 2.0086e+01, 1.0000e+00,        inf])

In [125]:
logits = torch.tensor([-5,3,0,5])
counts = logits.exp()
probs = counts / counts.sum()
probs

tensor([3.9751e-05, 1.1849e-01, 5.8995e-03, 8.7557e-01])

In [126]:
counts

tensor([6.7379e-03, 2.0086e+01, 1.0000e+00, 1.4841e+02])

In [127]:
# you know how the exponential go like 0 to infinity
# so how pytorch handle it by subtracting 
# as negative no. doesn't cause problem but postive does
logits = torch.tensor([-100,3,0,100]) - 100
counts = logits.exp()
probs = counts / counts.sum()
probs

tensor([0.0000e+00, 7.4689e-43, 3.7835e-44, 1.0000e+00])

# ---------------------------------------------

In [165]:
# build the dataset

block_size = 3   #content length: how many chracters do we take to predict the next one?
X, Y = [], []
for w in words:

    #print(w)
    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        #print(''.join(itos[i] for i in context), '---->', itos[ix])

        #crop and append
        context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)

In [166]:
X.shape, Y.shape

(torch.Size([228146, 3]), torch.Size([228146]))

In [167]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27,2), generator=g)
W1 = torch.randn((6,100),generator=g)
b1 = torch.randn(100, generator=g)
W2 = torch.randn((100,27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [168]:
for p in parameters:
    p.requires_grad = True

In [188]:
for _ in range(1000):

    #mini-batch construct
    ix = torch.randint(0, X.shape[0], (32,))
    
    # forward pass
    emb = C[X[ix]]  #(32, 3, 2)
    h = torch.tanh(emb.view(-1,6) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y[ix])
    
    
    #backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    
    #update
    for p in parameters:
        p.data +=  -0.1 *p.grad
        
print(loss.item())

2.3018555641174316


In [189]:
torch.randint(0,X.shape[0], (32,))

tensor([207650,  48820, 184742,  74151,  87370, 120456, 108412, 139639, 153970,
        111493,    560,   2019,  26536,  42317, 162014,  36741, 123183, 217723,
        171405,  43999,  24071, 197782,  51893,  83002, 200819,  82871, 156836,
        139234,  83021, 220494,   8653, 216284])