In [5]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
#read in all words
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [7]:
len(words)

32033

In [8]:
#build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
string_to_index = {s:i+1 for i,s in enumerate(chars)}
string_to_index['.'] = 0
index_to_string = {i:s for s,i in string_to_index.items()}
print(index_to_string)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [98]:
#build the dataset
block_size = 3 #context length: how many characters do we take to predict the next one
X, Y = [], []  #X input to neuro net, and Y label for each example in X

for w in words:
    #print(w)
    context = [0] * block_size # [0, 0, 0]
    for ch in w + '.':
        ix = string_to_index[ch]
        X.append(context)
        Y.append(ix)
        #print(''.join(index_to_string[i] for i in context), '---->', index_to_string[ix])
        context = context[1:] + [ix] #crop and append

X = torch.tensor(X)
Y = torch.tensor(Y)

In [99]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([228146, 3]), torch.int64, torch.Size([228146]), torch.int64)

In [11]:
C = torch.randn((27,2))

In [12]:
#implementing embedding
#An embedding is a way to convert raw data (words, tokens, images, categories, IDs, positions, nodes, etc.) into a vector of real numbers that a neural network can understand.
#Neural networks work only with real-valued tensors, not text, categories, IDs, words, or tokens.
#Most raw data is not numeric, so we need to convert it to vectors that capture meaning.
#randomly initializes matrix of embeding
#27 characters, 2 for reduce information representation from 30 (in Bengio paper) to 2 dimension
C = torch.randn((27, 2))

In [13]:
C

tensor([[-6.0655e-01, -1.0747e+00],
        [ 5.1443e-01, -1.7338e+00],
        [-2.7787e-01,  1.8458e-03],
        [-3.3391e-01,  8.4746e-02],
        [-1.9110e+00, -1.7315e-02],
        [ 1.0294e+00, -4.5334e-02],
        [-5.8614e-01,  3.4161e-02],
        [ 1.0532e+00,  6.2052e-01],
        [ 3.9708e-01,  2.6009e-01],
        [ 4.2247e-02,  4.7044e-03],
        [-1.1675e+00, -9.6438e-01],
        [-4.8728e-01, -2.1725e-01],
        [ 4.2607e-01,  1.4916e+00],
        [-9.4453e-01,  5.2975e-02],
        [-1.2980e+00, -5.8925e-01],
        [-1.0169e+00,  2.7930e-01],
        [ 7.9689e-01,  3.7111e-01],
        [-7.3043e-01, -9.2917e-01],
        [ 1.5555e+00,  8.9405e-02],
        [-1.6406e-01,  9.4610e-02],
        [ 1.2589e-01,  9.6658e-02],
        [ 7.9875e-01, -3.3036e-02],
        [-6.6306e-01,  1.4322e-02],
        [ 1.6244e+00, -1.1766e+00],
        [ 1.3398e+00, -2.1882e+00],
        [-2.4261e-01,  1.8333e-01],
        [-3.2957e-01,  8.8155e-01]])

In [14]:
#example of embeding '5'
C[5] #C[X]

tensor([ 1.0294, -0.0453])

In [15]:
#In the other file makemore_from_scratch, we encoded first and then we multiply them
F.one_hot(torch.tensor(5), num_classes=27).float() @ C

tensor([ 1.0294, -0.0453])

![Bengio_Neruo_Model.png](attachment:e1004855-6138-4c88-b278-7f7bb623aa43.png)

In [16]:
#we can see in the model we embedded the index as input and search for it in our Matrix C, we can see that we can follow two approaches
#use the index itself to search directly
#or hot encode the input and feed to the first layer and weight the C matrix

In [17]:
C[[5,6,7]]

tensor([[ 1.0294, -0.0453],
        [-0.5861,  0.0342],
        [ 1.0532,  0.6205]])

In [18]:
C[torch.tensor([5, 6, 7, 7, 7])]

tensor([[ 1.0294, -0.0453],
        [-0.5861,  0.0342],
        [ 1.0532,  0.6205],
        [ 1.0532,  0.6205],
        [ 1.0532,  0.6205]])

In [19]:
C[X].shape

torch.Size([32, 3, 2])

In [28]:
print(C[X[0]])
C[X][:1]

tensor([[-0.6065, -1.0747],
        [-0.6065, -1.0747],
        [-0.6065, -1.0747]])


tensor([[[-0.6065, -1.0747],
         [-0.6065, -1.0747],
         [-0.6065, -1.0747]]])

In [29]:
emb = C[X]
emb.shape

torch.Size([32, 3, 2])

![Bengio_Neruo_Model.png](attachment:679858ab-d1f3-47a5-9fc2-fec4069c6127.png)

In [30]:
#creation of hidden layer
# we know that our embedding inputs are 3 blocks and two values embedding each block
#[[-0.6065, -1.0747],
#[-0.6065, -1.0747],
#[-0.6065, -1.0747]]
#so we have a 6 = 3 * 2 inputs, and number of neurons are up to us
W1 = torch.randn((6,100))
b1 = torch.randn(100)

In [53]:
#goal multiply emb @ W1 + b1 but we have (32, 3, 2) @ (6,100) => we need to convert 3,2 to 6
#torch has a lot of transform functions that can work for this but we need to come up with something
#fit to our problem
#print(emb[0])
#print(emb[0,0])
#print(emb[0,0,0])
#so this brings me all of the arrays, within the first block, with all embeded values
#emb[:,0,:]
#second block
#emb[:,1,:]
#third block
#emb[:,2,:]

In [52]:
#so with torch we can concatenate them into a single dimension having the 6 inputs
#torch.cat([emb[:,0,:],emb[:,1,:],emb[:,2,:]], dim=1).shape

torch.Size([32, 6])

In [54]:
#but it's not useful for our test case what if we change the size of the block??
torch.cat(torch.unbind(emb,1),1).shape

torch.Size([32, 6])

In [55]:
#but it's an easier way to do it, using the power on how the physical internal of pytorch handles tensor
"""
 2 3
 1 5  => physical address in a single vector | 0x01 | 0x02 | 0x03 | .... 

 allowing the library to manipulate them, remember pytorch it's powered by C++ and C
"""
a = torch.arange(18)
a

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17])

In [56]:
a.shape

torch.Size([18])

In [59]:
#a.view(2,9)
#a.view(6,3)
a.view(3,3,2)

tensor([[[ 0,  1],
         [ 2,  3],
         [ 4,  5]],

        [[ 6,  7],
         [ 8,  9],
         [10, 11]],

        [[12, 13],
         [14, 15],
         [16, 17]]])

In [60]:
#a.storage() #don't run only in linux

  a.storage()


 0
 1
 2
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
[torch.storage.TypedStorage(dtype=torch.int64, device=cpu) of size 18]

In [68]:
emb.shape

torch.Size([32, 3, 2])

In [64]:
#emb.view(32,6)

In [65]:
#emb.view(32,6) == torch.cat(torch.unbind(emb,1),1) 

In [73]:
#returning to create second hidden layer
#warning be care ful with the broadcasting W1 + b1
#h = emb.view(emb.shape[0],6) @ W1 + b1
#h = emb.view(32,6) @ W1 + b1
h = torch.tanh(emb.view(-1,6) @ W1 + b1)
h

tensor([[ 0.9999, -0.2070, -0.8008,  ...,  0.7711, -0.9996,  0.7944],
        [ 0.9959,  0.9470, -0.8991,  ..., -0.9942, -0.9955, -0.7588],
        [ 0.9755,  0.8093, -0.7147,  ...,  0.8375, -0.9965,  0.9720],
        ...,
        [ 0.2114,  0.7676, -0.7765,  ...,  0.9807, -0.9988, -0.9111],
        [-0.9600, -0.7437,  0.7362,  ..., -0.8490, -0.5189, -0.9944],
        [ 0.9568, -0.9988,  0.9682,  ...,  0.3394, -0.9875, -0.9980]])

In [70]:
h.shape

torch.Size([32, 100])

In [74]:
#weigths and bias for the second layer and output
W2 = torch.randn((100,27))
b2 = torch.randn(27)

In [75]:
logits = h @ W2 + b2

In [76]:
counts = logits.exp()

In [77]:
probs = counts / counts.sum(1, keepdim = True)

In [78]:
probs.shape

torch.Size([32, 27])

In [82]:
#we want the probability for each target in 'Y' [ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19, ...
#so what we do with the next code is to iterate over the 32 rows and extract the value that correspond to that index in 5
#e.g probs[0] => [4.7634e-11, 1.1123e-10, 1.3706e-09, 4.7828e-07, 5.7764e-09, 2.1464e-02, ... extract Y[0] = 5 [2.1464e-02] and concatenate them in a list
#having them as tensors and tensors allow to obtain and flat list by using the index
#probs[torch.arange(32), Y]
probs[torch.arange(emb.shape[0]), Y]

tensor([2.1464e-02, 1.6045e-15, 1.3362e-10, 3.0985e-06, 1.9882e-06, 9.2105e-09,
        5.6723e-11, 8.0088e-10, 6.0361e-04, 3.8133e-03, 2.2670e-07, 1.8580e-09,
        1.1123e-10, 5.3338e-01, 4.8908e-17, 1.0240e-05, 1.8266e-12, 3.0048e-12,
        8.4249e-08, 1.0652e-09, 4.0292e-12, 6.6665e-05, 8.0360e-13, 1.6879e-04,
        9.9269e-09, 8.1054e-09, 1.5219e-06, 3.3675e-07, 3.7858e-12, 3.9475e-04,
        1.2713e-08, 1.4843e-08])

In [None]:
#awfull probs, but this because network is not trained yet

In [83]:
# ------- now made respectable -----

In [100]:
X.shape, Y.shape

(torch.Size([228146, 3]), torch.Size([228146]))

In [135]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27,2), generator=g)
W1 = torch.randn((6,100), generator=g)
b1 = torch.randn(100, generator=g)
W2 = torch.randn((100,27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [136]:
print(sum(p.nelement() for p in parameters))
for p in parameters:
    p.requires_grad = True

3481


In [106]:
for i in range(10):
    emb = C[X] # (32, 3, 2)
    h = torch.tanh(emb.view(-1,6) @ W1 + b1)
    logits = h @ W2 + b2
    # this 'log' linear model it's use for classification, torch has entropy function to resume all this
    #farward pass and backward pass efficiency
    #handle better number with the exponencial function, if we see exp() function, we see that with number greater that 5 tends to go to infinity
    #when we do backward we will keep dealing with infinity making our NN inadecuate
    #https://docs.pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html
    #counts = logits.exp()
    #prob = counts / counts.sum(1, keepdim = True)
    #loss = -prob[torch.arange(32), Y].log().mean()
    loss = F.cross_entropy(logits, Y)
    print(f"{i+1}. {loss.item()}")
    #loss
    #backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    for p in parameters:
        p.data += -0.1 * p.grad



1. 10.709595680236816
2. 10.407641410827637
3. 10.12781810760498
4. 9.864374160766602
5. 9.614513397216797
6. 9.376447677612305
7. 9.148953437805176
8. 8.931118965148926
9. 8.72223949432373
10. 8.521756172180176


In [107]:
#optimize to work in minibatches
#takes small portion and do with them first the back and forward
torch.randint(0, X.shape[0],(32,))

tensor([133077, 193434, 126467,  46227, 146605,  95573,  43700,  38860, 161254,
        182383, 165088,  55778, 184838,  79282, 101792, 168319, 126841,  74712,
         94155, 126954, 147702, 225471,  62071, 216088, 205408, 132997, 191478,
        190723,  39904, 223118, 201681, 187901])

In [117]:
for i in range(1000):
    #minibatch construct
    ix = torch.randint(0, X.shape[0], (32,))
    emb = C[X[ix]] # (32, 3, 2)
    h = torch.tanh(emb.view(-1,6) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y[ix])
    #backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    #update
    for p in parameters:
        p.data += -0.1 * p.grad
print(f"{loss.item()}")

2.4506287574768066


In [137]:
#determine learning rate
#https://docs.pytorch.org/docs/stable/generated/torch.linspace.html
learning_rate_exponencial = torch.linspace(-3,0, 1000)
learning_rates = 10**learning_rate_exponencial

In [144]:
lri = []
lossi = []
for i in range(10000):
    #minibatch construct
    ix = torch.randint(0, X.shape[0], (32,))
    emb = C[X[ix]] # (32, 3, 2)
    h = torch.tanh(emb.view(-1,6) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y[ix])
    #print(f"{i+1} => {loss.item()}")
    #backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    #update
    #lr = learning_rates[i]
    #once we find a good learning rate we set
    #lr = 0.1
    #when the loss starts to plat we do "learning decay" augmentin learning rate by 10
    lr = 0.01
    for p in parameters:
        p.data += -lr * p.grad

    #track stats
    #to plot and see how good are the learning rates
    #lri.append(learning_rate_exponencial[i])
    #lossi.append(loss.item())
    
print(f"{loss.item()}")

2.6983728408813477


In [145]:
#plt.plot(lri, lossi)