In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
words = open('names.txt', 'r').read().splitlines()
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s, in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s, i in stoi.items()}

### Example

In [3]:
# dataset

block_size = 3 # context length: how many characters we take in order to predict the next one
X, Y = [], []
for w in words[:3]:
    context = [0] * block_size
    print(w)
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print(''.join(itos[i] for i in context), '-->', itos[ix])
        context = context[1:] + [ix] # move the context window one char to the right

X = torch.tensor(X)
Y = torch.tensor(Y)

emma
... --> e
..e --> m
.em --> m
emm --> a
mma --> .
olivia
... --> o
..o --> l
.ol --> i
oli --> v
liv --> i
ivi --> a
via --> .
ava
... --> a
..a --> v
.av --> a
ava --> .


In [4]:
# each block of `block_size` is a sample input X[i]
# and its corresponding Y[i] is the predicted label
X.shape, Y.shape

(torch.Size([16, 3]), torch.Size([16]))

### Lookup table
we want to embed the 27 characters of the english alphabet into $\mathbb{R}^2$. We do this with a lookup table

In [5]:
C = torch.randn((27, 2))

### Pytorch Indexing
to do this, we can index into a pytorch tensor with another arrays/tensors and get something of equivalent dimension

In [6]:
print(C[[1, 2, 3]], '\n') 
print(X[:3], '\n')
print(C[X][:3])

tensor([[-1.3316,  2.0169],
        [ 0.5963,  1.0341],
        [-0.4038, -0.6699]]) 

tensor([[ 0,  0,  0],
        [ 0,  0,  5],
        [ 0,  5, 13]]) 

tensor([[[-1.7021, -0.0617],
         [-1.7021, -0.0617],
         [-1.7021, -0.0617]],

        [[-1.7021, -0.0617],
         [-1.7021, -0.0617],
         [-0.4512, -0.4796]],

        [[-1.7021, -0.0617],
         [-0.4512, -0.4796],
         [ 1.6121, -0.6347]]])


In [7]:
_t = torch.tensor((1, 2, 3))
print(_t.shape)
print(C[_t].shape)

torch.Size([3])
torch.Size([3, 2])


In [8]:
# similar to bigram NN, mat mul with one hot encoding just results in the row corresponding to the one hot encode
F.one_hot(torch.tensor(3), num_classes=27).float() @ C, C[3]
# you can think of C as the weights matrix of a NN layer

# As shown above, we can map the domain (27 chars) to R^2 with C and pytorch indexing
# furthermore, C is clearly invertible.
print(itos[X[13, 2].item()])
print(X[13, 2], C[X][13, 2], C[1])


a
tensor(1) tensor([-1.3316,  2.0169]) tensor([-1.3316,  2.0169])


In [9]:
emb = C[X]
emb.shape

torch.Size([16, 3, 2])

---
### View
Here, we try to manipulate the shape of our tensor `emb`, so that we can use it in the next layer which takes as input, a `[x, 6]` tensor

In [10]:
M = emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]
print(M[0].shape)

# torch.unbind removes the specified dimension from a given tensor, 
# and returns a tuple of slices along the removed dimension (obviously not including the removed dimension)
#    basically, torch.unbind(n, 1) generalizes the result above
# eg: if n.shape = [16, 3, 2], then torch.unbind(n, 1) is a tuple of 3 tensors of size [16, 2] 
# where the first entry in the tuple is n[:, 0, :] and so on.
N = torch.unbind(emb, 1)
print(N[0].shape)

torch.Size([16, 2])
torch.Size([16, 2])


In [11]:
# In our case, we can see that torch.unbind(n,1)[0] corresponding to the embeded values of 
# the leftmost characters of the list of inputs X
# likewise, torch.unbind(n,1)[1] are the embeded values of the second characters
# and torch.unbind(n,1)[2], are the embeded values of the rightmost characters
print(emb[:, 0, :])
print(emb[:, 0, :].shape)

tensor([[-1.7021, -0.0617],
        [-1.7021, -0.0617],
        [-1.7021, -0.0617],
        [-0.4512, -0.4796],
        [ 1.6121, -0.6347],
        [-1.7021, -0.0617],
        [-1.7021, -0.0617],
        [-1.7021, -0.0617],
        [ 0.6503, -1.3409],
        [ 0.8245,  0.1784],
        [-1.3059,  0.0724],
        [ 1.5494,  0.2942],
        [-1.7021, -0.0617],
        [-1.7021, -0.0617],
        [-1.7021, -0.0617],
        [-1.3316,  2.0169]])
torch.Size([16, 2])


In [12]:
# concatinate the embeded values of each character along a specified dimension
# eg: if n.shape, m.shape = [16, 2], then 
# torch.cat((n, m), 1).shape = [16, 4]
# torch.cat((n, m), 0).shape = [32, 2]
t = torch.cat(torch.unbind(emb, 1), 1)
t, t.shape

(tensor([[-1.7021, -0.0617, -1.7021, -0.0617, -1.7021, -0.0617],
         [-1.7021, -0.0617, -1.7021, -0.0617, -0.4512, -0.4796],
         [-1.7021, -0.0617, -0.4512, -0.4796,  1.6121, -0.6347],
         [-0.4512, -0.4796,  1.6121, -0.6347,  1.6121, -0.6347],
         [ 1.6121, -0.6347,  1.6121, -0.6347, -1.3316,  2.0169],
         [-1.7021, -0.0617, -1.7021, -0.0617, -1.7021, -0.0617],
         [-1.7021, -0.0617, -1.7021, -0.0617,  0.6503, -1.3409],
         [-1.7021, -0.0617,  0.6503, -1.3409,  0.8245,  0.1784],
         [ 0.6503, -1.3409,  0.8245,  0.1784, -1.3059,  0.0724],
         [ 0.8245,  0.1784, -1.3059,  0.0724,  1.5494,  0.2942],
         [-1.3059,  0.0724,  1.5494,  0.2942, -1.3059,  0.0724],
         [ 1.5494,  0.2942, -1.3059,  0.0724, -1.3316,  2.0169],
         [-1.7021, -0.0617, -1.7021, -0.0617, -1.7021, -0.0617],
         [-1.7021, -0.0617, -1.7021, -0.0617, -1.3316,  2.0169],
         [-1.7021, -0.0617, -1.3316,  2.0169,  1.5494,  0.2942],
         [-1.3316,  2.016

In [13]:
a = torch.arange(18, dtype=torch.uint8)
a, a.shape

(tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17],
        dtype=torch.uint8),
 torch.Size([18]))

In [14]:
a.view(2, 9).tolist(), a.view(3, 2, 3).tolist()

([[0, 1, 2, 3, 4, 5, 6, 7, 8], [9, 10, 11, 12, 13, 14, 15, 16, 17]],
 [[[0, 1, 2], [3, 4, 5]],
  [[6, 7, 8], [9, 10, 11]],
  [[12, 13, 14], [15, 16, 17]]])

`torch.tensor` has the `.view()` method which efficiently rearranges a tensor into a specified dimension. 
This can be done efficiently because the underlying data in a `torch.tensor` is stored in a one dimensional array. 
All `.view()` has to do is modify the strides and shapes.
So, `.view()` is obviously preferrable to the method above, as it does all operations in-place.

In [15]:
# Tensor.untyped_storage() returns a byte array (which is why the dtype is uint8)
a.untyped_storage().tolist()

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]

In [16]:
emb.view([16, 6]) == torch.cat(torch.unbind(emb, 1), 1)

tensor([[True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True]])

---

In [17]:
# first hidden layer
W1 = torch.randn(6, 100)
b1 = torch.randn(100)
# we can just use emb.view([-1, 6]) and pytorch will infer the shape
h = torch.tanh(emb.view([emb.shape[0], 6]) @ W1 + b1)
h.shape

torch.Size([16, 100])

In [18]:
# BROADCASTING
# W1.shape = [32, 100]
# b1.shape = [100]
# --> 32, 100
#      1, 100
# so brodcasting rules dictate that b1 will be added to every row in W1, which is what we want.

In [19]:
# second (last) hidden layer
W2 = torch.randn(100, 27)
b2 = torch.randn(27)
logits = h @ W2 + b2
counts = logits.exp()
# counts.sum(1) sums about the rows. 
# by broadcasting rules, each row (of size 27) in `counts` is then divided by the column vector counts.sum()
# `prob`contains the probability of each character appearing for each input X[i]
prob = counts / counts.sum(1, keepdims = True) 
print(counts.sum(1, keepdims = True).shape)
print(prob.shape)
print(prob[0].sum())

torch.Size([16, 1])
torch.Size([16, 27])
tensor(1.0000)


In [20]:
# probabilities of the predicted labels
# obviously the numbers aren't looking good as the NN is untrained.
prob[torch.arange(prob.shape[0]), Y]

tensor([2.5146e-09, 1.5124e-07, 2.7743e-11, 8.9618e-10, 6.5665e-03, 2.8617e-08,
        1.6862e-12, 2.5899e-13, 6.8110e-05, 3.9547e-02, 1.2316e-10, 1.3935e-11,
        6.2849e-05, 8.9503e-20, 3.3345e-04, 1.9316e-06])

In [21]:
# negative log-likelihood
loss = -prob[torch.arange(prob.shape[0]), Y].log().mean()
loss

tensor(18.4043)

In [22]:
F.cross_entropy(logits, Y)

tensor(18.4043)

The calculation for the loss above is a very common calculation. It is just **classification**, if you think about it. 
Hence torch has a built in function `.cross_entropy()` for such computation that is much more efficient than our implementation.
1. It computes the loss with much more efficient kernels that cluster multiple mathematical operations together thus reducing the runtime of the algorithm.
2. The backwards pass of `F.cross_entropy()` is also more efficient than our calculations above because clustered mathematical operations often simplify into less computationally intensive operations (as in this case).
3. `F.cross_entropy()` is more numerically stable in that it handles extreme values better than our approach

In [23]:
# t_logits.exp() may overflow for large positive logits
t_logits = torch.tensor([-5, -3, 0, 100])
t_counts = t_logits.exp()
t_probs = t_counts / t_counts.sum()
t_probs

tensor([0., 0., 0., nan])

In [26]:
# Torch overcomes this by first noting that since probabilities are normalized,
# we can offset the logits by any constant without changing the probability as demonstrated below:
t_logits = torch.tensor([-5, -3, 0, 100]) - 100
t_counts = t_logits.exp()
t_probs = t_counts / t_counts.sum()

t_logits2 = torch.tensor([-5, -3, 0, 100]) - 94
t_counts2 = t_logits2.exp()
t_probs2 = t_counts2 / t_counts2.sum()

t_probs == t_probs2

tensor([True, True, True, True])

<div>
    <img src="mlp.png" width="700"/>
</div>