# El modelo de bigramas usando una red neuronal

In [None]:
import numpy as np
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
#Abrimos el archivo con los nombres
dataset = open('../data/domain_names_full.txt', 'r').read().splitlines()
dataset[:8]

In [None]:
#¿Cuantos nombres tenemos?
len(dataset)

In [None]:
charset = ['*'] + sorted(list(set([y for x in dataset for y in x])))
ctoi = {c:i for i, c in enumerate(charset)}
itoc = {i:c for i, c in enumerate(charset)}

Vamos a hacer un dataset como para alimentar la red neuronal.

In [None]:
xs, ys = [], []

for d in dataset[:1]:
    example = ['*'] + list(d) + ['*']
    for c1, c2 in zip(example, example[1:]):
        xs.append(ctoi[c1])
        ys.append(ctoi[c2])
        print(c1, c2)
#xs = torch.tensor(xs)
#ys = torch.tensor(ys)

In [None]:
xs

In [None]:
ys

In [None]:
F.one_hot(xs, num_classes=len(charset)).shape

In [None]:
nclasses = len(charset)
xenc = F.one_hot(xs, num_classes=nclasses)

In [None]:
xenc.shape

In [None]:
plt.imshow(xenc)

In [None]:
# Remember to make the one_hot float!

# Nuestra "Red Neuronal"

![image.png](attachment:c7e53507-2f34-403f-9819-2d1aaad8ed05.png)

Espero se acuerden de esto. Lo que vamos a hacer es una red neuronal con una sola capa oculta (hidden layer), lineal, sin bias (es decir, sin suma ponderada, solo suma)

In [None]:
W = torch.randn((nclasses, 1))

In [None]:
W

In [None]:
xenc @ W  # [11 x 40] @ [40 x 1] ==> [11 x 1]

In [None]:
W = torch.randn((nclasses, nclasses))
xenc @ W  # [11 x 40] @ [40 x 40] ==> [11 x 40]

In [None]:
(xenc @ W)[0]

In [None]:
plt.imshow(xenc @ W)

La pregunta es que es esto 👆
Cómo interpretamos el output de la red neuronal?
- No son probabilidades porque exceden el rango [0, 1]
- No son counts por que son flotantes y hay negativos

Vamos a interpretar esta salida como log(counts) o logits. Para obtener counts podemos exponenciar. Luego, si normalizamos podemos interpretar la salida como probabilidades.

In [None]:
logits = xenc @ W  
counts = logits.exp()
probs = counts/counts.sum(axis=1, keepdims=True)

In [None]:
probs[0]

## Pasando en limpio

In [None]:
xs

In [None]:
ys

In [None]:
g = torch.Generator().manual_seed(42)
W = torch.randn((nclasses, nclasses), generator=g, requires_grad=True)

In [None]:
# forward pass
xenc = F.one_hot(xs,num_classes=nclasses).float()  # la entrada a la NN en encodeada en one_hot
logits = xenc @ W  # hacemos una predicción de los log(counts)
counts = logits.exp()  # obtenemos la matriz de cuentas
probs = counts / counts.sum(axis=1, keepdims=True)  # obtenemos la matriz de probabilidades

In [None]:
#probs[0, 20], probs[1,28], probs[2,28], probs[3,20], probs[4, 25] # and so on...

In [None]:
loss = -probs[range(11), ys].log().mean()
loss # .item()

In [None]:
# backwards pass
W.grad = None
loss.backward()  # <-- requiere que trackear los gradientes en W

In [None]:
# update Weights
W.data += -0.1 * W.grad

In [None]:
# Escriban el trining loop y no se olviden de imprimir el loss y agregar más ejemplos.

## Haciendo inferencia con el modelo

In [None]:
g = torch.Generator().manual_seed(42)

for i in range(10):
    out = []
    ix = 0
    while True:
        xenc = F.one_hot(torch.tensor([ix]),num_classes=nclasses).float()
        logits = xenc @ W
        counts = logits.exp()
        probs = counts / counts.sum(axis=1, keepdims=True)
        ix = torch.multinomial(probs, num_samples=1, replacement=True, generator=g).item()
        out.append(itoc[ix])
        if ix == 0:
            break
    print(''.join(out[:-1]))