### Tutorial Pytorch
https://www.youtube.com/watch?v=c36lUUr864M&feature=youtu.be

In [16]:
import torch
import numpy as np

### Tensores a partir de distribucion normal, require grad, reduce mean

In [60]:
std = torch.tensor([1],dtype=torch.int)

In [296]:
x = torch.normal(mean=torch.rand(3,3),std=1)
#x.requires_grad=True
x

tensor([[ 0.2908,  2.3880, -1.7087],
        [ 2.8463, -1.5386,  0.6895],
        [ 1.5419,  2.9075, -0.7003]])

In [87]:
x.mean()
#x.mean().item()

0.621843159198761

## Autograd
Al ejecutar un forward se tiene un puntero a una funcion Add Backward que calcula las gradientes para el backward prop

In [252]:
x = torch.randn(3, requires_grad=True)

In [253]:
y = x + 2

In [254]:
print(y)

tensor([ 1.3058,  2.9263, -0.5883], grad_fn=<AddBackward0>)


In [255]:
z = y * y * 2
z

tensor([ 3.4103, 17.1270,  0.6922], grad_fn=<MulBackward0>)

In [256]:
z=z.mean()
z

tensor(7.0765, grad_fn=<MeanBackward0>)

In [257]:
x.grad #Se afecta cada vez que se corre un backward

In [258]:
#Altera la gradiente de x
z.backward() #dz/dx, Tambien se le puede pasar un vector de las mismas dims que x. 

In [259]:
x.grad

tensor([ 1.7411,  3.9018, -0.7844])

In [162]:
y.mean().backward() #dy/dx

In [163]:
x.grad 

tensor([1.9076, 3.9108, 3.7136])

#### Para que Pytorch no calcula gradientes:
Al hacer update a los weights, no se necesita calcular la gradiente

In [169]:
#1. requires_grad_(False)
#2. x.detatch(), crea nuevo tensor que no requiere gradiente
#3. with torch.no_grad():

In [172]:
x = torch.randn(3, requires_grad=True)

In [164]:
x.requires_grad_(False)
x

tensor([-0.8193,  0.6831,  0.5352])

In [171]:
x.detach()

tensor([-0.1021, -2.7926, -0.7338])

In [173]:
with torch.no_grad():
    y = x+2
    print(y) #no tiene funcion de gradiente

tensor([0.7901, 1.3859, 2.7067])


#### Training example EMPTY GRADIENTS on every step

In [184]:
weights = torch.ones(4, requires_grad=True)

for epoch in range(3):
    model_output = (weights*3).sum()
    model_output.backward()
    print(weights.grad)
    weights.grad.zero_() #EMPTY GRADIENTS BEFORE NEXT OPERATION

tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])


In [186]:
optimizer = torch.optim.SGD([weights], lr=1e-3)
optimizer.step()
optimizer.zero_grad() #EMPTY GRADIENTS BEFORE NEXT OPERATION

### Backprop

#### Optimizer

In [265]:
import torch.nn as nn

In [266]:
states = torch.normal(torch.rand(3,4))
actions = torch.randint(1,3,(3,))

In [267]:
criterion1 = nn.CrossEntropyLoss()
criterion2 = nn.NLLLoss() #Requiere de LogSoftmax,

In [268]:
criterion1(states,actions)

tensor(2.4873)

In [269]:
criterion2(nn.LogSoftmax(dim=1)(states),actions)

tensor(2.4873)

# Simulacion de modelo

In [579]:
input_tensor = torch.randint(0,3,(3,5))
weights = torch.rand(125,3,requires_grad=True)
bias = torch.rand(125,5,requires_grad=True)

In [580]:
weights

tensor([[0.8402, 0.0209, 0.1973],
        [0.9238, 0.3832, 0.5293],
        [0.8765, 0.7666, 0.2490],
        [0.3489, 0.3641, 0.1139],
        [0.4584, 0.3827, 0.6041],
        [0.8989, 0.3235, 0.3466],
        [0.8363, 0.0217, 0.2004],
        [0.2159, 0.1633, 0.6151],
        [0.0327, 0.7379, 0.6648],
        [0.5941, 0.6473, 0.5944],
        [0.4555, 0.8625, 0.4804],
        [0.3443, 0.5762, 0.2929],
        [0.0721, 0.1281, 0.0981],
        [0.8526, 0.3860, 0.6288],
        [0.3863, 0.2539, 0.3333],
        [0.7756, 0.2616, 0.8300],
        [0.5350, 0.5499, 0.6215],
        [0.5019, 0.3792, 0.2121],
        [0.2504, 0.2082, 0.0376],
        [0.0478, 0.6091, 0.0066],
        [0.8672, 0.4213, 0.7951],
        [0.5051, 0.3295, 0.6581],
        [0.2551, 0.7155, 0.9383],
        [0.6676, 0.4711, 0.8421],
        [0.8301, 0.3037, 0.3247],
        [0.2655, 0.8930, 0.3116],
        [0.4302, 0.7586, 0.7549],
        [0.3475, 0.5600, 0.8265],
        [0.5053, 0.6391, 0.1856],
        [0.040

In [585]:
#Forward
logits = (weights @ input_tensor.float())+bias #Tiene grad_fn
logits.size()

torch.Size([125, 5])

In [586]:
#Funcion de perdida
actions = torch.randint(0,5,(125,))


criterion = nn.CrossEntropyLoss(reduction='none')
loss = criterion(logits,actions)
loss = loss.mean()
loss.backward() #update gradients on trainable tensors

In [587]:
#Optimizer
optimizer = torch.optim.Adam([weights,bias],1e-3)
optimizer.step()
optimizer.zero_grad()

In [588]:
weights

tensor([[0.8392, 0.0199, 0.1963],
        [0.9228, 0.3842, 0.5283],
        [0.8755, 0.7656, 0.2480],
        [0.3479, 0.3631, 0.1129],
        [0.4574, 0.3817, 0.6031],
        [0.8979, 0.3245, 0.3456],
        [0.8353, 0.0207, 0.1994],
        [0.2149, 0.1623, 0.6141],
        [0.0337, 0.7369, 0.6638],
        [0.5931, 0.6463, 0.5934],
        [0.4545, 0.8615, 0.4794],
        [0.3433, 0.5772, 0.2919],
        [0.0711, 0.1271, 0.0971],
        [0.8516, 0.3850, 0.6298],
        [0.3853, 0.2549, 0.3323],
        [0.7746, 0.2606, 0.8310],
        [0.5340, 0.5509, 0.6205],
        [0.5009, 0.3802, 0.2111],
        [0.2514, 0.2072, 0.0366],
        [0.0468, 0.6081, 0.0076],
        [0.8662, 0.4203, 0.7961],
        [0.5061, 0.3285, 0.6591],
        [0.2561, 0.7145, 0.9373],
        [0.6686, 0.4701, 0.8411],
        [0.8291, 0.3027, 0.3237],
        [0.2645, 0.8920, 0.3126],
        [0.4312, 0.7576, 0.7539],
        [0.3465, 0.5590, 0.8255],
        [0.5063, 0.6381, 0.1846],
        [0.039

#### Sample from distribution

In [567]:
logits.size()
torch.distributions.Categorical(logits).sample()

tensor([4, 1, 0, 0, 3, 2, 0, 1, 0, 0, 2, 1, 1, 2, 1, 2, 2, 2, 3, 2, 1, 2, 4, 3,
        4, 1, 4, 4, 1, 1, 1, 4, 1, 1, 1, 4, 2, 1, 2, 2, 2, 3, 3, 2, 2, 4, 3, 1,
        2, 4, 2, 2, 3, 4, 3, 3, 1, 3, 2, 3, 3, 1, 0, 1, 0, 3, 3, 1, 2, 1, 1, 3,
        1, 3, 3, 4, 2, 1, 2, 3, 4, 2, 0, 3, 4, 2, 3, 3, 2, 4, 2, 1, 0, 1, 0, 0,
        1, 4, 3, 1, 2, 1, 4, 3, 4, 1, 3, 4, 3, 0, 1, 0, 0, 3, 3, 1, 1, 1, 4, 3,
        1, 4, 3, 4, 3])

In [526]:
torch.rand((3,))

tensor([0.4938, 0.9887, 0.2181])

In [527]:
m = torch.distributions.Categorical(torch.randint(1,5,(3,)))
m

Categorical(probs: torch.Size([3]))

In [566]:
m.sample().item()

0