In [1]:
import torch 

# Tensor basic

## Operation

In [18]:
x = torch.empty(1)
x

tensor([6.])

In [24]:
x = 2 * torch.ones(1, 10)
x

tensor([[2., 2., 2., 2., 2., 2., 2., 2., 2., 2.]])

In [25]:
y = torch.ones_like(x)
y

tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])

In [27]:
y.add(x)
print(y)

tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])


In [28]:
y.add_(x)
print(y)

tensor([[3., 3., 3., 3., 3., 3., 3., 3., 3., 3.]])


In [29]:
y.view(2, 5)

tensor([[3., 3., 3., 3., 3.],
        [3., 3., 3., 3., 3.]])

## Numpy and pytorch

In [30]:
import torch
import numpy as np

In [36]:
a = torch.ones(5)
print(type(a))

b = a.numpy()
print(type(b))

<class 'torch.Tensor'>
<class 'numpy.ndarray'>


In [40]:
# Memory address of the tensor data: .data_ptr()
# Memory address of the numpy data: .ctypes.data

a.data_ptr() == b.ctypes.data

True

## Pytorch / Numpy device

In [46]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(True)
else:
    device = torch.device('cpu')
    print(False)

False


In [60]:
a = torch.ones(5, device= device)
a.to('cpu')
type(a)

torch.Tensor

In [58]:
b = a.numpy()
b.to_device('cpu')
type(b)

numpy.ndarray

# Gradient auto 

## Gradient calculation

In [2]:
x = torch.ones(3, requires_grad= True)
print(x)

tensor([1., 1., 1.], requires_grad=True)


In [3]:
y = x + 2

In [4]:
print(y)

tensor([3., 3., 3.], grad_fn=<AddBackward0>)


### With scalar

In [5]:
z = y*y*2
z = z.mean()
print(z)

tensor(18., grad_fn=<MeanBackward0>)


In [6]:
z.backward() # dz/dx

In [7]:
x.grad

tensor([4., 4., 4.])

### With vector

In [8]:
x = torch.ones(3, requires_grad= True)
y = x + 2

f = y*y*2
# vector = torch.tensor([0.1, 0.01, 0.001])
vector = torch.ones(3)
print(f)

tensor([18., 18., 18.], grad_fn=<MulBackward0>)


In [9]:
f.backward(vector) # df/fx

In [10]:
x.grad

tensor([12., 12., 12.])

## Preventing gradient history

In [11]:
x = torch.ones(3, requires_grad= True)
# x.requires_grad_(False)
# x.detach() # create new tensor 

In [12]:
with torch.no_grad():
    y = x + 2
    print(y)

tensor([3., 3., 3.])


## Note: gradient for tensor which has required grad will be accumulated  

In [13]:
weights = torch.ones(4, requires_grad= True)
for epoch in range(1):
    model_output = (weights*3).sum()
    model_output.backward()

    print(weights.grad)

tensor([3., 3., 3., 3.])


In [14]:
weights = torch.ones(4, requires_grad= True)
for epoch in range(2):
    model_output = (weights*3).sum()
    model_output.backward()

    print(weights.grad)

tensor([3., 3., 3., 3.])
tensor([6., 6., 6., 6.])


In [15]:
# fix grad to not accumulate

weights = torch.ones(4, requires_grad= True)
for epoch in range(2):
    model_output = (weights*3).sum()
    model_output.backward()
    
    print(weights.grad)
    weights.grad.zero_()

tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])


In [16]:
# with optimizer

# optimizer = torch.optim.SGD(params= weights, lr= 0.01)
# optimizer.step()
# optimizer.zero_grad()

# Backpropagation 

The chain rules:
$$
y = f(x), \quad z = g(y)
$$
$$
\frac{\partial z}{\partial x} = \frac{\partial z}{\partial y} \cdot \frac{\partial y}{\partial x}
$$




Example:
$$
\hat{y} = wx
$$

$$
s = \hat{y} - y
$$

$$
\text{Loss} = (s)^2
$$

$$
\frac{\partial \text{Loss}}{\partial w} = \frac{\partial \text{Loss}}{\partial s} \cdot \frac{\partial s}{\partial \hat{y}} \cdot \frac{\partial \hat{y}}{\partial w} = 2(wx-y)x
$$


In [70]:
x = torch.tensor(1.0)
y = torch.tensor(2.0)
w = torch.tensor(1.0, requires_grad= True)

In [71]:
y_hat = w*x 
loss = (y_hat - y)**2
loss

tensor(1., grad_fn=<PowBackward0>)

In [72]:
loss.backward()
w.grad

tensor(-2.)