# Notes on Pytorch turtorial
(https://pytorch.org/tutorials/)

In [5]:
import torch
x = torch.randn(5, 3, dtype=torch.float)
# shape is the same as size()
print(f"x.shape = {x.size()}")
# two ways to get specified dimention size
print(f"x.size()[0] = {x.size()[0]}, x.size()[1] = {x.size()[1]}")
print(f"x.size(0) = {x.size(0)}, x.size(1) = {x.size(1)}")

# two ways to extend dimension
print(f"x[None, :].shape = {x[None, :].shape}")
print(f"x.unsqueeze(0).shape = {x.unsqueeze(0).shape}")

# tensor-version transpose
y = x[None, :]
y.permute(2, 0, 1).size()

x.shape = torch.Size([5, 3])
x.size()[0] = 5, x.size()[1] = 3
x.size(0) = 5, x.size(1) = 3
x[None, :].shape = torch.Size([1, 5, 3])
x.unsqueeze(0).shape = torch.Size([1, 5, 3])


torch.Size([3, 1, 5])

<font color=red> `torch.Size` is in fact a tuple, so it supports all tuple operations.</font>

In [6]:
result = torch.empty(5, 3)
y = torch.randn(5, 3, dtype=torch.float32)
torch.add(x, y, out=result) # provide an output tensor as argument
print(result)

tensor([[-2.4424, -1.1048, -0.3555],
        [ 2.5575,  0.4598,  2.4088],
        [ 2.0923, -0.6871, -1.3885],
        [ 1.0288,  0.9461, -1.4477],
        [-0.0394, -0.4372,  0.8955]])


In [4]:
# add in-place
y.add_(x)
print(y)

tensor([[-0.4682, -1.0492,  0.9279],
        [ 1.2007, -0.7875,  0.0336],
        [ 4.0369, -1.5190, -1.5482],
        [-1.6552,  1.9363, -1.9971],
        [ 0.5353, -0.7182, -2.8778]])


<font color=red>**Any operation that mutates a tensor in-place is post-fixed with an _.** For example: `x.copy_(y)`, `x.t_()`, will change x.</font> 
<br> We can use standard NumPy-like indexing with all tensors. 
<br> Resizing: If you want to resize/reshape tensor, you can use `torch.view` or `x.reshape`:

In [5]:
z = x.view(-1, 5)
print(z)
print(x.reshape(-1, 5))

tensor([[ 0.9163,  0.1696, -0.5352,  1.1087, -1.0501],
        [-0.5731,  1.9382, -0.1344, -1.4246, -0.8032],
        [-0.2371, -0.7252,  0.3698, -1.5893, -0.7280]])
tensor([[ 0.9163,  0.1696, -0.5352,  1.1087, -1.0501],
        [-0.5731,  1.9382, -0.1344, -1.4246, -0.8032],
        [-0.2371, -0.7252,  0.3698, -1.5893, -0.7280]])


### Convertion between a Torch Tensor and a NumPy array:
The Torch Tensor and NumPy array will <font color=red> share their underlying memory locations </font> (if the Torch Tensor is on CPU), and changing one will change the other.
<br> All the Tensors on the CPU except a CharTensor support converting to NumPy and back.

In [8]:
a = torch.ones(5)
b = a.numpy()
a.add_(1)
print(f"b = {b}") # b will be affected if a is changed in-place
a = torch.ones(1)
print(f"a.item() = {a.item()}") 
# if there is only one element, then .item(), return Python scalar

# two ways to convert from numpy to tensor
# 1. 
c = torch.from_numpy(b)
print(c)
# 2. 
print(torch.tensor(b))

b = [2. 2. 2. 2. 2.]
a.item() = 1.0
tensor([2., 2., 2., 2., 2.])
tensor([2., 2., 2., 2., 2.])


In [5]:
import numpy as np
b = np.random.random((3, 4))
torch.tensor(b)

tensor([[0.9942, 0.2375, 0.0825, 0.3230],
        [0.0706, 0.7496, 0.6417, 0.2720],
        [0.3835, 0.1154, 0.3078, 0.8634]], dtype=torch.float64)

## AUTOGRAD: AUTOMATIC DIFFERENTIATION
Central to all neural networks in PyTorch is the `autograd` package.<br>
To prevent tracking history (and using memory), you can wrap the code block in `with torch.no_grad():`. <br>
Each tensor has a `.grad_fn` attribute that references a Function that has created the Tensor (<font color=red>except for Tensors created by the user - their `grad_fn is None`)</font>.

**Important attributes in Variables: data, requires_grad, grad_fn, grad**
1. `grad_fn` is None for leaf Tensor, while its grad is a Tensor. 
2. `grad_fn` is not None for other tree-node Tensors, but there will be a warining if you access its grad: *warnings.warn("The .grad attribute of a Tensor that is not a leaf Tensor is being accessed. Its .grad "*

In [23]:
a = torch.randn(2, 2)
print(f"a.requires_grad = {a.requires_grad}")
a.requires_grad_(True)
print(f"a.requires_grad = {a.requires_grad}")
b = (a * a).sum()
print(f"b.grad_fn = {b.grad_fn}")
print(f"a.grad_fn = {a.grad_fn}") # leaf node, no grad_fn

a.requires_grad = False
a.requires_grad = True
b.grad_fn = <SumBackward0 object at 0x125851c18>
a.grad_fn = None


In [24]:
b.backward()
print(f"a.grad = {a.grad}")
print(f"b.grad = {b.grad}")  
# Non-leaf node's .grad attribute won't be populated during autograd.backward()

a.grad = tensor([[-0.3382, -1.4165],
        [ 1.4241, -0.3048]])
b.grad = None


**By default, gradients are only retained for leaf variables. non-leaf variables' gradients are not retained to be inspected later. This was done by design, to save memory.** In order to get the non-leaf variables' gradients, you could call `.retain_grad()` before `.backward()`. 

**By default, `.baclward()` is only used on a scalar valued Tensor.** If you'd like to call backward on a vector function, you can pass a `torch.ones` of size of shape of the tensor you are trying to call backward with.
`L.backward(torch.ones_like(L))`

In [22]:
a = torch.randn(2, 2)
a.requires_grad = True
b = a * a
c = (b + a).sum()
b.retain_grad()   
# with this setting, we have dc/db; otherwise, the gradient of non-leaf node is not retained
c.backward()
print(f"a.grad = {a.grad}")
print(f"b.grad = {b.grad}")

a.grad = tensor([[-3.4189,  1.5920],
        [ 3.7105,  0.9097]])
b.grad = tensor([[1., 1.],
        [1., 1.]])


**One example that `retain_graph` is True:**
<img src="./img/retain_graph.png" alt="drawing" width="500"/>
```
loss1.backward(retain_graph=True)
loss2.backward()
opt.step()
```
Another simple way is:
```
total_loss = loss1 + loss2
total_loss.backward()
opt.step()
```
Essentially, they are the same. 

## Neural Networks
`torch.nn` only supports mini-batches. The entire `torch.nn` package only supports inputs that are a mini-batch of samples, and not a single sample. <br>
For example, `nn.Conv2d` will take in a 4D Tensor of `nSamples x nChannels x Height x Width`.
If you have a single sample, just use `input.unsqueeze(0)` to add a fake batch dimension.

If you follow `loss` in the backward direction, using its `.grad_fn` attribute, you will see a graph of computations.  
```
print(loss.grad_fn)  # MSELoss
print(loss.grad_fn.next_functions[0][0])  # Linear
print(loss.grad_fn.next_functions[0][0].next_functions[0][0])  # ReLU
```

In [None]:
import torch.optim as optim

# create your optimizer
optimizer = optim.SGD(net.parameters(), lr=0.01)

# in your training loop:
optimizer.zero_grad()   # zero the gradient buffers
output = net(input)
loss = criterion(output, target)
loss.backward()
optimizer.step()    # Does the update

A general way to set up weight decay for parameters:
<img src="../img/weight_decay.png" alt="drawing" width="750"/>
For the weight parameters in the model, we use $l_2$ normaliztion. Thus we separate the parameters in the net into two groups and set individual attributes. <br>
The attributes of each group consist of: 
<img src="../img/sgd_attr.png" alt="drawing" width="250"/>

The output of `torchvision` datasets are **PILImage images of range [0, 1]. We transform them to Tensors of normalized range [-1, 1].**
```
import torchvision.transforms as transforms
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=4,
                                          shuffle=True, num_workers=2)```

**A typical way to construct a model:** 
```
rgnet = nn.Sequential()
rgnet.add_module('model',block())
rgnet.add_module('Last_linear_layer', nn.Linear(16,10))
rgnet.apply(init_weights)
```
where `block()` is an `nn.Sequential()` model as well.

### save & load model 
```
PATH = './cifar_net.pth'
torch.save(net.state_dict(), PATH)

net = Net()
net.load_state_dict(torch.load(PATH))
```


<font color=red>**Three ways to access the network parameter values:**</font>
1. `net[0].weight`; 
2. `net.Linear_1.weight`;
3. `net.state_dict()['Linear_1.weight']` . `state_dict` is a method, while `state_dict()` returns a geneartor of OrderedDict of net parameter values. 

<font color=red>`net.parameters` is a method that show the net structure, which is similar to `net.state_dict`. `net.parameters()` is a generator object that can be iterated with `for`. `net.state_dict()` returns a generator of OrderDict. 

Another difference is that attribute `requires_grad` in parameters of `net.parameters()` are `True`, while in `net.state_dict()` it is `false. `</font>

#### <font color=red> Three ways to iterate all the net parameters</font>
1. `for param in net.parameters()`, `param.size(), param.data, param.dtype`;
2. `for key, value in net.state_dict().items()`, `value` is a Tensor, equivalent to `param.data`;
3. `for name, param in net.named_parameters()`.

### Double check the structure of the net
Show the output shape of each layer:
```
X = torch.randn(size=(1,1,28,28), dtype = torch.float32)
for layer in net:
    X = layer(X)
    print(layer.__class__.__name__,'output shape: \t',X.shape)
```
The result is as follows:
```
Reshape output shape: 	 torch.Size([1, 1, 28, 28])
Conv2d output shape: 	 torch.Size([1, 6, 28, 28])
Sigmoid output shape: 	 torch.Size([1, 6, 28, 28])
AvgPool2d output shape:  torch.Size([1, 6, 14, 14])
Conv2d output shape: 	 torch.Size([1, 16, 10, 10])
Sigmoid output shape: 	 torch.Size([1, 16, 10, 10])
AvgPool2d output shape:  torch.Size([1, 16, 5, 5])
```

### Ways to fine tune
freeze part of your model and train the rest:
https://stackoverflow.com/questions/51748138/pytorch-how-to-set-requires-grad-false

<font color=red>**Three ways of fine tuning:**</font>
1. replace the specified layers in the pre-trained model, and only set the `requires_grad` of parameters in those layers as `True`. 
2. use `with torch.no_grad()`. 
3. put the parameters in the specified layers in `optim`. For instance, `optimizer = optim.SGD(net.linear1.parameters(), lr)`, in this case, thought the `grad` of the parameters in other layers are computed, those parameters are never updated in `optimizer.step()`. 

In [5]:
import torch
from torch import nn, optim

net = nn.Sequential()
net.add_module('linear1', nn.Linear(2, 4))
net.add_module('relu', nn.ReLU())
net.add_module('linear2', nn.Linear(4, 1))

X = torch.randn(size=(10, 2), dtype = torch.float32)

In [11]:
y = torch.randn(size=(10, 1), dtype=torch.float32)
loss = nn.MSELoss(reduction="mean")
optimizer = optim.SGD(net.linear2.parameters(), lr=1)
print(f"linear1.weight:\n{net.linear1.weight.data}")
print(f"linear2.weight:\n{net.linear2.weight.data}")
optimizer.zero_grad()
l = loss(net(X), y)
l.backward()
optimizer.step()
print(f"linear1.weight:\n{net.linear1.weight.data}")
print(f"linear2.weight:\n{net.linear2.weight.data}")
print(f"linear1.weight: grad {net.linear1.weight.grad}")
print(f"linear2.weight: grad {net.linear2.weight.grad}")

linear1.weight:
tensor([[-0.1355, -0.2333],
        [-0.4334,  0.3797],
        [ 0.4248, -0.0101],
        [ 0.6089, -0.4895]])
linear2.weight:
tensor([[ -0.1188,  -0.9971,  -5.9006, -20.4001]])
linear1.weight:
tensor([[-0.1355, -0.2333],
        [-0.4334,  0.3797],
        [ 0.4248, -0.0101],
        [ 0.6089, -0.4895]])
linear2.weight:
tensor([[-0.1188,  5.8125, 23.5612, 80.6392]])
linear1.weight: grad tensor([[ 0.0000e+00,  0.0000e+00],
        [-7.4132e-01,  1.0638e+01],
        [ 2.8740e+02, -1.1358e+02],
        [ 7.9221e+02, -7.0597e+02]])
linear2.weight: grad tensor([[   0.0000,   -6.8095,  -29.4619, -101.0393]])


## 用Google Colab跑Jupyter Notebook:
https://colab.research.google.com/github/haoysRPI/d2l-pytorch/blob/master/Ch06_Multilayer_Perceptrons/6_1_Multilayer_Perceptron.ipynb 
只需把后面部分换掉