# chap 02

In [9]:
import torch

# Everything in pytorch is based on Tensor operations.
# A tensor can have different dimensions
# so it can be 1d, 2d, or even 3d and higher

# scalar, vector, matrix, tensor

# torch.empty(size): uninitiallized
x = torch.empty(1) # scalar
print(x)
x = torch.empty(3) # vector, 1D
print(x)
x = torch.empty(2,3) # matrix, 2D
print(x)
x = torch.empty(2,2,3) # tensor, 3 dimensions
#x = torch.empty(2,2,2,3) # tensor, 4 dimensions
print(x)

# torch.rand(size): random numbers [0, 1]
x = torch.rand(5, 3)
print(x)

# torch.zeros(size), fill with 0
# torch.ones(size), fill with 1
x = torch.zeros(5, 3)
print(x)

tensor([2.6079e-09])
tensor([9.9390e-16, 4.5740e-41, 8.2935e-35])
tensor([[8.2916e-35, 0.0000e+00, 8.3169e-38],
        [0.0000e+00, 1.1495e+24, 3.0881e+29]])
tensor([[[9.9391e-16, 4.5740e-41, 8.2994e-35],
         [0.0000e+00, 1.4013e-45, 0.0000e+00]],

        [[7.5189e-35, 0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00]]])
tensor([[0.3488, 0.8176, 0.2171],
        [0.5176, 0.3927, 0.7726],
        [0.9623, 0.6454, 0.8372],
        [0.1860, 0.5485, 0.7239],
        [0.9249, 0.9518, 0.8578]])
tensor([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]])


In [10]:
# check size
print(x.size())

# check data type
print(x.dtype)

# specify types, float32 default
x = torch.zeros(5, 3, dtype=torch.float16)
print(x)

# check type
print(x.dtype)

# construct from data
x = torch.tensor([5.5, 3])
print(x.size())

torch.Size([5, 3])
torch.float32
tensor([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]], dtype=torch.float16)
torch.float16
torch.Size([2])


In [None]:
# requires_grad argument
# This will tell pytorch that it will need to calculate the gradients for this tensor
# later in your optimization steps
# i.e. this is a variable in your model that you want to optimize
x = torch.tensor([5.5, 3], requires_grad=True)

# Operations
y = torch.rand(2, 2)
x = torch.rand(2, 2)

# elementwise addition
z = x + y
# torch.add(x,y)

# in place addition, everythin with a trailing underscore is an inplace operation
# i.e. it will modify the variable
# y.add_(x)

# substraction
z = x - y
z = torch.sub(x, y)

# multiplication
z = x * y
z = torch.mul(x,y)

# division
z = x / y
z = torch.div(x,y)

# Slicing
x = torch.rand(5,3)
print(x)
print(x[:, 0]) # all rows, column 0
print(x[1, :]) # row 1, all columns
print(x[1,1]) # element at 1, 1

# Get the actual value if only 1 element in your tensor
print(x[1,1].item())

# Reshape with torch.view()
x = torch.randn(4, 4)
y = x.view(16)
z = x.view(-1, 8)  # the size -1 is inferred from other dimensions
# if -1 it pytorch will automatically determine the necessary size
print(x.size(), y.size(), z.size())

# Numpy
# Converting a Torch Tensor to a NumPy array and vice versa is very easy
a = torch.ones(5)
print(a)

# torch to numpy with .numpy()
b = a.numpy()
print(b)
print(type(b))

# Carful: If the Tensor is on the CPU (not the GPU),
# both objects will share the same memory location, so changing one
# will also change the other
a.add_(1)
print(a)
print(b)

# numpy to torch with .from_numpy(x)
import numpy as np
a = np.ones(5)
b = torch.from_numpy(a)
print(a)
print(b)

# again be careful when modifying
a += 1
print(a)
print(b)

# by default all tensors are created on the CPU,
# but you can also move them to the GPU (only if it's available )
if torch.cuda.is_available():
    device = torch.device("cuda")          # a CUDA device object
    y = torch.ones_like(x, device=device)  # directly create a tensor on GPU
    x = x.to(device)                       # or just use strings ``.to("cuda")``
    z = x + y
    # z = z.numpy() # not possible because numpy cannot handle GPU tenors
    # move to CPU again
    z.to("cpu")       # ``.to`` can also change dtype together!
    # z = z.numpy()


In [15]:
cuda = torch.device('cuda')

a = torch.ones(5, device=cuda)
b = a.cpu().numpy()

print(a)
print(b)
a += 0.5
print(a)
print(b)

tensor([1., 1., 1., 1., 1.], device='cuda:0')
[1. 1. 1. 1. 1.]


In [17]:
a2 = torch.ones(5, device='cpu')
b = a2.numpy()

print(a2)
print(b)
a2 += 0.5
print(a2)
print(b)

tensor([1., 1., 1., 1., 1.])
[1. 1. 1. 1. 1.]
tensor([1.5000, 1.5000, 1.5000, 1.5000, 1.5000])
[1.5 1.5 1.5 1.5 1.5]


# chap 03

In [23]:
import torch
# The autograd package provides automatic differentiation 
# for all operations on Tensors

# requires_grad = True -> tracks all operations on the tensor. 
x = torch.randn(3, requires_grad=True)
y = x + 2

# y was created as a result of an operation, so it has a grad_fn attribute.
# grad_fn: references a Function that has created the Tensor
print(x) # created by the user -> grad_fn is None
print(y)
print(y.grad_fn)

# Do more operations on y
z = y * y * 3
print(z)
z = z.mean()
print(z)

tensor([ 0.2731,  0.2370, -1.2228], requires_grad=True)
tensor([2.2731, 2.2370, 0.7772], grad_fn=<AddBackward0>)
<AddBackward0 object at 0x7f7fd2af6c40>
tensor([15.5011, 15.0124,  1.8120], grad_fn=<MulBackward0>)
tensor(10.7752, grad_fn=<MeanBackward0>)


求导数, 注意 `z=z.mean()` 会初以size
$$
\begin{aligned}
\because z &= 3 \cdot y^2 / 3 \\
  &= (x+2)^2 \\
  &= x^2+4x+4 \\
  \\
\therefore \frac{\partial z}{ \partial x} &= 2x + 4
\end{aligned}
$$

In [24]:
# Let's compute the gradients with backpropagation
# When we finish our computation we can call .backward() and have all the gradients computed automatically.
# The gradient for this tensor will be accumulated into .grad attribute.
# It is the partial derivate of the function w.r.t. the tensor

z.backward()
print(x.grad) # dz/dx

print(2*x + 4) # dz/dx

# Generally speaking, torch.autograd is an engine for computing vector-Jacobian product
# It computes partial derivates while applying the chain rule

tensor([4.5462, 4.4740, 1.5543])
tensor([4.5462, 4.4740, 1.5543], grad_fn=<AddBackward0>)


求导数
$$
\begin{aligned}
\because
\vec{y} &= (\vec{x} * 2) * 2^{10} \\
  &= \vec{x} * 2^{11} \\
\therefore \frac{\partial \vec{y}}{ \partial \vec{x}}
  &= diag(2^{11})
\end{aligned}
$$

In [25]:

# -------------
# Model with non-scalar output:
# If a Tensor is non-scalar (more than 1 elements), we need to specify arguments for backward() 
# specify a gradient argument that is a tensor of matching shape.
# needed for vector-Jacobian product

x = torch.randn(3, requires_grad=True)

y = x * 2
for _ in range(10):
    y = y * 2

print(y)
print(y.shape)

v = torch.tensor([0.1, 1.0, 0.0001], dtype=torch.float32)
y.backward(v)
print(x.grad)

print(v * 2**11)

tensor([-2179.4465,  -311.4295,  2190.5410], grad_fn=<MulBackward0>)
torch.Size([3])
tensor([2.0480e+02, 2.0480e+03, 2.0480e-01])
tensor([2.0480e+02, 2.0480e+03, 2.0480e-01])


In [26]:

# -------------
# Stop a tensor from tracking history:
# For example during our training loop when we want to update our weights
# then this update operation should not be part of the gradient computation
# - x.requires_grad_(False)
# - x.detach()
# - wrap in 'with torch.no_grad():'

# .requires_grad_(...) changes an existing flag in-place.
a = torch.randn(2, 2)
print(a.requires_grad)
b = ((a * 3) / (a - 1))
print(b.grad_fn)
a.requires_grad_(True)
print(a.requires_grad)
b = (a * a).sum()
print(b.grad_fn)

False
None
True
<SumBackward0 object at 0x7f7fd2a35ee0>


In [27]:
# .detach(): get a new Tensor with the same content but no gradient computation:
a = torch.randn(2, 2, requires_grad=True)
print(a.requires_grad)
b = a.detach()
print(b.requires_grad)

True
False


In [28]:
# wrap in 'with torch.no_grad():'
a = torch.randn(2, 2, requires_grad=True)
print(a.requires_grad)
with torch.no_grad():
    print((x ** 2).requires_grad)

True
False


In [30]:

# -------------
# backward() accumulates the gradient for this tensor into .grad attribute.
# !!! We need to be careful during optimization !!!
# Use .zero_() to empty the gradients before a new optimization step!
weights = torch.ones(4, requires_grad=True)

for epoch in range(3):
    # just a dummy example
    model_output = (weights*3).sum()
    model_output.backward()
    
    print(weights.grad)

    # optimize model, i.e. adjust weights...
    with torch.no_grad():
        weights -= 0.1 * weights.grad

    # this is important! It affects the final weights & output
    weights.grad.zero_()

print(weights)
print(model_output)

# Optimizer has zero_grad() method
# optimizer = torch.optim.SGD([weights], lr=0.1)
# During training:
# optimizer.step()
# optimizer.zero_grad()


tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
tensor([0.1000, 0.1000, 0.1000, 0.1000], requires_grad=True)
tensor(4.8000, grad_fn=<SumBackward0>)


```python
# 首轮
weights = [1,1,1,1]
output = (3*x1 + 3*x2 + 3*x3 + 3*x4)
weights_grad = [3,3,3,3]
weights = [1,1,1,1] - [3,3,3,3]*0.1 = [0.7,0.7,0.7,0.7]

# 第二轮时
weights_grad = [3,3,3,3] # 仍然为常数
weights = weights - [3,3,3,3]*0.1 = [0.4,0.4,0.4,0.4]

# 第三轮时
output = (3*weights).sum() = 3*0.4*4 =4.8
weights = weights - [3,3,3,3]*0.1 = [0.1,0.1,0.1,0.1]
```

# chap 04
ignore

# chap 05

In [1]:
import torch
torch.cuda.is_available()

True

In [10]:
import torch

device = torch.device('cuda')
# device = torch.device('cpu')

X = torch.tensor([1, 2, 3, 4], dtype=torch.float32, device=device)
Y = X * 2  # 注意: 这里举例不太好，是向量x使用标量w映射到向量y
w = torch.tensor(0.0, dtype=torch.float32, requires_grad=True, device=device)


def forward(x):
    return w * x


def loss(y_true, y_pred):
    return ((y_true - y_pred) ** 2).mean()


lr = 1e-2
n_iter = 100

for epoch in range(n_iter):
    y_pred = forward(X)
    l = loss(Y, y_pred)  # 注意别写反顺序了
    l.backward()
    with torch.no_grad():
        w -= w.grad * lr
    w.grad.zero_()  # 记得梯度置零
    if epoch % 10 == 0:
        print(f"iter #{epoch}: w={w.item():.4f}, l={l.item():.4f}")

iter #0: w=0.3000, l=30.0000
iter #10: w=1.6653, l=1.1628
iter #20: w=1.9341, l=0.0451
iter #30: w=1.9870, l=0.0017
iter #40: w=1.9974, l=0.0001
iter #50: w=1.9995, l=0.0000
iter #60: w=1.9999, l=0.0000
iter #70: w=2.0000, l=0.0000
iter #80: w=2.0000, l=0.0000
iter #90: w=2.0000, l=0.0000


# chap 06

In [14]:
import torch

# device = torch.device('cuda')
device = torch.device('cpu')

X = torch.tensor([1, 2, 3, 4], dtype=torch.float32, device=device).reshape(-1, 1)
Y = X * 2  # 注意: 这里举例不太好，是向量x使用标量w映射到向量y

model = torch.nn.Linear(in_features=X.shape[1], out_features=Y.shape[1])
loss = torch.nn.MSELoss()
optimizer = torch.optim.SGD(params=model.parameters(), lr=1e-2)

n_iter = 100
for epoch in range(n_iter):
    y_pred = model(X)
    l = loss(Y, y_pred)  # 注意别写反顺序了
    l.backward()
    optimizer.step()
    optimizer.zero_grad()
    if epoch % 10 == 0:
        print(f"iter #{epoch}: w={w.item():.4f}, l={l.item():.4f}")

iter #0: w=2.0000, l=68.4709
iter #10: w=2.0000, l=1.8042
iter #20: w=2.0000, l=0.0774
iter #30: w=2.0000, l=0.0310
iter #40: w=2.0000, l=0.0281
iter #50: w=2.0000, l=0.0264
iter #60: w=2.0000, l=0.0249
iter #70: w=2.0000, l=0.0234
iter #80: w=2.0000, l=0.0221
iter #90: w=2.0000, l=0.0208
