In [1]:
import torch

# Summary

###  1. Introduction to Pytorch

- `-1`: Automatically infers the remaining dimension when one size is known.
- `rand`, `randn`: Initialize tensors; `rand` samples from (0, 1), `randn` from 𝒩(0, 1).
- `randint(a, b, (c, d))`: Creates a c × d tensor with values in [a, b).
- `zeros`, `ones`, `cat`: Common functions for tensor initialization and concatenation.
- `randn_like(A, dtype=d)`: Creates a tensor with the same shape as A and specified dtype.
- `.numpy()` and `from_numpy()`: Convert between Torch and NumPy; both share memory.
- Dimensions `(0,1,2)`: For a tensor of shape (a, b, c), `unsqueeze(dim)` adds a size-1 dimension at that position.
- `requires_grad` and `requires_grad_()`: Check or set whether a tensor should track gradients.
- `detach()` and `no_grad()`: Prevent a tensor from tracking gradients or computation history.

### 2,3. Loss Function and Weight Initialization

Learning how to use docs.pytorch website

# I. Introduction to Pytorch

## Torch Tensors

In [2]:
a = torch.tensor([2,2,1])
b = torch.tensor([[2,1,4],[3,5,4]])
print(a)
print(b)
print(a.shape, b.shape, a.size(), b.size(), b.shape[0])

tensor([2, 2, 1])
tensor([[2, 1, 4],
        [3, 5, 4]])
torch.Size([3]) torch.Size([2, 3]) torch.Size([3]) torch.Size([2, 3]) 2


In [3]:
c = torch.FloatTensor([[1,2,3],[4,5,6]])
d = torch.DoubleTensor([[7,8,9],[10,11,12],[13,14,15],[16,17,18]])
print(c)
print(d)
print(c.dtype, d.dtype)
print(c.mean())
print(c.std())
print(d.mean())

tensor([[1., 2., 3.],
        [4., 5., 6.]])
tensor([[ 7.,  8.,  9.],
        [10., 11., 12.],
        [13., 14., 15.],
        [16., 17., 18.]], dtype=torch.float64)
torch.float32 torch.float64
tensor(3.5000)
tensor(1.8708)
tensor(12.5000, dtype=torch.float64)


In [4]:
print(b.view(-1,1))
print(b.view(1,-1))
print(b.view(-1,2))
print(b.view(2,-1))
b=b.view(1,-1)
print(b, b.shape, b.size())

tensor([[2],
        [1],
        [4],
        [3],
        [5],
        [4]])
tensor([[2, 1, 4, 3, 5, 4]])
tensor([[2, 1],
        [4, 3],
        [5, 4]])
tensor([[2, 1, 4],
        [3, 5, 4]])
tensor([[2, 1, 4, 3, 5, 4]]) torch.Size([1, 6]) torch.Size([1, 6])


In [10]:
three_dim  = torch.randn(2, 3, 4)
print(three_dim)
print("\n")
print(three_dim.view(2,12))
print(three_dim.view(2,-1))

tensor([[[ 1.0547,  0.2128, -0.1074, -1.1657],
         [ 1.0220, -0.0675,  0.4466, -0.9165],
         [ 0.7449, -0.5138, -0.9045, -0.0879]],

        [[-1.7920, -0.8415, -0.4558,  0.5264],
         [-0.9851,  0.0352, -0.2334, -0.3798],
         [ 0.1845,  0.1012, -1.6273, -1.6339]]])


tensor([[ 1.0547,  0.2128, -0.1074, -1.1657,  1.0220, -0.0675,  0.4466, -0.9165,
          0.7449, -0.5138, -0.9045, -0.0879],
        [-1.7920, -0.8415, -0.4558,  0.5264, -0.9851,  0.0352, -0.2334, -0.3798,
          0.1845,  0.1012, -1.6273, -1.6339]])
tensor([[ 1.0547,  0.2128, -0.1074, -1.1657,  1.0220, -0.0675,  0.4466, -0.9165,
          0.7449, -0.5138, -0.9045, -0.0879],
        [-1.7920, -0.8415, -0.4558,  0.5264, -0.9851,  0.0352, -0.2334, -0.3798,
          0.1845,  0.1012, -1.6273, -1.6339]])


In [11]:
r1 = torch.rand(4,4) # random numbers between 0 and 1
r2 = torch.randn(4,4) # random numbers ~ N(0,1)
print(r1)
print(r2)
print(r2.dtype)

tensor([[0.5006, 0.4586, 0.6339, 0.4212],
        [0.0563, 0.9191, 0.0511, 0.1670],
        [0.5224, 0.6547, 0.9104, 0.3019],
        [0.4329, 0.7518, 0.1255, 0.8261]])
tensor([[-0.0387,  0.3581,  0.5235,  0.4341],
        [-0.8133, -0.5113, -1.3985, -0.7776],
        [-0.1521, -1.1452, -1.5790, -2.6286],
        [ 0.9999,  1.0004, -1.5273, -0.7699]])
torch.float32


In [7]:
int_arr1 = torch.randint(6,10,(5,)) # array of random integers from 6 to 9
int_arr2 = torch.randint(6,10,(3,3)) # 3x3 matrix of random integers from 6 to 9
print(int_arr1)
print(int_arr1.dtype)
print(int_arr2)
print(torch.numel(int_arr1))  #numel = number of elements
print(torch.numel(int_arr2))

tensor([6, 8, 7, 8, 7])
torch.int64
tensor([[6, 9, 6],
        [7, 6, 9],
        [9, 6, 9]])
5
9


In [None]:
z = torch.zeros(3,3,dtype=torch.long)
o = torch.ones(3,3)
print(z)
print(z.dtype)
print(o)
print(o.dtype)

tensor([[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]])
torch.int64
tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])
torch.float32


In [None]:
r2_like = torch.randn_like(r2, dtype = torch.double)
print(r2_like)
print(r2)
print(r2_like.dtype, r2.dtype)
print("\n")
r = torch.rand(4,4)
add_result = torch.add(r,r2)
print(add_result)
print("\n")
r2.add_(r) # no need r2 = torch.add(r,r2)
print(r2)

tensor([[-0.9063,  0.1151,  0.0742, -1.5300],
        [-0.1958,  0.2627,  0.0668, -0.7250],
        [-0.6898, -0.4778, -0.7281, -0.6101],
        [ 0.4747,  0.8363,  0.2570, -1.0331]], dtype=torch.float64)
tensor([[-0.0387,  0.3581,  0.5235,  0.4341],
        [-0.8133, -0.5113, -1.3985, -0.7776],
        [-0.1521, -1.1452, -1.5790, -2.6286],
        [ 0.9999,  1.0004, -1.5273, -0.7699]])
torch.float64 torch.float32


tensor([[ 0.5847,  0.4807,  0.6058,  1.1773],
        [-0.7628, -0.4396, -1.3566, -0.7574],
        [ 0.0161, -0.7525, -1.1123, -1.6619],
        [ 1.9598,  1.5118, -1.3447, -0.5060]])


tensor([[ 0.5847,  0.4807,  0.6058,  1.1773],
        [-0.7628, -0.4396, -1.3566, -0.7574],
        [ 0.0161, -0.7525, -1.1123, -1.6619],
        [ 1.9598,  1.5118, -1.3447, -0.5060]])


In [28]:
print(r2[:,0])
print(r2[:,:2])
print(r2[:3,:])
num_ten = r2[2,3]
print(num_ten)

tensor([ 0.5847, -0.7628,  0.0161,  1.9598])
tensor([[ 0.5847,  0.4807],
        [-0.7628, -0.4396],
        [ 0.0161, -0.7525],
        [ 1.9598,  1.5118]])
tensor([[ 0.5847,  0.4807,  0.6058,  1.1773],
        [-0.7628, -0.4396, -1.3566, -0.7574],
        [ 0.0161, -0.7525, -1.1123, -1.6619]])
tensor(-1.6619)


## Numpy Bridge

In [29]:
import numpy as np

In [33]:
a = torch.ones(5) # array of 1
print(a)
b = a.numpy()
print(b)

a.add_(1)
print(a)
print(b)

tensor([1., 1., 1., 1., 1.])
[1. 1. 1. 1. 1.]
tensor([2., 2., 2., 2., 2.])
[2. 2. 2. 2. 2.]


In [34]:
a = np.ones(5)
b = torch.from_numpy(a)
np.add(a,1,out=a)
print(a)
print(b)

[2. 2. 2. 2. 2.]
tensor([2., 2., 2., 2., 2.], dtype=torch.float64)


In [38]:
CUDA = torch.cuda.is_available()
print(CUDA)
if CUDA:
    add_result = add_result.cuda()
    print(add_result)

False


In [39]:
a = [2,3,4,1]
print(a)
to_list = torch.tensor(a)
print(to_list, to_list.dtype)

[2, 3, 4, 1]
tensor([2, 3, 4, 1]) torch.int64


In [40]:
data = [[1.,2.], [3.,4.],
         [5.,6.],[7.,8.]]
T= torch.tensor(data)
print(T,T.dtype)

tensor([[1., 2.],
        [3., 4.],
        [5., 6.],
        [7., 8.]]) torch.float32


## Tensor Concatenation

In [52]:
first_1 = torch.randn(2,5)
print(first_1)
second_1 = torch.randn(3,5)
print(second_1)
con_1 = torch.cat([first_1, second_1])
print('\n')
print(con_1)
print('\n')

first_2 = torch.randn(2,3)
print(first_2)
second_2 = torch.randn(2,5)
print(second_2)

try:
    con_2 = torch.cat([first_2, second_2])  
    print("\nNo need 1:", con_2)
except RuntimeError:
    con_2 = torch.cat([first_2, second_2], dim=1)
    print("\nNeed 1:\n", con_2)


tensor([[-1.6152, -0.3358, -0.8653,  1.1371, -0.4270],
        [-1.0304, -0.5375,  0.5854, -0.0241, -0.3972]])
tensor([[-1.0291,  0.7554,  0.5142, -0.5089,  0.4427],
        [ 0.3992,  1.1847, -0.0761, -1.1807,  1.0904],
        [-0.9484,  0.0158,  1.0841, -1.3088, -0.9169]])


tensor([[-1.6152, -0.3358, -0.8653,  1.1371, -0.4270],
        [-1.0304, -0.5375,  0.5854, -0.0241, -0.3972],
        [-1.0291,  0.7554,  0.5142, -0.5089,  0.4427],
        [ 0.3992,  1.1847, -0.0761, -1.1807,  1.0904],
        [-0.9484,  0.0158,  1.0841, -1.3088, -0.9169]])


tensor([[ 0.1022,  1.1862, -1.0037],
        [-0.2824, -0.7101,  1.0271]])
tensor([[ 0.2717, -0.1137, -0.2052,  0.3721,  0.3311],
        [-0.4721,  0.4227, -0.8319,  0.5296, -0.4055]])

Need 1:
 tensor([[ 0.1022,  1.1862, -1.0037,  0.2717, -0.1137, -0.2052,  0.3721,  0.3311],
        [-0.2824, -0.7101,  1.0271, -0.4721,  0.4227, -0.8319,  0.5296, -0.4055]])


## Adding Dimensions to Tensors

In [56]:
tensor_1 = torch.tensor([1,2,3,4])
tensor_a = torch.unsqueeze(tensor_1,0)

print(tensor_1)
print(tensor_1.shape)

print("\n")
print(tensor_a)
print(tensor_a.shape)

tensor_b = torch.unsqueeze(tensor_1,1)
print("\n")
print(tensor_b)
print(tensor_b.shape)


tensor([1, 2, 3, 4])
torch.Size([4])


tensor([[1, 2, 3, 4]])
torch.Size([1, 4])


tensor([[1],
        [2],
        [3],
        [4]])
torch.Size([4, 1])


In [60]:
tensor_2 = torch.rand(2,3,4)
print(tensor_2)
print('\n')

tensor_c = tensor_2[:,:,2]
print(tensor_c)
print(tensor_c.shape)
print('\n')

tensor_d = torch.unsqueeze(tensor_c,2)
print(tensor_d)
print(tensor_d.shape)


tensor([[[0.9262, 0.2993, 0.1385, 0.8037],
         [0.8454, 0.3926, 0.8003, 0.4646],
         [0.4680, 0.9508, 0.2461, 0.1978]],

        [[0.8093, 0.6057, 0.5924, 0.7403],
         [0.0355, 0.2987, 0.0811, 0.4463],
         [0.7509, 0.2297, 0.6220, 0.8597]]])


tensor([[0.1385, 0.8003, 0.2461],
        [0.5924, 0.0811, 0.6220]])
torch.Size([2, 3])


tensor([[[0.1385],
         [0.8003],
         [0.2461]],

        [[0.5924],
         [0.0811],
         [0.6220]]])
torch.Size([2, 3, 1])


## AutoGrad

In [64]:
x = torch.tensor([1., 2., 3], requires_grad = True)
y = torch.tensor([4., 5., 6.], requires_grad = True)
z = x+y
print(z)
print(z.grad_fn)

s = z.sum()
print(s)
print(s.grad_fn)


# Now if we backpropagate on s, we can find the gradients of s with respect to x
s.backward()
print(x.grad)

tensor([5., 7., 9.], grad_fn=<AddBackward0>)
<AddBackward0 object at 0x000001EAD4D9BFA0>
tensor(21., grad_fn=<SumBackward0>)
<SumBackward0 object at 0x000001EAD4E6B550>
tensor([1., 1., 1.])


In [75]:
x = torch.randn(2, 2)
y = torch.randn(2, 2)
print("By default, tensors in PyTorch are created with requires_grad=False: ", 
        x.requires_grad, y.requires_grad)

z = x+y
# So we can't backpropagation through z
print(z.grad_fn)

# Another way to set requires_grad = True is
x.requires_grad_()
y.requires_grad_()

# Now z contains enough information to compute gradients
z = x+y
print("After using requires_grad_(), we have: ", z.grad_fn)

# After using detach(), it return new_z but with the computation history forgotten
new_z = z.detach()
print(new_z.grad_fn)

# You can also stop autograd from tracking history on Tensors
print(x.requires_grad)
print((x+10).requires_grad)

# Stop autograd from tracking history on Tensors, using no_grad()
with torch.no_grad():
        print((x+10).requires_grad)



By default, tensors in PyTorch are created with requires_grad=False:  False False
None
After using requires_grad_(), we have:  <AddBackward0 object at 0x000001EAD4D9AA70>
None
True
True
False


## Last example

In [76]:
x = torch.ones(2,2, requires_grad = True)
print(x)
y = x+2
print(y)
print(y.grad_fn)
z = y * y * 3
out = z.mean()
print(z, out)
out.backward()
print(x.grad)

tensor([[1., 1.],
        [1., 1.]], requires_grad=True)
tensor([[3., 3.],
        [3., 3.]], grad_fn=<AddBackward0>)
<AddBackward0 object at 0x000001EAD4D9AA70>
tensor([[27., 27.],
        [27., 27.]], grad_fn=<MulBackward0>) tensor(27., grad_fn=<MeanBackward0>)
tensor([[4.5000, 4.5000],
        [4.5000, 4.5000]])


# II. Loss Functions

In [81]:
import torch
import torch.nn as nn

prediction = torch.randn(4,5)
label = torch.randn(4,5)

print(prediction)
print(label)

tensor([[ 1.2673, -0.8655,  0.1251, -0.4936, -1.1254],
        [ 0.2635, -0.9821,  0.6131, -0.5852, -1.5594],
        [ 0.2608, -0.0604, -0.1067,  0.0763, -0.4698],
        [ 0.5099,  1.4516,  0.2204, -1.9625,  0.0094]])
tensor([[ 1.4995,  0.2799,  0.2828,  1.8933,  0.7397],
        [ 0.0663,  1.8302, -1.8160, -0.8025,  1.5524],
        [-0.3312, -2.1019, -0.5141, -1.0282,  2.0092],
        [-0.1015,  0.1727,  0.4527, -1.0148, -0.3101]])


## MSE with none, sum and mean

In [None]:
no_mse = nn.MSELoss(reduction='none')
loss = no_mse(prediction, label)
print(loss)
# We can see that the first element is 0.0539, which is equivalent to (1.2673 - 1.4995)^2

sum_mse = nn.MSELoss(reduction='sum')
loss = sum_mse(prediction, label)
print(loss)

mean_mse = nn.MSELoss(reduction='mean')
loss = mean_mse(prediction, label)
print(loss)
# This is just the mean of loss from sum_mse: 49.2594/20 = 2.46297 ~ 2.4630

# Code scratch the Mean-MSE
loss = ((prediction-label)**2).mean()
print(loss)
# Now they have the same value 2.4630

tensor([[0.0539, 1.3119, 0.0249, 5.6973, 3.4789],
        [0.0389, 7.9088, 5.9009, 0.0472, 9.6833],
        [0.3504, 4.1676, 0.1660, 1.2199, 6.1457],
        [0.3738, 1.6356, 0.0540, 0.8982, 0.1021]])
tensor(49.2594)
tensor(2.4630)
tensor(2.4630)


## Binary Cross Entropy

In [99]:
print(prediction)

label = torch.zeros(4,5).random_(0,2)
print(label)

# BCE Loss with 'mean'
sigmoid = nn.Sigmoid() 
bce = nn.BCELoss(reduction = 'mean')
loss = bce(sigmoid(prediction), label)
print(loss)


# BCE Loss with Logistic Loss
bces = nn.BCEWithLogitsLoss(reduction = 'mean')
loss = bces(prediction, label)
print(loss)

tensor([[ 1.2673, -0.8655,  0.1251, -0.4936, -1.1254],
        [ 0.2635, -0.9821,  0.6131, -0.5852, -1.5594],
        [ 0.2608, -0.0604, -0.1067,  0.0763, -0.4698],
        [ 0.5099,  1.4516,  0.2204, -1.9625,  0.0094]])
tensor([[0., 0., 0., 0., 1.],
        [1., 0., 0., 0., 1.],
        [0., 0., 1., 0., 1.],
        [1., 0., 1., 1., 1.]])
tensor(0.9035)
tensor(0.9035)


In [101]:
import numpy as np
x = prediction.numpy()
y = label.numpy()

def sigmoid(x):
    return 1 / (1+np.exp(-x))

x = sigmoid(x)
loss_values = []
for i in range(len(y)):
    batch_loss = []
    for j in range(len(y[0])):
        batch_loss.append(-np.log(x[i][j]) if y[i][j] == 1 else -np.log(1-x[i][j])) 
    loss_values.append(batch_loss) 

print(np.mean(loss_values))

0.9034561


# III. Weight Initialization

In [105]:
import torch
import torch.nn as nn

layer = nn.Linear(5,5)
print(layer.weight.data)

nn.init.uniform_(layer.weight.data, a = 0.0, b=3)
print(layer.weight.data)

nn.init.uniform_(layer.weight, a = 0.0, b=3)
print(layer.weight)

tensor([[-0.0215,  0.1963,  0.3914,  0.4234, -0.4362],
        [ 0.0783, -0.3784,  0.2861, -0.2704,  0.2766],
        [-0.3024, -0.2893,  0.0576, -0.1063,  0.0171],
        [-0.4054,  0.3422,  0.2060, -0.1284, -0.2087],
        [-0.2232,  0.4160, -0.4102,  0.2103, -0.1946]])
tensor([[0.7129, 2.8142, 2.1510, 0.0769, 1.7592],
        [2.6224, 0.5557, 2.7480, 2.9759, 2.7838],
        [0.7591, 1.0135, 2.8390, 0.9291, 0.0162],
        [1.8795, 0.6520, 2.1927, 0.7616, 0.6316],
        [0.5401, 2.3795, 0.3415, 0.8574, 1.9155]])
Parameter containing:
tensor([[1.8504, 0.8965, 2.2543, 1.3662, 2.0741],
        [1.9128, 0.7405, 2.9246, 0.5192, 0.7867],
        [1.4747, 2.2198, 2.7232, 0.8911, 1.2978],
        [1.1615, 0.9000, 0.2711, 2.5292, 2.3769],
        [2.8431, 1.9656, 2.6902, 1.2760, 1.0407]], requires_grad=True)


In [106]:
nn.init.normal_(layer.weight, mean = 0.0, std = 1.0)

Parameter containing:
tensor([[-0.8022,  1.0612,  0.5589,  1.4273,  0.3788],
        [-1.5489, -0.5868, -1.1909,  0.5178, -1.0984],
        [-2.0589, -1.0587, -0.2300, -0.3395, -0.3609],
        [ 1.0487, -0.0720,  0.4155, -0.3796,  0.2537],
        [-1.2655,  1.3204,  0.3215,  0.0781,  0.2190]], requires_grad=True)

In [110]:
nn.init.constant_(layer.bias, 0) 
# This kind of initialization is usually used for bias
print(layer.bias)

Parameter containing:
tensor([0., 0., 0., 0., 0.], requires_grad=True)


In [111]:
torch.nn.init.xavier_uniform_(layer.weight, gain=1.0)

Parameter containing:
tensor([[ 0.6007,  0.7513, -0.5607,  0.7500,  0.6381],
        [ 0.4800, -0.2689,  0.4441,  0.2290,  0.3702],
        [-0.2814,  0.3258, -0.5368, -0.4653, -0.5996],
        [-0.2879, -0.0259,  0.3419, -0.4931, -0.5078],
        [ 0.7465, -0.0620, -0.7498, -0.1778, -0.0064]], requires_grad=True)

In [None]:
torch.nn.init.xavier_normal_(layer.weight, gain=1.0)
print(layer.weight.std())

# Note that sqrt(2/(n_in + n_out)) = sqrt(1/5) = 0.447

tensor(0.4119, grad_fn=<StdBackward0>)
