# Basic concepts and PyTorch intoduction

In [None]:
import torch 

x = torch.tensor(3)
y = torch.tensor(4)

print(f"scalar operation: {x+y, x-y}")

x = torch.arange(12, dtype=torch.float32)
print(f"vector: {x}")
print(f"matrix: {x.reshape(-1, 2)}")
print(f"matrix dimensions: {x.reshape(-1, 2).shape}")

scalar operation: (tensor(7), tensor(-1))
vector: tensor([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11.])
matrix: tensor([[ 0.,  1.],
        [ 2.,  3.],
        [ 4.,  5.],
        [ 6.,  7.],
        [ 8.,  9.],
        [10., 11.]])
matrix dimensions: torch.Size([6, 2])


## Linear Algebra 

In [None]:
A = torch.arange(6, dtype=torch.float32).reshape(-1, 3)
B = A.clone() #doing this we allocate a new memory to new variable B
A, A+B

(tensor([[0., 1., 2.],
         [3., 4., 5.]]), tensor([[ 0.,  2.,  4.],
         [ 6.,  8., 10.]]))

In [None]:
# element wise product - Hadamard product 

A * B

tensor([[ 0.,  1.,  4.],
        [ 9., 16., 25.]])

In [None]:
# scalar and tensor elemenst wiseproduct
a = torch.tensor(2)
a * A, a + A, (a + A).shape

(tensor([[ 0.,  2.,  4.],
         [ 6.,  8., 10.]]), tensor([[2., 3., 4.],
         [5., 6., 7.]]), torch.Size([2, 3]))

In [None]:
# Reduction (sum)

x = torch.arange(6, dtype= torch.float32)
print(x, x.sum())

x_matrix =  x.reshape(-1,3) #trick to not allocate a new memory for x
print(x_matrix.shape, x_matrix.sum(axis=0), x_matrix.sum(axis=1)) #axis=1 will reduce the column dimension (axis 1) by summing up elements of all the columns.
#other way to remember, 1 is not vertical but horizontal sum

tensor([0., 1., 2., 3., 4., 5.]) tensor(15.)
torch.Size([2, 3]) tensor([3., 5., 7.]) tensor([ 3., 12.])


In [None]:
print(f"A: {A}, shape: {A.shape}")
print(f"element wise mean: {A.mean(), A.sum() / A.numel()}")
print(f"columwise mean: {A.mean(axis=1), A.sum(axis=1)/A.shape[1], A.mean(axis=1).shape}") 

A: tensor([[0., 1., 2.],
        [3., 4., 5.]]), shape: torch.Size([2, 3])
element wise mean: (tensor(2.5000), tensor(2.5000))
columwise mean: (tensor([1., 4.]), tensor([1., 4.]), torch.Size([2]))


In [None]:
# non reduction sum - keeping the dims (the number of axes stays the same), This matters when we want to use the broadcast mechanism.

sum_A = A.sum(axis=1, keepdims=True)
sum_A, sum_A.shape

(tensor([[ 3.],
         [12.]]), torch.Size([2, 1]))

In [None]:
#we can divide A by sum_A with broadcasting to create a matrix where each row sums up to 1
A / sum_A

tensor([[0.0000, 0.3333, 0.6667],
        [0.2500, 0.3333, 0.4167]])

In [None]:
#Calculate the cumulative sum of elements of A along some axis, say axis=0 (row by row)
A, A.cumsum(axis=0)

(tensor([[0., 1., 2.],
         [3., 4., 5.]]), tensor([[0., 1., 2.],
         [3., 5., 7.]]))

### Product

Dot Product
- given two vectors $x, y \in \mathbb{R}^d$, the dot product is $x^{\top}y$
- Dot products are useful in a wide range of contexts. For example, given some set of values, denoted by a vector  and a set of weights denoted by the weighted sum of the values in according to the weights could be expressed as the dot product . When the weights are non-negative and sum to one, i.e., the dot product expresses a weighted average. After normalizing two vectors to have unit length, the dot products express the cosine of the angle between them.

In [None]:
#dot product 
x = torch.arange(3, dtype=torch.float32)
y = torch.ones_like(x)

x, y, torch.dot(x, y), torch.sum(x * y)

(tensor([0., 1., 2.]), tensor([1., 1., 1.]), tensor(3.), tensor(3.))

In [None]:
# Matrix-Vector Products

print(f"matrix-vector: {A.shape, x.shape, torch.mv(A, x), A@x}")

# Matrix-Matrix Multiplication
B = torch.ones(3, 4)
print(f"matrix-matrix: {torch.matmul(A, B)}")

matrix-vector: (torch.Size([2, 3]), torch.Size([3]), tensor([ 5., 14.]), tensor([ 5., 14.]))
matrix-matrix: tensor([[ 3.,  3.,  3.,  3.],
        [12., 12., 12., 12.]])


Norms

- Some of the most useful operators in linear algebra are norms. Informally, the norm of a vector tells us how big it is. For instance, the $l_2$ norm measures the (Euclidean) length of a vector.
- The norm is also popular and the associated metric is called the Manhattan distance. By definition, the $l_1$ norm sums the absolute values of a vector’s elements

In [None]:
#l2 norm
u = torch.tensor([3.0, -4.0])
print(f"l2: {torch.norm(u)}")

#l1 norm
print(f"l1 : {torch.abs(u).sum()}")

l2: 5.0
l1 : 7.0


## Automatic Differentiation

In [None]:
x = torch.arange(4.0)
x.requires_grad_(True)
x.grad

y = 2 * torch.dot(x, x) #y = 2x^2
y

tensor(28., grad_fn=<MulBackward0>)

In [None]:
# We can now take the gradient of y with respect to x by calling its backward method. Next, we can access the gradient via x’s grad attribute.
y.backward()
x.grad

tensor([ 0.,  4.,  8., 12.])

In [None]:
x.grad == 4 * x

tensor([True, True, True, True])

In [None]:
x.grad.zero_() #to reset the gradient
x.grad

tensor([0., 0., 0., 0.])

In [None]:
# Detaching
x.grad.zero_()
y = x * x
u = y.detach()
z = u * x

z.sum().backward()
x.grad == u
## you dont want to compute the gradient flow from y object, only compute the gradient in z. derivative of z is u and since u is y = x^2 then its gradient x^2

tensor([True, True, True, True])

In [None]:
u, x

(tensor([0., 1., 4., 9.]), tensor([0., 1., 2., 3.], requires_grad=True))

things to remember

- (i) attach gradients to those variables with respect to which we desire derivatives; detaching - u = y.detach()
- (ii) record the computation of the target value y.sum() 
- (iii) execute the backpropagation function; and y.sum().backward()
- (iv) access the resulting gradient x.grad()

x.grad.zero_() #reseting the gradient object

# Exercises

1. show that the transpose of the transpose of a matrix is the matrix itself: 
2. Given two matrices $A, B$ and show that sum and transposition commute: $ A^{\top} + B ^{\top} = ( A + B )^{\top}$

In [None]:
A = torch.normal(0, 0.1, (5, 5))
torch.transpose(torch.transpose(A, 0, 1),0 ,1) == A

tensor([[True, True, True, True, True],
        [True, True, True, True, True],
        [True, True, True, True, True],
        [True, True, True, True, True],
        [True, True, True, True, True]])

In [None]:
B = torch.randn(5,5)
B_t = torch.transpose(B, 0, 1)
A_t = torch.transpose(A, 0, 1)

A_t + B_t == torch.transpose(A + B, 0, 1)

tensor([[True, True, True, True, True],
        [True, True, True, True, True],
        [True, True, True, True, True],
        [True, True, True, True, True],
        [True, True, True, True, True]])