# Tensor Basics

Like Tensorflow, PyTorch runs on "tensor" objects.

In [83]:
import numpy as np
import torch

In [84]:
# create an "empty" tensor of shape 1 (i.e. an "empty" scaler)
x = torch.empty(1)

# since x is empty, the tensor will hold a random number until we assign the empty space a value
x

tensor([0.])

In [85]:
# create an empty 1 x 2 tensor (i.e. a vector)
x = torch.empty((1,2))

x

tensor([[9.1477e-41, 0.0000e+00]])

In [86]:
# create an empty 10 x 2 tensor (i.e. a matrix)
x = torch.empty((10,2))

x

tensor([[1.0194e-38, 9.9184e-39],
        [2.9389e-39, 1.0194e-38],
        [2.9389e-39, 9.2755e-39],
        [9.0918e-39, 1.0010e-38],
        [9.9184e-39, 1.0653e-38],
        [9.1837e-39, 9.6428e-39],
        [1.0010e-38, 9.1837e-39],
        [8.9082e-39, 9.2755e-39],
        [1.1112e-38, 0.0000e+00],
        [0.0000e+00, 6.6533e+16]])

In [87]:
# create an empty 3 x 6 x 2 tensor (i.e. a 3-tensor)
# Think of this as a 3-deep stack of 6 x 2 matrices.
x = torch.empty((3,6,2))

x

tensor([[[ 0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00],
         [ 9.1835e-41,  0.0000e+00],
         [ 2.8026e-45,  0.0000e+00],
         [-8.9940e+18,  4.5915e-41]],

        [[ 0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00]],

        [[ 0.0000e+00,  0.0000e+00],
         [        nan,  0.0000e+00],
         [ 1.4013e-45,  0.0000e+00],
         [-1.7252e+15,  4.5915e-41],
         [ 0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00]]])

In [88]:
# like numpy, we can quickly generate tensors of all 0's or all 1's
x = torch.zeros((3, 6, 2))

x

tensor([[[0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.]],

        [[0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.]],

        [[0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.]]])

In [89]:
x = torch.ones((3,6,2))

x

tensor([[[1., 1.],
         [1., 1.],
         [1., 1.],
         [1., 1.],
         [1., 1.],
         [1., 1.]],

        [[1., 1.],
         [1., 1.],
         [1., 1.],
         [1., 1.],
         [1., 1.],
         [1., 1.]],

        [[1., 1.],
         [1., 1.],
         [1., 1.],
         [1., 1.],
         [1., 1.],
         [1., 1.]]])

In [90]:
# we can also check the size of the tensor
x.size()

torch.Size([3, 6, 2])

--- 

In [91]:
# Tensors are usually created by passing a list or np.array of values
x = torch.tensor([2.5, 1, 3.8 ])

x

tensor([2.5000, 1.0000, 3.8000])

In [92]:
v = np.array([2.5, 1, 3.8])

x = torch.tensor(v)

x

tensor([2.5000, 1.0000, 3.8000], dtype=torch.float64)

In [93]:
# quickly generate an arbitrary tensor
x = torch.rand(2,3,4)

x

tensor([[[0.2596, 0.9668, 0.5173, 0.9609],
         [0.9325, 0.9534, 0.7186, 0.3392],
         [0.5390, 0.3931, 0.9580, 0.0567]],

        [[0.4764, 0.2889, 0.4169, 0.3076],
         [0.0109, 0.1179, 0.9215, 0.3748],
         [0.9160, 0.2743, 0.9339, 0.2710]]])

In [94]:
# we can actually convert tensors back to numpy arrays as well

# conver x to numpy array
x= x.numpy()

x

array([[[0.2596162 , 0.9668353 , 0.5172723 , 0.96090424],
        [0.93252426, 0.9533586 , 0.7185599 , 0.33918536],
        [0.53897727, 0.3931409 , 0.9580346 , 0.05666196]],

       [[0.47643667, 0.2888782 , 0.4168514 , 0.30761725],
        [0.01085627, 0.11785257, 0.9214798 , 0.3748451 ],
        [0.9159908 , 0.27432448, 0.93386585, 0.27102882]]], dtype=float32)

---

# Tensor Operations

In [95]:
# we can add tensors element wise
x = torch.ones((2,3,4))

y = torch.ones((2,3,4,))

print(x)
print(" ")
print(y)

tensor([[[1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.]],

        [[1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.]]])
 
tensor([[[1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.]],

        [[1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.]]])


In [96]:
2*y

tensor([[[2., 2., 2., 2.],
         [2., 2., 2., 2.],
         [2., 2., 2., 2.]],

        [[2., 2., 2., 2.],
         [2., 2., 2., 2.],
         [2., 2., 2., 2.]]])

In [97]:
x + 2*y

tensor([[[3., 3., 3., 3.],
         [3., 3., 3., 3.],
         [3., 3., 3., 3.]],

        [[3., 3., 3., 3.],
         [3., 3., 3., 3.],
         [3., 3., 3., 3.]]])

---

# Tensor Reshaping and Slicing

In [98]:
# make a 1D long tensor
x = torch.tensor(list(range(0,24)))

x

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23])

In [99]:
# we can reshape the tensor to 2 x 3 x 4
x = x.reshape((2,3,4))

x

tensor([[[ 0,  1,  2,  3],
         [ 4,  5,  6,  7],
         [ 8,  9, 10, 11]],

        [[12, 13, 14, 15],
         [16, 17, 18, 19],
         [20, 21, 22, 23]]])

In [100]:
# now we can select elements by slicing

# select the element in the first matrix, second row, third entry
# Note the indices start a 0, so they are all -1 from the name
x[0,1,2]

tensor(6)

In [101]:
# select the element in the second matrix, first row, fourth entry
x[1,0,3]

tensor(15)

In [102]:
# select the first matrix, first two rows
x[0, 0:2, :]

tensor([[0, 1, 2, 3],
        [4, 5, 6, 7]])

In [103]:
# select the second matrix, last two columns
x[0, : , 2:4]

tensor([[ 2,  3],
        [ 6,  7],
        [10, 11]])

In [104]:
# select the first two columns across both matrices
x[:, :, 0:2]

tensor([[[ 0,  1],
         [ 4,  5],
         [ 8,  9]],

        [[12, 13],
         [16, 17],
         [20, 21]]])

In [105]:
# select bottom right 2 x 2 submatrix from both matrices
x[:, 1:3, 2:4]

tensor([[[ 6,  7],
         [10, 11]],

        [[18, 19],
         [22, 23]]])

---

# CUDA and Device Allocation

In [107]:
# PyTorch tensors can be assigned to either the CPU or GPU
# which decides which device the tensor computations are done on.
x = torch.ones((2,3,4) , device = "cuda")

x

tensor([[[1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.]],

        [[1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.]]], device='cuda:0')

In [109]:
x.device

device(type='cuda', index=0)

In [111]:
# note: a GPU tensor cannot be converted back to a np.array.
# what we have to do instead is convert it back to a CPU tensor and then to an np.array
x.numpy()

TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

In [116]:
# convert back to cpu tensor first
x = x.to("cpu")

x.numpy()

array([[[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]],

       [[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]]], dtype=float32)

---

# Autograd

In [128]:
# In PyTorch, gradient descent is acheived using Auto Differentiation

# Auto Differentiation works by tracking each individual operation done on a tensor, then reverse engineering it
# to compute the gradient via the chain rule (cool stuff!)

# Therefore, if we need to compute the gradient of a tensor, we have to tell PyTorch to track the operations done on the tensor
# This is done by turning on the "requires_grad" parameter
x = torch.ones((2,3,4), device='cuda', requires_grad=True)

x

tensor([[[1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.]],

        [[1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.]]], device='cuda:0', requires_grad=True)

In [129]:
# let's do some operations
y = x+2

z = y*y*3

z = z.mean()

z

tensor(27., device='cuda:0', grad_fn=<MeanBackward0>)

In [130]:
print(x.grad)

None


In [131]:
# compute dz/dx
z.backward()

In [132]:
x.grad

tensor([[[0.7500, 0.7500, 0.7500, 0.7500],
         [0.7500, 0.7500, 0.7500, 0.7500],
         [0.7500, 0.7500, 0.7500, 0.7500]],

        [[0.7500, 0.7500, 0.7500, 0.7500],
         [0.7500, 0.7500, 0.7500, 0.7500],
         [0.7500, 0.7500, 0.7500, 0.7500]]], device='cuda:0')

- One thing to note: ```z.backward()``` actually computes a Jacobian matrix. Generally, we must we'll need to multiply the constructed Jacobian with a vector to get the final gradient.
- However, since our $z$ outputs a scalar, the Jacobian matrix is already a gradient, so multiplication is required in this specific case.

In [133]:
# consider the following z which is not scalar-valued
x = torch.ones((2,3,4), device='cuda', requires_grad=True)
y = x+2
z = y*y*3

z

tensor([[[27., 27., 27., 27.],
         [27., 27., 27., 27.],
         [27., 27., 27., 27.]],

        [[27., 27., 27., 27.],
         [27., 27., 27., 27.],
         [27., 27., 27., 27.]]], device='cuda:0', grad_fn=<MulBackward0>)

In [134]:
# trying to call backward() now will produce an error
z.backward()

RuntimeError: grad can be implicitly created only for scalar outputs

So what is going on? 
- ```tensor.backward()``` differentiates the tensor ```z``` with respect to the "leaf tensors" aka independent variables ```x``` and stores the resulting derivative in the attribute ```x.grad``` of the leaf tensor.
- Since we are dealing with multiple variables, the derivative comes in the form of the Jacobian, so ```tensor.backward()``` will return a Jacobian matrix.
- When ```z``` is a scalar-valued function $z:\mathbb{R}^n \to \mathbb{R}$, the Jacobian Matrix *is* the same as the gradient so everything is fine.
- However when ```z``` is tensor-valued, the Jacobian Matrix is now a tensor of higher dimension than the original ```x``` tensor. We can't "add" the Jacobian to ```x```, so what we have to do is multiply it with a vector to squash the dimension down.

In [137]:
v = torch.ones((2,3,4), device='cuda')

z.backward(v)

In [138]:
x.grad

tensor([[[18., 18., 18., 18.],
         [18., 18., 18., 18.],
         [18., 18., 18., 18.]],

        [[18., 18., 18., 18.],
         [18., 18., 18., 18.],
         [18., 18., 18., 18.]]], device='cuda:0')

- Let's illustrate what is happening here. We have
<br>

$$ z = 3y^2 = 3(x+2)^2$$
$$\frac{\partial z_{ijk}}{\partial x_{ijk}} = 6(x_{ijk}+2) |_{x_{ijk}=1}$$
$$\frac{\partial z_{ijk}}{\partial x_{ijk}} = 6(1+2) = 18 $$

We collect all of these into a tensor level equation:

<br>

$$ z = 3(x+2)^2 $$
$$\frac{dz}{dx} = 6(x+2) |_{x=[1]} = 6([1]+2)=[18]$$

where $[1]$ and $[18]$ denote 3-tenors with those values in all entries

In [143]:
# let's see what happens when we change the values in the tensor v
v = torch.tensor(list(range(1,25)), device='cuda').reshape((2,3,4))

v

tensor([[[ 1,  2,  3,  4],
         [ 5,  6,  7,  8],
         [ 9, 10, 11, 12]],

        [[13, 14, 15, 16],
         [17, 18, 19, 20],
         [21, 22, 23, 24]]], device='cuda:0')

In [145]:
# reset computational graph. This has to be done since the graph gets "used up"
# everytime we call backward()
x = torch.ones((2,3,4), device='cuda', requires_grad=True)
y = x+2
z = y*y*3

z.backward(v)

In [146]:
x.grad

tensor([[[ 18.,  36.,  54.,  72.],
         [ 90., 108., 126., 144.],
         [162., 180., 198., 216.]],

        [[234., 252., 270., 288.],
         [306., 324., 342., 360.],
         [378., 396., 414., 432.]]], device='cuda:0')

- The key thing to notice is that the tensor $v$ encodes the values we evaluate the partial derivatives to get the actual, numeric gradient. 

In [154]:
# recall that we had to re-initialize the tensor z. This is because the computational graph gets
# used up everytime we auto differentiate, i.e. call backward()

# If we want to differentiate multiple times, we'll need to tell PyTorch to save the computational graph
x = torch.ones((2,3,4), device='cuda', requires_grad=True)
y = x+2
z = y*y*3

# differentiate z, saving the computational graph
z.backward(v, retain_graph=True)

x.grad

tensor([[[ 18.,  36.,  54.,  72.],
         [ 90., 108., 126., 144.],
         [162., 180., 198., 216.]],

        [[234., 252., 270., 288.],
         [306., 324., 342., 360.],
         [378., 396., 414., 432.]]], device='cuda:0')

In [155]:
v2 = torch.ones((2,3,4), device='cuda')*2
v3 = torch.ones((2,3,4), device='cuda')*3

# clear the previous gradient stored in x
x.grad.zero_()

# differentiate z again
z.backward(v2, retain_graph=True)

x.grad

tensor([[[36., 36., 36., 36.],
         [36., 36., 36., 36.],
         [36., 36., 36., 36.]],

        [[36., 36., 36., 36.],
         [36., 36., 36., 36.],
         [36., 36., 36., 36.]]], device='cuda:0')

In [156]:
# clear gradient
x.grad.zero_()

# differentiate z once more
z.backward(v3, retain_graph=True)

x.grad

tensor([[[54., 54., 54., 54.],
         [54., 54., 54., 54.],
         [54., 54., 54., 54.]],

        [[54., 54., 54., 54.],
         [54., 54., 54., 54.],
         [54., 54., 54., 54.]]], device='cuda:0')

- Notice: we had to clear the gradient every single time we took a derivative. This is because PyTorch **accumulates gradients** in leaf tensors.
- Why does PyTorch do this? 
    - When doing SGD, we usually train the network on mini-batches of examples.
    - All the gradients from the min-batches must be summed together at the end in order to make one actual descent step.
    - This is why PyTorch automatically saves the gradient information: it assumes we are going to be doing mini-match gradient descent (which will almost always be the case unless we somehow have only 200 training examples)