<a href="https://colab.research.google.com/github/inderpreetsingh01/PyTorch/blob/main/D2l_1_pytorch_basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import numpy as np

In [None]:
t1 = torch.arange(12)
t1

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [None]:
t1.size()

torch.Size([12])

In [None]:
len(t1)

12

In [None]:
t1.shape

torch.Size([12])

In [None]:
t1.numel()

12

In [None]:
t1.reshape(3,4)

tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]])

In [None]:
t1

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [None]:
t1.reshape(4, -1)

tensor([[ 0,  1,  2],
        [ 3,  4,  5],
        [ 6,  7,  8],
        [ 9, 10, 11]])

In [None]:
# initialising tensor

In [None]:
t2 = torch.ones((2,3))
t2

tensor([[1., 1., 1.],
        [1., 1., 1.]])

In [None]:
t3 = torch.zeros((2,3))
t3

tensor([[0., 0., 0.],
        [0., 0., 0.]])

In [None]:
# iid samples from gaussian distribution with mean 0 and std 1
t4 = torch.randn((2,3))
t4

tensor([[ 0.9836,  0.3699,  0.9049],
        [-0.3280,  0.5398,  0.7757]])

In [None]:
t5 = torch.tensor([[1,2,3],[4,5,6],[7,8,9]])
t5

tensor([[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]])

In [None]:
# transpose
t5.T

tensor([[1, 4, 7],
        [2, 5, 8],
        [3, 6, 9]])

In [None]:
# Indexing and slicing

In [None]:
t5[:2, :] = 1
t5

tensor([[1, 1, 1],
        [1, 1, 1],
        [7, 8, 9]])

In [None]:
t5[-1]

tensor([7, 8, 9])

In [None]:
t5[2,1]

tensor(8)

In [None]:
# Operations

In [None]:
# Unary scalar operators => scalar to scalar, elementwise operator
torch.exp(t5)

tensor([[2.7183e+00, 2.7183e+00, 2.7183e+00],
        [2.7183e+00, 2.7183e+00, 2.7183e+00],
        [1.0966e+03, 2.9810e+03, 8.1031e+03]])

In [None]:
# binary operator => pair of scalar -> scalar
t2-t4
t2*t4
t2/t4
t2**t4
t2 + t4

tensor([[ 0.5357,  2.6226, -0.9734],
        [ 0.2441, -0.0752,  1.7225]])

In [None]:
# concatenate
torch.cat((t2, t4), axis=0)

tensor([[ 1.0000,  1.0000,  1.0000],
        [ 1.0000,  1.0000,  1.0000],
        [-0.4643,  1.6226, -1.9734],
        [-0.7559, -1.0752,  0.7225]])

In [None]:
torch.cat((t2, t4), axis=1)

tensor([[ 1.0000,  1.0000,  1.0000, -0.4643,  1.6226, -1.9734],
        [ 1.0000,  1.0000,  1.0000, -0.7559, -1.0752,  0.7225]])

In [None]:
t2==t4

tensor([[False, False, False],
        [False, False, False]])

In [None]:
t2.sum()

tensor(6.)

In [None]:
# broadcasting
a = torch.arange(3).reshape((3, 1))
b = torch.arange(2).reshape((1, 2))
a+b

tensor([[0, 1],
        [1, 2],
        [2, 3]])

In [None]:
a

tensor([[0],
        [1],
        [2]])

In [None]:
b

tensor([[0, 1]])

In [None]:
# In place operation
# Note that after we run Y = Y + X, id(Y) points to a different location.
# That is because Python first evaluates Y + X, allocating new memory for the result and then points Y to this new location in memory.
# This might be undesirable for two reasons. First, we do not want to run around allocating memory unnecessarily all the time.
# In machine learning, we often have hundreds of megabytes of parameters and update all of them multiple times per second. Whenever possible,
# we want to perform these updates in place. Second, we might point at the same parameters from multiple variables. If we do not update in place,
# we must be careful to update all of these references, lest we spring a memory leak or inadvertently refer to stale parameters.

In [None]:
# not inplace
before = id(a)
a = a+b
id(a) == before

False

In [None]:
# inplace
before = id(a)
a[:] = a+b
id(a) == before

True

In [None]:
# inplace
before = id(a)
a+=b
id(a) == before

True

In [None]:
# conversion to other object types
a_ = a.numpy()
type(a_)

numpy.ndarray

In [None]:
a = torch.from_numpy(a_)
type(a)

torch.Tensor

In [None]:
t = torch.tensor((4))
t

tensor(4)

In [None]:
t.item()

4

In [None]:
float(t)

4.0

In [None]:
int(t)

4

In [None]:
a = np.array([1,2,3])
b = torch.from_numpy(a)
c = torch.tensor(a)

In [None]:
b

tensor([1, 2, 3])

In [None]:
c

tensor([1, 2, 3])

In [None]:
# Reduction
a = torch.tensor([[1,2,3], [4,5,6]])
a

tensor([[1, 2, 3],
        [4, 5, 6]])

In [None]:
# specified axis will be vanished
b = a.sum(axis=0)
b

tensor([5, 7, 9])

In [None]:
c = a.sum(axis=1)
c

tensor([ 6, 15])

In [None]:
# by default reduces all the axis
d = a.sum()
d

tensor(21)

In [None]:
# can reduce multiple axes at once
e = a.sum(axis=[0,1])
e

tensor(21)

In [None]:
a

tensor([[1, 2, 3],
        [4, 5, 6]])

In [None]:
# if we dont want to reduce the axis, so that result can be used for broadcasting further
f = a.sum(axis=0, keepdims=True)
f

tensor([[5, 7, 9]])

In [None]:
# mean across each column
f/a.size(axis=0)

tensor([[2.5000, 3.5000, 4.5000]])

In [None]:
a.sum(axis=1, keepdims=True)

tensor([[ 6],
        [15]])

In [None]:
g = f/a.sum(axis=1, keepdims=True)
g

tensor([[0.8333, 1.1667, 1.5000],
        [0.3333, 0.4667, 0.6000]])

In [None]:
# cumsum is non reducing operator
a.cumsum(axis=0)

tensor([[1, 2, 3],
        [5, 7, 9]])

In [None]:
# Dot product
a = torch.tensor([1,2,3])
b = torch.tensor([4,5,6])
dot_product = torch.dot(a,b)
print(dot_product)

tensor(32)


In [None]:
torch.sum(a*b)

tensor(32)

In [None]:
# matrix vector and matrix matrix multiplication

In [None]:
a = torch.arange(12).reshape(3, 4)
b = torch.arange(4)
torch.mv(a, b)

tensor([14, 38, 62])

In [None]:
a@b

tensor([14, 38, 62])

In [None]:
c = a.T
a@c

tensor([[ 14,  38,  62],
        [ 38, 126, 214],
        [ 62, 214, 366]])

In [None]:
# norm

In [None]:
a

tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]])

In [None]:
# calculates l2 norm
a = torch.tensor([1,2,3], dtype=torch.float32)
# p=2 for l2 norm, 1 for l1 norm and so on
l2_norm = torch.norm(a, p=2)
l2_norm

tensor(3.7417)

In [None]:
l1_norm = torch.norm(a, p=1)
l1_norm

tensor(6.)

In [None]:
# l1 norm
torch.sum(torch.abs(a))

tensor(6.)

In [None]:
# autograd
x = torch.tensor([1.0,2,3])
# default value is True only
x.requires_grad_(True)

tensor([1., 2., 3.], requires_grad=True)

In [None]:
x.grad

In [None]:
y = 2*torch.dot(x,x)
y.backward()

In [None]:
# gradient of 2*x*x is 4*x which is correct
x.grad

tensor([ 4.,  8., 12.])

In [None]:
# Note that PyTorch does not automatically reset the gradient buffer when we record a new gradient.
# Instead, the new gradient is added to the already-stored gradient. This behavior comes in handy when we want to optimize the sum of multiple objective functions.
# To reset the gradient buffer, we can call x.grad.zero_() as follows:
x.grad.zero_()

tensor([0., 0., 0.])

In [None]:
y = x.sum()

In [None]:
y.backward()
x.grad

tensor([1., 1., 1.])

In [None]:
# backwards of non scalar variable, for non scalar output we need to provide the gradient argument
x

tensor([1., 2., 3.], requires_grad=True)

In [None]:
y = 2*(x**2)
y.backward()

RuntimeError: grad can be implicitly created only for scalar outputs

In [None]:
x.grad.zero_()

tensor([0., 0., 0.])

In [None]:
y.backward(gradient=torch.ones(len(x)))

In [None]:
x.grad

tensor([ 4.,  8., 12.])

In [None]:
# Detaching Computation
# below we compute y = x*x and then z = x*y which is x*x*x so gradient of z wrt x should be 3*x*x
# but we want y to be treated independently or remove its ancestors from computation graph so we assign it to u
# where u is a variable detached from computation graph leading to z but y still has its own dependency on x.
# so when we differentiate z wrt x we get u and we can differentiate y wrt x as well and we will get 2*x.
x = torch.tensor([2.0], requires_grad=True)
y = x*x
u = y.detach()
z = u*x
z.backward()
x.grad == u

tensor([True])

In [None]:
z

tensor([8.], grad_fn=<MulBackward0>)

In [None]:
x.grad

tensor([4.])

In [None]:
x.grad.zero_()

tensor([0.])

In [None]:
y.backward()
x.grad

tensor([4.])

In [None]:
# Programming offers us a lot more freedom in how we compute results. For instance, we can make them depend on auxiliary variables or condition choices on intermediate results.
# One benefit of using automatic differentiation is that even if building the computational graph of a function required passing through a maze of Python control flow
# (e.g., conditionals, loops, and arbitrary function calls), we can still calculate the gradient of the resulting variable.