# TORCH 03. Autograd: automotic differentiation
- `autograd` package provides automatic differentiation for all operations on Tensors

## Tensor

In [1]:
import torch
print(torch.__version__)

1.4.0


In [2]:
# Create a tensor and set `requires_grad=True` to track computation with it
x = torch.ones(2, 2, requires_grad=True)
print(x)

tensor([[1., 1.],
        [1., 1.]], requires_grad=True)


In [4]:
# Do a tensor operation
y = x + 2
print(y, end='\n\n')

# y was created as a result of an operation, so it has a `grad_fn`
print(y.grad_fn, end='\n\n')

# Do more operations on y
z = y * y * 3
out = z.mean()
print(z, out)

tensor([[3., 3.],
        [3., 3.]], grad_fn=<AddBackward0>)

<AddBackward0 object at 0x000001C47BFBA160>

tensor([[27., 27.],
        [27., 27.]], grad_fn=<MulBackward0>) tensor(27., grad_fn=<MeanBackward0>)


In [5]:
# `.requires_grad_( ... )` changes an existing Tensor's `requires_grad` flag in-place.
# The input flag defaults to `False` if not given.
a = torch.randn(2, 2)
a = ((a * 3) / (a - 1))
print(a.requires_grad)    # default is False
a.requires_grad_(True)    # Set requres_grad as True
print(a.requires_grad)    # It will be a True
b = (a * a).sum()
print(b.grad_fn)          # Since requires_grad is True, exists grad_fn

False
True
<SumBackward0 object at 0x000001C42E1803C8>


## Gradients

In [6]:
out

tensor(27., grad_fn=<MeanBackward0>)

In [7]:
# Since `out` contains a single scalar,
# `out.backward()` is equivalent to `out.backward(torch.tensor(1,))`.
out.backward()  # 역전파 실시

In [8]:
# Print gradients d(out)/dx
print(x.grad)

tensor([[4.5000, 4.5000],
        [4.5000, 4.5000]])


$$o = {\cfrac{1}{4}}{\sum_{i}{z_{i}}}$$
$$z_{i} = 3(x_{i}+2)^{2}$$
$$z_{i}|_{x_{i}=1}=27$$
$$\text{Therefore,}{\;}\cfrac{{\partial}o}{{\partial}x_{i}}=\cfrac{3}{2}(x_{i}+2)$$
$${\cfrac{{\partial}o}{{\partial}x_{i}}}\bigg{|}_{x_{i}=1}=\cfrac{9}{2}=4.5$$

$$\text{Mathematically, if you have a vector valued function}\;\vec{y}=f(\vec{x}),$$
$$\text{then the gradient of}\;\vec{y}\text{ with respect to}\;\vec{x}\text{ is a jacobian matrix:}$$
$$J=\begin{pmatrix}
\cfrac{{\partial}y_{1}}{{\partial}x_{1}} & \cdots & \cfrac{{\partial}y_{1}}{{\partial}x_{n}}\\
\vdots & \ddots & \vdots\\
\cfrac{{\partial}y_{m}}{{\partial}x_{1}} & \cdots & \cfrac{{\partial}y_{m}}{{\partial}x_{n}}\\
\end{pmatrix}$$

$$\text{Generally speaking, `torch.autograd` is an engine for computing vector-Jacobian product.}$$
$$\text{That is, given any vector }v=(v1{\quad}v2{\quad}{\cdots}{\quad}v_{m})^{T}\text{, compute the product }v^{T}{\cdot}J$$
$$\text{If }v\text{ happens to be the gradient of a scalar function }l=g\big{(}\vec{y}\big{)}\text{, that is, }v=\bigg{(}\cfrac{{\partial}l}{{\partial}y_{1}}\;\cdots\;\cfrac{{\partial}l}{{\partial}y_{m}}\bigg{)}^{T}\text{,}$$
$$\text{then by the chain rule, the vector-Jacobian product would be the gradient of }l\text{ with respect to }\vec{x}\text{:}$$
$$J^{T}\cdot{v}=\begin{pmatrix}
\cfrac{{\partial}y_{1}}{{\partial}x_{1}} & \cdots & \cfrac{{\partial}y_{m}}{{\partial}x_{1}}\\
\vdots & \ddots & \vdots\\
\cfrac{{\partial}y_{1}}{{\partial}x_{n}} & \cdots & \cfrac{{\partial}y_{m}}{{\partial}x_{n}}\\
\end{pmatrix}\begin{pmatrix}
\cfrac{{\partial}l}{{\partial}y_{1}}\\
\vdots\\
\cfrac{{\partial}l}{{\partial}y_{m}}\\
\end{pmatrix}=\begin{pmatrix}
\cfrac{{\partial}l}{{\partial}x_{1}}\\
\vdots\\
\cfrac{{\partial}l}{{\partial}x_{n}}\\
\end{pmatrix}$$
$$\text{(Note that }v^{T}\cdot{J}\text{ gives a row vector which can be treated as a column vector by taking }{J}^{T}\cdot{v}\text{)}$$
$$\text{This characteristic of vector-Jacobian product makes it very convenient to feed external gradients into a model that has non-scalar output.}$$

In [11]:
# vector-Jacobian product example
x = torch.randn(3, requires_grad=True)
print('x :', x)
y = x * 2
while y.data.norm() < 1000:
    y = y * 2
print('y :', y)

x : tensor([ 0.3164, -1.1508,  1.5400], requires_grad=True)
y : tensor([  323.9894, -1178.4686,  1577.0007], grad_fn=<MulBackward0>)


In [12]:
# y is not a scalar,
# `torch.autograd` could not compute the full jacobian directly,
# but if we just want the vector-jacobian product,
# simply pass the vector to `backward` as argument.
v = torch.tensor([0.1, 1.0, 0.0001], dtype=torch.float)
y.backward(v)

print(x.grad)

tensor([1.0240e+02, 1.0240e+03, 1.0240e-01])


In [13]:
# Stop autograd from tracking history on Tensors with `.requires_grad=True`
# By wrapping the code block in `with torch.no_grad():`
print(x.requires_grad)
print((x ** 2).requires_grad)

with torch.no_grad():
    print((x ** 2).requires_grad)

True
True
False


In [14]:
# Or by using `.detach()` to get a new Tensor with the same content
# but that does not require gradients
print(x.requires_grad)
y = x.detach()
print(y.requires_grad)
print(x.eq(y).all())

True
False
tensor(True)


## AUTOMATIC DIFFERENTIATION PACKAGE - TORCH.AUTOGRAD

In [26]:
# SOURCE CODE FOR TORCH.AUTOGRAD
import torch
import warnings

from torch.autograd.variable import Variable
from torch.autograd.function import Function, NestedIOFunction
from torch.autograd.gradcheck import gradcheck, gradgradcheck
from torch.autograd.grad_mode import no_grad, enable_grad, set_grad_enabled
from torch.autograd.anomaly_mode import detect_anomaly, set_detect_anomaly
from torch.autograd import profiler

In [27]:
__all__ = ['Variable', 'Function', 'backward', 'grad_modea']

In [29]:
def _make_grads(outputs, grads):
    new_grads = []
    for out, grad in zip(outputs, grads):
        if isinstance(grad, torch.Tensor):
            if not out.shape == grad.shape:
                raise RuntimeError("Mismatch in shape: grad_output["
                                   + str(grads.index(grad)) + "] has a shape of "
                                   + str(grad.shape) + " and output["
                                   + str(outputs.index(out)) + "] has a shape of "
                                   + str(out.shape) + ".")
            new_grads.append(grad)
        elif grad is None:
            if out.requires_grad:
                if out.numel() != 1:
                    '''
                    # Returns the total number of elements in the `input` tensor.
                    >>> a = torch.randn(1, 2, 3, 4, 5)
                    >>> torch.numel(a)
                    120
                    >>> a = torch.zeros(4, 4)
                    >>> torch.numel(a)
                    16
                    '''
                    raise RuntimeError("grad can be implicitly created only for scalar outputs")
                new_grads.append(torch.ones_like(out, memory_format=torch.preserve_format))
            else:
                new_grads.append(None)
        else:
            raise TypeError("gradients can be either Tensors or None, but got " +
                            type(grad).__name__)
    return tuple(new_grads)

In [35]:
def backward(tensors, grad_tensors=None, retain_graph=None, create_graph=False,
             grad_variables=None):
    """
    Computes the sum of gradients of given tensors w.r.t. graph leaves.
    """
    if grad_variables is not None:
        warnings.warn("'grad_variables' is deprecated. Use 'grad_tensors' instead.")
        if grad_tensors is None:
            grad_tensors = grad_variables
        else:
            raise RuntimeErorr("'grad_tensors' and 'grad_variables' (deprecated) "
                               "arguments both passed to backward(). Please only "
                               "use 'grad_tensors'.")
    
    tensors = (tensors, ) if isinstance(tensors, torch.Tensor) else tuple(tensors)
    
    if grad_tensors is None:
        grad_tensors = [None] * len(tensors)
    elif isinstance(grad_tensors, torch.Tensor):
        grad_tensors = [grad_tensors]
    else:
        grad_tensors = list(grad_tensors)
    
    grad_tensors = _make_grads(tensors, grad_tensors)
    if retain_graph is None:
        retain_graph = create_graph
        
    # 위에서 설정만 잡아주고 돌리는건 C++ Imperative Engine에서 돌린다.
    Variable._execution_engine.run_backward(
        tensors, grad_tensors, retain_graph, create_graph,
        allow_unreachable=True)  # allow_unreachable flag

In [None]:
def grad(outputs, inputs, grad_outputs=None, retain_graph=None, create_graph=False,
         only_inputs=True, allow_unused=False):
    """
    Computes and returns the sum of gradients of outputs w.r.t. the inputs.
    """
    if not only_inputs:
        warnings.warn("only_inputs argument is deprecated and is ignored now "
                      "(defualts to True). To accumulate gradient for other "
                      "parts of the graph, please use torch.autograd.backward.")
        