In [2]:
import torch
import numpy as np
import torchvision

In [None]:
# che vor di
"""If any of tensors are non-scalar (i.e. their data has more than one element) and require gradient,
the function additionally requires specifying grad_tensors. It should be a sequence of matching length, 
that contains gradient of the differentiated function w.r.t. corresponding tensors 
(None is an acceptable value for all tensors that don’t need gradient tensors)"""

In [33]:
"""
A pytorch model has the parameters() method that returns every single tensors composing the network.
A pytorch model has the children() method that returns all the layers of the model.
Each child (i.e. layer) is composed by one or more tensors.
Each child has also a parameter() method that returns each tensor composing the layer.
Understanding this structure is important to turn off gradient computation on single layers or sublayers.
"""
# load a pretrained model
model = torchvision.models.resnet18(pretrained=True)
layer_count = 0 
layer_max = 4 # until which layer we want the description
# loop over the layers of the net
for c in model.children():
    print("*"*70)
    print(f"CHILDREN {layer_count + 1} IS :")
    print(c)
    print('\n')
    print(f"size of parameters for child {layer_count + 1} are :")
# loop over the tensors composing the layer c
    for param in c.parameters():
        print(param.shape)
        # param.requires_grad = False
    print("*"*70)
    print('\n\n')
    layer_count += 1
    if layer_count > layer_max:
        break

**********************************************************************
CHILDREN 1 IS :
Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)


size of parameters for child 1 are :
torch.Size([64, 3, 7, 7])
**********************************************************************



**********************************************************************
CHILDREN 2 IS :
BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)


size of parameters for child 2 are :
torch.Size([64])
torch.Size([64])
**********************************************************************



**********************************************************************
CHILDREN 3 IS :
ReLU(inplace)


size of parameters for child 3 are :
**********************************************************************



**********************************************************************
CHILDREN 4 IS :
MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)


size 

Questions:
- what is the right terminology for a tensor in a layer?
- how is a layer defined?
- what is a sub-graph?

In [57]:
"""
Autograd is reverse automatic differentiation system.
Conceptually, autograd records a graph recording all of the operations that created the data as you execute operations,
giving you a directed acyclic graph whose leaves are the input tensors and roots are the output tensors. 
By tracing this graph from roots to leaves, you can automatically compute the gradients using the chain rule.

Internally, autograd represents this graph as a graph of Function objects (really expressions),
which can be apply() ed to compute the result of evaluating the graph.
When computing the forwards pass, autograd simultaneously performs the requested computations and 
builds up a graph representing the function that computes the gradient 
(the .grad_fn attribute of each torch.Tensor is an entry point into this graph). 
"""
# accessing the graph of Functions for a sublayer in ResNet
next(iter(model.parameters()))[0].grad_fn(torch.rand(3,7,7))

tensor([[[[0.4145, 0.7627, 0.5591,  ..., 0.8221, 0.1405, 0.7418],
          [0.8263, 0.3073, 0.5047,  ..., 0.9060, 0.1171, 0.1214],
          [0.1013, 0.0171, 0.9138,  ..., 0.7892, 0.8192, 0.1922],
          ...,
          [0.2389, 0.0933, 0.6571,  ..., 0.9614, 0.1424, 0.6270],
          [0.7308, 0.6663, 0.6745,  ..., 0.3075, 0.4274, 0.2564],
          [0.1157, 0.7707, 0.7036,  ..., 0.0357, 0.6143, 0.5993]],

         [[0.9419, 0.8969, 0.8285,  ..., 0.5001, 0.3291, 0.9373],
          [0.2153, 0.5582, 0.2164,  ..., 0.4124, 0.8528, 0.3696],
          [0.0113, 0.5700, 0.7718,  ..., 0.2349, 0.8837, 0.2239],
          ...,
          [0.1147, 0.1277, 0.8975,  ..., 0.6002, 0.9612, 0.9621],
          [0.6847, 0.6775, 0.1844,  ..., 0.8157, 0.9225, 0.0109],
          [0.6633, 0.3915, 0.7651,  ..., 0.8848, 0.8308, 0.7504]],

         [[0.8452, 0.8360, 0.2734,  ..., 0.7499, 0.2969, 0.0283],
          [0.6142, 0.8519, 0.5852,  ..., 0.3823, 0.1674, 0.0523],
          [0.5229, 0.3796, 0.3087,  ..., 0

# torch.autograd.Function

In [None]:
"""
The Tensor and Function classes are interconnected. 
They build up an acyclic graph, that encodes a complete history of computation. 
Each Tensor has a .grad_fn attribute that references a Function that has created the Tensor 
(except for Tensors created by the user - their grad_fn is None).
"""

In [290]:
x = torch.FloatTensor([1,2,3]) # leaf tensor of Directed Acyclic Graph 
x.requires_grad = True
y = 2.*x 
print(f'operation that created y = {y.grad_fn} is a Function object' )
z = y**2
print(f'operation that created z = {z.grad_fn} is a Function object')
w = z.sum()
print(f'operation that created w = {w.grad_fn} is a Function object', end='\n' + '*'*70+'\n\n')
print(f'compute backward pass starting from w')
w.backward()
print(f'gradient of x from w: {x.grad}')

print("""\nBy default, gradients are only retained for leaf variables. 
non-leaf variables’ gradients are not retained to be inspected later. 
This was done by design, to save memory.\n""")
# ref:
# https://discuss.pytorch.org/t/why-cant-i-see-grad-of-an-intermediate-variable/94
print(f'gradient of y: {y.grad}')
print(f'gradient of z: {z.grad}')
print(f'gradient of w: {w.grad}', end = '\n\n')

print(f'compute backward pass starting from z')
try:
    z.backward(torch.FloatTensor([1,1,1]))
except Exception as e:
    print('='*70)
    print(e)
    print('='*70)
x = torch.FloatTensor([1,2,3]) # leaf tensor of Directed Acyclic Graph 
x.requires_grad = True
y = 2.*x 
z = y**2
w = z.sum()
z.backward(torch.FloatTensor([1,1,1]))
print()
print(f'compute AFTER REDEFINITION backward pass starting from z')
print(f'gradient of x from z: {x.grad}')
x = torch.FloatTensor([1,2,3]) # leaf tensor of Directed Acyclic Graph 
x.requires_grad = True
y = 2.*x 
z = y**2
w = z.sum()
y.backward(torch.FloatTensor([1,1,1]))
print()
print(f'compute AFTER REDEFINITION backward pass starting from y')
print(f'gradient of x from y: {x.grad}')
print()
print(f'compute AFTER REDEFINITION backward pass starting from w')
x = torch.FloatTensor([1,2,3]) # leaf tensor of Directed Acyclic Graph 
x.requires_grad = True
y = 2.*x 
z = y**2
w = z.sum()
w.backward()
print(f'gradient of x from w: {x.grad}')

print('\n' + '*'*70)
print('use retain_graph = True')
print('*'*70+'\n')

x = torch.FloatTensor([1,2,3]) # leaf tensor of Directed Acyclic Graph 
x.requires_grad = True
y = 2.*x 
z = y**2
w = z.sum()
ww = 3

w.backward(retain_graph=True)
print(f'gradient of x from w: {x.grad}')
z.backward(torch.FloatTensor([1,1,1]), retain_graph=True)
print(f'gradient of x from z: {x.grad},   (.grad accumulates the gradients)'  )
y.backward(torch.FloatTensor([1,1,1]))
print(f'gradient of x from y: {x.grad}')

print('\n' + '*'*70)
print('after reinitialization using different "gradient" arguments')
print('*'*70+'\n')

print('scalar root, gradient = [1] (default argument)')
x = torch.FloatTensor([1,2,3]) # leaf tensor of Directed Acyclic Graph 
x.requires_grad = True
y = 2.*x 
z = y**2
w = z.sum()
w.backward(torch.FloatTensor([1]))
print(f'gradient of x from w: {x.grad} \n')

print('scalar root, gradient = [0]')
x = torch.FloatTensor([1,2,3]) # leaf tensor of Directed Acyclic Graph 
x.requires_grad = True
y = 2.*x 
z = y**2
w = z.sum()
w.backward(torch.FloatTensor([0]))
print(f'gradient of x from w: {x.grad} \n')

print('scalar root, gradient = [1,1,1,1]')
x = torch.FloatTensor([1,2,3]) # leaf tensor of Directed Acyclic Graph 
x.requires_grad = True
y = 2.*x 
z = y**2
w = z.sum()
w.backward(torch.FloatTensor([1,1,1,1]))
print(f'gradient of x from w: {x.grad} \n')


print('scalar root, gradient = [2]')
x = torch.FloatTensor([1,2,3]) # leaf tensor of Directed Acyclic Graph 
x.requires_grad = True
y = 2.*x 
z = y**2
w = z.sum()
ww = 3
w.backward(torch.FloatTensor([2]))
print(f'gradient of x from w: {x.grad} \n \n \n')


print('tensorial root, gradient = [1,1,1]')
x = torch.FloatTensor([1,2,3]) # leaf tensor of Directed Acyclic Graph 
x.requires_grad = True
y = 2.*x 
z = y**2
w = z.sum()
z.backward(torch.FloatTensor([1,1,1]))
print(f'gradient of x from z: {x.grad} \n')

print('tensorial root, gradient = [0,0,0]')
x = torch.FloatTensor([1,2,3]) # leaf tensor of Directed Acyclic Graph 
x.requires_grad = True
y = 2.*x 
z = y**2
w = z.sum()
z.backward(torch.FloatTensor([0,0,0]))
print(f'gradient of x from z: {x.grad} \n')

print('tensorial root, gradient = [2,2,2]')
x = torch.FloatTensor([1,2,3]) # leaf tensor of Directed Acyclic Graph 
x.requires_grad = True
y = 2.*x 
z = y**2
w = z.sum()
z.backward(torch.FloatTensor([2,2,2]))
print(f'gradient of x from z: {x.grad}')

operation that created y = <MulBackward0 object at 0x12a41cf98> is a Function object
operation that created z = <PowBackward0 object at 0x12a41cf98> is a Function object
operation that created w = <SumBackward0 object at 0x12b0b78d0> is a Function object
**********************************************************************

compute backward pass starting from w
gradient of x from w: tensor([ 8., 16., 24.])

By default, gradients are only retained for leaf variables. 
non-leaf variables’ gradients are not retained to be inspected later. 
This was done by design, to save memory.

gradient of y: None
gradient of z: None
gradient of w: None

compute backward pass starting from z
Trying to backward through the graph a second time, but the buffers have already been freed. Specify retain_graph=True when calling backward the first time.

compute AFTER REDEFINITION backward pass starting from z
gradient of x from z: tensor([ 8., 16., 24.])

compute AFTER REDEFINITION backward pass starting fro

# Extending torch.autograd 

In [291]:
"""from the docs:
Adding operations to autograd requires implementing a new Function subclass for each operation.
Recall that Function s are what autograd uses to compute the results and gradients, and encode the operation history.
Every new function requires you to implement 2 methods:
1)   forward()
2)   backward()"""

# example from programcreek.com

def test_function(self):
        class MyFunction(Function):

            @staticmethod
            def forward(ctx, tensor1, scalar, tensor2):
# ctx is a context object that can be used to stash information for backward computation
                ctx.scalar = scalar
                ctx.save_for_backward(tensor1, tensor2)
                return tensor1 + scalar * tensor2 + tensor1 * tensor2

            @staticmethod
            def backward(ctx, grad_output):
                var1, var2 = ctx.saved_variables
                # NOTE: self is the test case here
                self.assertIsInstance(var1, Variable)
                self.assertIsInstance(var2, Variable)
                self.assertIsInstance(grad_output, Variable)
                return (grad_output + grad_output * var2, None,
                        grad_output * ctx.scalar + grad_output * var1)

        x, y = self._function_test(MyFunction)

        x_grad_desc = graph_desc(x.grad.grad_fn)
        y_grad_desc = graph_desc(y.grad.grad_fn)
        self.assertEqual(
            x_grad_desc,
            'Identity(AddBackward(ExpandBackward(AccumulateGrad()), '
            'MulBackward(ExpandBackward(AccumulateGrad()), AccumulateGrad())))')
        self.assertEqual(
            y_grad_desc,
            'Identity(AddBackward(MulConstantBackward(ExpandBackward(AccumulateGrad())), '
            'MulBackward(ExpandBackward(AccumulateGrad()), AccumulateGrad())))') 