# tf.GradientTape 

Documentation [here](https://www.tensorflow.org/api_docs/python/tf/GradientTape). 

A few experiments, mostly following the tf documentation.

In [1]:
import tensorflow as tf
import numpy as np

In [4]:
x = tf.constant(5.0)
with tf.GradientTape() as g:
    g.watch(x)
    y = x**2
dy_dx = g.gradient(y,x) # should result in 10

with tf.Session() as sess:
    print(sess.run(dy_dx))

10.0


Nestings possible in order to retrieve higher-order derivatives.

In [21]:
x = tf.constant(4.0)
with tf.GradientTape() as g:
    g.watch(x)
    with tf.GradientTape() as gg:
        gg.watch(x)
        y = x**3
    dy_dx = gg.gradient(y,x)  # 3*x^2 = 3*16 = 48
d2y_dx2 = g.gradient(dy_dx,x) # 6*x = 6*4.0 = 24
    
with tf.Session() as sess:
    print(sess.run((dy_dx, d2y_dx2))) # pass desired elements 
    print(sess.run([dy_dx, d2y_dx2])) # as tuple, list,
    print(sess.run({'a':dy_dx,'b':d2y_dx2})) #  dict...

(48.0, 24.0)
[48.0, 24.0]
{'a': 48.0, 'b': 24.0}


As the [documentation](https://www.tensorflow.org/api_docs/python/tf/Session) states:
> The `fetches` argument (first argument of `run`) may be a single graph element, or an arbitrarily nested list, tuple, namedtuple, dict, or OrderedDict containing graph elements at its leaves. 

---

By default, resources held by `GradientTape` are accessible only once, unless `persistent=True` is set.

In [24]:
x = tf.constant(2.0)
with tf.GradientTape(persistent=False) as g:
    g.watch(x)
    y = x**2
    z = y**2
dz_dx = g.gradient(z,x) # 4*x^3 for x = 2: 32
dy_dx = g.gradient(y,x) # 4

with tf.Session() as sess:
    print(sess.run((dz_dx, dy_dx)))
    print(sess.run((dz_dx, dy_dx))) # error occurs

RuntimeError: GradientTape.gradient can only be called once on non-persistent tapes.

In [28]:
x = tf.constant(2.0)
with tf.GradientTape(persistent=True) as g:
    g.watch(x)
    y = x**2
    z = y**2
dz_dx = g.gradient(z,x) # 4*x^3 for x = 2: 32
dy_dx = g.gradient(y,x) # 4

with tf.Session() as sess:
    print(sess.run((dz_dx, dy_dx)))
    print(sess.run((dz_dx, dy_dx))) # it works

(32.0, 4.0)
(32.0, 4.0)


Fine-grained control over variables disabling automatic tracking:

In [52]:
var_a = tf.constant(2.0)
var_b = tf.constant(5.0)

with tf.GradientTape(persistent=True, 
                     watch_accessed_variables=False) as tape:
    tape.watch(var_a)
    y = var_a**2
    z = var_b ** 3
dy_da = tape.gradient(y, var_a)
dz_db = tape.gradient(z, var_b)

with tf.Session() as sess:
    print(sess.run(dy_da))
    try:
        print(sess.run(dz_db)) # throws an error
    except Exception as e:
        print('var_b variable is not being followed, hence error:')
        print(e)

4.0
var_b variable is not being followed, hence error:
Fetch argument None has invalid type <class 'NoneType'>


---
A note of warning when customizing models:
> Note that when using models you should ensure that your variables exist when using `watch_accessed_variables=False`. Otherwise it's quite easy to make your first iteration not have any gradients.

In [99]:
a = tf.keras.layers.Dense(32)
b = tf.keras.layers.Dense(32)
inputs = tf.constant([[float(x) for x in range(10)]])

with tf.GradientTape(watch_accessed_variables=False) as tape:   
    tape.watch(a.variables) # `a.build` hasn't been called yet
                            # therefore `a.variables` will be empty
    result = b(a(inputs))
    g = tape.gradient(result, a.variables)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    print(sess.run(result))
    print()
    try:
        print(sess.run(g))
    except Exception as e:
        print('Nope! Error:')
        print(e)

[[ 4.921738    5.2549005  -3.4237587   1.0484527  -2.1596549   7.2284975
  -8.753367    2.309942    2.626636    2.2715087   2.1964502  -0.43788818
   0.9118432  -1.8459383   4.7647614  -2.135406    0.27540636 10.666039
  -5.8266163  -0.13711858  2.7537673  -8.153975    0.37899143 -3.478874
   0.4156808  -2.409906   -3.4665046  -0.5386368   0.56853664  1.9909041
  -0.2896165   1.9271399 ]]

Nope! Error:
Fetch argument None has invalid type <class 'NoneType'>


Better with calling `build(input_shape)` beforehand. [Documentation](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Dense#build).

In [111]:
a2 = tf.keras.layers.Dense(32)
b2 = tf.keras.layers.Dense(32)
inputs2 = tf.constant([[float(x) for x in range(10)]])

with tf.GradientTape(watch_accessed_variables=False) as tape:   
    a2.build(inputs2.shape)
    tape.watch(a2.variables)                         
    result2 = b2(a2(inputs2))
    gg = tape.gradient(result2, a2.variables)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    print(sess.run(result2))
    print('-'*30)
    
    grads = sess.run(gg)    
    print([grad.shape for grad in grads])
    print(grads)

[[ -8.556849     4.9956393   -5.173847     1.3807405    4.371144
   -4.902327     2.1342604   -1.0151781   -2.6916428   -0.29353797
    6.632813     4.823284     5.929711    -2.022577     3.9823647
    1.5416613   -2.032748     1.2983477    0.42888513  -2.187703
  -10.97552     -1.0283854    2.004771    -2.4335957    4.043797
    4.5529833    1.2305776   -3.1239045    2.3101783    2.6177657
   -1.1685332   -5.174881  ]]
------------------------------
[(10, 32), (32,)]
[array([[ 0.00000000e+00, -0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00, -0.00000000e+00,
        -0.00000000e+00,  0.00000000e+00, -0.00000000e+00,
        -0.00000000e+00,  0.00000000e+00, -0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        -0.00000000e+00, -0.00000000e+00,  0.00000000e+00,
        -0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        -0.00000000e+00, -0.00000000e+00,  0.00000000e+00,
        -0.00000000e+00, -0.00000000e+00, -0.00000000e

---
The [Jacobian](https://www.tensorflow.org/api_docs/python/tf/GradientTape) (the [square-shaped first-order partial derivatives of a vector-valued function](https://en.wikipedia.org/wiki/Jacobian_matrix_and_determinant)) can be computed manually, and also [as a batch](https://www.tensorflow.org/api_docs/python/tf/GradientTape#batch_jacobian), 

In [150]:
x = tf.constant([1.,2.]) 
z = tf.constant([2.,3.])

with tf.GradientTape(persistent=True) as g:
    g.watch(x)     
    g.watch(z)
    y = x * x + tf.cos(z) 
    
grad_x = g.gradient(y,x)
jacobian_x = g.jacobian(y,x)
grad_z = g.gradient(y,z)
jacobian_z = g.jacobian(y,z)

with tf.Session() as sess:
    print(sess.run(y))
    print()
    print('Grad & Jac for x:', sess.run(grad_x), sess.run(jacobian_x), sep='\n')
    print()
    print('Grad & Jac for z:', sess.run(grad_z), sess.run(jacobian_z), sep='\n')

[0.5838531 3.0100074]

Grad & Jac for x:
[2. 4.]
[[2. 0.]
 [0. 4.]]

Grad & Jac for z:
[-0.9092974 -0.14112  ]
[[-0.9092974 -0.       ]
 [-0.        -0.14112  ]]


In [123]:
x = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) 

with tf.GradientTape() as g:
    g.watch(x)     
    y = x * x 

batch_jacobian = g.batch_jacobian(y, x)

with tf.Session() as sess:
    print(sess.run(y))
    print('-'*30)    
    # batch_jacobian is [[[2, 0], 
    #                     [0, 4]], 
    #                   [[6, 0], 
    #                    [0, 8]]]
    print(sess.run(batch_jacobian))

[[ 1.  4.]
 [ 9. 16.]]
------------------------------
[[[2. 0.]
  [0. 4.]]

 [[6. 0.]
  [0. 8.]]]
