## TensorFlow AutoDiff

In [None]:
import tensorflow as tf
import numpy as np
import tensorflow.keras as K
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
def f(w1, w2):
    return 3 * w1 ** 2 + 2 * w1 * w2

In [None]:
w1, w2 = tf.Variable(5.), tf.Variable(3.)

with tf.GradientTape() as tape:
    z = f(w1, w2)
    
    # To save memory one can add stuff inside
    # with tape.stop_recording()

In [4]:
gradiets = tape.gradient(z, [w1, w2])

In [5]:
gradiets

[<tf.Tensor: shape=(), dtype=float32, numpy=36.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=10.0>]

## Calling volatile gradient (persistent=False) again leads to an error

In [6]:
 tape.gradient(z, w1)

RuntimeError: A non-persistent GradientTape can only be used to compute one set of gradients (or jacobians)

## Persistent tape can be called multiple times but needs to be manually released

In [None]:
with tf.GradientTape(persistent=True) as tape:
    z = f(w1, w2)

In [None]:
tape.gradient(z, w1)

In [None]:
tape.gradient(z, w2)

In [22]:
del tape

## GradientTape by default tracks only tf.Variable and ignores tf.constant tensors, but it can be forced to watch any tensor and allows to compute gradients w/ respect to them

In [None]:
c1, c2 = tf.constant(5.), tf.constant(3.)

with tf.GradientTape() as tape:
    tape.watch(c1)
    tape.watch(c2)
    z = f(c1, c2)
    

In [None]:
tape.gradient(z, [c1, c2])

## For vector containing losses as individual elements one has to call the jacobian() method of the tape. 
## Otherwise the tape will compute the scalar gradient of the summed vector elements

## It is also possible to exclude parts of the funciton from contributing to gradients: tf.stop_gradient(...) enclosing part of the expression renders it effectively constant

## It has to be done at the level of the function

In [None]:
def f(w1, w2):
    return 3 * w1 ** 2 + tf.stop_gradient(2 * w1 *w2)

In [14]:
w1, w2 = tf.Variable(5.), tf.Variable(3.)

with tf.GradientTape(persistent=True) as tape:
    z = f(w1, w2)

In [15]:
tape.gradient(z, [w1, w2])
del tape

[<tf.Tensor: shape=(), dtype=float32, numpy=36.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=10.0>]

## For a known gradient expression one can form a custom gradient function associated with the given TF function

In [16]:
@tf.custom_gradient
def my_softplus(z):
    def my_softplus_gradients(grads):
        return grads * (1 - 1 / (1 + tf.exp(z)))
    
    result = tf.math.log(1 + tf.exp(-tf.abs(z))) + tf.maximum(0, z)
    
    return result, my_softplus_gradients

In [23]:
w1, w2 = tf.Variable(5.), tf.Variable(3.)

with tf.GradientTape(persistent=True) as tape:
    z = my_softplus(w1)

In [25]:
tape.gradient(z, [w1, w2])

[<tf.Tensor: shape=(), dtype=float32, numpy=0.9933072>, None]