In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

# Automatic Gradients
- Autoamtc differentiation is useful for implementing ML and DL algorithms where Backpropagation plays important role in training the algorithms
- To differentiate automatically, TensorFlow needs to remember what operations happen in what order during the forward pass. 
- Then, during the backward pass, TensorFlow traverses this list of operations in reverse order to compute gradients.

### Gradient Tapes
- TensorFlow provides the tf.GradientTape API for automatic differentiation; that is, computing the gradient of a computation with respect to some inputs, usually tf.Variables. 
- TensorFlow "records" relevant operations executed inside the context of a tf.GradientTape onto a "tape". 
- TensorFlow then uses that tape to compute the gradients of a "recorded" computation using reverse mode differentiation.

In [2]:
### Example of Derivatives wrt to a scalar
x = tf.Variable(3.0)
with tf.GradientTape() as tape:
    y = x**2

# Once you've recorded some operations, use GradientTape.gradient(target, sources) to calculate the gradient of some target (often a loss) relative to some source (often the model's variables):
dy_dx = tape.gradient(y,x)
dy_dx.numpy()

6.0

In [7]:
### Example of Derivaties wrt a Vector
w = tf.Variable(tf.random.normal((3,2)), name='w')
b = tf.Variable(tf.zeros(2, dtype=tf.float32), name='b')
x = [[1.,2.,3.]]
with tf.GradientTape() as tape:
    y = x@w+b
    loss = tf.reduce_mean(y**2)

# To get the gradient of loss with respect to both variables, you can pass both as sources to the gradient method. 
# The tape is flexible about how sources are passed and will accept any nested combination of lists or dictionaries and return the gradient structured the same way
[dl_dw, dl_db] = tape.gradient(loss, [w,b])
with tf.GradientTape() as tape:
    y = x@w+b
    loss = tf.reduce_mean(y**2)
grad = tape.gradient(loss,{'w':w,'b':b})
print(dl_dw.shape, grad['w'].shape)


(3, 2) (3, 2)


In [8]:
# It's common to collect tf.Variables into a tf.Module or one of its subclasses (layers.Layer, keras.Model) for checkpointing and exporting.
# In most cases, you will want to calculate gradients with respect to a model's trainable variables. 
# Since all subclasses of tf.Module aggregate their variables in the Module.trainable_variables property, you can calculate these gradients in a few lines of code:
layer = tf.keras.layers.Dense(2, activation='relu')
x = tf.constant([[1., 2., 3.]])
with tf.GradientTape() as tape:
    y = layer(x)
    loss = tf.reduce_mean(y**2)
grad = tape.gradient(loss, layer.trainable_variables)

for var, g in zip(layer.trainable_variables, grad):
    print(f'{var.name}, shape: {g.shape}')


dense/kernel:0, shape: (3, 2)
dense/bias:0, shape: (2,)


In [14]:
#The default behavior is to record all operations after accessing a trainable tf.Variable. The reasons for this are:

#The tape needs to know which operations to record in the forward pass to calculate the gradients in the backwards pass.
#The tape holds references to intermediate outputs, so you don't want to record unnecessary operations.
#The most common use case involves calculating the gradient of a loss with respect to all a model's trainable variables.
#For example, the following fails to calculate a gradient because the tf.Tensor is not "watched" by default, and the tf.Variable is not trainable:
x0 = tf.Variable(3.0, name='x0')
x1 = tf.Variable(3.0, name='x1', trainable=False) # Not trainable
x2 = tf.Variable(2.0, name='x2') +1.0 # Returns a tensor not a variable
x3 = tf.constant(3.0, name='x3') # not a variable
x4 = tf.Variable(4.0, name='x4')

with tf.GradientTape() as tape:
    y = (x0**2) + (x1**2) + (x2**2) + (x3**2)

grad = tape.gradient(y,[x0,x1,x2,x3,x4])
for g in grad:
    print(g)

# We can list the variables being watched by tape
# tf.GradientTape provides hooks that give the user control over what is or is not watched.
[var.name for var in tape.watched_variables()]


tf.Tensor(6.0, shape=(), dtype=float32)
None
None
None
None


['x0:0']

In [18]:
# To record gradients with respect to a tf.Tensor, you need to call GradientTape.watch(x):
x = tf.constant(3.0)
with tf.GradientTape() as tape:
    tape.watch(x)
    y = x**2

dy_dx = tape.gradient(y,x)
print(dy_dx.numpy())

# Conversely, to disable the default behavior of watching all tf.Variables, set watch_accessed_variables=False when creating the gradient tape. 
# This calculation uses two variables, but only connects the gradient for one of the variables:
x0 = tf.Variable(0.0)
x1 = tf.Variable(10.0)

with tf.GradientTape(watch_accessed_variables=False) as tape:
  tape.watch(x1)
  y0 = tf.math.sin(x0)
  y1 = tf.nn.softplus(x1)
  y = y0 + y1
  ys = tf.reduce_sum(y)

grad = tape.gradient(ys,{'x0':x0,'x1':x1})
print('dys/dx0:', grad['x0'])
print('dys/dx1:', grad['x1'].numpy())



6.0
dys/dx0: None
dys/dx1: 0.9999546
