In [3]:
# Automatic Differentiation/Gradient for Deep Learning

"""
Automatic Differentiation (AD) is a technique used to compute derivatives (gradients)
of functions numerically, automatically, and accurately. AD breaks down complex
functions into a series of elementary operations, and using the chain rule,
calculates the gradients of these operations with respect to the input variables.
"""

'\nAutomatic Differentiation (AD) is a technique used to compute derivatives (gradients) \nof functions numerically, automatically, and accurately. AD breaks down complex \nfunctions into a series of elementary operations, and using the chain rule, \ncalculates the gradients of these operations with respect to the input variables.\n'

In [2]:
import tensorflow as tf
import numpy as np
tf.version.VERSION

'2.12.0'

In [5]:
# Gradient Tape : tf.GradientTape
"""
This API allows you to record operations executed within a "tape" context and
then compute gradients of these recorded operations with respect to inputs.
"""
x = tf.Variable(3.0)

with tf.GradientTape() as tape:
  y = x**2


# Once you've recorded some operations, use GradientTape.gradient(target, sources)
# to calculate the gradient of some target (often a loss) relative to some source.
diff_yx = tape.gradient(y, x)   # dy/dx = 2*x
diff_yx

<tf.Tensor: shape=(), dtype=float32, numpy=6.0>

In [8]:
# tf.GradientTape works on all tensors, for e.g: y = matmul(x,w) + b
w = tf.Variable(tf.random.normal((3, 2)), name='w')
b = tf.Variable(tf.zeros(2, dtype=tf.float32), name='b')
x = [[1., 2., 3.]]

with tf.GradientTape(persistent=True) as tape:
  y = x @ w + b
  loss = tf.reduce_mean(y**2)  # mean square loss

[dl_dw, dl_db] = tape.gradient(loss, [w, b])
print(w)
dl_dw, dl_db
# Note: The gradient with respect to each source has the shape of the source

<tf.Variable 'w:0' shape=(3, 2) dtype=float32, numpy=
array([[0.33232355, 0.52813953],
       [0.29861686, 0.61077875],
       [0.07380591, 0.11919672]], dtype=float32)>


(<tf.Tensor: shape=(3, 2), dtype=float32, numpy=
 array([[1.150975 , 2.1072872],
        [2.30195  , 4.2145743],
        [3.452925 , 6.3218613]], dtype=float32)>,
 <tf.Tensor: shape=(2,), dtype=float32, numpy=array([1.150975 , 2.1072872], dtype=float32)>)

In [11]:
# variables can be put in dictionary too.
my_vars = {
    'w': w,
    'b': b
}

grad = tape.gradient(loss, my_vars)
grad['b']

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([1.150975 , 2.1072872], dtype=float32)>

In [12]:
# Gradients with respect to models in ML
"""
In machine learning, trainable variables (also known as model parameters or weights)
are the parameters that a model learns during training in order to optimize its
performance. These variables are adjusted iteratively using optimization algorithms
like gradient descent to minimize a loss function.
"""
# All subclasses(layers.Layer, keras.Model) of tf.Module aggregate their variables
# in the Module.trainable_variables property

layer = tf.keras.layers.Dense(2, activation='relu') # fully connected layer
x = tf.constant([[1., 2., 3.]])

with tf.GradientTape() as tape:
  # Forward pass
  y = layer(x)
  loss = tf.reduce_mean(y**2) # mean squared error loss

# Gradients with respect to every trainable variable
grad = tape.gradient(loss, layer.trainable_variables)

for var, g in zip(layer.trainable_variables, grad):
  print(f'{var.name}, shape: {g.shape}')

dense/kernel:0, shape: (3, 2)
dense/bias:0, shape: (2,)


In [13]:
# Controlling What the Tape Watches and Gradient Calculation in TensorFlow
"""
Controlling what the tape watches allows you to selectively calculate gradients
with respect to specific variables.
"""
# Note: By default, the tape watches trainable variables to calculate gradients effectively.



x0 = tf.Variable(3.0, name='x0')

x1 = tf.Variable(3.0, name='x1', trainable=False) # Not trainable

# Not a Variable: A variable + tensor returns a tensor.
x2 = tf.Variable(2.0, name='x2') + 1.0

x3 = tf.constant(3.0, name='x3') # Not a variable

with tf.GradientTape() as tape:
  y = (x0**2) + (x1**2) + (x2**2)

grad = tape.gradient(y, [x0, x1, x2, x3])

for g in grad:
  print(g)

tf.Tensor(6.0, shape=(), dtype=float32)
None
None
None


In [15]:
# getting variable name which are being watched : tape.watched_variables()
[var.name for var in tape.watched_variables()]

['x0:0']

In [19]:
"""
In some scenarios, you might want to control which variables are watched by
the tf.GradientTape context to control gradient computation.

-> Using watch and watch_accessed_variables:

To control what the tape watches, you can use the tape.watch(x) function.
By calling tape.watch(x) on a specific variable x, you explicitly instruct
the tape to watch that variable, even if it's a tensor.
"""

with tf.GradientTape() as tape2:
  tape2.watch(x1)   # x1 was non- trainable. we can also put x2 and x3
  tape2.watch(x3)
  y = (x0**2) + (x1**2) + (x2**2)

grad2 = tape2.gradient(y, [x0, x1, x2, x3])
for g in grad2:
  print(g)

tf.Tensor(6.0, shape=(), dtype=float32)
tf.Tensor(6.0, shape=(), dtype=float32)
None
None


In [21]:
# On contrary, To disable the default behavior of watching all tf.Variables, set
# watch_accessed_variables=False when creating the gradient tape.

x0 = tf.Variable(0.0)
x1 = tf.Variable(10.0)

with tf.GradientTape(watch_accessed_variables=False) as tape3: # do not watch anyone
  tape3.watch(x1)  # watch only x1
  y0 = tf.math.sin(x0)
  y1 = tf.nn.softplus(x1)
  y = y0 + y1
  ys = tf.reduce_sum(y)

grad3 = tape3.gradient(ys, {'x0': x0, 'x1': x1})
print(grad3['x0'])   # no gardient will be computed using x0
print(grad3['x1'])

None
tf.Tensor(0.9999546, shape=(), dtype=float32)


In [23]:
# The usage of persistent gradient tapes and Gradients with Respect to Intermediate
# values in TensorFlow
"""
By default, when you compute gradients using tf.GradientTape, the resources held
by the tape are released as soon as the GradientTape.gradient method is called.
However, in some situations, you might want to compute gradients multiple times
over the same computation. This is where persistent gradient tapes come in.
"""

x = tf.constant([1, 3.0])
with tf.GradientTape(persistent=True) as tape4: # setting persistent
  tape4.watch(x)
  y = x * x   # y variable is intermediate variable  = x^2
  z = y * y   # final variable = x^4

print(tape4.gradient(z, x).numpy())
print(tape4.gradient(y, x).numpy())  # intermediate gradients

"""
Note: Persistent gradient tapes keep resources allocated until you explicitly
release them by deleting the tape.
"""
# After using the persistent gradient tape, it's important to explicitly delete
# the tape using del tape to release the associated resources.
del tape4   # Drop the reference to the tape

[  4. 108.]
[2. 6.]


In [None]:
# Performance Study
"""
1. Memory Usage with Gradient Tapes:

 Gradient tapes use memory to store intermediate results, both inputs and outputs,
 during the forward pass. These intermediate results are necessary for computing
 gradients during the backward pass (backpropagation).

2. Persistent gradient
 Using persistent=True can be helpful when you need to calculate multiple gradients
 using the same tape. However, this comes with a trade-off: it increases peak memory usage.

 """

In [24]:
# Gradients of Non-Scalar Targets and Element-wise Calculations in TensorFlow

# A gradient represents the rate of change of a scalar function with respect to its inputs.


x = tf.Variable(2.0)
with tf.GradientTape(persistent=True) as tape5:  # set persistent true
  y0 = x**2
  y1 = 1 / x

print(tape5.gradient(y0, x).numpy())
print(tape5.gradient(y1, x).numpy())

del tape5

4.0
-0.25


In [25]:
# For the gradient of multiple targets, result for each source is sum of the gradients
# of each target

# Note: If target are non-scalar applies same, e.g : y = x*[3., 4.] => dy/dx = 3+4

x = tf.Variable(2.0)
with tf.GradientTape() as tape6:
  y0 = x**2   # target 1
  y1 = 1 / x  # target 2

print(tape6.gradient({'y0': y0, 'y1': y1}, x).numpy())  # dy1/dx + dy0/dx or (d(y0 + y1)/dx)

"""
Note: If you need a separate gradient for each item, refer to Jacobians.
"""

3.75


In [29]:
# Control Flow inside GradientTape
"""
In scenarios like calculating sub-gradients for functions which are not
differentiable at all points, we can calculate gradient using control flow
like if/while with different sub-gradients.
"""

x = tf.constant(1.5)

v0 = tf.Variable(2.0)
v1 = tf.Variable(2.0)

with tf.GradientTape() as tape7:
  tape7.watch(x)
  if x > 1.0:      # condition
    result = 2*v0   # sub-gradient function
  else:
    result = v1**2  # another sub-gradient function

dv0, dv1 = tape7.gradient(result, [v0, v1])

print(dv0)
print(dv1)

tf.Tensor(2.0, shape=(), dtype=float32)
None


In [32]:
# Cases where gradient returns None

# 1. Replace a Variable with Tensor
# Since, GradientTape will automatically watch a tf.Variable but not a tf.Tensor.

# 2. Did calculations outside of TensorFlow
# The tape can't record the gradient path if the calculation exits TensorFlow.

x = tf.Variable([[1.0, 2.0],
                 [3.0, 4.0]], dtype=tf.float32)

with tf.GradientTape() as tape:
  tape.watch(x)
  x2 = x**2

  # This step is calculated with NumPy, not in tensorflow graph
  y1 = np.mean(x2, axis=0)

  # Like most ops, reduce_mean will cast the NumPy array to a constant tensor
  # using `tf.convert_to_tensor`.
  y = tf.reduce_mean(y1, axis=0)

print(tape.gradient(y, x))

None


In [38]:
# 3. Took gradients through an integer or string
# strings and integers are not differentiable
x = tf.constant(10)  # integer constant

with tf.GradientTape() as g:
  y = x * x

print(g.gradient(y, x))

# Note: We should make this variable watched by tape for getting gradient

None


In [39]:
# 4. Took gradients through a stateful object
"""
State stops gradients. When you read from a stateful object, the tape can only
observe the current state, not the history that lead to it.

-> A tf.Tensor is immutable. You can't change a tensor once it's created.
   It has a value, but no state. All the operations discussed so far are also
   stateless: the output of a tf.matmul only depends on its inputs.

-> A tf.Variable has internal state—its value. When you use the variable, the
   state is read. It's normal to calculate a gradient with respect to a variable,
   but the variable's state blocks gradient calculations from going farther back.
"""

x0 = tf.Variable(3.0)
x1 = tf.Variable(0.0)

with tf.GradientTape() as tape:

  x1.assign_add(x0) # Update x1 = x1 + x0.

  # The tape starts recording from x1.
  y = x1**2   # y = (x1 + x0)**2

# This doesn't work.
print(tape.gradient(y, x0))   #dy/dx0 = 2*(x1 + x0)

# Similarly, tf.data.Dataset iterators and tf.queues are stateful, and will stop
# all gradients on tensors that pass through them.

None


In [40]:
# To avoid non-differentiable conditions of above, We can show 0 instead of None.
# we can set flag in gradient = unconnected_gradients=tf.UnconnectedGradients.ZERO)

x = tf.Variable([2., 2.])
y = tf.Variable(3.)

with tf.GradientTape() as tape:
  z = y**2
print(tape.gradient(z, x, unconnected_gradients=tf.UnconnectedGradients.ZERO))

tf.Tensor([0. 0.], shape=(2,), dtype=float32)
