## 自动求导

In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

2.3.0
sys.version_info(major=3, minor=7, micro=11, releaselevel='final', serial=0)
matplotlib 3.4.2
numpy 1.18.5
pandas 1.3.3
sklearn 1.0
tensorflow 2.3.0
tensorflow.keras 2.4.0


### 1. 近似求导

- 近似求导方法
    1. x 向左移动一个位置 : x1 = x + eps
    2. x 向右移动一个位置 : x2 = x - eps
    3. f(x1) - f(x2) / (x1 - x2) => 当 x1 - x2 非常小的时候就可以计算对应的导数

### 2. tf.GradientTape

In [4]:
# 定义函数
def f(x, y):
    return x ** 2 + y ** 2 + 1  # 偏导数为 2x 和 2y

#### 2.1 求偏导

In [8]:
# 使用 tf.GradientTape 求导
x = tf.Variable(2.0)
y = tf.Variable(3.0)
with tf.GradientTape(persistent=True) as tape:
    z = f(x, y)

# z 对 x 的偏导数 , 默认情况下 tape 只能用一次就会被回收
dz_x = tape.gradient(z, x)
print(dz_x)
try:
    dz_y = tape.gradient(z, y)
except RuntimeError as re:
    print(re)

tf.Tensor(4.0, shape=(), dtype=float32)
GradientTape.gradient can only be called once on non-persistent tapes.


#### 2.2 tape 持久化
- 默认情况下 tape 只能用一次就会被回收, 必须设置 persistent 为 True 才可以, 另外用完需要自己回收掉 : del tape

In [14]:
# 使用 tf.GradientTape 求导
x = tf.Variable(2.0)
y = tf.Variable(3.0)
#  默认情况下 tape 只能用一次就会被回收, 必须设置 persistent 为 True 才可以, 另外用完需要自己回收掉 : del tape
with tf.GradientTape(persistent=True) as tape:
    z = f(x, y)

# z 对 x 的偏导数 , 默认情况下 tape 只能用一次就会被回收
dz_x = tape.gradient(z, x)
dz_y = tape.gradient(z, y)
print(dz_y)  # dz_y 的偏导数是 2y => 2 * 3 = 6
print(dz_x)
# 回收 tape
del tape

tf.Tensor(6.0, shape=(), dtype=float32)
tf.Tensor(4.0, shape=(), dtype=float32)


#### 2.3 同时求多个偏导数

In [16]:
# 使用 tf.GradientTape 求导
x = tf.Variable(2.0)
y = tf.Variable(3.0)
#  默认情况下 tape 只能用一次就会被回收, 必须设置 persistent 为 True 才可以, 另外用完需要自己回收掉 : del tape
with tf.GradientTape(persistent=True) as tape:
    z = f(x, y)

# 同时求 x和y的偏导数
dz_xy = tape.gradient(z, [x, y])  # 返回值是数组
print(dz_xy)

del tape

[<tf.Tensor: shape=(), dtype=float32, numpy=4.0>, <tf.Tensor: shape=(), dtype=float32, numpy=6.0>]


#### 2.4 tf.watch

In [18]:
# 使用 tf.GradientTape 求导
x = tf.constant(2.0)
y = tf.constant(3.0)
#  默认情况下 tape 只能用一次就会被回收, 必须设置 persistent 为 True 才可以, 另外用完需要自己回收掉 : del tape
with tf.GradientTape(persistent=True) as tape:
    tape.watch(x)
    tape.watch(y)
    z = f(x, y)

# 同时求 x和y的偏导数
dz_xy = tape.gradient(z, [x, y])  # 返回值是数组
print(dz_xy)

del tape

[<tf.Tensor: shape=(), dtype=float32, numpy=4.0>, <tf.Tensor: shape=(), dtype=float32, numpy=6.0>]


#### 2.5 一个变量对两个目标函数求导

In [21]:
x = tf.Variable(5.0)

with tf.GradientTape() as tape:
    z1 = 3 * x
    z2 = x ** 2

print(tape.gradient([z1, z2], x))  # 结果其实是z1和z2对x求导的结果的和

del tape

tf.Tensor(13.0, shape=(), dtype=float32)


#### 2.6 二阶导数求导

In [29]:
x = tf.Variable(2.0)
y = tf.Variable(3.0)


def f(x, y):
    return x ** 4 + y ** 2 + 1


with tf.GradientTape(persistent=True) as out_tape:
    with tf.GradientTape(persistent=True) as inner_tape:
        z = f(x, y)
    # 内层计算导数
    inner_gradients = inner_tape.gradient(z, [x, y])  # 返回值是一个数组
# 对内层计算的导数再次求导
# 一阶 : dz_x = 4x^3 , dz_y = 2y => 4 * 8 = 32, 2 * 3 = 6
# 二阶 : d2z_x = 12x^2 , d2z_y = 2
out_gradient = [out_tape.gradient(inner_gradient, [x, y]) for inner_gradient in inner_gradients]
print(inner_gradients)
print(out_gradient)

del inner_tape
del out_tape

[<tf.Tensor: shape=(), dtype=float32, numpy=32.0>, <tf.Tensor: shape=(), dtype=float32, numpy=6.0>]
[[<tf.Tensor: shape=(), dtype=float32, numpy=48.0>, None], [None, <tf.Tensor: shape=(), dtype=float32, numpy=2.0>]]


### 3. 模拟梯度下降

- x.assign_sub(y) => x - y

#### 3.1 模拟简单梯度下降

In [31]:
def f(x):
    return x ** 2 + 2 * x + 1

learning_rate = 0.1
x = tf.Variable(0.0)

for _ in range(100):
    with tf.GradientTape() as tape:
        z = f(x)
    dz_dx = tape.gradient(z, x)
    # x - 计算的梯度
    x.assign_sub(learning_rate * dz_dx)
print(x)

<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=-0.9999999>


#### 3.2 使用优化函数

- x.assign_sub(learning_rate * dz_dx) ==  optimizer.apply_gradients([(dz_dx, x)])

In [34]:
def f(x):
    return x ** 2 + 2 * x + 1

learning_rate = 0.1
x = tf.Variable(0.0)

optimizer = keras.optimizers.SGD(learning_rate=learning_rate) # 随机梯度下降

for _ in range(100):
    with tf.GradientTape() as tape:
        z = f(x)
    dz_dx = tape.gradient(z, x)
    # x - 计算的梯度
    # x.assign_sub(learning_rate * dz_dx) => optimizer.apply_gradients([(dz_dx, x)])
    # apply_gradients([(grad01, var01), (grad02, var02) ... ])
    optimizer.apply_gradients([(dz_dx, x)])
print(x)

<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=-0.9999999>
