# 梯度下降

## 梯度的定义

$\nabla f = \left(\frac{\partial f}{\partial x_1}; \frac{\partial f}{\partial x_2};...;\frac{\partial f}{\partial x_n} \right)$

$\theta_{t+1} = \theta_t - \alpha_t \nabla f(\theta_t)$


In [1]:
import tensorflow as tf
from tensorflow import keras


In [2]:
w = tf.constant(1.)
x = tf.constant(2.)
y = x * w


In [6]:
with tf.GradientTape() as tape:
    tape.watch([w])
    y2 = x * w


In [7]:
# 只能执行一次
tape.gradient(y2, [w])


[<tf.Tensor: id=45, shape=(), dtype=float32, numpy=2.0>]

In [None]:
# 可以求导多次
with tf.GradientTape(persistent=True) as tape:
    tape.watch([w])
    y2 = x * w




In [8]:
# 二阶求导
w = tf.Variable(1.0) # 默认watch
b = tf.Variable(2.0)
x = tf.Variable(3.0)

with tf.GradientTape() as tape1:
    with tf.GradientTape() as tape2:
        y = x * w + b
    dy_dw, dy_db = tape2.gradient(y, [w, b])
d2y_dw2 = tape1.gradient(dy_dw, w)

print("dy_dw: ", dy_dw)
print("dy_db: ", dy_db)
print("d2y_dw2: ", d2y_dw2)


dy_dw:  tf.Tensor(3.0, shape=(), dtype=float32)
dy_db:  tf.Tensor(1.0, shape=(), dtype=float32)
d2y_dw2:  None


## 激活函数的梯度


In [10]:
a = tf.linspace(-10., 10, 10)

with tf.GradientTape() as tape:
    tape.watch(a)
    y = tf.sigmoid(a)

grads = tape.gradient(y, [a])

print("A: ", a)
print("y: ", y)
print("Gradient of a: ", grads)


A:  tf.Tensor(
[-10.         -7.7777777  -5.5555553  -3.333333   -1.1111107   1.1111116
   3.333334    5.5555563   7.7777786  10.       ], shape=(10,), dtype=float32)
y:  tf.Tensor(
[4.5388937e-05 4.1878223e-04 3.8510561e-03 3.4445226e-02 2.4766389e-01
 7.5233626e-01 9.6555483e-01 9.9614894e-01 9.9958128e-01 9.9995458e-01], shape=(10,), dtype=float32)
Gradient of a:  [<tf.Tensor: id=189, shape=(10,), dtype=float32, numpy=
array([4.5386874e-05, 4.1860685e-04, 3.8362255e-03, 3.3258751e-02,
       1.8632649e-01, 1.8632641e-01, 3.3258699e-02, 3.8362255e-03,
       4.1854731e-04, 4.5416677e-05], dtype=float32)>]


## 损失函数梯度

$\frac{\partial p_i}{\partial a_j} = \frac{\partial \frac{e^{a_i}}{\sum^N_{k=1} e^{a_k}}}{\partial a_j}$

- i=j: $\frac{\partial p_i}{\partial a_j} = p_j(1 - p_j)$


In [2]:
x = tf.random.normal([2, 4])
w = tf.random.normal([4, 3])
b = tf.zeros([3])
y = tf.constant([2, 0])

with tf.GradientTape() as tape:
    tape.watch([w, b])
    prob = tf.nn.softmax(x@w + b, axis=1)
    loss = tf.reduce_mean(tf.losses.MSE(tf.one_hot(y, depth=3), prob))

grads = tape.gradient(loss, [w, b])
print("w gradients: ", grads[0])
print("b gradients: ", grads[1])


w gradients:  tf.Tensor(
[[-0.00381645  0.12483726 -0.12102084]
 [ 0.00143867 -0.0862134   0.08477473]
 [ 0.0003233  -0.04569282  0.04536951]
 [-0.00371393  0.1185912  -0.11487729]], shape=(4, 3), dtype=float32)
b gradients:  tf.Tensor([-0.00330473  0.12689719 -0.12359247], shape=(3,), dtype=float32)


In [3]:
x = tf.random.normal([2, 4])
w = tf.random.normal([4, 3])
b = tf.zeros([3])
y = tf.constant([2, 0])

with tf.GradientTape() as tape:
    tape.watch([w, b])
    logits = x@w + b
    loss = tf.reduce_mean(tf.losses.categorical_crossentropy(tf.one_hot(y, depth=3), logits, from_logits=True))

grads = tape.gradient(loss, [w, b])
print("w gradients: ", grads[0])
print("b gradients: ", grads[1])


w gradients:  tf.Tensor(
[[-1.0553846e-01  9.4311126e-02  1.1227328e-02]
 [ 3.3583328e-01 -3.0006656e-01 -3.5766724e-02]
 [-2.4949678e-03  2.6152630e-03 -1.2028698e-04]
 [ 4.9271515e-01 -4.3982694e-01 -5.2888218e-02]], shape=(4, 3), dtype=float32)
b gradients:  tf.Tensor([-0.46352893  0.4142054   0.04932352], shape=(3,), dtype=float32)
