In [1]:
import tensorflow as tf

In [2]:
tf.__version__

'1.10.1'

In [3]:
import numpy as np
import os

# To plot pretty figures
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

In [4]:
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

### Xavier and He Initailization

In [5]:
reset_graph()

n_inputs = 28 * 28
n_hidden1 = 300

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name='X')

ReLU激活函数的初始化方法：

In [6]:
he_init = tf.variance_scaling_initializer()
hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu,
                          kernel_initializer=he_init, name='hidden1')

### Leaky ReLU

In [10]:
reset_graph()

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name='X')

In [11]:
def leaky_relu(z, name=None):
    return tf.maximum(0.01 * z, z, name=name)

hidden1 = tf.layers.dense(X, n_hidden1, activation=leaky_relu, name='hidden1')

使用Leaky ReLU训练神经网络：

In [31]:
reset_graph()

n_inputs = 28 * 28   # MNIST
n_outputs = 10
n_hidden1 = 300
n_hidden2 = 100

In [32]:
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name='X')
y = tf.placeholder(tf.int32, shape=(None), name='y')

In [33]:
with tf.name_scope('dnn'):
    hidden1 = tf.layers.dense(X, n_hidden1, activation=leaky_relu, name='hidden1')
    hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=leaky_relu, name='hidden2')
    logits = tf.layers.dense(hidden2, n_outputs, name='outputs')

In [34]:
with tf.name_scope('loss'):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y,
                                                              logits=logits)
    loss = tf.reduce_mean(xentropy, name='loss')

In [35]:
learning_rate = 0.01

with tf.name_scope('train'):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
    training_op = optimizer.minimize(loss)

In [36]:
with tf.name_scope('eval'):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

In [37]:
init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [38]:
# Load the data
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()

In [39]:
X_train.shape

(60000, 28, 28)

In [40]:
X_train.dtype

dtype('uint8')

In [41]:
y_train.shape

(60000,)

In [42]:
y_test.dtype

dtype('uint8')

In [43]:
X_train = X_train.astype(np.float32).reshape(-1, 28*28) / 255.
X_test = X_test.astype(np.float32).reshape(-1, 28*28) / 255.
y_train = y_train.astype(np.int32)
y_test = y_train.astype(np.int32)
X_val, X_train = X_train[:5000], X_train[5000:]
y_val, y_train = y_train[:5000], y_train[5000:]

In [44]:
def shuffle_batch(X, y, batch_size):
    rnd_idx = np.random.permutation(len(X))
    n_batches = len(X) // batch_size
    for batch_idx in np.array_split(rnd_idx, n_batches):
        X_batch, y_batch = X[batch_idx], y[batch_idx]
        yield X_batch, y_batch

In [45]:
n_epochs = 40
batch_size = 64

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        if epoch % 5 == 0:
            acc_batch = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
            acc_val = accuracy.eval(feed_dict={X: X_val, y: y_val})
            print(epoch, 'batch acc:', acc_batch, 'val acc:', acc_val)
    save_path = saver.save(sess, './my_model_final.ckpt')

0 batch acc: 0.890625 val acc: 0.8976
5 batch acc: 0.9375 val acc: 0.9432
10 batch acc: 0.921875 val acc: 0.9598
15 batch acc: 0.9375 val acc: 0.9672
20 batch acc: 0.984375 val acc: 0.9716
25 batch acc: 1.0 val acc: 0.9752
30 batch acc: 0.984375 val acc: 0.9752
35 batch acc: 0.953125 val acc: 0.9774


In [84]:
x = np.arange(8.)
np.split(x, 4)

[array([0., 1.]), array([2., 3.]), array([4., 5.]), array([6., 7.])]

In [87]:
10 // 3

3

### ELU

In [48]:
def elu(z, alpha=1):
    return np.where(z < 0, alpha * (np.exp(z) - 1), 1)

In [46]:
reset_graph()

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name='X')

In [47]:
hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.elu, name='hidden1')

### SELU

In [51]:
reset_graph()

n_inputs = 28 * 28
n_outputs = 10

n_hidden1 = 300
n_hidden2 = 100

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name='X')
y = tf.placeholder(tf.int32, shape=(None), name='y')

with tf.name_scope('dnn'):
    hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.selu, name='hidden1')
    hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.selu, name='hidden2')
    logits = tf.layers.dense(hidden2, n_outputs, name='outputs')

with tf.name_scope('loss'):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y,
                                                              logits=logits)
    loss = tf.reduce_mean(xentropy, name='loss')

learning_rate = 0.01

with tf.name_scope('train'):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)

with tf.name_scope('eval'):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [54]:
# Scale the inputs to mean 0 and standard deviation 1:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
X_val_scaled = scaler.transform(X_val)

n_epochs = 40
batch_size = 64

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
            X_batch_scaled = scaler.transform(X_batch)
            sess.run(training_op, feed_dict={X: X_batch_scaled, y: y_batch})
        if epoch % 5 == 0:
            acc_batch = accuracy.eval(feed_dict={X: X_batch_scaled, y: y_batch})
            acc_val = accuracy.eval(feed_dict={X: X_val_scaled, y: y_val})
            print(epoch, 'batch acc:', acc_batch, 'val acc:', acc_val)

0 batch acc: 0.9375 val acc: 0.9146
5 batch acc: 0.9375 val acc: 0.9518
10 batch acc: 0.984375 val acc: 0.9622
15 batch acc: 0.953125 val acc: 0.9654
20 batch acc: 1.0 val acc: 0.9676
25 batch acc: 1.0 val acc: 0.9686
30 batch acc: 1.0 val acc: 0.9694
35 batch acc: 1.0 val acc: 0.97


### Batch Normalization 批量归一化

In [56]:
reset_graph()

n_inputs = 28 * 28
n_outputs = 10
n_hidden1 = 300
n_hidden2 = 100

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name='X')
y = tf.placeholder(tf.int32, shape=(None,), name='y')

training = tf.placeholder_with_default(False, shape=(), name='training')

hidden1 = tf.layers.dense(X, n_hidden1, name='hidden1')
bn1 = tf.layers.batch_normalization(hidden1, training=training, momentum=0.9)
bn1_act = tf.nn.elu(bn1)

hidden2 = tf.layers.dense(X, n_hidden2, name='hidden2')
bn2 = tf.layers.batch_normalization(hidden2, training=training, momentum=0.9)
bn2_act = tf.nn.elu(bn2)

logits_before_bn = tf.layers.dense(bn2_act, n_outputs, name='outputs')
logits = tf.layers.batch_normalization(logits_before_bn, training=training,
                                       momentum=0.9)

In [57]:
reset_graph()

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name='X')
training = tf.placeholder_with_default(False, shape=(), name='training')

In [58]:
# To avoid repeating the same params over and over again
from functools import partial

my_batch_norm_layer = partial(tf.layers.batch_normalization,
                              training=training, momentum=0.9)

hidden1 = tf.layers.dense(X, n_hidden1, name='hidden1')
bn1 = my_batch_norm_layer(hidden1)
bn1_act = tf.nn.elu(bn1)

hidden2 = tf.layers.dense(bn1_act, n_hidden2, name='hidden2')
bn2 = my_batch_norm_layer(hidden2)
bn2_act = tf.nn.elu(bn2)

logits_before_bn = tf.layers.dense(bn2_act, n_outputs, name='outputs')
logits = my_batch_norm_layer(logits_before_bn)

**Build a neural net for MNIST, using the ELU activation and Batch Normalization at each layer:**

In [61]:
reset_graph()

batch_norm_momentum = 0.9

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name='X')
y = tf.placeholder(tf.int32, shape=(None), name='y')
training = tf.placeholder_with_default(False, shape=(), name='training')

with tf.name_scope('dnn'):
    he_init = tf.variance_scaling_initializer()
    
    my_batch_norm_layer = partial(tf.layers.batch_normalization,
                                  training=training,
                                  momentum=batch_norm_momentum)
    
    my_dense_layer = partial(tf.layers.dense,
                             kernel_initializer=he_init)
    
    hidden1 = my_dense_layer(X, n_hidden1, name='hidden1')
    bn1 = tf.nn.elu(my_batch_norm_layer(hidden1))
    hidden2 = my_dense_layer(bn1, n_hidden2, name='hidden2')
    bn2 = tf.nn.elu(my_batch_norm_layer(hidden2))
    logits_before_bn = my_dense_layer(bn2, n_outputs, name='outputs')
    logits = my_batch_norm_layer(logits_before_bn)

with tf.name_scope('loss'):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y,
                                                              logits=logits)
    loss = tf.reduce_mean(xentropy, name='loss')

with tf.name_scope('train'):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)

with tf.name_scope('eval'):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

init = tf.global_variables_initializer()

We need to explicitly run the extra update operations needed by batch normalization.

In [62]:
n_epochs = 20
batch_size = 128

extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
            sess.run([training_op, extra_update_ops],
                     feed_dict={X: X_batch, y: y_batch})
        acc_val = accuracy.eval(feed_dict={X: X_val, y: y_val})
        print(epoch, 'val acc:', acc_val)

0 val acc: 0.8618
1 val acc: 0.8936
2 val acc: 0.9068
3 val acc: 0.9152
4 val acc: 0.9176
5 val acc: 0.921
6 val acc: 0.925
7 val acc: 0.9286
8 val acc: 0.9276
9 val acc: 0.9312
10 val acc: 0.9314
11 val acc: 0.9338
12 val acc: 0.933
13 val acc: 0.936
14 val acc: 0.9376
15 val acc: 0.9398
16 val acc: 0.9414
17 val acc: 0.9436
18 val acc: 0.9442
19 val acc: 0.9468


> 因为网络比较浅，所以效果不是最好。Batch Norm和ELU对深层网络会有非常明显的效果。

```python
# Could also make the training op depend on the update op:
with tf.name_scope('train'):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(extra_update_ops):
        training_op = optimizer.minimize(loss)
```

This way, u would just have to evaluate the training_op during training, TF would automatically run the update op as well:

```python
sess.run(training_op, feed_dict={training: True, X: X_batch, y: y_batch})
```

Note that the list of trainable variables is shorter than the list of all global variables. This is because the moving averages are non-trainable variables. If u want to reuse a pretrained nn, u must not forget these non-trainable variables.

模型中可训练参数：

In [63]:
[v.name for v in tf.trainable_variables()]

['hidden1/kernel:0',
 'hidden1/bias:0',
 'batch_normalization/gamma:0',
 'batch_normalization/beta:0',
 'hidden2/kernel:0',
 'hidden2/bias:0',
 'batch_normalization_1/gamma:0',
 'batch_normalization_1/beta:0',
 'outputs/kernel:0',
 'outputs/bias:0',
 'batch_normalization_2/gamma:0',
 'batch_normalization_2/beta:0']

模型中所有参数：

In [64]:
[v.name for v in tf.global_variables()]

['hidden1/kernel:0',
 'hidden1/bias:0',
 'batch_normalization/gamma:0',
 'batch_normalization/beta:0',
 'batch_normalization/moving_mean:0',
 'batch_normalization/moving_variance:0',
 'hidden2/kernel:0',
 'hidden2/bias:0',
 'batch_normalization_1/gamma:0',
 'batch_normalization_1/beta:0',
 'batch_normalization_1/moving_mean:0',
 'batch_normalization_1/moving_variance:0',
 'outputs/kernel:0',
 'outputs/bias:0',
 'batch_normalization_2/gamma:0',
 'batch_normalization_2/beta:0',
 'batch_normalization_2/moving_mean:0',
 'batch_normalization_2/moving_variance:0']

### Gradient Clipping 梯度剪裁

减轻梯度爆炸问题的技术——在方向传播的过程中简单地裁剪梯度，从而保证不会超过阈值（这个对于循环神经网络非常有效）。

In [75]:
reset_graph()

n_inputs = 28 * 28  # MNIST
n_hidden1 = 300
n_hidden2 = 50
n_hidden3 = 50
n_hidden4 = 50
n_hidden5 = 50
n_outputs = 10

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name='X')
y = tf.placeholder(tf.int32, shape=(None), name='y')

with tf.name_scope('dnn'):
    hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name='hidden1')
    hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, name='hidden2')
    hidden3 = tf.layers.dense(hidden2, n_hidden3, activation=tf.nn.relu, name='hidden3')
    hidden4 = tf.layers.dense(hidden3, n_hidden4, activation=tf.nn.relu, name='hidden4')
    hidden5 = tf.layers.dense(hidden4, n_hidden5, activation=tf.nn.relu, name='hidden5')
    logits = tf.layers.dense(hidden5, n_outputs, name='outputs')

with tf.name_scope('loss'):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y,
                                                              logits=logits)
    loss = tf.reduce_mean(xentropy, name='loss')

In [76]:
learning_rate = 0.01

在TF中，优化器的`minimize()`函数同时负责计算和应用梯度，所以必须先调用优化器的`compute_gradients()`方法，然后调用`clip_by_value()`方法创建一个剪裁梯度的操作，最后调用`apply_gradients()`方法来应用裁剪后的梯度：

In [77]:
threshold = 0.01

optimizer = tf.train.GradientDescentOptimizer(learning_rate)
grads_and_vars = optimizer.compute_gradients(loss)
capped_gvs = [(tf.clip_by_value(grad, -threshold, threshold), var)
              for grad, var in grads_and_vars]
training_op = optimizer.apply_gradients(capped_gvs)

In [78]:
with tf.name_scope('eval'):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name='acc')

In [79]:
init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [80]:
n_epochsn_epochs = 20
batch_size = 128

In [82]:
with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        acc_val = accuracy.eval(feed_dict={X: X_val, y: y_val})
        print(epoch, 'val acc:', acc_val)
    save_path = saver.save(sess, './my_model_final.ckpt')

0 val acc: 0.4832
1 val acc: 0.8518
2 val acc: 0.8944
3 val acc: 0.9152
4 val acc: 0.9234
5 val acc: 0.9282
6 val acc: 0.9316
7 val acc: 0.9384
8 val acc: 0.943
9 val acc: 0.9452
10 val acc: 0.9476
11 val acc: 0.9498
12 val acc: 0.9526
13 val acc: 0.9538
14 val acc: 0.9542
15 val acc: 0.958
16 val acc: 0.96
17 val acc: 0.961
18 val acc: 0.9616
19 val acc: 0.9642
