# 自编码网络
自编码是一种无监督学习方式，一般输出比输入的维度要小的多，因此常用于降维问题。自编码是一种有效的特征检测工具，可用于深度网络的预训练。此外自编码网络还可以用于随机生成与输入数据及其相似的新数据，我们称为生成模型。因此其典型应用包括：降维，特征提取，无监督预训练，生成模型。<br/>
自编码网络通常由两部分构成：编码器将输入转化成内部表示（称识别网络，通常低于输入的维数），解码器将内部表示转变为输出（称生成网络）。其网络结构类似于多层感知器MLP，但输入神经元与输出神经元的数目必须相等。输出通常也称重建，损失函数称为重建损失。由于内部表示的维度通常低于输出的维度，而损失函数要求输入输出想去不远，因而迫使网络学习输入最重要的特征。
```python
import tensorflow as tf
from tensorflow.contrib.layers import fully_connected
n_input=3
n_hidden=2
n_output=n_input
learning_rate=0.01

X=tf.placeholder(tf.float32,shape=[None,n_input])
hidden=fully_connected(X,n_hidden,activation_fn=None)
output=fully_connected(hidden,n_output,activation_fn=None)
reconstruction_loss=tf.reduce_mean(tf.square(output-X))
optimizer=tf.train.AdamOptimizer(learning_rate)
training_op=optimizer.minimize(reconstruction_loss)
init=tf.global_variables_initializer()

from tensorflow.examples.tutorials.mnist import input_data
from sklearn.model_selection import train_test_split as tts
mnist = input_data.read_data_sets("/tmp/data/")
X_train,y_train,X_test,y_test=tts(mnist["data"],mnist['target'],test_size=0.2)
n_iteration=1000
coding=hidden#隐层的损失作为coding
with tf.Session() as sess:
    init.run()
    for i in range(n_iteration):
        training_op.run(feed_dict={X:X_train})
    coding_val=coding_eval(feed_dict={X:X_test})
```
activation_fn=None,损失函数为MSE即PCA。


In [1]:
import tensorflow as tf
from tensorflow.contrib.layers import fully_connected
n_input=28*28
n_hidden=2
n_output=n_input
learning_rate=0.01

X=tf.placeholder(tf.float32,shape=[None,n_input])
hidden=fully_connected(X,n_hidden,activation_fn=None)
output=fully_connected(hidden,n_output,activation_fn=None)
reconstruction_loss=tf.reduce_mean(tf.square(output-X))
optimizer=tf.train.AdamOptimizer(learning_rate)
training_op=optimizer.minimize(reconstruction_loss)
init=tf.global_variables_initializer()

from tensorflow.examples.tutorials.mnist import input_data
from sklearn.model_selection import train_test_split as tts
mnist = input_data.read_data_sets("/tmp/data/")
X_train=mnist.train.images
X_test=mnist.test.images
n_iteration=1000
coding=hidden#隐层的损失作为coding
with tf.Session() as sess:
    init.run()
    for i in range(n_iteration):
        training_op.run(feed_dict={X:X_train})
    coding_val=coding.eval(feed_dict={X:X_test})

  from ._conv import register_converters as _register_converters


Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
Instructions for updating:
Please write your own downloading logic.
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting /tmp/data/train-images-idx3-ubyte.gz
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.


KeyboardInterrupt: 

# 堆栈自编码网络
堆栈自编码网络通常是对称的，且深度有限以避免过拟合。

<img src='https://raw.githubusercontent.com/hzg0601/python/master/stacked_autoencoder.png' width='500'>

In [16]:

import tensorflow as tf
from tensorflow.contrib.layers import fully_connected

import tensorflow.examples.tutorials.mnist.input_data as input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
tf.reset_default_graph()
n_input=28*28
n_hidden1=300
n_hidden2=150
n_hidden3=n_hidden1
n_output=n_input

learning_rate=0.01
l2_reg=0.001

X=tf.placeholder(tf.float32,shape=[None,n_input])
with tf.contrib.framework.arg_scope(
[fully_connected],activation_fn=tf.nn.elu,
    weights_initializer=tf.contrib.layers.variance_scaling_initializer(),
    weights_regularizer=tf.contrib.layers.l2_regularizer(l2_reg)
):
    hidden1=fully_connected(X,n_hidden1)
    hidden2=fully_connected(hidden1,n_hidden2)
    hidden3=fully_connected(hidden2,n_hidden3)
    output=fully_connected(hidden3,n_output,activation_fn=None)
    
    reconstruction_loss=tf.reduce_mean(tf.square(output-X))
    reg_losses=tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
    loss=tf.add_n([reconstruction_loss]+reg_losses)
    
    optimizer=tf.train.AdamOptimizer(learning_rate)
    training_op=optimizer.minimize(loss)
    
    init=tf.global_variables_initializer()
n_epochs=5
batch_size=150
 
with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        n_batches=mnist.train.num_examples//batch_size#取模
        for i in range(n_batches):
            X_batch,y_batch=mnist.train.next_batch(batch_size)
            sess.run(training_op,feed_dict={X:X_batch})

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


# 权重关联
对于对称自编码器，可以将解码层的权重与编码层的权重相关联，这种方法可以将权重的数量减半，从而加速训练，减少过拟合的风险。记$W_L$为L层的连接权重，假设模型共L层，则$\frac{N}{2}$即编码层，N层为输出层，解码层的权重可定义为：$W_{N-L+1}=W_L^T$ .用fully_connected函数定义关联权重比较困难，可以手动定义：

In [None]:
import tensorflow as tf

n_input=28*28
n_hidden1=300
n_hidden2=150
n_hidden3=n_hidden1
n_output=n_input

activation=tf.nn.elu
regularizer=tf.contrib.layers.l2_regularizer(l2_reg)
initializer=tf.contrib.layers.variance_scaling_initializer()

X=tf.placeholder(tf.float32,shape=(None,n_inputs))
weights1_init=initializer([n_inputs,n_hidden1])
weights2_init=initializer([n_hidden1,n_hidden2])

weights1=tf.Variable(weights1_init,dtype=tf.float32,name='weights1')
weights2=tf.Variable(weights2_init,dtype=tf.float32,name='weights2')
weights3=tf.transpose(weights2,name='weights3')
weights4=tf.transpose(weights1,name='weights4')#权重关联

biases1=tf.Variable(tf.zeros(n_hidden1),name='biases1')
biases2=tf.Variable(tf.zeros(n_hidden2),name='biases2')
biases3=tf.Variable(tf.zeros(n_hidden3),name='biases3')
biases4=tf.Variable(tf.zeros(n_hidden4),name='biases4')

hidden1=activation(tf.matmul(X,weights1)+biases1)
hidden2=activation(tf.matmul(hidden1,weights2)+biases2)
hidden3=activation(tf.matmul(hidden2,weights3)+biases3)
outputs=tf.matmul(hidden3,weights4)+biases4

reconstruction_loss=tf.reduce_mean(tf.square(output-X))
reg_loss=regularizer(weights1)+regularizer(weights2)
loss=reconstruction_loss+reg_loss

optimizer=tf.train.AdamOptimizer(learning_rate)
training_op=optimizer.minimizer(loss)

init=tf.global_variables_initializer()

# 每次训练一个自编码器
一次训练整个堆栈网络是非常耗时，因此常常是一次训练一个浅的自编码器，然后将其堆栈成一个网络。
<img width='500' height='400' src='https://raw.githubusercontent.com/hzg0601/python/master/stacked_autoencoder_2.png'>

第一阶段学习重建输入，第二阶段学习重建上一个的隐层。执行的方法是：一，每个阶段用不同的计算图；或者，二，用同一个计算图，如下图所示：
<img width='500' height='400' src='https://raw.githubusercontent.com/hzg0601/python/master/stacked_autoencoder_3.png'>


In [None]:
optimizer=tf.train.AdamOptimizer(learning_rate)
with tf.name_scope('phase1'):
    phase1_outputs=tf.matmul(hidden1,weights4)+biases4
    phase1_reconstruction_loss=tf.reduce_mean(tf.square(phase1_outputs)-X)
    phase1_reg_loss=regularizer(weights1)+regularizer(weights4)#隐层1和输出层
    phase1_loss=phase1_reconstruction_loss+phase1_reg_loss
    phase1_training_op=optimizer.minimize(phase1_loss)

with tf.name_scope('phase2'):
    phase2_reconstruction_loss=tf.reduce_mean(tf.square(hidden3-hidden1))
    phase2_reg_loss=regularizer(weights2)+regularizer(weights3)
    phase2_loss=phase2_reconstruction_loss+phase2_reg_loss
    train_vars=[weights2,biases2,weights3,biases3]
    phase2_training_op=optimizer.minimize(phase2_loss,var_list=train_vars)#提供训练变量
    #不包括weights1,biases1,从而冻结第一层
    
    

In [None]:
#可视化重建结果
n_test_digits=2
X_test=mnist.test.image[:n_test_digits]
with tf.Session() as sess:
    [...]#训练自编码器
    outputs_val=outputs.eval(feed_dict={X:X_test})
def plot_image(image,shape=[28,28]):
    plt.show(image.reshape(shape),cmap='Greys',interpolation='nearest')
    plt.axis('off')
for digit_index in range(n_test_digits):
    plt.subplot(n_test_digits,2,digit_index*2+1)
    plt_image(X_test[digit_index])
    plt.subplot(n_test_digits,2,digit_index*2+2)
    plot_image(outputs_val[digit_index])
        
        
#可视化特征
##针对每个隐层的每个神经元，找出最能刺激训练样本的样例，这对高层隐层非常有用。
with tf.Session() as sess:
    [...]#训练自编码器
    weights1_val=weights1.eval()
for i in range(5):
    plt.subplot(1,5,i+1)
    plot_image(weights1_val.T[i])
    
#另一种方法是给自编码器提供一个随机输入图像，测试您感兴趣的神经元的激活程度，
#然后执行反向传播来调整图像，使神经元激活得更充分。 如果迭代数次（执行渐变上升），
#图像将逐渐变成最令人满意的图像（用于神经元）。 这是一种用于可视化神经元的技术。

# 用堆栈自编码网络进行无监督预训练
##迁移学习的优势在于，你的神经网络不必学习所有底层特征，只需复用已存在的底层检测器。同样地
##如果你拥有大量的无标记样本，你可以先训练一个堆栈自编码网络，然后复用其低层网络，用以创建
##符合任务需要的神经网络，再用有标记样本继续训练。
<img width='500' src='https://raw.githubusercontent.com/hzg0601/python/master/unsupervised_pretraining_2.png'>

# 过完备自编码（输出比输入大）
## 降噪自编码
对有噪音的输入进行去噪恢复，防止自编码器简单复制输入。噪音可以是高斯噪音，也可以是输入的随机变换，如dropout。
<img width='500' src='https://raw.githubusercontent.com/hzg0601/python/master/denoising_autoencoder.png'>

In [None]:
#random noise
X=tf.placeholder(tf.float32,shape=[None,n_inputs])
X_noisy=X+tf.random_normal(tf.shape(X))#用tf.get_shape会报错，因为返回的是[None,n_inputs]
[...]
hidden1=activation(tf.matmul(X_noisy,weights)+biases1)
[...]
reconstruction_loss=tf.reduce_mean(tf.square(outputs-X))
[...]



#dropout
from tensorflow.contrib.layers import dropout
keep_prob=0.7
is_training=tf.placeholder_with_default(False,shape=(),name='is_training')
X=tf.placeholder(tf.float32,shape=[None,n_inputs])
X_drop=dropout(X,keep_prob,is_training=is_training)
[....]
hidden1=ativation(tf.matmul(X_drop,weights1)+biases1)
[...]
reconstruction_loss=tf.reduce_mean(tf.square(output-X))
[...]
#在训练阶段将is_training设为True
sess.run(training_op,feed_dict={X:X_batch,is_training:True})

# 稀疏自编码
在损失函数中添加正则项，使激活神经元的个数大幅减少。为获得稀疏表示，我们必须首先定义每次训练的稀疏性：通过计算全部训练批次中每个神经元的的平均激活次数，批的大小不能太小。一旦获得每个神经元的平均激活次数，就要为过度活跃的神经元增加稀疏损失项，通常是Kullback-Leibler散度，因为它的梯度表现更好。对任意两个随机分布P和Q，K-L散度定义为：
$$D_{KL}(P||Q)=\sum{i}{P(i)log\frac{P(i)}{Q(i)}}$$
对于稀疏自编码，我们希望测度的是目标概率p和真实概率q之间的散度，可表示为
$$D_{KL}(p||q)=p\ log\frac{p}{q}+(1-p)log\frac{1-p}{1-q}$$
为区分重建损失和稀疏损失的相对重要性，可以在稀疏损失项上加入稀疏权重超参数。


In [None]:
def kl_divergence(p,q):
    return p*tf.log(p/q)+(1-p)*tf.log((1-p)/(1-q))
learning_rate=0.01
sparsity_target=0.1
sparsity_weight=0.2

[....]

optimizer=tf.train.AdamOptimizer(learning_rate)
hidden1_mean=tf.reduce_mean(hidden1,axis=0)
sparsity_loss=tf.reduce_sum(kl_divergence(sparsity_target,hidden1_mean))
reconstruction_loss=tf.reduce_mean(tf.square(output-X))
loss=reconstruction_loss+sparsity_loss*sparsity_weight
training_op=optimizer.minimize(training_op)
#编码层的activation必须介于0-1，否则KL会返回NaN,解决该冲突的方法之一是在建模时引入logistic
##激活函数
hidden1=tf.nn.sgmoid(tf.matmul(X,weights1)+biases)
#为加速训练，可以将损失函数由MSE替换为cross entropy,其梯度表现更好，但需要将输入标准化，
##输出也要调整为sigmoid函数
[...]
logits=tf.matmul(hidden1,weights2)+biases2
outputs=tf.nn.sigmoid(logits)
reconstruction_loss=tf.reduce_sum(
tf.nn.sigmoid_cross_entropy_with_logits(labels=X,logits=logits))



# 变分自编码（variational autoencoders）
是概率自编码，因为他们的输出有随机的成分；也是生成自编码，即可以生成从原始样本集中抽取的数据。这两个特征使得他们与RBM非常像,只是训练更容易，抽样也更快。
<img width='500' src='https://raw.githubusercontent.com/hzg0601/python/master/variational_autoencoder.png'>


隐层1,2构成编码器，隐层4，5构成解码器，与其他自编码器不同的是，第三层并非直接产生编码结果，而是输出编码均值$\mu$和标准差$\sigma$,真是编码是从均值为$\mu$，标准差$\sigma$得的的高斯分布随机数，而后解码器正常解码。尽管输入的分布非常复杂，变分自编码器仍可以输出与输入类似的样本。在训练阶段，损失函数迫使编码器遍历编码空间（潜空间）生成类似于高斯点构成的球星空间，从而可以得到新样本。<br/>
其损失函数由两部分构成：第一部分是重建损失，第二部分是潜在损失（latent loss）迫使自编码器编码输入是类似于高斯分布的抽样，因此用KL散度测度目标分布即高斯分布与分布之间的差异。由于高斯噪音，限制了可转移至编码层的信息量，因此损失函数会比较复杂。潜在损失的代码如下：
```python 
eps=1e-10#平滑项
latent_loss=0.5*tf.reduce_sum(
tf.square(hidden3_sigma)+tf.square(hidden3_mean))-1-tf.log(eps+tf.square(hidden3_sigma))
```
另一种变体训练编码器输出$\gamma=log(\sigma^2)$而非$\sigma$,从而$\sigma=exp(\frac{\gamma}{2})$,这种变化会使编码层更容易不同尺度的$\sigma$，因此收敛速度更快，代码如下：
```python
latent_loss=0.5*tf.reduce_sum(tf.exp(hidden3_gamma)+tf.square(hidden3_mean)-1-hidden3_gamma)
```


In [4]:
import tensorflow as tf
from tensorflow.contrib.layers import fully_connected
tf.reset_default_graph()

n_inputs=28*28
n_hidden1=500
n_hidden2=500
n_hideen3=20#编码层
n_hidden4=n_hidden2
n_hidden5=n_hidden1
n_outputs=n_inputs

learning_rate=-0.001
X=tf.placeholder(tf.float32,shape=[None,n_inputs])
with tf.contrib.framework.arg_scope(
[fully_connected],activation_fn=tf.nn.elu,
    weights_initializer=tf.contrib.layers.variance_scaling_initializer()):
    hidden1=fully_connected(X,n_hidden1)
    hidden2=fully_connected(hidden1,n_hidden2)
    hidden3_mean=fully_connected(hidden2,n_hideen3,activation_fn=None)
    hidden3_gamma=fully_connected(hidden2,n_hideen3,activation_fn=None)
    hidden3_sigma=tf.exp(0.5*hidden3_gamma)
    noise=tf.random_normal(tf.shape(hidden3_sigma),dtype=tf.float32)
    hidden3=hidden3_mean+hidden3_sigma*noise
    hidden4=fully_connected(hidden3,n_hidden4)
    hidden5=fully_connected(hidden4,n_hidden5)
    logits=fully_connected(hidden5,n_outputs,activation_fn=None)
    outputs=tf.sigmoid(logits)
    
reconstruction_loss=tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(labels=X,logits=logits))
latent_loss=0.5*tf.reduce_sum(tf.exp(hidden3_gamma)+tf.square(hidden3_mean)-1-hidden3_gamma)
cost=reconstruction_loss+latent_loss
optimizer=tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op=optimizer.minimize(cost)

init=tf.global_variables_initializer()


In [5]:
import numpy as np
n_digits=60
n_epochs=50
batch_size=150

#tf.reset_default_graph()

from tensorflow.examples.tutorials.mnist import input_data
mnist=input_data.read_data_sets("MNIST_data/", one_hot=True)

def plot_image(image,shape=[28,28]):
    plt.show(image.reshape(shape),cmap='Greys',interpolation='nearest')
with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        n_batches=mnist.train.num_examples//batch_size
        for iteration in range(n_batches):
            X_batch,y_batch=mnist.train.next_batch(batch_size)
            sess.run(training_op,feed_dict={X:X_batch})
    codings_rnd=np.random.normal(size=[n_digits,n_hidden3])
    outputs_val=outputs.eval(feed_dict={hidden3:codings_rnd})
for iteraion in range(n_digits):
    plt.subplot(n_digits,10,iteration+1)
    plot_image(outputs_val[iteration])

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Instructions for updating:
Please use tf.one_hot on tensors.
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


KeyboardInterrupt: 

# 其他自编码器
## 收缩自编码Contractive autoencoder(CAE)
## 堆栈卷积自编码stacked convolutional autoencoder
## 生成式随机网络generative stochastic network
## winner-take-all 自编码
## 对抗式自编码adversarial autoencoder