# copynet 

# pointer-generator network

In [1]:
import tensorflow as tf

## PGN公式构建

In [None]:
class Pointer(tf.keras.layers.Layer):
    # 定义一个类，来实现计算公式的代码表示，通过这里获得P_gen
    def __init__(self):
        super(Pointer, self).__init__()
        self.w_s_reduce = tf.keras.layers.Dense(1)
        self.w_i_reduce = tf.keras.layers.Dense(1)
        self.w_c_reduce = tf.keras.layers.Dense(1)
    
    def call(self, context_vector, state, dec_inp):
        return tf.nn.sigmoid(self.w_s_reduce(state)+self.w_c_reduce(context_vector)+self.w_i_reduce(dec_inp))

## 整体构建PGN模型的思路

In [None]:
class PGN(tf.keras.Model):
  
    def __init__(self, params):
        # 模型的框架搭建
        super(PGN, self).__init__()
        self.encoder = Encoder()  # Encoder层
        self.attention = BahdanauAttention()  # Attention层
        self.decoder = Decoder()  # Decoder层
        self.pointer = Pointer()
    
    def call_encoder(self, enc_inp):
        enc_hidden = self.encoder.initialize_hidden_state()
        enc_output, enc_hidden = self.encoder(enc_inp, enc_hidden)
        return enc_hidden, enc_output
    
    def call(self, enc_output, dec_hidden, enc_inp, enc_extended_inp,  dec_inp, batch_oov_len):
        # 计算过程的逻辑
        predictions = []
        attentions = []
        p_gens = []
        context_vector, _ = self.attention(dec_hidden, enc_output)
        for t in range(dec_inp.shape[1]):
            # 进行一个词一个词预测的时候，首先是调用decoder得到一个预测值pred
            dec_x, pred, dec_hidden = self.decoder(tf.expand_dims(dec_inp[:, t],1), dec_hidden, enc_output, context_vector)
            # 将attention的结果再拿出来，这里得到的就是alpha
            context_vector, attn = self.attention(dec_hidden, enc_output)
            # 获得p_gen
            p_gen = self.pointer(context_vector, dec_hidden, tf.squeeze(dec_x, axis=1))
      
            predictions.append(pred)
            attentions.append(attn)
            p_gens.append(p_gen)
        # 计算最终的分布
        # 原先的时候拿到pred就可以了，现在使用得到最终的一个分布再进行计算
        final_dists = _calc_final_dist()
        return final_dists

# coverage

In [None]:
def _coverage_loss(attn_dists, padding_mask):
    """
    Calculates the coverage loss from the attention distributions.
    一个新的损失函数
    原来的train.py，train过程中定义了一个一个损失函数：loss_function
    就将这里的损失函数放在loss_function的下边
    train_step里边就要定义一个total_loss,求两个loss之和
    
    """
    coverage = tf.zeros_like(attn_dists[0]) # shape (batch_size, attn_length). Initial coverage is zero.
    covlosses = [] # Coverage loss per decoder timestep. Will be list length max_dec_steps containing shape (batch_size).
    for a in attn_dists:
        covloss = tf.reduce_sum(tf.minimum(a, coverage), [1]) # calculate the coverage loss for this step
        covlosses.append(covloss)
        coverage += a # update the coverage vector
    coverage_loss = _mask_and_avg(covlosses, padding_mask)
    return coverage_loss

In [None]:
if use_coverage and prev_coverage is not None:
    # self.W_s(values) [batch_sz, max_len, units] self.W_h(hidden_with_time_axis) [batch_sz, 1, units]
    # self.W_c(prev_coverage) [batch_sz, max_len, units]  score [batch_sz, max_len, 1]
    score = self.V(tf.nn.tanh(self.W_s(enc_output) + self.W_h(hidden_with_time_axis) + self.W_c(prev_coverage)))
    # attention_weights shape (batch_size, max_len, 1)
    mask = tf.cast(enc_pad_mask, dtype=score.dtype)
    masked_score = tf.squeeze(score, axis=-1) * mask
    masked_score = tf.expand_dims(masked_score, axis=2)
    attention_weights = tf.nn.softmax(masked_score, axis=1)

    coverage = attention_weights + prev_coverage
    # 这一部分主要在Attention层的call方法里边进行改进


In [None]:
class Pointer(tf.keras.layers.Layer):
    """
    calculate Pgen
    input context_vector [batch_sz,enc_units] dec_hidden [batch_sz,dec_units] dec_inp_context [batch_sz,1,embedding_dim+enc_units]
    output scaler pgen
    """
    def __init__(self):
        super(Pointer, self).__init__()
        self.w_s_reduce = tf.keras.layers.Dense(1)
        self.w_i_reduce = tf.keras.layers.Dense(1)
        self.w_c_reduce = tf.keras.layers.Dense(1)

    def call(self, context_vector, dec_hidden, dec_inp):
        # change dec_inp_context to [batch_sz,embedding_dim+enc_units]
        dec_inp = tf.squeeze(dec_inp, axis=1)
        pgen = tf.nn.sigmoid(self.w_s_reduce(dec_hidden) + self.w_c_reduce(context_vector) + self.w_i_reduce(dec_inp))
        return pgen

# 先验知识其他技巧

In [None]:
class ScaleShift(Layer):
    """缩放平移变换层（Scale and shift）"""
    def __init__(self, **kwargs):
        super(ScaleShift, self).__init__(**kwargs)
    def build(self, input_shape):
        kernel_shape = (1,) * (len(input_shape)-1) + (input_shape[-1],)
        self.log_scale = self.add_weight(name='log_scale',
                                         shape=kernel_shape,
                                         initializer='zeros')
        self.shift = self.add_weight(name='shift',
                                     shape=kernel_shape,
                                     initializer='zeros')
    def call(self, inputs):
        x_outs = K.exp(self.log_scale) * inputs + self.shift
        return x_outs

- RNN（lstm/gru)参数初始化

- 梯度消减

- Dropout/L2 regularization
在decoder模型框架定义里边先过一个GRU，再过一个Dropout。
L2 regularization：还没加，可以自己试试看看

- learning rate

- 双向lstm、gru

- 数据预处理

- encoder和decoder的embedding共享参数（也就是用一套词向量）

- 采用预训练词向量

- batch size调小，16，32，64

- hidden states 128，256

- Adam betra1=0.9， betra2=0.999， e=10-8

- learning rate=0.0001， 0.001

- clipping gradient = 2.0  
梯度消减的参考
通常在选择优化器的地方进行使用



In [1]:
import tensorflow as tf
help(tf.keras.layers.Bidirectional)

Help on class Bidirectional in module tensorflow.python.keras.layers.wrappers:

class Bidirectional(Wrapper)
 |  Bidirectional(layer, merge_mode='concat', weights=None, backward_layer=None, **kwargs)
 |  
 |  Bidirectional wrapper for RNNs.
 |  
 |  Arguments:
 |    layer: `Recurrent` instance.
 |    merge_mode: Mode by which outputs of the
 |      forward and backward RNNs will be combined.
 |      One of {'sum', 'mul', 'concat', 'ave', None}.
 |      If None, the outputs will not be combined,
 |      they will be returned as a list.
 |    backward_layer: Optional `Recurrent` instance to be used to handle
 |      backwards input processing. If `backward_layer` is not provided,
 |      the layer instance passed as the `layer` argument will be used to
 |      generate the backward layer automatically.
 |      Note that the provided `backward_layer` layer should have properties
 |      matching those of the `layer` argument, in particular it should have the
 |      same values for `state