##### 1. encoder

In [None]:
class EncoderSeq(nn.Module):
    ...
    def forward(self,
                input_seqs,
                input_lengths,
                batch_graph,
                hidden=None):
        # Note: we run this all at once (over multiple batches of multiple sequences)

        # input_seqs:    [seq_len, batch_size]
        # input_lengths: [batch_size]
        # batch_graph:   [batch_size, 5, seq_len, seq_len]

        embedded = self.embedding(input_seqs)  # S x B x E
        embedded = self.em_dropout(embedded)
        # embedded:   [seq_len, batch_size, embedding_size]

        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        pade_hidden = hidden

        pade_outputs, pade_hidden = self.gru_pade(packed, pade_hidden)
        pade_outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(pade_outputs)
        # pade_outputs: [seq_len,   batch_size, 2*hidden_size]
        # pade_hidden:  [2*n_layer, batch_size,   hidden_size]

        problem_output = pade_outputs[-1, :, :self.hidden_size] + pade_outputs[0, :, self.hidden_size:]
        pade_outputs   = pade_outputs[ :, :, :self.hidden_size] + pade_outputs[:, :, self.hidden_size:]  # S x B x H
        # problem_output: [batch_size, hidden_size]
        # pade_outputs:   [seq_len, batch_size, hidden_size]

        _, pade_outputs = self.gcn(pade_outputs, batch_graph)
        # pade_outputs: [batch_size, seq_len, hidden_size]

        pade_outputs = pade_outputs.transpose(0, 1)
        # pade_outputs: [seq_len, batch_size, hidden_size]

        return pade_outputs, problem_output

**(1) Node Representation**  
**Module:** BiLSTM neural network  
**Output:**  
$$H = \{h_{1}, ..., h_{N}\} \in R^{N \times d}, N = m + l$$
$d$ denotes the dimension of hidden vectors  
$m$ represents the number of words  
$l$ represents the number of quantities  

In [None]:
class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std  = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

In [None]:
class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."
    def __init__(self, d_model, d_ff ,d_out, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_out)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))

In [None]:
class Graph_Module(nn.Module):
    def __init__(self):
        # 4层GCN网络
        self.graph = clones(module=GCN(in_feat_dim=indim,
                                       nhid=hiddim,
                                       out_feat_dim=self.d_k,
                                       dropout=dropout),
                            N=4)
        
        self.feed_foward = PositionwiseFeedForward(indim, hiddim, outdim, dropout)
        self.norm = LayerNorm(outdim)

    def forward(self, graph_nodes, graph):
        # graph_nodes: [seq_len, batch_size, hidden_size]
        # graph:       [batch_size, 5, seq_len, seq_len]
        nbatches = graph_nodes.size(0)
        mbatches = graph.size(0)
        if nbatches != mbatches:
            graph_nodes = graph_nodes.transpose(0, 1)
        # graph_nodes: [batch_size, seq_len, hidden_size]

        # adj (batch_size, K, K): adjacency matrix

        # graph.numel(): 返回数组中的元素个数
        if not bool(graph.numel()):
            adj = self.get_adj(graph_nodes)
            adj_list = [adj, adj, adj, adj]
        else:
            adj = graph.float()
            # adj: [batch_size, 5, seq_len, seq_len]
            # adj[:, 1, :]: Quantity Comparison Graph
            # adj[:, 4, :]: Quantity Cell       Graph
            adj_list = [adj[:, 1, :], adj[:, 1, :], adj[:, 4, :], adj[:, 4, :]]

        g_feature = tuple([l(graph_nodes, x) for l, x in zip(self.graph, adj_list)])
        # g_feature: ([batch_size, seq_len, outdim], [batch_size, seq_len, outdim], [batch_size, seq_len, outdim], [batch_size, seq_len, outdim])
        # hidden_size = outdim * 4
        g_feature = self.norm(torch.cat(g_feature, dim=2)) + graph_nodes  # Norm & Add => Z => Z^
        # g_feature: [batch_size, seq_len, hidden_size]

        graph_encode_features = self.feed_foward(g_feature) + g_feature   # Norm & Add => Z-
        # graph_encode_features: [batch_size, seq_len, hidden_size]

        # adj: [batch_size, 5, seq_len, seq_len]
        return adj, graph_encode_features



**Graph Transformer:**  
  
**Graph_Module**:  
**inputs:**  
* adjacency matrices of multiple graphs $\{A_{k}\}_{k=1}^{K}$
* initial node embeddings $H$
* Quantity Comparison Graph $adj[:, 1, :]$
* Quantity Cell Graph $adj[:, 4, :]$
  
**outputs**: 
* adjacency matrices of multiple graphs $\{A_{k}\}_{k=1}^{K}$
* graph representation $z_{g}$
  
**model**:  
* for each graph $\{A_{k}\}_{k=1}^{K}$, where $K=4$, concatenate GCN learning
* each GCN output = $[batch\_size, seq\_len, outdim]$, and $outdim=hidden\_size / 4$  
$$Z = \overset{K}{\underset{k=1}{\parallel}} GCN(A_{k}, H)$$
* $\parallel$ denote the concatenation of the K GCN heads.  
* GCN layer = Layer Normalization layer + Residual Connection
$$\hat{Z} = Z + LayerNorm(Z)$$
* Feed-Forward Network FFN(two layer feed-forward network with relu function between layers)  
$$FFN(x) = max(0, x W_{f1} + b_{f1}) W_{f2} + b_{f2}$$
* feed-forward network sub-layer
$$\bar{Z} = \hat{Z} + LayerNorm(FFN(\hat{Z}))$$
* **(not mentioned)** min-pooling operation on all learned node representations, then fed into fully connected neural network(FC) to generate the graph representation
$$Z_{g} = FC(MinPool(\bar{Z}))$$


In [None]:
class GCN(nn.Module):
    def __init__(self):
        ...
        self.gc1 = GraphConvolution(in_feat_dim, nhid)
        self.gc2 = GraphConvolution(nhid, out_feat_dim)
        self.dropout = dropout

    def forward(self, x, adj):
        x = F.relu(self.gc1(x, adj))
        x = F.dropout(x, self.dropout, training=self.training)
        x = self.gc2(x, adj)
        return x

**GCN**  
  
**Inputs:**  
* adjacency matrix which represent the graph structure $A_{k}$
* feature matrix which mean the input feature of all nodes $X$
  
**Outputs:**  
* graph node feature
  
**model**:  
$$GCN(A_{k}, X) = GConv_{2}(A_{k}, GConv_{1}(A_{k}, X))$$

In [None]:
import math
import torch

from torch.nn.parameter import Parameter
from torch.nn.modules.module import Module


# Graph_Conv
class GraphConvolution(Module):
    """
    Simple GCN layer, similar to https://arxiv.org/abs/1609.02907
    """

    def __init__(self, in_features, out_features, bias=True):
        super(GraphConvolution, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.FloatTensor(in_features, out_features))
        if bias:
            self.bias = Parameter(torch.FloatTensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)

    def forward(self, input, adj):
        # input: [batch_size, seq_len, in_features]
        support = torch.matmul(input, self.weight)  # input * weight
        # support: [batch_size, seq_len, out_features]
        output  = torch.matmul(adj, support)  # adj * input * weight
        # output: [batch_size, seq_len, out_features]

        if self.bias is not None:
            return output + self.bias
        else:
            return output


**GraphConv**  
  
**input:**  
* graph node representations $X = input$
* adjancency matrix $A_{k} = adj$
    
**output:**  
* graph updated feature 
  
**model:**  
$$GConv(A_{k}, X) = relu(A_{k} X^{T} W_{gk})$$  
其中，$W_{gk} \in R^{d \times d_{k}}$, where $d_k = d/K$ 

#### 2. **prediction**

#### 3. **generate**

#### 4. **merge**