In [29]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext.data import Field, BucketIterator
import numpy as np

import spacy
import en_core_web_sm
import de_core_news_sm
spacy_en = en_core_web_sm.load()
spacy_de = de_core_news_sm.load()

In [66]:
class SelfAttention(nn.Module):
    """Self-Attention block.
    
    """
    
    def __init__(self, d_k, d_q, d_v, d_temp, dropout=0.0):
        super(SelfAttention, self).__init__()
        
        self.k_linear = nn.Linear(d_k, d_temp)
        self.q_linear = nn.Linear(d_q, d_temp)
        self.v_linear = nn.Linear(d_v, d_v)
        self.layer_norm = nn.LayerNorm(d_v)
        
        self.scale = d_v**(-0.5)
        
    def forward(self, k, q, v, attn_mask=None):
        """
        Shape:
            k: [B, S, d_k];
            q: [B, L, d_q];
            v: [B, S, d_v];
            attn_mask: [L, S], L is the target sequence length, S is the source sequence
        length, optional.
        """
        residual = q # residual: [B, S, d_v]
        
        # Linear before scaled dot-product attntion
        _k = self.k_linear(k)
        _q = self.q_linear(q)
        _v = self.v_linear(v)
        
        # scaled dot-product attntion
        attention = torch.bmm(_k, _q.transpose(1,2)) * self.scale # attention: [B, S, L]
        
        # mask
        if attn_mask is not None: 
            attention = attention.masked_fill_(attn_mask, -np.inf)
            
        attn_weight = F.softmax(attention, dim=1) # attn_weight: [B, S, L]
        output = torch.bmm(attn_weight.transpose(1,2), v) # output: [B, L, d_embed]
        
        # add residual and norm layer
        output = self.layer_norm(output + residual)
        
        return output, attn_weight

In [80]:
class MultiHeadAttention(nn.Module):
    """Multi-head attention, based on SelfAttention.
    
    """
    def __init__(self, d_input, d_temp, nhead, dropout=0.0):
        super(MultiHeadAttention, self).__init__()
        
        self.nhead = nhead
        d_k, d_q, d_v = d_input, d_input, d_input
        self.self_attn_list = nn.ModuleList(SelfAttention(d_k, d_q, d_v, d_temp) for _ in range(nhead))
        self.fc_layer = nn.Linear(d_input*nhead, d_input)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, k, q, v, attn_mask=None):
        output_list, attn_weight_list = [], []
        for i in range(self.nhead):
            # # output: [B, L, d_embed], attn_weight: [B, S, L]
            if attn_mask is not None:
                output_list_i, attn_weight_i = self.self_attn_list[i](k, q, v, attn_mask)
            else:
                output_list_i, attn_weight_i = self.self_attn_list[i](k, q, v)
            output_list_i, attn_weight_i = self.self_attn_list[i](k, q, v)
            output_list.append(output_list_i)
            attn_weight_list.append(attn_weight_i)
        output_concat = torch.cat(output_list, dim=-1) # output_concat: [B, L, d_embed*nhead]
        output = self.dropout(self.activation(self.fc_layer(output_concat))) # output: [B, L, d_embed]
        
        return output, attn_weight_list

In [68]:
class FeedForward(nn.Module):
    """Feed forward in Encoder and Decoder layer.
    """
    def __init__(self, d_input, d_output, d_ff):
        super(FeedForward, self).__init__()
        
        self.linear_1 = nn.Conv1d(d_input, d_ff, 1)
        self.linear_2 = nn.Conv1d(d_ff, d_input, 1)
        self.relu = nn.ReLU()
        self.layer_norm = nn.LayerNorm(d_output)
        
    def forward(self, x):
        """
        Args:
            x: [B, L, d_embed]
        """
        # feed forward
        output = self.linear_2(self.relu(self.linear_1(x)))
        
        # add residual and norm layer
        output = self.layer_norm(output + x)
        
        return output

In [81]:
class EncoderLayer(nn.Module):
    """Encoder sub-layer.
    Args:
        d_input: the dimension of embedded input.
        d_output: the dimensiof of output.
        d_temp: the dimension of multi-head linear's output.
        nhead: the number of heads in the multiheadattention models (default=8).
        d_ff: the dimension of the feedforward network model (default=2048).
    """
    def __init__(self, d_input, d_output, d_temp=2048, nhead=8, d_ff=2048, dropout=0.0):
        
        self.multi_head_attn = MultiHeadAttention(d_input, d_temp, nhead, dropout)
        self.feed_forward = FeedForward(d_input, d_output, d_ff)
        
    def forward(self, input, attn_mask=None):
        output_attn, attn_weight = self.multi_head_attn(input, input, input, attn_mask)
        output = self.feed_forward(output_attn)
        
        return output, attn_weight

In [86]:
class EncoderStack(nn.Module):
    r"""The encoder stack.
    
    Args:
        d_input: the dimension of embedded input.
        d_output: the dimensiof of output.
        d_temp: the dimension of multi-head linear's output.
        num_encoder: the number of encoder layers.
        nhead: the number of heads in the multiheadattention models (default=8).
        d_ff: the dimension of the feedforward network model (default=2048).
    """
    def __init__(self, d_input, d_output, d_temp=2048, num_encoder=6, nhead=8, d_ff=2048,
                dropout=0.0):
        super(EncoderStack, self).__init__()
        
        self.num_encoder = num_encoder
        self.encoder_stack = nn.Sequential(
            EncoderLayer(d_input, d_output, d_temp, nhead, d_ff, dropout) 
            for _ in range(num_encoder))
        
        self._reset_parameters()
    
    def forward(self, input_embedding, mask=None):
        r"""Pass the input through the encoder layers in turn.

        Args:
            src: the sequnce to the encoder (required).
            mask: the mask for the src sequence (optional).

        Shape:
            - src: :math:`(S, N, E)`.
        """
        output = input_embedding
        for i in range(self.num_encoder):
            output, _ = self.encoder_stack[i](output, mask)
        
        return output
    
    def _reset_parameters(self):
        r"""Initiate parameters in the EncoderStack model."""

        for p in self.parameters():
            if p.dim() > 1:
                xavier_uniform_(p)

In [87]:
class DecoderLayer(nn.Module):
    """Decoder sub-layer.
    """
    def __init__(self, d_input, d_output, d_temp=2048, nhead=8, d_ff=2048, dropout=0.0):
        super(DecoderLayer, self).__init__()
        
        self.masked_multi_head_attn = MultiHeadAttention(d_input, d_temp, nhead, dropout)
        self.multi_head_attn = MultiHeadAttention(d_input, d_temp, nhead, dropout)
        self.feed_forward = FeedForward(d_input, d_output, d_ff)
        
    def forward(self, input, encoder_output, mask=None):
        output_1, _ = self.masked_multi_head_attn(input, mask)
        output_2, _ = self.multi_head_attn(encoder_output, encoder_output, output_1)
        output = self.feed_forward(output_2)
        
        return output

In [88]:
class DecoderStack(nn.Module):
    r"""In order to translate non-autoregressively and parallelize the decoding process, 
    The decoder is different with the original Transformer network. Mainly in the following
    aspects:
        1. mask out each query position only from attending to itself.
        
    Args:
        
    
    """
    def __init__(self, d_input, d_output, d_temp=2048, num_decoder=6, nhead=8, d_ff=2048,
                dropout=0.1):
        super(DecoderStack, self).__init__()
        
        self.num_decoder = num_decoder
        self.decoder_stack = nn.Sequential(
            DecoderLayer(d_input, d_output, d_temp, nhead, d_ff, dropout)
            for _ in range(num_decoder))
        
        self._reset_parameters()
        
    def forward(self, output_embedding, encoder_output, mask=None):
        output = output_embedding
        for i in range(self.num_decoder):
            output = self.decoder_stack[i](output, encoder_output, mask)
        
        return output
        
    def _reset_parameters(self):
        r"""Initiate parameters in the DecoderStack model."""

        for p in self.parameters():
            if p.dim() > 1:
                xavier_uniform_(p)

In [89]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_embed, max_seq_len):
        super(PositionalEncoding, self).__init__()
        
        position_encoding = np.array([
          [pos / np.pow(10000, 2.0 * (j // 2) / d_embed) for j in range(d_embed)]
          for pos in range(max_seq_len)])

        position_encoding[:, 0::2] = np.sin(position_encoding[:, 0::2])
        position_encoding[:, 1::2] = np.cos(position_encoding[:, 1::2])

        # 在PE矩阵的第一行，加上一行全是0的向量，代表这`PAD`的positional encoding
        # 在word embedding中也经常会加上`UNK`，代表位置单词的word embedding，两者十分类似
        # 那么为什么需要这个额外的PAD的编码呢？很简单，因为文本序列的长度不一，我们需要对齐，
        # 短的序列我们使用0在结尾补全，我们也需要这些补全位置的编码，也就是`PAD`对应的位置编码
        pad_row = torch.zeros([1, d_model])
        position_encoding = torch.cat((pad_row, position_encoding))
        
        # 嵌入操作，+1是因为增加了`PAD`这个补全位置的编码，
        # Word embedding中如果词典增加`UNK`，我们也需要+1。看吧，两者十分相似
        self.position_encoding = nn.Embedding(max_seq_len + 1, d_model)
        self.position_encoding.weight = nn.Parameter(position_encoding,
                                                     requires_grad=False)


SyntaxError: invalid syntax (<ipython-input-89-0f063d199bfb>, line 2)

In [96]:
diag_ones = np.array([1]*10)
tgt_mask = np.diag(diag_ones)
tgt_mask = tgt_mask.float().masked_fill(tgt_mask, float('-inf'))

AttributeError: 'numpy.ndarray' object has no attribute 'float'

In [None]:
class NAT(nn.Module):
    """Non-autoregressive transformer.
    """
    def __init__(self)

In [95]:
diag_ones = torch.ones((q.size(0), q.size(1))) # diag_ones: [B, L]
attn_mask = torch.diag_embed(diag_ones).byte() # attn_mask: [B, L, L]

diag_ones = torch.ones((2, 5))
mask = torch.diag_embed(diag_ones).byte()
# attn_mask = torch.from_numpy(mask).byte()
attn_weight = torch.rand(2, 5, 5) # [B, S, L]
# attn_mask = attn_mask.repeat(2, 1, 1)
attn_weight = attn_weight.masked_fill_(mask, -np.inf)
attn_weight = F.softmax(attn_weight, dim=1)



In [94]:
mask.float().masked_fill(mask, float('-inf'))



tensor([[[-inf, 0., 0., 0., 0.],
         [0., -inf, 0., 0., 0.],
         [0., 0., -inf, 0., 0.],
         [0., 0., 0., -inf, 0.],
         [0., 0., 0., 0., -inf]],

        [[-inf, 0., 0., 0., 0.],
         [0., -inf, 0., 0., 0.],
         [0., 0., -inf, 0., 0.],
         [0., 0., 0., -inf, 0.],
         [0., 0., 0., 0., -inf]]])

In [25]:
mask = np.array([[1 for _ in range(4)] for _ in range(3)])
bt = torch.from_numpy(mask).byte()

In [26]:
bt

tensor([[1, 1, 1, 1],
        [1, 1, 1, 1],
        [1, 1, 1, 1]], dtype=torch.uint8)

In [18]:
bt.masked_fill?

In [4]:
class FertilityPredictor(nn.Module):
    pass

In [5]:
class TranslationPredictor(nn.Module):
    pass