In [1]:
# python自带
import os
import re
import random
from lxml import etree

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = '3'

In [3]:
# 科学计算
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [4]:
# NLP 相关
import jieba
import torchtext
from nltk import word_tokenize
from nltk.translate import bleu_score

In [5]:
def word_tokenize_zh(input):
    return list(jieba.cut(input))

In [6]:
SRC = torchtext.data.Field(tokenize=word_tokenize)
TRG = torchtext.data.Field(tokenize=word_tokenize_zh)

train_data = torchtext.datasets.TranslationDataset(
    path='data/news-commentary-v12.zh-en',
    exts=('.en', '.zh'),
    fields=(SRC, TRG)
)


Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.769 seconds.
Prefix dict has been built succesfully.


In [7]:
print(len(train_data))

227383


In [8]:
dev_data = torchtext.datasets.TranslationDataset(
    path='data/newsdev2017-enzh',
    exts=('.en', '.zh'),
    fields=(SRC, TRG)
)

In [9]:
print(len(dev_data))

2002


In [10]:
test_data = torchtext.datasets.TranslationDataset(
    path='data/newstest2017-enzh',
    exts=('.en', '.zh'),
    fields=(SRC, TRG)
)

In [11]:
print(len(test_data))

2001


In [16]:
SRC.build_vocab(train_data.src, dev_data.src, test_data.src, min_freq=5)

In [15]:
TRG.build_vocab(train_data.trg, dev_data.trg, test_data.trg, min_freq=5)

In [26]:
print(len(SRC.vocab.__dict__['freqs']))
print(len(TRG.vocab.__dict__['freqs']))

95167
91404


In [29]:
SRC.vocab.load_vectors(torchtext.vocab.Vectors('/home/zyc/Downloads/glove.840B.300d.txt'))

100%|█████████▉| 2195167/2196018 [04:23<00:00, 8545.75it/s]

In [35]:
TRG.vocab.load_vectors(torchtext.vocab.Vectors('/home/zyc/Downloads/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5'))

In [None]:
class Encoder(nn.Module):
    '''Encoder(bi-GRU)
    '''
    def __init__(self, pretrained_embed, padding_idx, fix, hidden_size,
                 n_layers=1, dropout=0.5):
        super(Encoder, self).__init__()

        self.embedding = nn.Embedding.from_pretrained(pretrained_embed)
        self.embedding.padding_idx = padding_idx
        if fix:
            self.embedding.weight.requires_grad = False
        
        self.gru = nn.GRU(self.embedding.embedding_dim, hidden_size, n_layers,
                            dropout=dropout, bidirectional=True)

    def forward(self, src, hidden=None):
        '''
        Inputs:
            src: input word index
            hidden: h_t-1 (num_layers * num_directions, batch, hidden_size)
        Outputs:
            output: [T*B*H]
            hidden: h_t
        '''
        embeded = self.embedding(src)
        outputs, hidden = self.gru(embeded, hidden)
        
        # Sum bi-lstm outputs
        output = (outputs[:, :, :self.hidden_size] + 
                   outputs[:, :, self.hidden_size:])
        return output, hidden


class ConcatAttn(nn.Module):
    '''Attention(concat)
    Params:
        hidden_size: hidden size
    '''
    def __init__(self, hidden_size):
        super(ConcatAttn, self).__init__()
        self.hidden_size = hidden_size
        self.attn = nn.Linear(2 * hidden_size, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))
        stdv = 1.0 / sqrt(self.v.size(0))
        self.v.data.uniform_(-stdv, stdv)
    
    def forward(self, hidden, encoder_output):
        '''
        Inputs:
            hidden: [1*B*H] 
            encoder_output: [T*B*H]
        Outputs:
            energy: normalised weights [B*T]
        '''
        # Expand hidden [1*B*H] -> [T*B*H] -> [B*T*H]
        hidden = hidden.repeat(encoder_output.size(0), 1, 1).transpose(0, 1)

        # Transfer encoder_output to [B*T*H]
        encoder_output = encoder_output.transpose(0, 1)

        # Calculate energy and normalise  [B*T]
        attn_energy = self.score(hidden, encoder_output)
        return F.softmax(attn_energy)

    def score(self, hidden, encoder_output):
        '''
        Inputs:
            hidden: [B*T*H]
            encoder_output: [B*T*H]
        Outputs:
            attn_energy: weights [B*T]
        '''
        # Project vectors [B*T*2H] -> [B*T*H] -> [B*H*T]
        energy = self.attn(torch.cat([hidden, encoder_output], 2))
        energy = energy.transpose(1, 2)
        
        # Expend v  [H] -> [B*H] -> [B*1*H]
        v = self.v.repeat(encoder_output.size(0), 1).unsqueeze(1)
        
        # [B*1*H] * [B*H*T] -> [B*1*T]
        attn_energy = torch.bmm(v, energy)
        return attn_energy

        
class BilinearAttn(nn.Module):
    '''Attention(bilinear)
    Params:
        hidden_size: hidden size
    '''
    def __init__(self, hidden_size):
        super(BilinearAttn, self).__init__()
        self.hidden_size = hidden_size
        self.bilinear = nn.Linear(hidden_size, hidden_size)

    
    def forward(self, hidden, encoder_output):
        '''
        Inputs:
            hidden: [1*B*H] 
            encoder_output: [T*B*H]
        Outputs:
            energy: normalised weights [B*T]
        '''
        # [T*B*H] -> [T*B*H] -> [B*H*T]
        wh = self.bilinear(encoder_output).permute(1, 2, 0)
        
        # [1*B*H] -> [B*1*H] x [B*H*T] => [B*1*T] -> [B*T]
        score = hidden.transpose(0, 1).bmm(wh).squeeze(1)
        
        return F.softmax(score)
    


class Decoder(nn.Module):
    '''Decoder(bi-GRU)
    '''
    def __init__(self, pretrained_embed, padding_idx, hidden_size, 
                 n_layers=1, dropout=0.2):
        super(Decoder, self).__init__()
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding.from_pretrained(pretrained_embed)
        self.embedding.padding_idx = padding_idx
        if fix:
            self.embedding.weight.requires_grad = False
            
        self.dropout = nn.Dropout(dropout, inplace=True)
        
        self.attention = BilinearAttn(hidden_size)
        
        self.grucell = nn.GRUCell(
            
        )
        
        self.out = nn.Linear(hidden_size * 2, output_size)

    def forward(self, input, hidden, encoder_output):
        '''
        Inputs:
            input: [B]
            hidden: [1*B*H]
            encoder_output: [T*B*H]
        Outputs:
            output: probabilities of prediction
            hidden: [1*B*H]
            attn_weights: [B*1*T]
        '''
        # [1*B*H]
        embeded = self.embed(input).unsqueeze(0) 
        embeded = self.dropout(embeded)

        # Calculate attention weights and apply  [B*1*T] * [T*B*H] -> [B*1*H] -> [1*B*H]
        attn_weights = self.attention(hidden, encoder_output)
        context = attn_weights.bmm(encoder_output.transpose(0, 1)).transpose(0, 1)

        # Combine embeded input and attended context
        rnn_input = torch.cat([embeded, context], 2)
        output, hidden = self.gru(rnn_input, hidden)

        # [1*B*H] -> [B*H]
        output = output.squeeze(0)
        context = context.squeeze(0)

        # Output probabilities
        output = self.out(torch.cat([output, context], 1))
        output = F.log_softmax(output, dim=1)
        return output, hidden