In [1]:
# python自带
import os
import re
import random
from lxml import etree

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

In [3]:
# 科学计算
import numpy as np
import torch
import torch.nn a s nn
import torch.nn.functional as F
import torch.optim as optim

In [5]:
# NLP 相关
import jieba
import torchtext
from nltk import word_tokenize
from nltk.translate import bleu_score

In [6]:
def word_tokenize_zh(input):
    return list(jieba.cut(input))

In [28]:
SRC = torchtext.data.Field(tokenize=word_tokenize, eos_token='<eos>')
TRG = torchtext.data.Field(tokenize=word_tokenize_zh, init_token='<sos>', eos_token='<eos>')

train_data = torchtext.datasets.TranslationDataset(
    path='data/news-commentary-v12.zh-en',
    exts=('.en', '.zh'),
    fields=(SRC, TRG)
)


In [29]:
print(len(train_data))

227383


In [30]:
dev_data = torchtext.datasets.TranslationDataset(
    path='data/newsdev2017-enzh',
    exts=('.en', '.zh'),
    fields=(SRC, TRG)
)

In [31]:
print(len(dev_data))

2002


In [32]:
test_data = torchtext.datasets.TranslationDataset(
    path='data/newstest2017-enzh',
    exts=('.en', '.zh'),
    fields=(SRC, TRG)
)

In [33]:
print(len(test_data))

2001


In [34]:
SRC.build_vocab(train_data.src, dev_data.src, test_data.src, min_freq=5)

In [35]:
TRG.build_vocab(train_data.trg, dev_data.trg, test_data.trg, min_freq=5)

In [36]:
print(len(SRC.vocab.__dict__['freqs']))
print(len(TRG.vocab.__dict__['freqs']))

95171
91404


In [37]:
SRC.vocab.load_vectors(torchtext.vocab.Vectors('data/glove.840B.300d.txt'))

In [38]:
TRG.vocab.load_vectors(torchtext.vocab.Vectors('data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5'))

In [143]:
class Encoder(nn.Module):
    '''Encoder(bi-GRU)
    '''
    def __init__(self, pretrained_embed, padding_idx, fix, hidden_size,
                 n_layers=1, dropout=0.5):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding.from_pretrained(pretrained_embed)
        self.embedding.padding_idx = padding_idx
        if fix:
            self.embedding.weight.requires_grad = False
        
        self.gru = nn.GRU(self.embedding.embedding_dim, hidden_size, n_layers,
                            dropout=dropout, bidirectional=True)

    def forward(self, src, hidden=None):
        '''
        Inputs:
            src: input word index
            hidden: h_t-1 (num_layers * num_directions, batch, hidden_size)
        Outputs:
            output: [T*B*H]
            hidden: h_t
        '''
        embeded = self.embedding(src)
        outputs, hidden = self.gru(embeded, hidden)
        
        # Sum bi-lstm outputs
        output = (outputs[:, :, :self.hidden_size] + 
                   outputs[:, :, self.hidden_size:])
        return output, hidden


class ConcatAttn(nn.Module):
    '''Attention(concat)
    Params:
        hidden_size: hidden size
    '''
    def __init__(self, hidden_size):
        super(ConcatAttn, self).__init__()
        self.hidden_size = hidden_size
        self.attn = nn.Linear(2 * hidden_size, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))
        stdv = 1.0 / sqrt(self.v.size(0))
        self.v.data.uniform_(-stdv, stdv)
    
    def forward(self, hidden, encoder_output):
        '''
        Inputs:
            hidden: [1*B*H] 
            encoder_output: [T*B*H]
        Outputs:
            energy: normalised weights [B*1*T]
        '''
        # Expand hidden [1*B*H] -> [T*B*H] -> [B*T*H]
        hidden = hidden.repeat(encoder_output.size(0), 1, 1).transpose(0, 1)

        # Transfer encoder_output to [B*T*H]
        encoder_output = encoder_output.transpose(0, 1)

        # Calculate energy and normalise  [B*1*T]
        attn_energy = self.score(hidden, encoder_output)
        return F.softmax(attn_energy, dim=2)

    def score(self, hidden, encoder_output):
        '''
        Inputs:
            hidden: [B*T*H]
            encoder_output: [B*T*H]
        Outputs:
            attn_energy: weights [B*T]
        '''
        # Project vectors [B*T*2H] -> [B*T*H] -> [B*H*T]
        energy = self.attn(torch.cat([hidden, encoder_output], 2))
        energy = energy.transpose(1, 2)
        
        # Expend v  [H] -> [B*H] -> [B*1*H]
        v = self.v.repeat(encoder_output.size(0), 1).unsqueeze(1)
        
        # [B*1*H] * [B*H*T] -> [B*1*T]
        attn_energy = torch.bmm(v, energy)
        return attn_energy

        
class BilinearAttn(nn.Module):
    '''Attention(bilinear)
    Params:
        hidden_size: hidden size
    '''
    def __init__(self, hidden_size):
        super(BilinearAttn, self).__init__()
        self.hidden_size = hidden_size
        self.bilinear = nn.Linear(hidden_size, hidden_size)

    
    def forward(self, hidden, encoder_output):
        '''
        Inputs:
            hidden: [1*B*H] 
            encoder_output: [T*B*H]
        Outputs:
            energy: normalised weights [B*1*T]
        '''
        # [T*B*H] -> [T*B*H] -> [B*H*T]
        wh = self.bilinear(encoder_output).permute(1, 2, 0)
        
        # [1*B*H] -> [B*1*H] x [B*H*T] => [B*1*T]
        score = hidden.transpose(0, 1).bmm(wh)
        
        return F.softmax(score, dim=2)
    


class Decoder(nn.Module):
    '''Decoder(bi-GRU)
    '''
    def __init__(self, pretrained_embed, padding_idx, hidden_size, fix, output_size,
                 n_layers=1, dropout=0.2):
        super(Decoder, self).__init__()
        self.n_layers = n_layers
        self.embedding = nn.Embedding.from_pretrained(pretrained_embed)
        self.embedding.padding_idx = padding_idx
        if fix:
            self.embedding.weight.requires_grad = False
            
        self.dropout = nn.Dropout(dropout, inplace=True)
        
        self.attention = BilinearAttn(hidden_size)
        
        self.gru = nn.GRU(
            self.embedding.embedding_dim,
            hidden_size,
            n_layers,
            dropout=dropout
        )
        
        self.linear1 = nn.Linear(hidden_size * 2, hidden_size * 2, bias=False)
        self.linear2 = nn.Linear(hidden_size * 2, output_size, bias=False)

    def forward(self, input, hidden, encoder_output):
        '''
        Inputs:
            input: [B]
            hidden: [layers*B*H]
            encoder_output: [T*B*H]
        Outputs:
            p: [B*O]
            hidden: [layers*B*H]
        '''
        # [B] -> [B*E] -> [1*B*E]
        embeded = self.embedding(input).unsqueeze(0) 
            
        # [1*B*H], [layers*B*H]
        output, hidden = self.gru(embeded, hidden)
        
        # ht: [B*H]  the last layer
        ht = hidden[-1, :, :]
        
        # [1*B*T] and [T*B*H] -> [B*1*T]
        attn_weights = self.attention(ht.unsqueeze(0), encoder_output)
        
        # [B*1*T] x [B*T*H] => [B*1*H] -> [B*H]
        c = attn_weights.bmm(encoder_output.transpose(0, 1)).squeeze(1)
        
        # concat c and h => [B*2H] => [B*H] 
        attn_vector = F.tanh(self.linear1(
            torch.cat([c, ht], dim=1)
        ))
        
        # [B*H] -> [B*O]
        p = F.softmax(self.linear2(attn_vector), dim=1)

        return p, hidden

In [150]:
batch_size = 16

In [151]:
train_iter = torchtext.data.BucketIterator(
    dataset=train_data,
    batch_size=batch_size,
    shuffle=True
)

dev_iter = torchtext.data.BucketIterator(
    dataset=dev_data,
    batch_size=batch_size,
    shuffle=True
)

test_iter = torchtext.data.BucketIterator(
    dataset=test_data,
    batch_size=batch_size,
    shuffle=True
)

In [152]:
EOS_TOKEN = 2

In [153]:
def training(epoch, encoder, decoder, encoder_optimizer, decoder_optimizer ,criterion, eval_steps, train_iter, dev_iter, device):
    encoder.to(device)
    decoder.to(device)
    
    step = 0
    train_loss = 0.0
    lowest_loss = 1e5
    
    encoder.train()
    decoder.train()
    for e in range(epoch):
        train_iter.init_epoch()
        for train_batch in iter(train_iter):
            step += 1
            
            # [T*B]
            src = train_batch.src.to(device)
            trg = train_batch.trg.to(device)
            
            encoder_optimizer.zero_grad()
            decoder_optimizer.zero_grad()
            
            # encoder
            encoder_output, hidden = encoder(src)
            
            # decoder            
            hidden = hidden[:decoder.n_layers]
            decoder_input = trg[0] # SOS
            
            
            loss = 0.0
            for i in range(trg.size(0) - 1):
                p, hidden = decoder(
                    decoder_input, hidden, encoder_output
                )
                loss += criterion(p, trg[i+1])
                decoder_input = trg[i+1]

                
            loss.backward()
            train_loss += loss.item()
            
            encoder_optimizer.step()
            decoder_optimizer.step()
            
            if (step % eval_steps) == 0:
                with torch.no_grad():
                    encoder.eval()
                    decoder.eval()
                    
                    dev_iter.init_epoch()
                    dev_loss = 0.0
                    dev_step = 0
                    for dev_batch in iter(dev_iter):
                        dev_step += 1
                        dev_src = dev_batch.src.to(device)
                        dev_trg = dev_batch.trg.to(device)
                        
                        encoder_output, hidden = encoder(src)
                        hidden = hidden[:decoder.n_layers]
                        decoder_input = trg[0]
                        for i in range(trg.size(0) - 1):
                            p, hidden = decoder(
                                decoder_input, hidden, encoder_output
                            )
                            dev_loss += criterion(p, trg[i+1])
                            decoder_input = trg[i+1] 
                    train_loss /= eval_steps
                    dev_loss /= dev_step
                    print("epoch {0} steps {1} train_loss {2} dev_loss{3}".format(
                        e, step, train_loss, dev_loss
                    ))
                    
                    if dev_loss < lowest_loss:
                        dev_loss = lowest_loss
                        save(
                            encoder=encoder,
                            decoder=decoder,
                            info={'steps':step, 'epoch':e, 'train_loss':train_loss, 'dev_loss':dev_loss}
                        )
                    
                    train_loss = 0.0
                    encoder.train()
                    decoder.train()

def save(encoder, decoder, info):
    torch.save(info, 'best_model.info')
    torch.save(encoder, 'best_encoder.m')
    torch.save(decoder, 'best_decoder.m')
    
def load():
    encoder = torch.load('best_encoder.m')
    decoder = torch.load('best_decoder.m')
    info = torch.load('best_model.info')
    return encoder, decoder, info

In [154]:
choise = "cuda" if torch.cuda.is_available() else "cpu"
print(choise + " is available")
device = torch.device(choise)

cuda is available


In [155]:
encoder = Encoder(pretrained_embed=SRC.vocab.vectors, padding_idx=SRC.vocab.stoi[SRC.pad_token], fix=True, hidden_size=128, dropout=0.3, n_layers=4)
decoder = Decoder(pretrained_embed=TRG.vocab.vectors, padding_idx=TRG.vocab.stoi[TRG.pad_token], fix=True, hidden_size=128, dropout=0.3, n_layers=4, output_size=len(TRG.vocab.freqs))

encoder_optimizer = optim.Adam(filter(lambda p: p.requires_grad, encoder.parameters()), lr=1e-3)
decoder_optimizer = optim.Adam(filter(lambda p: p.requires_grad, decoder.parameters()), lr=1e-3)

criterion = nn.NLLLoss()


training(epoch=20, encoder=encoder, decoder=decoder, encoder_optimizer=encoder_optimizer, decoder_optimizer=decoder_optimizer, 
         criterion=criterion, eval_steps=500, train_iter=train_iter, dev_iter=dev_iter, device=device)

RuntimeError: CUDA error: device-side assert triggered