In [None]:
import numpy as np
import chainer
from chainer import Chain
from chainer import Variable
from chainer import functions as F
from chainer import links as L
from chainer import cuda
from chainer import optimizers

import random
import spacy
from tqdm import tqdm
nlp = spacy.load('ja_ginza_nopn')

GPU = False

if GPU:
    chainer.cuda.get_device(0).use()
    xp = chainer.cuda.cupy
else:
    xp = np

In [None]:
class CopyNetEncoder(Chain):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(CopyNetEncoder, self).__init__(
            xh = L.EmbedID(vocab_size, embed_size, ignore_label=-1),
            hh = L.Linear(embed_size, 4 * hidden_size),
            hy = L.Linear(hidden_size, 4 * hidden_size)
        )
        
    def __call__(self, x, c, h):
        e = F.tanh(self.xh(x))
        return F.lstm(c, self.hh(e) + self.hy(h))

In [None]:
class CopyNetDecoder(Chain):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(CopyNetDecoder, self).__init__(
            ye = L.EmbedID(vocab_size, embed_size, ignore_label=-1),
            eh = L.Linear(embed_size,  4*hidden_size),
            hh = L.Linear(hidden_size, 4*hidden_size),
            fh = L.Linear(hidden_size, 4*hidden_size),
            bh = L.Linear(hidden_size, 4*hidden_size),
            he = L.Linear(hidden_size, embed_size),
            ey = L.Linear(embed_size, vocab_size)
        )
        
    def __call__(self, y, c, h, f, b):
        e = F.tanh(self.ye(y))
        c, h = F.lstm(c, self.eh(e) + self.hh(h) + self.fh(f) + self.bh(b))
        t = self.ey(F.tanh(self.he(h)))

        return t, c, h

In [None]:
class CopyNetAttention(Chain):
    def __init__(self, hidden_size, GPU):
        super(CopyNetAttention, self).__init__(
            eh = L.Linear(hidden_size, hidden_size),
            hh = L.Linear(hidden_size, hidden_size),
            hw = L.Linear(hidden_size, hidden_size),
            he = L.Linear(hidden_size, 1)
        )
        
        self.hidden_size = hidden_size
                
        if GPU:
            self.xp = chainer.cuda.cupy
        else:
            self.np = np
        
    def __call__(self, fs, bs, h):
        # Arguments
        
        # fs -> 順向きEncoder中間ベクトルのList
        # bs -> 逆向きEncoder中間ベクトルのList
        # h -> Decoderが出力したベクトルのList
        
        batch_size = h.data.shape[0]
        
        att = []
        ws = []
        
        sum_w = Variable(
            self.np.zeros((batch_size, 1), dtype='float32'))
        
        for f,b in zip(fs, bs):
            w = self.he(F.tanh(self.eh(f)+self.hh(b)+self.he(h)))
            att.append(w)
            w = F.exp(w)
            ws.append(w)
            sum_w += w
            
        att_f = Variable(
            self.np.zeros((batch_size, self.hidden_size), dtype='float32'))
        
        att_b = Variable(
            self.np.zeros((batch_size, self.hidden_size), dtype='float32'))
        
        for f, b, w in zip(fs, bs, ws):
            w /= sum_w
            att_f += F.reshape(F.batch_matmul(f, w), (batch_size, self.hidden_size))
            att_b += F.reshape(F.batch_matmul(f, w), (batch_size, self.hidden_size))
            
        att = F.concat(att, axis=1)
        return att_f, att_b, att

In [None]:
class CopyNet(Chain):
    def __init__(self, vocab_size, hidden_size, embed_size, batch_size, GPU=False):
        
        super(CopyNet, self).__init__(
            f_encoder = CopyNetEncoder(vocab_size, embed_size, hidden_size),
            b_encoder = CopyNetEncoder(vocab_size, embed_size, hidden_size),
            
            attention = CopyNetAttention(hidden_size, GPU),
            decoder   = CopyNetDecoder(vocab_size, embed_size, hidden_size),
            predictor = L.Linear(hidden_size, 1)
        )
        
        
        if GPU:
            chainer.cuda.get_device(0).use()
            self.xp = chainer.cuda.cupy
        else:
            self.np = np
            
        self.vocab_size  = vocab_size
        self.hidden_size = hidden_size
        self.embed_size  = embed_size
        self.batch_size  = batch_size
        
        
        self.fs = []
        self.bs = []
        
        self.c = None
        self.h = None
        
    def encode(self, fs):
        """Arguments
        fs -> 入力する単語のList
        """
        
        c = Variable(self.np.zeros((self.batch_size, self.hidden_size), dtype='float32'))
        h = Variable(self.np.zeros((self.batch_size, self.hidden_size), dtype='float32'))
        
        for word in fs:
            c, h = self.f_encoder(word, c, h)
            self.fs.append(h)

        c = Variable(self.np.zeros((self.batch_size, self.hidden_size), dtype='float32'))
        h = Variable(self.np.zeros((self.batch_size, self.hidden_size), dtype='float32'))
        
        for word in reversed(fs):
            c, h = self.b_encoder(word, c, h)
            self.bs.insert(0, h)
            
        self.c = Variable(self.np.zeros((self.batch_size, self.hidden_size), dtype='float32'))
        self.h = Variable(self.np.zeros((self.batch_size, self.hidden_size), dtype='float32'))
        
    def decode(self, w):
        """Arguments
        w -> 入力する単語のList
        """
        
        att_f, att_b, att = self.attention(self.fs, self.bs, self.h)
        
        t, self.c, self.h = self.decoder(w, self.c, self.h, att_f, att_b)
        
        return self.predictor(self.h), att, t
    
    def reset(self):
        
        self.fs = []
        self.bs = []
        
        self.c = Variable(self.np.zeros((self.batch_size, self.hidden_size), dtype='float32'))
        self.h = Variable(self.np.zeros((self.batch_size, self.hidden_size), dtype='float32'))
        
        self.zerograds()

In [None]:
def forward(enc_words, dec_words, model):
    """
    Arguments
    
        enc_words -> 単語IDのリストのBatch
        dec_words -> listの単語IDのリストのBatch
        model    -> model
    """
    
    batch_size = len(enc_words[0])
    
    enc_keys  = enc_words
    
    enc_words = [Variable(xp.array(row, dtype='int32')) for row in enc_words]
    
    model.reset()
    model.encode(enc_words)
    
    loss = Variable(xp.zeros((), dtype='float32'))
    t = Variable(xp.array([0 for _ in range(batch_size)], dtype='int32'))

    for dec in dec_words:
        lambda_, att, y = model.decode(t)
        
        t = Variable(xp.array(dec, dtype='int32'))
        s    = F.log_softmax(y)
        
        att_s = F.log_softmax(att)
        
        lambda_s = F.reshape(F.sigmoid(lambda_), (batch_size,))
        
        Pg = Variable(xp.zeros((), dtype='float32'))
        Pc = Variable(xp.zeros((), dtype='float32')) 
        Ep = Variable(xp.zeros((), dtype='float32'))
        
        Cnt = 0
        
        for i, word in enumerate(dec):
            
            if word != -1:
                _x = F.get_item(F.get_item(s,i), word)
                _y = F.reshape((1.0 - F.get_item(lambda_s, i)), ())
                
                Pg  += _x*_y
                Cnt += 1
                
                if word in enc_keys[i]:
                    _x = F.get_item(F.get_item(att_s, i), list(enc_keys[i]).index(word))
                    _y = F.reshape(F.get_item(lambda_s, i), ())
                    
                    Pc += _x*_y
                    
                    Ep += F.log(F.get_item(lambda_s, i))
                else:
                    Ep += F.log(1.0 - F.get_item(lambda_s, i))
                    
        Pg *= (-1.0 / xp.max([1, Cnt]))
        Pc *= (-1.0 / xp.max([1, Cnt]))
        Ep *= (-1.0 / xp.max([1, Cnt]))
        
        loss += Pg + Pc + Ep
    return loss

In [None]:
def predict(model, enc_words, batch_col_size):
    result = [] # Seq の　リスト
    modes  = [] # Copy Gen どちらを使ったか。
    
    model.reset()
    
    enc_keys  = enc_words
    
    enc_words = [Variable(xp.array(x, dtype='int32')) for x in enc_words]
    
    t = Variable(xp.array([0], dtype='int32'))

    model.encode(enc_words)
    
    for i in range(batch_col_size):
        lambda_, att, y = model.decode(t)
        
        lambda_ = F.sigmoid(lambda_)
        
        s = F.softmax(y)
        
        prob = lambda_.data[0][0]
        
        flag = xp.random.choice(2, 1, p=[1.0 - prob, prob])[0]
        
        if flag == 0:
            label = s.data.argmax()
            result.append(label)
            
            modes.append("Gen")
            
            t = Variable(xp.array([label], dtype='int32'))
        else:
            n = F.softmax(att).data.argmax()
            
            label = enc_keys[n][0]
            
            result.append(label)
            modes.append("Copy")
            
            t = Variable(xp.array([label], dtype='int32'))
            
        if label == 0:
            break
            
    return result, modes

In [None]:
def train(data,
          vocab,
          id2wd,
          batch_col_size,
          embed_size=300,
          hidden_size=150,
          batch_size=16,
          epoch_num=60):
    
    
    N = len(data[0])
    
    data = np.array(data)
    
    total_loss = 0
    vocab_size = len(vocab)
    
    model = CopyNet(vocab_size, hidden_size, embed_size, batch_size)
    
    opt = optimizers.Adam()
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(5))

    for epoch in tqdm(range(0, epoch_num)):
        
        total_loss = 0

        perm = np.random.permutation(N)
        
        for i in range(0, N, batch_size):
            
            mini_batch_x = data[0][perm[i:i+batch_size]]
            mini_batch_y = data[1][perm[i:i+batch_size]]
            
            enc = xp.array(mini_batch_x).T
            dec = xp.array(mini_batch_y).T
            
            loss = forward(enc, dec, model)
            
            loss.backward()
            loss.unchain_backward()
            total_loss += loss.data
            opt.update()
            
        if (epoch+1)%10 == 0:
            print(str(epoch) + "Epoch | total_loss : " + str(total_loss))
            total_loss = 0
            
    return model

In [None]:
data = [
    [
        "私は怪しい日本語翻訳機を発明した。",
        "仕事が大変な時はしっかり休んで体のストレスを溜めないようにする。",
        "ガキが。。。舐めてると潰すぞ。",
        "あなた、私のフォロワーが18億人を突破しました！！ありがとう！！",
        "イトーヨーカドー",
        "Twitterデビュー！",
        "Huawei",
        "あなた、私は感謝するが、私は負けない！",
        "中国国民党",
        "北方領土を不正に占領している国はどの国でしょう？"
    ],
    [
        "贵樣！我は正レい日本語翻译机を发明した！",
        "贵樣！ 仕事か大變の时はレつかリ休んて身體の疲ねどヌトレヌを贮めないよラすゐ。",
        "カギが・・・舐ぬてゐと溃ずそ",
        "贵樣！私のフ口ワ一か18億人を突破レだ！！あリかどラ！！",
        "亻卜一彐一力卜一",
        "微博デ匕ュ一！",
        "华为",
        "贵樣！感谢ずゑが、わたレは负げない！",
        "中国共产党",
        "北方领土を不法に佔拠レでいゑのはとの國でレょラ？",
    ]
]

In [None]:
def tokenize(seq):
    
    tokens = []
    
    for sent in nlp(seq).sents:
        
        for token in sent:
            
            tokens.append(token.orth_)
            
    return tokens

In [None]:
def create_vocab_dict(x, y):
    
    vocab = {"<eos>":0, "<unk>":1}
    
    for ls in [x,y]:
        for seq in ls:
            for token in tokenize(seq):
                if not token in vocab.keys():
                    vocab[token] = len(vocab)
                    
    return vocab, {value:key for key, value in vocab.items()}

In [None]:
def seq2id(vocab, seq):
    result = []
    
    for token in tokenize(seq):
        
        if token in vocab.keys():
            
            result.append(vocab[token])
            
        else:
            
            result.append(1)
        
    return result

In [None]:
def seq2train(vocab, seq, batch_col_size):
    seqs = seq2id(vocab, seq)
    seqs.append(0)
        
    x = batch_col_size - len(seqs)

    while x >= 1:
        seqs.append(-1)
        x-=1
        
    return seqs

In [None]:
def id2seq(id2wd, seq):
    result = []
    
    for index in seq:
        if index in id2wd.keys():
            result.append(id2wd[index])
        elif index == -1:
            pass
        else:
            result.append("<unk>")
    return result

In [None]:
def make_train_data(data, vocab, batch_col_size):
    
    x, y = [], []
    
    for train_data in data[0]:
        x.append(seq2train(vocab, train_data, batch_col_size))
        
    for train_data in data[1]:
        y.append(seq2train(vocab, train_data, batch_col_size))
        
    return [x,y]

In [None]:
vocab, id2wd = create_vocab_dict(data[0], data[1])

train_data   = make_train_data(data, vocab, 40)

model        = train(train_data,
                      vocab,
                      id2wd,
                      40,
                      embed_size=300, 
                      hidden_size=150,
                      batch_size=5,
                      epoch_num=60)

In [None]:
seq = seq2train(vocab, "私は怪しい日本語翻訳機を発明した。",40)
res, mode = predict(model, xp.array([seq]).T, 40)
"".join(id2seq(id2wd, res))