In [1]:
import json
import pickle
import random

import torch
from torch import nn, optim
from torch import autograd
from torch.autograd import Variable
import torch.nn.functional as F
import numpy as np
import torch.nn.utils.rnn as rnn_utils

import nltk
from nltk.translate.bleu_score import SmoothingFunction
from nltk.translate.bleu_score import sentence_bleu
import time
import copy

from Vocab import Vocab
from LanguageModel import LanguageModel

import torch
torch.cuda.set_device(0)

print('import over')

copy_thres=0

import over


In [2]:
def batch_words2sentence(words_list):
    return [' '.join(words) for words in words_list]
def batch_tokens2words(tokens_list, vocab):
    ##    para: tokens_list is list[list] type
    ##    return: words_list corresponding to tokens
    return [[vocab.token2word[token] for token in tokens] for tokens in tokens_list]

def batch_tokens_remove_eos(tokens_list, vocab):
    ##    para: tokens_list is list[list] type
    ##    return pure tokens_list removed eos symbol
    result=[]
    for tokens in tokens_list:
        tokens_filtered=[]
        for token in tokens:
            if token == vocab.word2token['<eos>']:
#                 tokens_filtered.append(token)
                break
            else:
                tokens_filtered.append(token)
        result.append(tokens_filtered)
    return result

def batch_tokens_bleu(references, candidates, smooth_epsilon=0.001):
    ##    para: references and candidates are list[list] type
    ##    return: list of BLEU for every sample
    ##
    bleu_scores=[]
    for ref, candidate in zip(references, candidates):
        if min(len(ref), len(candidate))<4:
            bleu_scores.append(0)
        else:
            bleu_scores.append(sentence_bleu([ref], candidate, smoothing_function = SmoothingFunction(epsilon=smooth_epsilon).method1))
    return bleu_scores

with open('data_set/vocab.pk', 'rb') as f:
    vocab=pickle.load(f)

    
def seqs_split(seqs, vocab):
    seqs = batch_tokens_remove_eos(seqs, vocab)
    simple_sent1s=[]
    simple_sent2s=[]
    for seq in seqs:
        simple_sent1=[]
        simple_sent2=[]
        sent=simple_sent1
        for token in seq:
            if token==vocab.word2token['<split>']:
                sent=simple_sent2
            else:
                sent.append(token)
        simple_sent1s.append(simple_sent1)
        simple_sent2s.append(simple_sent2)
        
    return simple_sent1s, simple_sent2s

def simple_sents_concat(simple_sent1s, simple_sent2s, vocab, max_length):
    simple_sent_lens=[]
    simple_sents=simple_sent1s
    for i, sent in enumerate(simple_sent2s):
        simple_sents[i].append(vocab.word2token['<split>'])
        for token in sent:
            simple_sents[i].append(token)

        #if there is no <split> in simple_sent1s and simple_sent2s, then the length of sents_concat will be longer than max_length
        if len(simple_sents[i])>max_length:
            simple_sents[i] = simple_sents[i][:max_length]
            
        simple_sent_lens.append(len(simple_sents[i]))
            
        while(len(simple_sents[i])<max_length):
            simple_sents[i].append(vocab.word2token['<padding>'])
            
    return simple_sents, simple_sent_lens


def get_lm_inputs_and_labels(sents, vocab, max_length):
    lm_inputs=copy.deepcopy(sents)
    lm_labels=copy.deepcopy(sents)
    lm_input_lens=[]
    
    for sent in lm_inputs:
        if len(sent)>=max_length:
            sent=sent[:max_length-1]
        sent.insert(0, vocab.word2token['<sos>'])
        lm_input_lens.append(len(sent))
        while(len(sent)<max_length):
            sent.append(vocab.word2token['<padding>'])

    for sent in lm_labels:
        if len(sent)>=max_length:
            sent = sent[:max_length-1]
        sent.append(vocab.word2token['<eos>'])
        while(len(sent)<max_length):
            sent.append(vocab.word2token['<padding>'])
        
    return lm_inputs, lm_input_lens, lm_labels


def duplicate_reconstruct_labels(sents, topk):
    return [x for x in sents for ii in range(topk)]


def batch_tokens_bleu_split_version(references, candidates, vocab, smooth_epsilon=0.001):
    # needn't remove '<sos>' token before calling this function, which is different from the 'batch_token_bleu()' version
    #
    ref1, ref2 = seqs_split(references, vocab)
    cand1, cand2 = seqs_split(candidates, vocab)
    bleu_simple_sent1s = batch_tokens_bleu(ref1, cand1)
    bleu_simple_sent2s = batch_tokens_bleu(ref2, cand2)
#     print(bleu_simple_sent1s)
#     print(bleu_simple_sent2s)
    bleu=[]
    for idx in range(len(bleu_simple_sent1s)):
        bleu.append((bleu_simple_sent1s[idx]+bleu_simple_sent2s[idx])/2)
    return bleu


def set_model_grad(model, is_grad):
    for param in model.parameters():
         param.requires_grad = is_grad

In [3]:
seqs=[[8,9,90,5,3,2,1], [5,8,9,90,5,3,2,1], [8,2,9,40,5,3,2,2,1], [8,9,90,5,3,2,1], [8,9,90]]
a,b = seqs_split(seqs, vocab)

print(a)
print(b)

lm_in, lm_in_lens, lm_labels=get_lm_inputs_and_labels(a,vocab, max_length=6)
print(lm_in)
print(lm_in_lens)
print(lm_labels)
lm_in, lm_in_lens, lm_labels=get_lm_inputs_and_labels(b,vocab, max_length=6)
print(lm_in)
print(lm_in_lens)
print(lm_labels)

c,d=simple_sents_concat(a,b,vocab, 3)
print(c)
print(d)


batch_tokens_bleu([[1,2,3,4,5,6]], [[2,3,1,4,5]])

[[8, 9, 90], [], [8], [8, 9, 90], [8, 9, 90]]
[[3], [8, 9, 90, 3], [], [3], []]
[[0, 8, 9, 90, 1, 1], [0, 1, 1, 1, 1, 1], [0, 8, 1, 1, 1, 1], [0, 8, 9, 90, 1, 1], [0, 8, 9, 90, 1, 1]]
[4, 1, 2, 4, 4]
[[8, 9, 90, 2, 1, 1], [2, 1, 1, 1, 1, 1], [8, 2, 1, 1, 1, 1], [8, 9, 90, 2, 1, 1], [8, 9, 90, 2, 1, 1]]
[[0, 3, 1, 1, 1, 1], [0, 8, 9, 90, 3, 1], [0, 1, 1, 1, 1, 1], [0, 3, 1, 1, 1, 1], [0, 1, 1, 1, 1, 1]]
[2, 5, 1, 2, 1]
[[3, 2, 1, 1, 1, 1], [8, 9, 90, 3, 2, 1], [2, 1, 1, 1, 1, 1], [3, 2, 1, 1, 1, 1], [2, 1, 1, 1, 1, 1]]
[[8, 9, 90], [5, 8, 9], [8, 5, 1], [8, 9, 90], [8, 9, 90]]
[3, 3, 2, 3, 3]


[0.013910597740964967]

In [4]:
#fusion data set

# with open('./data_set2/fusion_data_set/train_pseudo_simple_sents.pk', 'rb') as f:
#     fusion_pseudo_train_set_inputs = pickle.load(f)
# with open('./data_set2/fusion_data_set/train_pseudo_simple_sent_lens.pk', 'rb') as f:
#     fusion_pseudo_train_set_input_lens = pickle.load(f)
# with open('./data_set2/fusion_data_set/train_pseudo_labels.pk', 'rb') as f:
#     fusion_pseudo_train_set_labels = pickle.load(f)
#supervise
with open('./data_set2/fusion_data_set/train_simple_sents_supervised.pk', 'rb') as f:
    fusion_pseudo_train_set_inputs = pickle.load(f)
with open('./data_set2/fusion_data_set/train_simple_sent_lens_supervised.pk', 'rb') as f:
    fusion_pseudo_train_set_input_lens = pickle.load(f)
with open('./data_set2/fusion_data_set/train_labels_supervised.pk', 'rb') as f:
    fusion_pseudo_train_set_labels = pickle.load(f)
    
    
with open('./data_set2/fusion_data_set/validation_simple_sents.pk', 'rb') as f:
    fusion_pseudo_valid_set_inputs = pickle.load(f)
with open('./data_set2/fusion_data_set/validation_simple_sent_lens.pk', 'rb') as f:
    fusion_pseudo_valid_set_input_lens = pickle.load(f)
with open('./data_set2/fusion_data_set/validation_labels.pk', 'rb') as f:
    fusion_pseudo_valid_set_labels = pickle.load(f)
    
    
    
#split data set

# with open('./data_set2/split_data_set/train_complex_sents.pk', 'rb') as f:
#     split_train_set_inputs = pickle.load(f)
# with open('./data_set2/split_data_set/train_complex_sent_lens.pk', 'rb') as f:
#     split_train_set_input_lens = pickle.load(f)
# with open('./data_set2/split_data_set/train_pseudo_labels.pk', 'rb') as f:
#     split_pseudo_train_set_labels = pickle.load(f)
#supervise
with open('./data_set2/split_data_set/train_complex_sents_supervised.pk', 'rb') as f:
    split_train_set_inputs = pickle.load(f)
with open('./data_set2/split_data_set/train_complex_sent_lens_supervised.pk', 'rb') as f:
    split_train_set_input_lens = pickle.load(f)
with open('./data_set2/split_data_set/train_labels_supervised.pk', 'rb') as f:
    split_pseudo_train_set_labels = pickle.load(f)
    
    
with open('./data_set2/split_data_set/validation_complex_sents.pk', 'rb') as f:
    split_valid_set_inputs = pickle.load(f)
with open('./data_set2/split_data_set/validation_complex_sent_lens.pk', 'rb') as f:
    split_valid_set_input_lens = pickle.load(f)
with open('./data_set2/split_data_set/validation_labels.pk', 'rb') as f:
    split_pseudo_valid_set_labels = pickle.load(f)


In [5]:
print(len(split_train_set_inputs), len(split_train_set_input_lens), len(split_pseudo_train_set_labels))
print(len(fusion_pseudo_train_set_inputs), len(fusion_pseudo_train_set_input_lens), len(fusion_pseudo_train_set_labels))


197988 197988 197988
197988 197988 197988


In [6]:
class Encoder(nn.Module):
    def __init__(self, use_cuda, hidden_dim, input_dim, vocab):#, pre_train_weight, is_fix_word_vector = 1):
        super(Encoder, self).__init__()
        
        self.use_cuda = use_cuda
        self.input_dim=input_dim
        self.hidden_dim=hidden_dim
        self.vocab = vocab
        
        self.lstm=torch.nn.LSTM(input_size=self.input_dim, 
                                hidden_size= self.hidden_dim, 
                                bidirectional=True,
                                batch_first=True
                               )
        
        #embedding
        self.embed=nn.Embedding(len(self.vocab.word2token), input_dim)
        #loading pre trained word embedding
        with open('data_set/pre_trained_token_embedding.pk', 'rb') as f:
            pre_train_word_embedding = pickle.load(f)
            
        self.embed.weight.data.copy_(torch.FloatTensor(pre_train_word_embedding))
#         self.embed.weight.requires_grad = False
        
    def order(self, inputs, inputs_len):    #inputs: tensor, inputs_len: 1D tensor
        inputs_len, sort_ids = torch.sort(inputs_len, dim=0, descending=True)
        
        if self.use_cuda:
            inputs = inputs.index_select(0, Variable(sort_ids).cuda())
        else:
            inputs = inputs.index_select(0, Variable(sort_ids))
        
        _, true_order_ids = torch.sort(sort_ids, dim=0, descending=False)
        
        return inputs, inputs_len, true_order_ids
    #
    def forward(self, inputs, inputs_len):
        inputs = Variable(inputs)
        if self.use_cuda:
            inputs=inputs.cuda()
            
        inputs, sort_len, true_order_ids = self.order(inputs, inputs_len)

        in_vecs=self.embed(inputs)

        packed = rnn_utils.pack_padded_sequence(input=in_vecs, lengths=list(sort_len), batch_first =True)
        
        outputs, (hn,cn) = self.lstm(packed)
        outputs, sent_lens = rnn_utils.pad_packed_sequence(outputs)
        
        #print('outpurs size, hn size and cn size: ', outputs.size(), hn.size(), cn.size())
        outputs = outputs.transpose(0,1)  #transpose is necessary
        #print('outpurs size, hn size and cn size: ', outputs.size(), hn.size(), cn.size())
        
        #warnning: outputs, hn and cn have been sorted by sentences length so the order is wrong, now to sort them
        if self.use_cuda:
            outputs = outputs.index_select(0, Variable(true_order_ids).cuda())
        else:
            outputs = outputs.index_select(0, Variable(true_order_ids))
        
        hn = torch.cat((hn[0], hn[1]), dim=1)
        cn = torch.cat((cn[0], cn[1]), dim=1)
        #print('hn size and cn size: ', hn.size(), cn.size())
        
        if self.use_cuda:
            hn = hn.index_select(0, Variable(true_order_ids).cuda())
            cn = cn.index_select(0, Variable(true_order_ids).cuda())
        else:
            hn = hn.index_select(0, Variable(true_order_ids))
            cn = cn.index_select(0, Variable(true_order_ids))
            
        return outputs, (hn,cn)

In [7]:
def _inflate(tensor, times, dim):
    """
    Examples::
        >> a = torch.LongTensor([[1, 2], [3, 4]])
        >> a
        1   2
        3   4
        [torch.LongTensor of size 2x2]
        >> b = ._inflate(a, 2, dim=1)
        >> b
        1   2   1   2
        3   4   3   4
        [torch.LongTensor of size 2x4]
    """
    repeat_dims = [1] * tensor.dim()
    repeat_dims[dim] = times
    return tensor.repeat(*repeat_dims)

class Decoder(nn.Module):
    def __init__(self, use_cuda, encoder, hidden_dim, max_length=25):
        super(Decoder, self).__init__()
        
        self.use_cuda = use_cuda
        self.hidden_dim=hidden_dim
        self.input_dim = encoder.input_dim
        self.max_length = max_length
        self.vocab = encoder.vocab
        self.weight = [1]*len(self.vocab.word2token)
        self.weight[self.vocab.word2token['<padding>']]=0
        #self.weight[self.vocab.word2token['<eos>']]=1.01
        #self.weight[self.vocab.word2token['<split>']]=1.01
        
        self.hidden_size = self.hidden_dim
        self.V = len(self.vocab.word2token)
        self.SOS = self.vocab.word2token['<sos>']
        self.EOS = self.vocab.word2token['<eos>']
        self.log_softmax = nn.LogSoftmax(dim=1)
        
        self.lstmcell = torch.nn.LSTMCell(input_size=self.input_dim, hidden_size=self.hidden_dim*2, bias=True)
        
        #embedding
        self.embed=encoder.embed# reference share
        #fcnn: projection for crossentroy loss
        self.fcnn = nn.Linear(in_features = self.hidden_dim*2+hidden_dim*2, out_features = len(self.vocab.word2token))
        
        self.softmax = nn.Softmax(dim=1)
        self.cost_func = nn.CrossEntropyLoss(weight=torch.Tensor(self.weight), reduce=False)
        self.nll_loss = nn.NLLLoss(weight=torch.Tensor(self.weight), reduce=False)

        print('init lookup embedding matrix size: ', self.embed.weight.data.size())
        
        #copy
        out_features_dim=self.hidden_dim
        self.attent_wh = nn.Linear(in_features = self.hidden_dim*2, out_features = out_features_dim, bias = 0)
        self.attent_ws = nn.Linear(in_features = self.hidden_dim*2, out_features = out_features_dim, bias = 1)
        self.tanh = nn.Tanh()
        self.attent_vt = nn.Linear(in_features = out_features_dim, out_features = 1, bias=0)
        
        self.prob_wh = nn.Linear(in_features = self.hidden_dim*2, out_features = 1, bias=0)
        self.prob_ws = nn.Linear(in_features = self.hidden_dim*2, out_features = 1, bias=0)
        self.prob_wx = nn.Linear(in_features = input_dim, out_features = 1, bias=1)
        self.sigmoid = nn.Sigmoid()
    
    def get_context_vec(self, enc_outputs, this_timestep_input, dec_state):
        batch_size = enc_outputs.size(dim = 0)
        
        wh = self.attent_wh(enc_outputs)
        ws = self.attent_ws(dec_state).unsqueeze(dim=1)
#         print('wh, ws size: ', wh.size(), ws.size())
        ws = ws.expand(ws.size(0), wh.size(1), ws.size(2))
#         print('ws size: ', ws.size())
        weight = self.attent_vt(self.tanh(wh+ws))
#         print('weight size: ', weight.size())
        weight = self.softmax(weight.squeeze(dim=2))
#         print('weight size: ', weight.size())
        context_v = torch.bmm(weight.unsqueeze(dim=1), enc_outputs)
#         print('context_v size: ', context_v.size())
        context_v = context_v.squeeze(dim=1)
        return context_v, weight
    
    def copy_mechanism(self, enc_outputs, this_timestep_input, dec_state, inputs_one_hot, context_v, weight):
        batch_size = enc_outputs.size(dim = 0)
        
#         wh = self.attent_wh(enc_outputs)
#         ws = self.attent_ws(dec_state).unsqueeze(dim=1)
# #         print('wh, ws size: ', wh.size(), ws.size())
#         ws = ws.expand(ws.size(0), wh.size(1), ws.size(2))
# #         print('ws size: ', ws.size())
#         weight = self.attent_vt(self.tanh(wh+ws))
# #         print('weight size: ', weight.size())
#         weight = self.softmax(weight.squeeze(dim=2))
# #         print('weight size: ', weight.size())
#         context_v = torch.bmm(weight.unsqueeze(dim=1), enc_outputs)
# #         print('context_v size: ', context_v.size())
#         context_v = context_v.squeeze(dim=1)
        
        p_wh = self.prob_wh(context_v)
        p_ws = self.prob_ws(dec_state)
        p_wx = self.prob_wx(this_timestep_input)
        if_copy = self.sigmoid(p_wh+p_ws+p_wx)
#         if_copy = 0.3*if_copy
#         if_copy = self._tocuda(Variable(torch.ones(batch_size, 1), requires_grad=0))
#         print('if_copy size: ', if_copy.size())
        
        prob_copy = torch.bmm(inputs_one_hot, weight.unsqueeze(dim=2))
        prob_copy = prob_copy.squeeze(dim=2)
#         prob_copy = self._tocuda(Variable(torch.rand(batch_size, len(self.vocab.word2token)), requires_grad=0))
#         prob_copy = self.softmax(prob_copy)

#         print('prob_copy size: ', prob_copy.size())
#         print(torch.sum(prob_copy, dim=1))
#         print(torch.mean(if_copy))
        
#         if random.random()<0.005:
#             print('if_copy mean: ', torch.mean(if_copy))
#             _, max_ids = torch.max(prob_copy, dim=1)
#             print(self.vocab.token2word[max_ids.data[0]], self.vocab.token2word[max_ids.data[1]], self.vocab.token2word[max_ids.data[2]])
            
            
        return if_copy, prob_copy

    def forward(self, enc_outputs, sent_lens, h0_and_c0, labels, inputs, teaching_rate=0.6, is_train=1):
        labels = Variable(labels)
        if self.use_cuda:
            labels = labels.cuda()

        all_loss = 0
        predicts = []
        max_probs=[]
        batch_size = enc_outputs.size(dim = 0)
        final_hidden_states = h0_and_c0[0]
#         print('enc_outputs size:', enc_outputs.size())

        sents_len = enc_outputs.size(1)
        inputs = inputs[:,:sents_len].unsqueeze(dim=2)
        one_hot = torch.FloatTensor(batch_size, sents_len, len(self.vocab.word2token)).zero_()
        one_hot.scatter_(2, inputs, 1)
        one_hot = one_hot.transpose(1,2)
        one_hot = self._tocuda(Variable(one_hot, requires_grad = 0))
#         print('one_hot size: ', one_hot.size())
        
        for ii in range(self.max_length):
            if ii==0:
                zero_timestep_input = Variable(torch.LongTensor([self.vocab.word2token['<sos>']]*batch_size))
                if self.use_cuda:
                    zero_timestep_input = zero_timestep_input.cuda()
                    
                zero_timestep_input = self.embed(zero_timestep_input)#size: batch_size * self.input_dim
                
                last_timestep_hidden_state,cx = self.lstmcell(zero_timestep_input, h0_and_c0)
                #print('last_timestep_hidden_state: ', last_timestep_hidden_state.size(), cx.size())

                #get context vector
                context_vec, weight = self.get_context_vec(enc_outputs=enc_outputs, this_timestep_input=-1, 
                                                            dec_state = last_timestep_hidden_state)
                logits = self.fcnn(torch.cat([last_timestep_hidden_state, context_vec], dim=1))
                
                #copy or not
                copy_control=random.random()
                if copy_control<copy_thres:
                    if_copy, prob_copy = self.copy_mechanism(enc_outputs=enc_outputs, this_timestep_input=zero_timestep_input, 
                                                            dec_state = last_timestep_hidden_state, inputs_one_hot = one_hot, 
                                                            context_v=context_vec,
                                                            weight = weight)
                    score = (1-if_copy)*self.softmax(logits)+if_copy*prob_copy
                    score = torch.clamp(score, min=10**(-30), max=1)

                #for saving time: no training, no loss calculating
                if is_train:
                    if copy_control<copy_thres:
                        loss = self.nll_loss(torch.log(score), labels[:,0])
                    else:
                        loss = self.cost_func(logits, labels[:,0])
                    all_loss+=loss
                
                #get predicts
                if copy_control<copy_thres:
                    _, max_idxs = torch.max(score, dim=1)
                else:
                    _, max_idxs = torch.max(logits, dim=1)
                predicts.append(torch.unsqueeze(max_idxs, dim=0))
                
                
            else:
                if is_train:
                    rand = random.random()
                    if rand<teaching_rate:
                        this_timestep_input = self.embed(labels[:,ii-1])#label teaching, lookup embedding
                    else:
                        this_timestep_input = self.embed(max_idxs)#last_timestep output, and then look up word embedding
                else:
                    this_timestep_input = self.embed(max_idxs)#last_timestep output, and then look up word embedding
                
                last_timestep_hidden_state,cx = self.lstmcell(this_timestep_input, (last_timestep_hidden_state,cx))
                
                #get context vector
                context_vec, weight = self.get_context_vec(enc_outputs=enc_outputs, this_timestep_input=this_timestep_input, 
                                                            dec_state = last_timestep_hidden_state)
                logits = self.fcnn(torch.cat([last_timestep_hidden_state, context_vec], dim=1))
                
                #copy or not
                copy_control=random.random()
                if copy_control<copy_thres:
                    if_copy, prob_copy = self.copy_mechanism(enc_outputs=enc_outputs, this_timestep_input=this_timestep_input, 
                                                            dec_state = last_timestep_hidden_state, inputs_one_hot = one_hot, 
                                                             context_v=context_vec,
                                                            weight = weight)
                    score = (1-if_copy)*self.softmax(logits)+if_copy*prob_copy
                    score = torch.clamp(score, min=10**(-30), max=1)

                #for saving time: no training, no loss calculating
                if is_train:
                    if copy_control<copy_thres:
                        loss = self.nll_loss(torch.log(score), labels[:,ii])
                    else:
                        loss = self.cost_func(logits, labels[:,ii])
                    all_loss+=loss
                
                #get predicts
                if copy_control<copy_thres:
                    _, max_idxs = torch.max(score, dim=1)
                else:
                    _, max_idxs = torch.max(logits, dim=1)
                predicts.append(torch.unsqueeze(max_idxs, dim=0))
                
        predicts = torch.cat(predicts, dim=0)
        predicts = torch.transpose(predicts, 0, 1)
    
        if is_train:  #training
#             all_loss = torch.cat(all_loss, dim=1)
#             all_loss = torch.mean(all_loss, dim=1)
#             loss = torch.mean(all_loss)
            loss = all_loss/self.max_length
    
            #print('loss size: ', loss.size())
            #torch.cuda.empty_cache()
            if self.use_cuda:
                return loss, predicts.data.cpu().tolist()
            else:
                return loss, predicts.data.tolist()
        else:   #testing
            if self.use_cuda:
                return predicts.data.cpu().tolist()
            else:
                return predicts.data.tolist()
#         if is_train:  #training
#             if self.use_cuda:
#                 return all_loss/(self.max_length+1), predicts.data.cpu().numpy()
#             else:
#                 return all_loss/(self.max_length+1), predicts.data.numpy()
#         else:   #testing
#             if self.use_cuda:
#                 return predicts.data.cpu().numpy()
#             else:
#                 return predicts.data.numpy()
    
    
    def decode_topk_seqs(self, encoder, inputs, input_lens, topk=3):
        enc_outputs, (enc_hn, enc_cn) = encoder(inputs, input_lens)
        batch_size = enc_outputs.size(dim = 0)
        
        #one hot of inputs
        sents_len = enc_outputs.size(1)
        inputs = inputs[:,:sents_len].unsqueeze(dim=2)
        one_hot = torch.FloatTensor(batch_size, sents_len, len(self.vocab.word2token)).zero_()
        one_hot.scatter_(2, inputs, 1)
        one_hot = one_hot.transpose(1,2)
        one_hot = self._tocuda(Variable(one_hot, requires_grad = 0))
        
        metadata = self.decode_by_beamsearch(encoder_hidden=(enc_hn, enc_cn), encoder_outputs=enc_outputs, inputs_one_hot=one_hot,topk = topk)
        results = metadata['topk_sequence']
        results =torch.cat(results, dim = 2)
        results=results.view(batch_size*topk, -1)
        if self.use_cuda:
            results = results.data.cpu().tolist()
        else:
            results = results.data.tolist()
#         results=batch_tokens_remove_eos(results, self.vocab)

#         labels = [x for x in labels for ii in range(topk)]
#         labels = batch_tokens_remove_eos(labels, self.vocab)
#         bleu_scores = batch_tokens_bleu(references=labels, candidates=results, smooth_epsilon=0.01)
        
#         bleu_scores = torch.FloatTensor(bleu_scores).view(batch_size, topk)
#         bleu_max, _ = torch.max(bleu_scores, dim=1)
        
#         bleu_mean = torch.mean(bleu_scores, dim=1).unsqueeze(dim=1)
#         bleu_scores = bleu_scores-bleu_mean
#         bleu_scores = bleu_scores.view(-1)
        
#         bleu_scores = self._tocuda(Variable(bleu_scores, requires_grad = 0))
#         log_probs = metadata['score']
#         log_probs = log_probs.view(batch_size*topk)
#         loss = -torch.dot(log_probs, bleu_scores)/batch_size/topk
#         return loss, results, torch.mean(bleu_mean.squeeze()), torch.mean(bleu_max)

        log_probs = metadata['score']
        log_probs = log_probs.view(batch_size*topk)
        
        return results, log_probs
        
        
        
    def _tocuda(self, var):
        if self.use_cuda:
            return var.cuda()
        else:
            return var
    def decode_by_beamsearch(self, encoder_hidden=None, encoder_outputs=None, inputs_one_hot=None, topk = 10):
        self.k = topk
        batch_size = encoder_outputs.size(dim=0)
        
        self.pos_index = self._tocuda(Variable(torch.LongTensor(range(batch_size)) * self.k).view(-1, 1))

        hidden = tuple([_inflate(h, self.k, 1).view(batch_size*self.k, -1) for h in encoder_hidden])
        #print('hidden0 size: (%s, %s)'%(hidden[0].size(), hidden[1].size()))

        encoder_outputs = _inflate(encoder_outputs, self.k, 1).view(batch_size*self.k, encoder_outputs.size(1), encoder_outputs.size(2))
        inputs_one_hot = _inflate(inputs_one_hot, self.k, 1).view(batch_size*self.k, inputs_one_hot.size(1), inputs_one_hot.size(2))
        
        # Initialize the scores; for the first step,
        # ignore the inflated copies to avoid duplicate entries in the top k
        sequence_scores = torch.Tensor(batch_size * self.k, 1)
        sequence_scores.fill_(-float('Inf'))
        sequence_scores.index_fill_(0, torch.LongTensor([i * self.k for i in range(0, batch_size)]), 0.0)
        sequence_scores = self._tocuda(Variable(sequence_scores))

        # Initialize the input vector
        input_var = self._tocuda(Variable(torch.LongTensor([self.SOS] * batch_size * self.k)))

        # Store decisions for backtracking
        stored_outputs = list()
        stored_scores = list()
        stored_predecessors = list()
        stored_emitted_symbols = list()
        stored_hidden = list()

        for ii in range(0, self.max_length):
            # Run the RNN one step forward
            #print('setp: %s'%ii)
            input_vec = self.embed(input_var)
            #print('input_var and input_vec size: ', input_var.size(), input_vec.size())
            hidden = self.lstmcell(input_vec, hidden)
            #print('hidden size: (%s, %s)'%(hidden[0].size(), hidden[1].size()))
            
            #log_softmax_output = self.log_softmax(self.fcnn(hidden[0]))
            
            logits = self.fcnn(hidden[0])
#             print('logits size', logits.size())
#             print(encoder_outputs.size())
#             print(input_vec.size())
#             print(hidden[0].size())
#             print(inputs_one_hot.size())
            if_copy, prob_copy = self.copy_mechanism(enc_outputs=encoder_outputs, this_timestep_input=input_vec.squeeze(dim=1), 
                                                            dec_state = hidden[0], inputs_one_hot = inputs_one_hot)
#             print('if_copy size', if_copy.size(), 'prob_copy size', prob_copy.size())
            
            score = (1-if_copy)*self.softmax(logits)+if_copy*prob_copy
            score = torch.clamp(score, min=10**(-30), max=1)
#             print('score size: ', score.size())

            # To get the full sequence scores for the new candidates, add the local scores for t_i to the predecessor scores for t_(i-1)
            sequence_scores = _inflate(sequence_scores, self.V, 1)
            sequence_scores += torch.log(score).squeeze(1)
            scores, candidates = sequence_scores.view(batch_size, -1).topk(self.k, dim=1)

            # Reshape input = (bk, 1) and sequence_scores = (bk, 1)
            input_var = (candidates % self.V).view(batch_size * self.k, 1)
            sequence_scores = scores.view(batch_size * self.k, 1)

            # Update fields for next timestep
            predecessors = (candidates / self.V + self.pos_index.expand_as(candidates)).view(batch_size * self.k, 1)
            if isinstance(hidden, tuple):
                hidden = tuple([h.index_select(0, predecessors.squeeze()) for h in hidden])
            else:
                hidden = hidden.index_select(0, predecessors.squeeze())

            # Update sequence scores and erase scores for end-of-sentence symbol so that they aren't expanded
            stored_scores.append(sequence_scores.clone())
            eos_indices = input_var.data.eq(self.EOS)
            if eos_indices.nonzero().dim() > 0:
                sequence_scores.data.masked_fill_(eos_indices, -float('inf'))

            # Cache results for backtracking
            stored_predecessors.append(predecessors)
            stored_emitted_symbols.append(input_var)
#             stored_hidden.append(hidden)

        # Do backtracking to return the optimal values
        output, h_t, h_n, s, l, p = self._backtrack(hidden,
                                                    stored_predecessors, stored_emitted_symbols,
                                                    stored_scores, batch_size, self.hidden_size)

        metadata = {}

        metadata['score'] = s
        metadata['topk_length'] = l
        metadata['topk_sequence'] = p
        metadata['length'] = [seq_len[0] for seq_len in l]
        metadata['sequence'] = [seq[0] for seq in p]
        
#         torch.cuda.empty_cache()
        
        return metadata

    def _backtrack(self, hidden, predecessors, symbols, scores, b, hidden_size):
        """Backtracks over batch to generate optimal k-sequences.

        Args:
            nw_output [(batch*k, vocab_size)] * sequence_length: A Tensor of outputs from network
            nw_hidden [(num_layers, batch*k, hidden_size)] * sequence_length: A Tensor of hidden states from network
            predecessors [(batch*k)] * sequence_length: A Tensor of predecessors
            symbols [(batch*k)] * sequence_length: A Tensor of predicted tokens
            scores [(batch*k)] * sequence_length: A Tensor containing sequence scores for every token t = [0, ... , seq_len - 1]
            b: Size of the batch
            hidden_size: Size of the hidden state

        Returns:
            output [(batch, k, vocab_size)] * sequence_length: A list of the output probabilities (p_n)
            from the last layer of the RNN, for every n = [0, ... , seq_len - 1]

            h_t [(batch, k, hidden_size)] * sequence_length: A list containing the output features (h_n)
            from the last layer of the RNN, for every n = [0, ... , seq_len - 1]

            h_n(batch, k, hidden_size): A Tensor containing the last hidden state for all top-k sequences.

            score [batch, k]: A list containing the final scores for all top-k sequences

            length [batch, k]: A list specifying the length of each sequence in the top-k candidates

            p (batch, k, sequence_len): A Tensor containing predicted sequence
        """

        lstm = isinstance(hidden, tuple)

        # initialize return variables given different types
        output = list()
        h_t = list()
        p = list()
        # Placeholder for last hidden state of top-k sequences.
        # If a (top-k) sequence ends early in decoding, `h_n` contains
        # its hidden state when it sees EOS.  Otherwise, `h_n` contains
        # the last hidden state of decoding.
        if lstm:
            state_size = hidden[0].size()
            h_n = tuple([torch.zeros(state_size), torch.zeros(state_size)])
        else:
            h_n = torch.zeros(nw_hidden[0].size())
        l = [[self.max_length] * self.k for _ in range(b)]  # Placeholder for lengths of top-k sequences
                                                                # Similar to `h_n`

        # the last step output of the beams are not sorted
        # thus they are sorted here
        sorted_score, sorted_idx = scores[-1].view(b, self.k).topk(self.k)
        # initialize the sequence scores with the sorted last step beam scores
        s = sorted_score.clone()

        batch_eos_found = [0] * b   # the number of EOS found
                                    # in the backward loop below for each batch

        t = self.max_length - 1
        # initialize the back pointer with the sorted order of the last step beams.
        # add self.pos_index for indexing variable with b*k as the first dimension.
        t_predecessors = (sorted_idx + self.pos_index.expand_as(sorted_idx)).view(b * self.k)
        while t >= 0:
            # Re-order the variables with the back pointer
            current_symbol = symbols[t].index_select(0, t_predecessors)
            # Re-order the back pointer of the previous step with the back pointer of
            # the current step
            t_predecessors = predecessors[t].index_select(0, t_predecessors).squeeze()

            # This tricky block handles dropped sequences that see EOS earlier.
            # The basic idea is summarized below:
            #
            #   Terms:
            #       Ended sequences = sequences that see EOS early and dropped
            #       Survived sequences = sequences in the last step of the beams
            #
            #       Although the ended sequences are dropped during decoding,
            #   their generated symbols and complete backtracking information are still
            #   in the backtracking variables.
            #   For each batch, everytime we see an EOS in the backtracking process,
            #       1. If there is survived sequences in the return variables, replace
            #       the one with the lowest survived sequence score with the new ended
            #       sequences
            #       2. Otherwise, replace the ended sequence with the lowest sequence
            #       score with the new ended sequence
            #
            eos_indices = symbols[t].data.squeeze(1).eq(self.EOS).nonzero()
            if eos_indices.dim() > 0:
                for i in range(eos_indices.size(0)-1, -1, -1):
                    # Indices of the EOS symbol for both variables
                    # with b*k as the first dimension, and b, k for
                    # the first two dimensions
                    idx = eos_indices[i]
                    b_idx = int(idx[0] / self.k)
                    # The indices of the replacing position
                    # according to the replacement strategy noted above
                    res_k_idx = self.k - (batch_eos_found[b_idx] % self.k) - 1
                    batch_eos_found[b_idx] += 1
                    res_idx = b_idx * self.k + res_k_idx

                    # Replace the old information in return variables
                    # with the new ended sequence information
                    t_predecessors[res_idx] = predecessors[t][idx[0]]

                    current_symbol[res_idx, :] = symbols[t][idx[0]]
                    s[b_idx, res_k_idx] = scores[t][idx[0]]
                    l[b_idx][res_k_idx] = t + 1

            # record the back tracked results
            p.append(current_symbol)
            t -= 1

        # Sort and re-order again as the added ended sequences may change
        # the order (very unlikely)
        s, re_sorted_idx = s.topk(self.k)
        for b_idx in range(b):
            l[b_idx] = [l[b_idx][k_idx.data[0]] for k_idx in re_sorted_idx[b_idx,:]]

        re_sorted_idx = (re_sorted_idx + self.pos_index.expand_as(re_sorted_idx)).view(b * self.k)

        # Reverse the sequences and re-order at the same time
        # It is reversed because the backtracking happens in reverse time order
#         output = [step.index_select(0, re_sorted_idx).view(b, self.k, -1) for step in reversed(output)]
        p = [step.index_select(0, re_sorted_idx).view(b, self.k, -1) for step in reversed(p)]
        #    --- fake output ---
        output = None
        #    --- fake ---
        return output, h_t, h_n, s, l, p

    def _mask_symbol_scores(self, score, idx, masking_score=-float('inf')):
            score[idx] = masking_score

    def _mask(self, tensor, idx, dim=0, masking_score=-float('inf')):
        if len(idx.size()) > 0:
            indices = idx[:, 0]
            tensor.index_fill_(dim, indices, masking_score)

In [8]:
class Seq2Seq(nn.Module):
    def __init__(self, use_cuda, input_dim, hidden_dim, vocab, max_length = 25):
        super(Seq2Seq, self).__init__()
        
        self.use_cuda = use_cuda
        self.enc = Encoder(use_cuda=use_cuda, hidden_dim=hidden_dim, input_dim=input_dim, vocab=vocab)
        self.dec = Decoder(use_cuda=use_cuda, encoder=self.enc, hidden_dim=hidden_dim, max_length=max_length)
        if use_cuda:
            self.enc = self.enc.cuda()
            self.dec = self.dec.cuda()
    def forward(self, inputs, input_lens, labels, is_train=1, teaching_rate=1):
        enc_outputs, (enc_hn, enc_cn) = self.enc(torch.LongTensor(inputs), torch.LongTensor(input_lens))
        if is_train:
            loss, predicts = self.dec(enc_outputs = enc_outputs, 
                                    h0_and_c0=(enc_hn, enc_cn), 
                                    sent_lens=input_lens,
                                    labels=torch.LongTensor(labels), 
                                    is_train=1, 
                                    teaching_rate = 1,
                                    inputs = inputs
                                    )
            return loss, predicts
        else:
            predicts = self.dec(enc_outputs = enc_outputs, 
                                h0_and_c0=(enc_hn, enc_cn), 
                                sent_lens=input_lens,
                                labels=torch.LongTensor(labels), 
                                is_train=0, 
                                teaching_rate = 1,
                                inputs = inputs
                                )
            return predicts
#     def train_using_rl(self, inputs, input_lens, labels, is_train=1, teaching_rate=1):
#         enc_outputs, (enc_hn, enc_cn) = self.enc(torch.LongTensor(inputs), torch.LongTensor(input_lens))
#         loss, predicts, bleu_mean = self.dec.train_using_rl_2(enc_outputs = enc_outputs, 
#                                                 h0_and_c0=(enc_hn, enc_cn), 
#                                                 sent_lens=input_lens,
#                                                 labels=labels,
#                                                 is_train=1, 
#                                                 teaching_rate = 1
#                                                 )
#         return loss, predicts, bleu_mean

    def tocuda(self, x):
        if self.use_cuda:
            return x.cuda()
        else:
            return x
        
    def train_using_reward(self, inputs, input_lens, reconstruct_labels, reconstruct_model, language_model, topk=3, loss_ratio=0.5):
        dec_seqs, log_probs = self.dec.decode_topk_seqs(self.enc, inputs, input_lens, topk=topk)
#         enc_outputs, (enc_hn, enc_cn) = self.enc(torch.LongTensor(inputs), torch.LongTensor(input_lens))
#         results = self.dec.decode_no_labels(enc_outputs=enc_outputs, h0_and_c0=(enc_hn, enc_cn), topk=topk)
        simple_sent1s, simple_sent2s = seqs_split(dec_seqs, self.enc.vocab)
        
        lm_input1s, lm_input1_lens, lm_label1s = get_lm_inputs_and_labels(simple_sent1s, self.enc.vocab, self.dec.max_length)
        simple_sent1s_ppl = language_model.get_sentences_ppl(torch.LongTensor(lm_input1s), 
                                                      torch.LongTensor(lm_input1_lens), 
                                                      torch.LongTensor(lm_label1s)
                                                    )
        lm_input2s, lm_input2_lens, lm_label2s = get_lm_inputs_and_labels(simple_sent2s, self.enc.vocab, self.dec.max_length)
        simple_sent2s_ppl = language_model.get_sentences_ppl(torch.LongTensor(lm_input2s), 
                                                      torch.LongTensor(lm_input2_lens), 
                                                      torch.LongTensor(lm_label2s)
                                                    )
        
        simple_inputs, simple_input_lens = simple_sents_concat(simple_sent1s, simple_sent2s, self.enc.vocab, self.dec.max_length)
        #reconstruct labels
        reconstruct_loss, predicts = reconstruct_model.forward(torch.LongTensor(simple_inputs), 
                                     torch.LongTensor(simple_input_lens), 
                                     labels=reconstruct_labels, 
                                     is_train=1, teaching_rate=1)
        
        #rm_rewards: reconstruct model rewards
        #lm_rewards: language model rewards
        rm_rewards=-reconstruct_loss.data
        lm_rewards=(1/self.tocuda(torch.Tensor(simple_sent1s_ppl))+1/self.tocuda(torch.Tensor(simple_sent2s_ppl)))/2
        
        rm_rewards_mean = torch.mean(rm_rewards.view(-1, topk), dim=1)
        lm_rewards_mean = torch.mean(lm_rewards.view(-1, topk), dim=1)
        rm_rewards = rm_rewards.view(-1, topk) - rm_rewards_mean.unsqueeze(dim=1)
        lm_rewards = lm_rewards.view(-1, topk) - lm_rewards_mean.unsqueeze(dim=1)
        
        rm_rewards = rm_rewards.view(-1)
        lm_rewards = lm_rewards.view(-1)
        
        #sum both rewards up
        rewards = loss_ratio*rm_rewards+(1-loss_ratio)*lm_rewards
        rewards = Variable(rewards, requires_grad=0)
        
        #regarding rewards as weights of every seq
        loss = -torch.dot(log_probs, rewards)/log_probs.size(0)
        
#         labels = [x for x in labels for ii in range(topk)]
#         labels = batch_tokens_remove_eos(labels, self.vocab)
#         bleu_scores = batch_tokens_bleu(references=labels, candidates=results, smooth_epsilon=0.01)
        
#         bleu_scores = torch.FloatTensor(bleu_scores).view(batch_size, topk)
#         bleu_max, _ = torch.max(bleu_scores, dim=1)
        
#         bleu_mean = torch.mean(bleu_scores, dim=1).unsqueeze(dim=1)
#         bleu_scores = bleu_scores-bleu_mean
#         bleu_scores = bleu_scores.view(-1)
        
#         bleu_scores = self._tocuda(Variable(bleu_scores, requires_grad = 0))
        
#         log_probs = metadata['score']
#         log_probs = log_probs.view(batch_size*topk)
    
#         loss = -torch.dot(log_probs, bleu_scores)/batch_size/topk
        
        return loss, reconstruct_loss, torch.mean(rm_rewards_mean), torch.mean(lm_rewards_mean)
    
    


In [None]:
def split_model_eval(model, inputs, input_lens, labels):
    dataset_size = len(inputs)
    print(dataset_size)
    scores_ground_truth=0
    scores_no_ground_truth=0
    for idx in range(0, dataset_size, batch_size):
        
        #no teacher forcing
        predicts = model.forward(torch.LongTensor(inputs[idx:idx+batch_size]),
                                 torch.LongTensor(input_lens[idx:idx+batch_size]),
                                 labels=[],
                                 is_train=0, teaching_rate=1)
        bleu_scores = batch_tokens_bleu_split_version(references = labels[idx:idx+batch_size],
                                                     candidates = predicts,
                                                     smooth_epsilon=0.001,
                                                     vocab=vocab)
        for x in bleu_scores:
            scores_no_ground_truth+=x
    return scores_no_ground_truth/dataset_size


def fusion_model_eval(model, inputs, input_lens, labels):
    dataset_size = len(inputs)
    scores_ground_truth=0
    scores_no_ground_truth=0
    for idx in range(0, dataset_size, batch_size):
        
        #no teacher forcing
        predicts = model.forward(torch.LongTensor(inputs[idx:idx+batch_size]),
                                 torch.LongTensor(input_lens[idx:idx+batch_size]),
                                 labels=[],
                                 is_train=0, teaching_rate=1)
        predicts = batch_tokens_remove_eos(predicts, vocab)
        labels_ = batch_tokens_remove_eos(labels[idx:idx+batch_size], vocab)
        bleu_scores = batch_tokens_bleu(references=labels_, candidates=predicts, smooth_epsilon=0.001)
        for x in bleu_scores:
            scores_no_ground_truth+=x
    return scores_no_ground_truth/dataset_size



In [None]:
use_cuda = 1
hidden_dim = 256
input_dim = 100
lr=0.005
batch_size=180
split_train_set_size=int(len(split_train_set_inputs)/1)
epochs=10000
train_bleu_mean=-1
train_bleu_max=-1
split_model = Seq2Seq(use_cuda = use_cuda, input_dim = input_dim, hidden_dim = hidden_dim, 
                          vocab = vocab, max_length = 61)

fusion_model = Seq2Seq(use_cuda = use_cuda, input_dim = input_dim, hidden_dim = hidden_dim, 
                          vocab = vocab, max_length = 51)
# #pre train para
# split_model_path = './models_saved/time-[2019-03-09-18-40-05]-info=[pre-trained_split_model-20per]-loss=0.673111856-bleu=0.7492-hidden_dim=256-input_dim=100-epoch=1-batch_size=100-batch_id=[1-[of]-1979]-lr=0.0050'
# fusion_model_path = './models_saved/time-[2019-03-09-18-40-07]-info=[pre-trained_fusion_model-20per]-loss=0.515186548-bleu=0.3736-hidden_dim=256-input_dim=100-epoch=1-batch_size=100-batch_id=[1-[of]-1979]-lr=0.0050'

# split_model_path = './models_saved/time-[2019-03-10-13-23-10]-info=[pre-trained_split_model-20per]-loss=0.454687029-bleu=0.7130-hidden_dim=256-input_dim=100-epoch=4-batch_size=100-batch_id=[1-[of]-1979]-lr=0.0050'
# fusion_model_path = './models_saved/time-[2019-03-10-13-23-11]-info=[pre-trained_fusion_model-20per]-loss=0.346116364-bleu=0.7466-hidden_dim=256-input_dim=100-epoch=4-batch_size=100-batch_id=[1-[of]-1979]-lr=0.0050'

# pre_train = torch.load(split_model_path, map_location='cpu')
# split_model.load_state_dict(pre_train)
# pre_train = torch.load(fusion_model_path, map_location='cpu')
# fusion_model.load_state_dict(pre_train)

if use_cuda:
    split_model = split_model.cuda()
    fusion_model = fusion_model.cuda()
    
split_optimizer = optim.Adam(filter(lambda p: p.requires_grad, split_model.parameters()), lr=lr)
fusion_optimizer = optim.Adam(filter(lambda p: p.requires_grad, fusion_model.parameters()), lr=lr)

start_time = time.time()

def model_train(epoch, batch_size, train_set_size):
    batch_id = 0
    valid_bleu = 0
    for start_idx in range(0, train_set_size-batch_size+1, batch_size):
#         print('batch id: ', batch_id)
            
        batch_id+=1
        end_idx = start_idx + batch_size
        
        split_optimizer.zero_grad()#clear  
        split_loss, predicts = split_model.forward(torch.LongTensor(split_train_set_inputs[start_idx:end_idx]), 
                                     torch.LongTensor(split_train_set_input_lens[start_idx:end_idx]), 
                                     labels=torch.LongTensor(split_pseudo_train_set_labels[start_idx:end_idx]), 
                                     is_train=1, teaching_rate=1)
        split_loss=torch.mean(split_loss)

        #optimize
        split_loss.backward()#retain_graph=True)
        split_optimizer.step()
        
#         torch.cuda.empty_cache()
        
        fusion_optimizer.zero_grad()#clear
        fusion_loss, predicts = fusion_model.forward(torch.LongTensor(fusion_pseudo_train_set_inputs[start_idx:end_idx]), 
                                     torch.LongTensor(fusion_pseudo_train_set_input_lens[start_idx:end_idx]), 
                                     labels=torch.LongTensor(fusion_pseudo_train_set_labels[start_idx:end_idx]), 
                                     is_train=1, teaching_rate=1)
        fusion_loss = torch.mean(fusion_loss)
        fusion_loss.backward()#retain_graph=True)
        fusion_optimizer.step()

#         torch.cuda.empty_cache()
        
        if batch_id%50==1:
            split_model.eval()
            fusion_model.eval()
            
            sample_num = 5
            rand_idx = random.randint(0, train_set_size-sample_num-1)
            
            print('--------split model training sampling display--------')
            #teaching forcing
            loss_, predicts = split_model.forward(torch.LongTensor(split_train_set_inputs[rand_idx:rand_idx+sample_num]), 
                                             torch.LongTensor(split_train_set_input_lens[rand_idx:rand_idx+sample_num]), 
                                             labels=torch.LongTensor(split_pseudo_train_set_labels[rand_idx:rand_idx+sample_num]), 
                                             is_train=1, teaching_rate=1)
            del loss_
            
            predicts = batch_tokens_remove_eos(predicts, vocab)
            labels = batch_tokens_remove_eos(split_pseudo_train_set_labels[rand_idx:rand_idx+sample_num], vocab)
            
            predicts = batch_tokens2words(predicts, vocab)
            labels = batch_tokens2words(labels, vocab)
            
            predicts_sents = batch_words2sentence(predicts)
            labels_sents = batch_words2sentence(labels)
            
            for (predict_sent, label_sent) in zip(predicts_sents, labels_sents):
                print(' 1----> ', predict_sent)
                print(' 2----> ', label_sent)
                print('\n')
                
            print('--------fusion model training sampling display--------')
            loss_, predicts = fusion_model.forward(torch.LongTensor(fusion_pseudo_train_set_inputs[rand_idx:rand_idx+sample_num]), 
                                             torch.LongTensor(fusion_pseudo_train_set_input_lens[rand_idx:rand_idx+sample_num]), 
                                             labels=torch.LongTensor(fusion_pseudo_train_set_labels[rand_idx:rand_idx+sample_num]), 
                                             is_train=1, teaching_rate=1)
            del loss_
            
            predicts = batch_tokens_remove_eos(predicts, vocab)
            labels = batch_tokens_remove_eos(fusion_pseudo_train_set_labels[rand_idx:rand_idx+sample_num], vocab)
            
            predicts = batch_tokens2words(predicts, vocab)
            labels = batch_tokens2words(labels, vocab)
            
            predicts_sents = batch_words2sentence(predicts)
            labels_sents = batch_words2sentence(labels)
            
            for (predict_sent, label_sent) in zip(predicts_sents, labels_sents):
                print(' 1----> ', predict_sent)
                print(' 2----> ', label_sent)
                print('\n')
                
#             #no teaching forcing
            print('----no teaching forcing----')
            predicts = split_model.forward(torch.LongTensor(fusion_pseudo_train_set_inputs[rand_idx:rand_idx+sample_num]), 
                                             torch.LongTensor(fusion_pseudo_train_set_input_lens[rand_idx:rand_idx+sample_num]), 
                                             labels=[],
                                             is_train=0, teaching_rate=1)
            predicts = batch_tokens_remove_eos(predicts, vocab)
            labels = batch_tokens_remove_eos(split_pseudo_train_set_labels[rand_idx:rand_idx+sample_num], vocab)
            
            predicts = batch_tokens2words(predicts, vocab)
            labels = batch_tokens2words(labels, vocab)
            
            predicts_sents = batch_words2sentence(predicts)
            labels_sents = batch_words2sentence(labels)

            for (predict_sent, label_sent) in zip(predicts_sents, labels_sents):
                print(' 1----> ', predict_sent)
                print(' 2----> ', label_sent)
                print('\n')
                
            info_stamp = 'split_loss={:2.9f}-train_bleu_mean={:2.9f}-train_bleu_max={:2.9f}-batch_size={:n}-epoch={:n}-batch_id=({:n}/{:n})'.format(
                              split_loss.data[0], train_bleu_mean, train_bleu_max, batch_size, epoch, batch_id, int(train_set_size/batch_size))
            print(info_stamp)
#             torch.cuda.empty_cache()
            
# #             #valid_set testing
            if batch_id%500==1:
#                 valid_bleu = split_model_eval(split_model, split_valid_set_inputs, split_valid_set_input_lens, split_pseudo_valid_set_labels)
                rand_idx=random.randint(0, len(split_valid_set_inputs)-batch_size-1-1)
                predicts = split_model.forward(torch.LongTensor(split_valid_set_inputs[rand_idx:rand_idx+batch_size]), 
                                                 torch.LongTensor(split_valid_set_input_lens[rand_idx:rand_idx+batch_size]), 
                                                 labels=[],#torch.LongTensor(valid_set_labels[rand_idx:rand_idx+batch_size]), 
                                                 is_train=0, teaching_rate=1)
#                 predicts = batch_tokens_remove_eos(predicts, vocab)
#                 labels = batch_tokens_remove_eos(split_pseudo_valid_set_labels[rand_idx:rand_idx+batch_size], vocab)
                
#                 bleu_scores = batch_tokens_bleu(references=labels, candidates=predicts, smooth_epsilon=0.001)
                bleu_scores = batch_tokens_bleu_split_version(references = split_pseudo_valid_set_labels[rand_idx:rand_idx+batch_size],
                                                             candidates = predicts,
                                                             smooth_epsilon=0.001,
                                                             vocab=vocab)
                valid_bleu = 0
                for x in bleu_scores:
                    valid_bleu+=x
                valid_bleu/=len(bleu_scores)
                       
                info_stamp = 'info=[{:s}]-loss={:2.9f}-bleu={:1.4f}-hidden_dim={:n}-input_dim={:n}-epoch={:n}-batch_size={:n}-batch_id=[{:n}-[of]-{:n}]-lr={:1.4f}'.format(
                              'pretrain_split-att-20per', split_loss.data[0], valid_bleu, hidden_dim, input_dim, epoch, batch_size, batch_id, int(train_set_size/batch_size), lr)
                print(info_stamp, valid_bleu)
                now = int(round(time.time()*1000))
                time_stamp = time.strftime('time-[%Y-%m-%d-%H-%M-%S]-',time.localtime(now/1000))
                torch.save(split_model.state_dict(), ''.join(['./models_saved/', time_stamp, info_stamp]))
                
                #fusion eval
#                 valid_bleu = fusion_model_eval(fusion_model, fusion_pseudo_valid_set_inputs, 
#                                                   fusion_pseudo_valid_set_input_lens, fusion_pseudo_valid_set_labels)
                


                rand_idx=random.randint(0, len(fusion_pseudo_valid_set_inputs)-batch_size-1-1)
                predicts = fusion_model.forward(torch.LongTensor(fusion_pseudo_valid_set_inputs[rand_idx:rand_idx+batch_size]), 
                                                 torch.LongTensor(fusion_pseudo_valid_set_input_lens[rand_idx:rand_idx+batch_size]), 
                                                 labels=[],#torch.LongTensor(valid_set_labels[rand_idx:rand_idx+batch_size]), 
                                                 is_train=0, teaching_rate=1)
                predicts = batch_tokens_remove_eos(predicts, vocab)
                labels = batch_tokens_remove_eos(fusion_pseudo_valid_set_labels[rand_idx:rand_idx+batch_size], vocab)
                
                bleu_scores = batch_tokens_bleu(references=labels, candidates=predicts, smooth_epsilon=0.001)

                valid_bleu = 0
                for x in bleu_scores:
                    valid_bleu+=x
                valid_bleu/=len(bleu_scores)
                
                info_stamp = 'info=[{:s}]-loss={:2.9f}-bleu={:1.4f}-hidden_dim={:n}-input_dim={:n}-epoch={:n}-batch_size={:n}-batch_id=[{:n}-[of]-{:n}]-lr={:1.4f}'.format(
                              'pretrain_fusion-att-20per', fusion_loss.data[0], valid_bleu, hidden_dim, input_dim, epoch, batch_size, batch_id, int(train_set_size/batch_size), lr)
                print(info_stamp, valid_bleu)
                now = int(round(time.time()*1000))
                time_stamp = time.strftime('time-[%Y-%m-%d-%H-%M-%S]-',time.localtime(now/1000))
                torch.save(fusion_model.state_dict(), ''.join(['./models_saved/', time_stamp, info_stamp]))
#                 torch.cuda.empty_cache()
            split_model.train()
            fusion_model.train()
            
for epoch in range(epochs):
    model_train(epoch, batch_size, split_train_set_size)
    
print('running time: %.2f mins'%((time.time()-start_time)/60))

init lookup embedding matrix size:  torch.Size([44380, 100])
init lookup embedding matrix size:  torch.Size([44380, 100])
--------split model training sampling display--------
 1---->  the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the
 2---->  in britain , the ha - ha is a feature in landscape gardens laid out by charles <low_freq> and by william kent . <split> the device was an essential component of the '' swept '' views of capability brown .


 1---->  the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the
 2---->  in britain some mps may <low_freq> their importance to their constituents . <split> in fact ,

 1---->  after after after the the after the after the the , , , the after , , the after after after the , , the the , after after the after after the after after after after
 2---->  the <low_freq> and the <low_freq> were allies and in 1529 <low_freq> warriors were harassing spanish forces and <low_freq> was trying to <low_freq> rebellion among the <low_freq> ' ; the spanish decided that military action was necessary .


 1---->  after the , a <low_freq> of , , the , the , the the , the <low_freq> , , he ,
 2---->  the <low_freq> is a rare natural breed of cat that originates from the region of the amur river , russia .


 1---->  after the , he , , he <low_freq> of , , the the , the <low_freq> , , the , he , the the of , the , the the of . . the the . .
 2---->  the ust '' <low_freq> formation is a geological formation exposed along the banks of the <low_freq> river in russia , famed for the vast quantities of fossils of the <low_freq> <low_freq> preserved in its ash beds .


 1---->

 1---->  as a result , he was a result of the <low_freq> , the <low_freq> was a result of the <low_freq> , the <low_freq> was a result of the <low_freq> , the <low_freq> was a result of the <low_freq> .
 2---->  both flew almost straight away into england to meet connolly , during the november 1995 . <split> the result giving brian connolly some closure to an issue that had haunted him , for a considerable period .


 1---->  as the <low_freq> , the <low_freq> was a result of the <low_freq> , the <low_freq> was a result of the <low_freq> .
 2---->  both flight recorders were recovered from the crash site . <split> the recorders were delivered to the interstate aviation committee in moscow .


 1---->  as the <low_freq> , the <low_freq> was a result of the <low_freq> , the <low_freq> was a result of the <low_freq> , the <low_freq> was a result of the <low_freq> , the <low_freq> was the <low_freq> .
 2---->  both forces comprised men - at - arms and were relatively <low_freq> . <split> t

--------split model training sampling display--------
 1---->  politics of the but place in a constituency democratic a labour representative democratic <split> the president is the head of state and in president minister , been of of of the involved head of government . is involved by a council of government .
 2---->  politics of mauritius takes place in a framework of a parliamentary democracy . <split> the president is the head of state while the prime minister has full executive power and is the head of government who is assisted by a council of ministers .


 1---->  <low_freq> - <low_freq> <low_freq> , and operated by 1943 police regiment and <low_freq> . men from eastern europe . <split> the '' commander - <low_freq> '' were the german mass camp such as <low_freq> and tutor .
 2---->  <low_freq> - führer <low_freq> , and operated by ss police battalions and <low_freq> -- volunteers from eastern europe . <split> the '' ss - <low_freq> '' managed the nazi concentration camps such

 1---->  an action comedy film ' starring kevin james is being shot entirely at the resort . <split> it is the first film ever shot in the property of steve trump , by his departure .
 2---->  an action comedy film ' starring kevin james is being shot entirely at the casino . <split> it is the first film ever shot in the property of steve wynn by his permission .


 1---->  an active <low_freq> , she introduced the bill that became the bill that became the rights of entry ( gas and electricity . <split> she promoted a bill to pay arm money to the elderly living in institutions .
 2---->  an active <low_freq> , she introduced the bill that became the rights of entry ( gas and electricity boards ) act , 1954 . <split> she promoted a bill to pay pocket money to the elderly living in institutions .


 1---->  an active ecclesiastical politician , he was an alliance of pope <low_freq> . <split> he was involved in the latter 's family , he was involved in the three - chapter controversy .
 2

--------split model training sampling display--------
 1---->  <low_freq> is a ward within the city of derby . <split> prior to this <low_freq> he was a separate village which dated from before the domesday book of 1086 .
 2---->  <low_freq> is a ward within the city of derby . <split> prior to this , <low_freq> was a separate village which dated from before the domesday book of 1086 .


 1---->  chocolate pepsi ( often spelled <low_freq> ) is a filipino rock band formed in 2002 with <split> he in 2002 with original drummer chris <low_freq> , chris left the group due to his deteriorating health condition .
 2---->  sponge cola ( often spelled <low_freq> ) is a filipino rock band formed in 2002 . <split> formed in 1998 with original drummer chris <low_freq> , chris left the group due to his worsening health condition .


 1---->  gleason , sandy and patrick see this '' and , , , try to calm everyone down . <split> he try is them that whatever happens , dresses bottom is their home , and

 1---->  remix are included from mark alston , johnny dirty . <split> the original version was produced by the original version was produced by nick aja .
 2---->  remixes are included from mark alston , johnny vicious , jim heinz , jamie j sanchez , and mike rizzo . <split> the original version was produced by nick saya .


 1---->  remnants of <low_freq> fishing gear were found near the abundant <low_freq> of '' <low_freq> - <low_freq> - <low_freq> park , where dense trees provide habitat for many birds . <split> the <low_freq> is now called me - <low_freq> - <low_freq> - <low_freq> park , where dense trees provide habitat for many birds and the <low_freq> - <low_freq> - <low_freq> park ,
 2---->  remnants of <low_freq> fishing gear were found near the abundant <low_freq> of '' <low_freq> - <low_freq> - <low_freq> '' ( '' shaped like a bear 's head '' , for the west seattle peninsula ) . <split> the site is in what is now called me - <low_freq> - <low_freq> park , where dense trees p

 1---->  it <low_freq> is used , like many branches in north wales , a <low_freq> campus by local government projects ( fda ) , and to the water is an from the nearby park park .
 2---->  llyn <low_freq> is used , like many lakes in north wales as a <low_freq> facility by local education authorities ( lea ) , access to the water is provided from the roadside car park .


 1---->  it <low_freq> began his career by <low_freq> <low_freq> , and <low_freq> 18 be <low_freq> , and signed on 18 september 2006 by <low_freq> <low_freq> .
 2---->  lo <low_freq> began his career by psc <low_freq> , joined than to nk <low_freq> , and signed on 18 february 2006 by nk <low_freq> .


 1---->  it ( often abbreviated as <low_freq> ) , also known as <low_freq> guitar , is an american group music band consisting of artists martin and <low_freq> <low_freq> , and of canada , ontario , <low_freq> , respectively , and .
 2---->  <low_freq> ( sometimes stylized as <low_freq> ) , formerly known as <low_freq> co

--------split model training sampling display--------
 1---->  the hospital of st. thomas the martyr of <low_freq> is situated on the king 's - bridge , near the carrollton , in canterbury . <split> the was built by thomas <low_freq> becket , a significant of middle middle ages , , was built famous for being assassinated in his own cathedral .
 2---->  the hospital of st. thomas the martyr of <low_freq> is situated on the king 's - bridge , near the westgate , in canterbury . <split> it was built by thomas <low_freq> becket ( a clergyman of the middle ages ) who was most famous for being assassinated in his own cathedral .


 1---->  the hosts ' language also does not allow for lying or even speculation . <split> they , they create literal <low_freq> by recruiting individuals to perform bizarre <low_freq> that can then become allusions in language .
 2---->  the hosts ' language also does not allow for lying or even speculation . <split> instead , they create literal <low_freq> by recr

 1---->  meanwhile , felix , oc , and rafael , his died died after their last child was born .
 2---->  cecilia , felix , agustin , and vicente , jr. sofia died after their last child was born .


 1---->  lily brings her car to suit douglas for a service , she she also tells him to replace the brake pitch .
 2---->  cecilia brings her car to fitzgerald motors for a service , and she also asks steph to replace the brake pads .


 1---->  <low_freq> joshi ( born 14 september 1990 ) is a norwegian footballer from plays for for norwegian , <low_freq> , <low_freq> <low_freq> the norway women 's national football team .
 2---->  <low_freq> pedersen ( born 14 september 1990 ) is a norwegian footballer that plays as a striker for lillestrøm in <low_freq> and the norway women 's national football team .


 1---->  mary 's third and final marriage , to thomas <low_freq> , <low_freq> , or <low_freq> , an lincolnshire lincolnshire browne , otherwise between may <low_freq> and january 2018 , and t

--------split model training sampling display--------
 1---->  top scholarly presses require funds for publication costs in their contracts ( peter lang , for instance , amongst other major european houses ) . <split> amongst themselves the the shrouded and in controversy for racism , homophobia and fascism .
 2---->  top scholarly presses require funds for publication costs in their contracts ( peter lang , for instance , amongst other major european houses ) . <split> <low_freq> are , themselves , shrouded in controversy for racism , sexism and fascism .


 1---->  top session drummer hal blaine played on the record <split> he is it one of his favorites .
 2---->  top session drummer hal blaine played on this . <split> he considers it one of his favorites .


 1---->  <low_freq> is a hybrid personal note and information organizer and password protector for macintosh computers . <split> it is notes to be entered and organized .
 2---->  <low_freq> is a hybrid personal note and informa

 1---->  it was used as the theme song of stephen king 's '' golden years '' , which the the pilot of the cbs series '' <low_freq> '' .
 2---->  it was used as the theme song of stephen king 's '' golden years '' , and in the pilot of the cbs series '' <low_freq> '' .


 1---->  it was used by greek writers such as herodotus herodotus <low_freq> and <low_freq> <low_freq> with democracy , is said to have been first ordained by the ancient egyptians <low_freq> rabat ( c. <low_freq> - <low_freq> ) .
 2---->  it was used by greek writers such as herodotus herodotus <low_freq> and <low_freq> <low_freq> with democracy and is said to have been first ordained by the ancient athenian <low_freq> solon ( c. <low_freq> - <low_freq> ) .


 1---->  the was used extensively in north america , the 1948 olympics , however , continued to use the timing .
 2---->  it was used extensively in north america but the 1948 olympics , however , continued to use omega timing .


 1---->  it was used in a wide va

 1---->  the '' saint benedict press classics '' line first began in 2006 . <split> this product line started with public domain classic catholic titles as a direct competitor for tan books and publishers .
 2---->  the '' saint benedict press classics '' line first began in 2006 . <split> this product line started with public domain classic catholic titles as a direct competitor for tan books and publishers .


 1---->  the '' saint ninian '' came alongside to take off survivors . <split> it was herself torpedoed by '' ub - 48 '' .
 2---->  the '' saint ninian '' came alongside to take off survivors , but was herself torpedoed by '' ub - 48 '' . <split> she sank with the loss of 15 men .


 1---->  the '' sam '' does his fair share of <low_freq> , but he also needs to play the run and take on blockers . <split> he also needs to play the run and take on blockers , making him a bigger linebacker on average than the weak - side linebacker .
 2---->  the '' sam '' does his fair share of <

--------split model training sampling display--------
 1---->  the second soldier was surrounded by a team of fifteen to twenty activists in two groups . <split> the of thrown the , he landed from the ship 's roof .
 2---->  the second soldier was surrounded by a team of fifteen to twenty activists in two groups . <split> one group attacked him when he landed on the ship 's roof .


 1---->  the second son of rev. j. h. <low_freq> , he was educated at highgate school and <split> he was at the , , cambridge graduated at highgate college , cambridge where he graduated ma degree .
 2---->  the second son of rev. j. h. shakespeare , he was educated at highgate school . <split> he served in world war i. he studied at emmanuel college , cambridge where he graduated ma llb .


 1---->  the second son of william dallas of lloyd 's and sarah day , he was born in london 27 the june 1809 . <split> he was descended from alexander dallas of <low_freq> , inverness - shire .
 2---->  the second son o

 1---->  robert donald kilpatrick , jr. ( born august 20 , 1949 ) , better known as patrick kilpatrick , is an american actor , director , screenwriter , producer , and journalist . <split> he has appeared in over 117 films and television series .
 2---->  robert donald kilpatrick , jr. ( born august 20 , 1949 ) , better known as patrick kilpatrick , is an american actor , director , screenwriter , producer , and journalist . <split> he has appeared in over 117 films and television series .


 1---->  robert donald murphy ( born january 18 , 1977 ) is an american professional gridiron football offensive lineman who currently currently a free agent . <split> he is currently a free agent .
 2---->  robert donald murphy ( born january 18 , 1977 ) is an american professional gridiron football offensive lineman who is currently a free agent . <split> he most recently played for the toronto argonauts of the canadian football league ( cfl ) .


 1---->  robert donaldson is a high school stude

--------split model training sampling display--------
 1---->  at the end of the second pool the the water picks . <split> the water picks up speed and starts its decent though iron ore rock .
 2---->  at the end of the second pool , the river narrows . <split> the water picks up speed and starts its decent though iron ore rock .


 1---->  at the end of the adder battles where the remaining teams consist of <low_freq> and his friends , <low_freq> <low_freq> ) . hao 's team . <split> the remaining teams decide to forfeit the tournament crowning the hao the title of dharma king .
 2---->  at the end of the shaman battles , the remaining teams consist of <low_freq> and his friends ( the <low_freq> ) and hao 's team . <split> the remaining teams decide to forfeit the tournament , crowning hao the title of shaman king .


 1---->  at the end of the song , the woman of the house will serve some and give money . make <low_freq> . . the . <split> instead addition , the <low_freq> team will gi

 1---->  <low_freq> ramon adrian sebastian <low_freq> es <low_freq> de sebastian sebastian <low_freq> , <low_freq> de la familia hom en el , de <low_freq> <low_freq> <low_freq> abdallah , <low_freq> , <low_freq> 1841 .
 2---->  juan ramon sebastian mera <low_freq> es nieto de sebastian mera alonzo , <low_freq> de la familia mera en <low_freq> <low_freq> y <low_freq> <low_freq> en oviedo , asturias , <low_freq> 1841 .


 1---->  juan martín lópez caro ( born march <low_freq> , spain ) spain 23 , 1969 in is the head of racing coach , racing spanish la liga , and brazilian <low_freq> <low_freq> who was fired december 4 , 2005 .
 2---->  juan ramón lópez caro ( born in <low_freq> , spain , march 23 , 1969 ) is the manager of real madrid of the spanish la liga , replacing brazilian <low_freq> <low_freq> who was fired december 4 , 2005 .


 1---->  rivera rivera ( born august 10 , 1964 ) is a professional wrestler who who known for his career in the world wrestling council where where he has

 1---->  a common starting dose is 15 mg iv , equivalent to 10 mg of morphine hydrochloride . <split> <low_freq> is commonly used for the treatment of <low_freq> pain .
 2---->  a common starting dose is 15 mg iv , equivalent to 10 mg of morphine hydrochloride . <split> <low_freq> is commonly used for the treatment of <low_freq> pain .


 1---->  a common symptom of '' a. <low_freq> '' is seen when the <low_freq> is feeding in the <low_freq> tissue . <split> <low_freq> is the <low_freq> is <low_freq> , which are <low_freq> at first then turn <low_freq> as the feeding persists .
 2---->  a common symptom of '' a. <low_freq> '' is seen when the <low_freq> is feeding in the <low_freq> tissue . <split> angular lesions are formed , which are <low_freq> at first then turn <low_freq> as the feeding persists .


 1---->  a common thread running through them is that <low_freq> is a woman possessed of supernatural powers . <split> the daughter of a fairy mother and a human father , she forbids h

--------split model training sampling display--------
 1---->  doomsday , star diamond , <low_freq> and rampage . <split> she <low_freq> traveled back in time from 2030 to 1997 to prevent earth 's destruction .
 2---->  doomsday , star sapphire , <low_freq> and rampage . <split> the heroes traveled back in time from 2030 to 1997 to prevent earth 's destruction .


 1---->  <low_freq> ( <low_freq> davis ) is a fictional character appearing a mutant superhero appearing the marvel comics universe . <split> it appeared depicted in the <low_freq> in the appeared in the pages of the '' west coast avengers '' in 1989 .
 2---->  <low_freq> ( <low_freq> davis ) is a fictional character , a mutant superhero in the marvel comics universe . <split> he was created by john byrne and first appeared in the pages of the '' west coast avengers '' in 1989 .


 1---->  <low_freq> ' first sport was tennis . <split> he competed he competed at semi-professional national level in the netherlands .
 2---->  <l

 1---->  rebecca field is an american actress . <split> he was raised in lenox dale , massachusetts .
 2---->  rebecca field is an american actress . <split> field was raised in lenox dale , massachusetts .


 1---->  rebecca <low_freq> ( born april 9th , 1977 ) is an english actress . <split> she is of mixed english and indian descent .
 2---->  rebecca <low_freq> ( born april 9th , 1977 ) is an english actress . <split> she is of mixed english and indian descent .


 1---->  rebecca eugene eugene , was the second wife of roger sherman . <split> she was the daughter of benjamin and rebecca malden , and the niece of roger sherman 's brother rev. josiah sherman and the second cousin , josiah sherman and the second cousin once removed of colonel william prescott .
 2---->  rebecca minot prescott ( 1743 -- 1793 ) was the second wife of roger sherman . <split> she was the daughter of benjamin and rebecca minot prescott from salem , massachusetts ; the niece of roger sherman 's brother rev.

--------split model training sampling display--------
 1---->  jason frederick kidd ( born march 23 , 1973 ) is an american former player . former professional . <split> he is currently the head coach of the milwaukee bucks of the national basketball association ( nba ) .
 2---->  jason frederick kidd ( born march 23 , 1973 ) is an american basketball coach and former player . <split> he is currently the head coach of the milwaukee bucks of the national basketball association ( nba ) .


 1---->  jason goodall ( born 23 january 1967 ) is a former australian basketball . . the philippines states . <split> he was playing tennis in zambia , central africa , when he was nine years old .
 2---->  jason goodall ( born 23 january 1967 ) is a former professional tennis player from the united kingdom . <split> goodall started playing tennis in zambia , central africa , when he was nine years old .


 1---->  jason <low_freq> is an english - greek technology developer and entrepreneur from <low_

 1---->  it was then sacked by the lombards in buchenwald , and later by the moors in 737 , and when 's king divided frankish <low_freq> into three parts in urn by the treaty of verdun , vienne in became part of middle ethiopians .
 2---->  it was then sacked by the lombards in 558 , and later by the moors in 737 , so francia 's king divided frankish <low_freq> into three parts in 843 by the treaty of verdun , hence vienne became part of middle francia .


 1---->  it was then sent to all the bishops in the world , and were ordered to maintain a strict confidentiality about any matters of sexual abuse by priests .
 2---->  it was then sent to all the bishops in the world , who were ordered to maintain a strict confidentiality about any matters of sexual abuse by priests .


 1---->  it was then sent to the spain , losing over half their men at the battle of <low_freq> in april 1707 , and took took part in the capture of vigo in october 1719 .
 2---->  it was then sent to the spain , lo

 1---->  she focused too on non-profit organizations , such as the march of <low_freq> , the veterans of foreign wars , the salvation army . <split> the lions international crippled children 's camp in <low_freq> .
 2---->  she focused too on non-profit organizations , such as the march of <low_freq> , the veterans of foreign wars , the salvation army , and the lions international crippled children 's camp in <low_freq> . <split> he had segments on the louisiana national guard and the roots of the methodist church in central louisiana .


 1---->  she followed her mother 's lead , earning a degree in early childhood education from hampton ( institute ) . <split> she taught as a first grade teacher in greensboro , north carolina .
 2---->  she followed her mother 's lead , earning a degree in early childhood education from hampton ( institute ) university . <split> she taught as a first grade teacher in greensboro , north carolina .


 1---->  she follows the rituals that society expect

 1---->  randomized randomized clinical trials met the criteria for review and were reviewed by professors ernst the peninsula college of medicine and dentistry , that <low_freq> has <low_freq> effects on a range of cancers .
 2---->  seven randomized clinical trials met the criteria for review and were reviewed by professors at the peninsula college of medicine and dentistry suggested that <low_freq> has <low_freq> effects on a range of cancers .


 1---->  four trainees escaped one night , to the time the officers had retired to bed leaving the trainees to their own devices .
 2---->  seven trainees escaped one night , at the time the officers had retired to bed leaving the trainees to their own devices .


 1---->  just years later , during the 's reign of hyrule , the carpenters have departed up a tent in <low_freq> valley in restore the bridge that the <low_freq> thieves have to ward off intruders .
 2---->  seven years later , during ganondorf 's reign of hyrule , the carpenters 

 1---->  during this period , milan was living in brussels . <split> <low_freq> mentions that he later served under cardinal mazarin .
 2---->  during this period , milan was living in brussels . <split> <low_freq> mentions that he later served under cardinal mazarin .


 1---->  during this period , mr jones attempted to improve the co-ordination of australia 's transport . <split> consequently , the australian transport had a significant degree of systemic inefficiency .
 2---->  during this period , mr jones attempted to improve the co-ordination of australia 's transport systems . <split> consequently australian transport had a significant degree of systemic inefficiency .


 1---->  during this period , nano left the political scene to return back a few years later . <split> he undertook a controversial campaign to clean the socialist party from corruption which came to be known as movement for <low_freq> .
 2---->  during this period , nano left the political scene to return back

 1---->  garrison frazier , the 67 year - old former pastor of third african baptist , had in the late 1850s , he for $ 1,000 bought freedom for himself and his wife .
 2---->  garrison frazier , the 67 year - old former pastor of third african baptist who , in the late 1850s , had for $ 1,000 bought freedom for himself and his wife .


 1---->  savannah savannah ( 1983 - 2005 ) was a thoroughbred thoroughbred racehorse winning won the cheltenham gold cup in 1991 by a short - head from the fellow with desert orchid back in third place .
 2---->  garrison savannah ( 1983 - 2005 ) was a famous racehorse , who won the cheltenham gold cup in 1991 by a short - head from the fellow with desert orchid back in third place .


 1---->  <low_freq> is thinking about telling tommy that he is dating his sister , when franco what he thinks , franco runs away from him ; telling sean that he does n't even want to be involved in in way he is n't a victim of crazed retribution .
 2---->  <low_freq> is t

 1---->  the french city of <low_freq> , eure had 19 students from their high school visit america . <split> in america the students stayed in the homes of canal winchester high school students .
 2---->  the french city of <low_freq> , eure had 19 students from their high school visit america . <split> while in america the students stayed in the homes of canal winchester high school students .


 1---->  the french crossing resulted in the battle of <low_freq> - <low_freq> . <split> beginning on 21 may , the battle opposed the numerous austrian army to only a proportion of the '' grande <low_freq> '' , as napoleon was unable to bring through the bulk of his forces in time .
 2---->  the french crossing resulted in the battle of <low_freq> - <low_freq> . <split> beginning on 21 may , the battle opposed the numerous austrian army to only a fraction of the '' grande <low_freq> '' , as napoleon was unable to bring through the bulk of his forces in time .


 1---->  the french decisively d

--------split model training sampling display--------
 1---->  <low_freq> are '' <low_freq> '' ics app '' lets '' are <split> the are , flexible java <low_freq> that be used in the teaching of physics and other sciences .
 2---->  <low_freq> are '' <low_freq> '' ics app '' lets '' . <split> these small , flexible java <low_freq> can be used in the teaching of physics and other sciences .


 1---->  <low_freq> produced in plants act as toxins to the attacking <low_freq> . <split> they well , the are <low_freq> pathogens than of the and are more virulent than those unable to do so .
 2---->  <low_freq> produced in plants act as toxins to the attacking organism . <split> as such , host - specific pathogens capable of degrading <low_freq> are more virulent than those unable to do so .


 1---->  <low_freq> ( pha , or <low_freq> ) is a <low_freq> found in plants , especially beans , <split> especially is found in the highest concentrations in the highest kidney bean ( '' <low_freq> <low_fre

 1---->  by 1839 , it returned to service as a hotel , but was destroyed by fire in 1851 . <split> the <low_freq> was built in its place nine years later .
 2---->  by 1839 , it returned to service as a hotel , but was destroyed by fire in 1851 , and subsequently torn down . <split> the <low_freq> was built in its place nine years later .


 1---->  by 1843 he was experiencing persistent symptoms including tremor and nocturnal <low_freq> . <split> in 1844 , he began to use a wheelchair .
 2---->  by 1843 he was experiencing persistent symptoms including tremor and nocturnal <low_freq> and in 1844 , he began to use a wheelchair . <split> his last years became confined to his bed .


 1---->  by 1845 galena was producing nearly 27,000 tons of lead ore . <split> at that time jo <low_freq> county was producing eighty percent of the lead in the united states .
 2---->  by 1845 galena was producing nearly 27,000 tons of lead ore . <split> at that time jo <low_freq> county was producing eight

--------split model training sampling display--------
 1---->  a 16th season has been confirmed and production be the december . <split> it began on 12 june 2015 and ends on 12 june 2016 .
 2---->  a 16th season has been confirmed and will contain 10 episodes . <split> production began on 12 december 2015 and ends on 12 june 2016 .


 1---->  a 1900 graduate from the university of heidelberg in germany . <low_freq> moved to studies in the . the <low_freq> . . the <low_freq> '' . <split> <low_freq> was to hollywood in the late 1920s to appear in foreign - language versions of american films .
 2---->  a 1900 graduate from the university of heidelberg in germany , <low_freq> began his career in 1903 on the vienna stage in '' <low_freq> '' . <split> he moved to hollywood in the late 1920s to appear in foreign - language versions of american films .


 1---->  a 191 cm <low_freq> , brown was won won and the first 1800s . . he was the successive fitzroy best and fairest awards . <split> the

 1---->  the interview he made in april 1960 , <low_freq> <low_freq> , who had just won two sanremo festivals , <low_freq> inspired , <low_freq> <low_freq> to write '' <low_freq> '' , his first composition as a singer - songwriter .
 2---->  the interview he made in april 1960 with domenico <low_freq> , who had just won two sanremo festivals in a row , inspired <low_freq> to write '' <low_freq> '' , his first song as a singer - songwriter .


 1---->  the interview was broadcast broadcast broadcast online and and the and daughter georgie were interviewed on bbc radio five live , episode 180 of <low_freq> ' show was made available on the bbc website .
 2---->  the interview was not only broadcast online , but <low_freq> and daughter georgie were interviewed on bbc radio five live and episode 180 of <low_freq> ' show was made available on the bbc website .


 1---->  the interviews are from people involved in the united united nations climate change conference conference the <low_freq> ,

 1---->  clarissa bowers ( born october 27 , 1997 ) is an american model and beauty pageant titleholder . <split> she won america 's miss world 2017 and will represent the united states at miss world 2017 and will represent the united states at miss world 2017 .
 2---->  clarissa bowers ( born october 27 , 1997 ) is an american model and beauty pageant titleholder . <split> she won america 's miss world 2017 and will represent the united states at miss world 2017 .


 1---->  clarissa ward ( born on january 30 , 1980 ) is an american television journalist . <split> she is currently foreign correspondent with '' cnn '' . <split> she was formerly with '' cbs news '' , based in london .
 2---->  clarissa ward ( born on january 30 , 1980 ) is an american television journalist who is currently foreign correspondent with '' cnn '' . <split> she was formerly with '' cbs news '' , based in london .


 1---->  clark & edmonds , 1983 <low_freq> - 52 lack of food was a major factor . <split> the 

--------split model training sampling display--------
 1---->  the squadron swelled in numbers as new <low_freq> were added to the roster . it underwent a number moves moves to various airfields . <split> after , the being assigned to will rogers field , oklahoma it would be deactivated in 1943 .
 2---->  the squadron swelled in numbers as new <low_freq> were added to the roster and it underwent a number of moves to various airfields . <split> finally , after being assigned to will rogers field , oklahoma it would be deactivated in 1943 .


 1---->  the squadron under commodore thomas <low_freq> was been patrolled to the the waters between puerto rico and saint kitts . <split> he was flagship '' constellation '' , cruising independently , he engaged and engaged '' <low_freq> '' .
 2---->  the squadron under commodore thomas <low_freq> had been sent to patrol the waters between puerto rico and saint kitts . <split> <low_freq> 's flagship '' constellation '' was cruising independently wh

 1---->  black island is a small island nature reserve with an area of <low_freq> ha in to the south - eastern coast of tasmania around lying , the south to the river derwent .
 2---->  green island is a small island nature reserve with an area of <low_freq> ha close to the south - eastern coast of tasmania , australia at the entrance to the river derwent .


 1---->  bill lantern , green arrow , and ray proceed to question prometheus , but soon learn that it is <low_freq> a , they impersonating him , they realize they 've fallen into a trap just as a bomb goes off .
 2---->  green lantern , green arrow , and ray proceed to question prometheus , but soon learn that it 's not him but <low_freq> impersonating him ; they realize they 've fallen into a trap just as a bomb goes off .


 1---->  bill lantern , real name is len lewis , who in '' just imagine stan lee and dave gibbons creating green lantern ( december , 2001 ) '' .
 2---->  green lantern , real name is len lewis , debuted in '

 1---->  since childhood , the brothers showed an interest in playing music . <split> they played self - created instruments such as bottle caps , plastic containers , and other items .
 2---->  since childhood , the brothers showed an interest in playing music . <split> they played self - created instruments such as bottle caps , plastic containers , and other items .


 1---->  since completing his newest recording , <low_freq> , mickey has returned to his first love , the <low_freq> of the <low_freq> . <split> the new york city is produced at the 2005 midtown international theatre festival .
 2---->  since completing his newest recording , <low_freq> , mickey has returned to his first love , the theatre . <split> a revival of '' apathy - the gen x musical '' was produced at the 2005 midtown international theatre festival .


 1---->  since construction and prior to the construction of the new <low_freq> the rotunda had advertisement signs on the top of the building . <split> in the 

--------split model training sampling display--------
 1---->  this is how he expresses and illustrates the of the . <split> the art of not understanding due comprehend is of linguistic <low_freq> is the of his work .
 2---->  this is how he expresses and illustrates lifestyle in diaspora . <split> the art of not understanding and <low_freq> because of linguistic disabilities is core of his work .


 1---->  this is <low_freq> , however , as any observation can host over one explanation . <split> the success of any hypothesis can be explained by over one theory .
 2---->  this is <low_freq> , however , as any observation can host over one explanation . <split> the success of any hypothesis can be explained by over one theory .


 1---->  this is important because a process - based approach allows for ongoing use and continuous improvement . <split> this helps prevent <low_freq> and encourages meaningful dialogue .
 2---->  this is important because a process - based approach allows for

 1---->  in february 2007 , he joined leicester city on an initial one - month loan period . <split> he made his debut in leicester 's 2 - 0 victory at portman road against ipswich town .
 2---->  in february 2007 , he joined leicester city on an initial one - month loan period . <split> he made his debut in leicester 's 2 - 0 victory at portman road against ipswich town .


 1---->  in february 2007 , the airline was purchased by bmi . <split> the airline was a british airways franchise until the night of 27 october 2007 .
 2---->  in february 2007 , the airline was purchased by bmi . <split> the airline was a british airways franchise until the night of 27 october 2007 .


 1---->  in february 2007 <low_freq> martineau <low_freq> llp merged with london - based stringer saul llp to create the first full - service integrated canadian - uk law firm . <split> <low_freq> 's first ever full service uk - canadian law partnership with lawyers practicing both english and canadian law .
 2----

--------split model training sampling display--------
 1---->  <low_freq> 's crossing is a diminutive on the manx electric railway . the isle of man . <split> it is rural request stop on almost exclusively for local traffic .
 2---->  <low_freq> 's crossing is a stop on the manx electric railway on the isle of man . <split> this diminutive rural request stop caters almost exclusively for local traffic .


 1---->  <low_freq> is also home to <low_freq> community highschool . <split> one of the toronto district school board 's alternative schools .
 2---->  <low_freq> is also home to <low_freq> community highschool . <split> one of the toronto district school board 's alternative schools .


 1---->  <low_freq> is both <low_freq> by the total power he held over the soldier and disgusted by a display of terror he considers weak and <low_freq> . <split> he also up his mind right and there to never show any sign of vulnerability .
 2---->  <low_freq> is both <low_freq> by the total power he

 1---->  during december several visitor are here for picnic , although many people come through out year .
 2---->  during december several visitors come here for picnic , although many people come throughout the year .


 1---->  during desert shield , the ship made several suez canal <low_freq> and operated in the eastern <low_freq> , operation storm , into the desert storm initiated the , , vf - 32 <low_freq> were in the first strike wave flying combat air patrol . in central and western . .
 2---->  during desert shield , the ship made several suez canal <low_freq> and operated in the eastern <low_freq> when desert shield turned into operation desert storm in january 1991 , vf - 32 <low_freq> were in the first strike wave flying combat air patrol mainly in central and western iraq .


 1---->  during desert storm , wing was the first american fighters on station , due was due mainly because of the fact the wing civilian forward to its 's wartime bases .
 2---->  during desert stor

 1---->  he eventually married <low_freq> <low_freq> and had one son , moshe before leaving russia . <split> he was also a member of the <low_freq> zion movement .
 2---->  he eventually married <low_freq> <low_freq> and had one son , moshe before leaving russia . <split> he possibly was already a member of the <low_freq> zion movement .


 1---->  he eventually met ian <low_freq> , the front man for positive punk band southern death cult . <split> <low_freq> was impressed with duffy 's playing and he abandoned <low_freq> to start a new band with him .
 2---->  he eventually met ian <low_freq> , the front man for positive punk band southern death cult . <split> <low_freq> was impressed with duffy 's playing and he abandoned <low_freq> to start a new band with him .


 1---->  he eventually moved from north kingstown to <low_freq> , rhode island . <split> west set up a prosperous tavern in the <low_freq> , and was an active farmer and molasses trader .
 2---->  he eventually moved from 

--------split model training sampling display--------
 1---->  one yen silver coins minted after japan adopted the gold standard ( gold based currency ) in 1897 . <split> in were not issued for domestic use , but for use in japanese taiwan and foreign trade .
 2---->  one yen silver coins minted after japan adopted the gold standard ( gold based currency ) in 1897 . <split> these were not issued for domestic use , but for use in japanese taiwan and foreign trade .


 1---->  <low_freq> financial partners , inc. is a of the founders securities private mutual holding organization organization . <split> it is corporate offices at the <low_freq> tower in indianapolis , indiana .
 2---->  <low_freq> financial partners , inc. is one of the fastest growing usa mutual insurance holding companies . <split> it has corporate offices at the <low_freq> tower in indianapolis , indiana .


 1---->  <low_freq> is a grass - root organization . has is now recognized . a grass organization . <split> it i

 1---->  the illustrated plane parallel plate <low_freq> has unequal path lengths for the test and reference beams , because of this , it must be used with highly <low_freq> ( laser ) light .
 2---->  the illustrated plane parallel plate <low_freq> has unequal path lengths for the test and reference beams ; because of this , it must be used with highly <low_freq> ( laser ) light .


 1---->  a illustration here of a for a three - animal team looks a much to a group of linkage <low_freq> and <low_freq> , lead shows '' , a lowest of the output sum / difference of the individual inputs .
 2---->  the illustration here of <low_freq> for a three - animal team is very similar to a group of linkage <low_freq> and <low_freq> : '' load '' is the equivalent of the output sum / difference of the individual inputs .


 1---->  the illustration of the <low_freq> instrument to the right shows how two persons would use such a <low_freq> , in <low_freq> is is aligning the instrument while johannes set

--------split model training sampling display--------
 1---->  there can only be two valves per cylinder and <split> <low_freq> are restricted to a basic roots type -- rotor case width with a breadth of .
 2---->  there can only be two valves per cylinder . <split> <low_freq> are restricted to a basic roots type -- rotor case width with a breadth of .


 1---->  there could be no focus on any target . the rockets rockets . well were fired . <split> they rockets were predominantly <low_freq> , and predominantly at civilian cities in northern israel .
 2---->  there could be no focus on any target for the <low_freq> rockets as they are <low_freq> . <split> the rockets were fired <low_freq> , and predominantly at civilian cities in northern israel .


 1---->  there could be then <low_freq> a creation of denser quartz in the area of higher pressure . <split> it means a creation of basic molecular si - o rings .
 2---->  there could be then <low_freq> a creation of denser quartz in the are

 1---->  the girl pulled out clumps of his hair and hit him with a bin . <split> she had to be pulled off the teacher by his colleagues who heard his screams .
 2---->  the girl pulled out clumps of his hair and hit him with a bin . <split> she had to be pulled off the teacher by his colleagues who heard his screams .


 1---->  the girl takes hold of his hand and places the coin in it . <split> when she feels his hand , she realizes who he is .
 2---->  the girl takes hold of his hand and places the coin in it . <split> when she feels his hand , she realizes who he is .


 1---->  the girls ' friendship <low_freq> after jane becomes involved with ricky . <split> she is a <low_freq> who has filmed : a plastic bag being blown in the wind .
 2---->  the girls ' friendship <low_freq> after jane becomes involved with ricky . <split> jane and ricky bond over what ricky considers the most beautiful imagery he has filmed : a plastic bag being blown in the wind .


 1---->  the girls ' friends

--------split model training sampling display--------
 1---->  the library stood adjacent to the city fire station at the time and <split> although around 15 fire engines and 150 firefighters attended the fire , a priceless collection of books , manuscripts and archive material was destroyed .
 2---->  the library stood adjacent to the city fire station at the time . <split> although around 15 fire engines and 150 firefighters attended the fire , a priceless collection of books , manuscripts and archive material was destroyed .


 1---->  the <low_freq> - <low_freq> line , the most beautiful part of the albanian railway <low_freq> was closed for passenger traffic in 2012 . <split> the stored locomotives and wagons from <low_freq> are <low_freq> <low_freq> for <low_freq> .
 2---->  the <low_freq> - <low_freq> line , the most beautiful part of the albanian railway network was closed for passenger traffic in 2012 . <split> the stored locomotives and wagons from <low_freq> are being moved 

 1---->  after roughly seven months of construction , she was launched in 15 june 1940 and commissioned commissioned into the kriegsmarine in june september under and the command of '' '' <low_freq> .
 2---->  after roughly seven months of construction , she was launched on 15 june 1940 and formally commissioned into the kriegsmarine on 10 september 1940 under the command of <low_freq> georg <low_freq> .


 1---->  after running a community newspaper in hamilton , the '' examiner '' , she established the '' women 's news '' in 2002 , and the in '' women 's post '' in 2003 .
 2---->  after running a community newspaper in hamilton , the '' examiner '' , she established the '' women 's news '' in 2002 , renaming it the '' women 's post '' in 2003 .


 1---->  after sales , exchange or <low_freq> of faulty product is be able and it as and is always easier to communicate with local business rather than international <low_freq> , but is also quicker reaction to emergency situations faster <

 1---->  irvin mayfield , jr. ( born ) is an american jazz trumpeter and bandleader . <split> he has been serving as cultural <low_freq> of the city of new orleans since 2003 .
 2---->  irvin mayfield , jr. ( born ) is an american jazz trumpeter and bandleader . <split> he has been serving as cultural <low_freq> of the city of new orleans since 2003 .


 1---->  irvin shrewsbury cobb ( june 23 , 1876 -- march 11 , 1944 ) was an american author , humorist , editor and columnist . <split> cobb moved to new york in 1904 , living there for the rest of his life .
 2---->  irvin shrewsbury cobb ( june 23 , 1876 -- march 11 , 1944 ) was an american author , humorist , editor and columnist from paducah , kentucky who moved to new york in 1904 , living there for the rest of his life . <split> cobb also wrote more than 60 books and 300 short stories .


 1---->  irvin alleged that the individuals who were involved with the attempted <low_freq> were fans of the jaguars organization . <split> the 

info=[pretrain_fusion-att-20per]-loss=0.415392995-bleu=0.6894-hidden_dim=256-input_dim=100-epoch=3-batch_size=180-batch_id=[1-[of]-1099]-lr=0.0050 0.6893796677625951
--------split model training sampling display--------
 1---->  the concert hall has a seating capacity of <low_freq> people and when seating is available in the choir loft above the main stage area the hall can hold up to <low_freq> . <split> it hall is a tall , rectangular room with stepped , curved balconies and terraces .
 2---->  the concert hall has a seating capacity of <low_freq> people and when seating is available in the choir loft above the main stage area the hall can hold up to <low_freq> . <split> the hall is a tall , rectangular room with stepped , curved balconies and terraces .


 1---->  the concert hall was built between 1906 and 1909 , in art nouveau style . <split> the building design using a reinforced concrete frame designed by robert <low_freq> .
 2---->  the concert hall was built between 1906 and 1

 1---->  in the meanwhile they continued with enough activity to maintain their franchise , <low_freq> <low_freq> - <low_freq> , managed to persuade de <low_freq> that a lock - and - lake canal was more realistic than a sea - level canal .
 2---->  in the meanwhile they continued with enough activity to maintain their franchise , and <low_freq> - <low_freq> eventually managed to persuade de <low_freq> that a lock - and - lake canal was more realistic than a sea - level canal .


 1---->  in the medulla , second - order fibers of the <low_freq> and <low_freq> systems <low_freq> and form the medial <low_freq> , a tract of nerve fibers that leads the rest of the way up the brainstem to the thalamus .
 2---->  in the medulla , second - order fibers of the <low_freq> and <low_freq> systems <low_freq> and form the medial <low_freq> , a tract of nerve fibers that leads the rest of the way up the brainstem to the thalamus .


 1---->  in the meeting the elected officials heard many complaints 

 1---->  <low_freq> continued to be an ambassador for the sport of wrestling . <split> he passed away on may 16 2011 due to a hit and run car accident .
 2---->  <low_freq> continued to be an ambassador for the sport of wrestling . <split> he passed away on may 16 2011 due to a hit and run car accident .


 1---->  blue 's creatures tend to be weaker than creatures of other colors . <split> however , they are also able to make them difficult to damage or block , particularly '' '' and to a lesser extent '' '' or '' '' .
 2---->  blue 's creatures tend to be weaker than creatures of other colors , but commonly have abilities and <low_freq> . <split> which make them difficult to damage or block , particularly '' '' and to a lesser extent '' '' or '' '' .


 1---->  blue apron inc. is an ingredient delivery service that <low_freq> ingredients on a subscription basis . <split> it operates in the united states only .
 2---->  blue apron inc. is an ingredient delivery service that <low_freq>

--------split model training sampling display--------
 1---->  the vehicle procurement includes eight <low_freq> <low_freq> trains , with two options to procure six more . <split> the first will will be delivered in june 2016 .
 2---->  the vehicle procurement includes eight <low_freq> <low_freq> trains , with two options to procure six more . <split> the first trains will be delivered in june 2016 .


 1---->  the vehicle rolled over and his with him were killed . <split> he sustained a fractured skull and and broken ribs , legs , arms and hands .
 2---->  the vehicle rolled over and those with him were killed . <split> he sustained a fractured skull , and broken ribs , legs , arms and hands .


 1---->  the vehicle then enters a burial chamber with treasures and italian speaking explorers discovering mummies . <split> from car then then found to a dark tunnel with lightning effects .
 2---->  the vehicle then enters a burial chamber with treasures and italian speaking explorers disco

 1---->  at age 19 she met with <low_freq> <low_freq> at an airport , since then they 've become <low_freq> friends , and later '' miguel <low_freq> '' proposed that they should form a trio of which would be formed by <low_freq> , alessandra and another member .
 2---->  at age 19 she met with <low_freq> <low_freq> at an airport , since then they 've <low_freq> great friends , years later '' miguel <low_freq> '' proposed that they should form a trio of which would be formed by <low_freq> , alessandra and another member .


 1---->  at age 20 , giselle got breast augmentation , a point in her life in which she describes as '' life changing '' .
 2---->  at age 20 , giselle got breast augmentation , a point in her life in which she describes as '' life changing '' .


 1---->  at age 20 , he was recruited to referee games alongside his father in the eastern league , and the late 1940s and early 1950s .
 2---->  at age 20 , he was recruited to referee games alongside his father in the eas

 1---->  a number of attempts to rescue the railway and arrange a takeover took place over the next year . <split> demolition of the structure commenced in september 1957 , completing the following year .
 2---->  a number of attempts to rescue the railway and arrange a takeover took place over the next year but were ultimately unsuccessful . <split> demolition of the structure commenced in september 1957 , completing the following year .


 1---->  a number of tourist resorts have been established . <split> these include santiago bay garden & resort , <low_freq> rock resort and <low_freq> park .
 2---->  a number of tourist resorts have been established , catering to both domestic and international visitors . <split> these include santiago bay garden & resort , <low_freq> rock resort and <low_freq> park .


 1---->  a number of scholars have pointed out analogues in other medieval welsh literature . <split> it has attracted interest from those who believe it represents a tradition tha

--------split model training sampling display--------
 1---->  guitarist k. k. k. abruptly left the band shortly before the tour . <split> he was replaced by 31 - year - old briton richie faulkner .
 2---->  guitarist k. k. downing abruptly left the band shortly before the tour . <split> he was replaced by 31 - year - old briton richie faulkner .


 1---->  guitarist martin barre remembers the whole band coming up with various ideas for the music . <split> some parts were recorded in a single take , with every band member having important inputs into the music , with significant contributions from keyboardist john evan .
 2---->  guitarist martin barre remembers the whole band coming up with various ideas for the music . <split> some parts were recorded in a single take , with every band member having important inputs into the music , including significant contributions from keyboardist john evan .


 1---->  guitarist nail previously played with the band <low_freq> , performing at mou

 1---->  i can fly '' is a 2014 song written by performed by lana del rey , and was film be written written for tim burton 's film '' big eyes '' .
 2---->  i can fly '' is a 2014 song written and performed by lana del rey , it 's to be also used for tim burton 's film '' big eyes '' .


 1---->  i could fall in love '' debuted at number 37 on the hot contemporary tracks chart on 29 july 1995 , is a '' hot shot debut '' for being the highest debut that week .
 2---->  i could fall in love '' debuted at number 37 on the adult contemporary tracks chart on 29 july 1995 and received a '' hot shot debut '' for being the highest debut that week .


 1---->  i could fall in love '' is a song recorded by american tejano music singer selena for her fifth studio album , '' dreaming of you '' ( 1995 ) , and posthumously by emi latin on june 26 1995 .
 2---->  i could fall in love '' is a song recorded by american tejano music singer selena for her fifth studio album , '' dreaming of you '' ( 1995

--------split model training sampling display--------
 1---->  <low_freq> suites , a division of the marriott international hotel chain , was the international duration acre in chain . <split> it company division was officially unveiled in newport news , virginia on february 23 , 1997 .
 2---->  <low_freq> suites , a division of the marriott international hotel chain , is an extended - stay hotel chain . <split> the first location was officially unveiled in newport news , virginia on february 23 , 1997 .


 1---->  <low_freq> grammar school , normally referred to locally as <low_freq> is a grammar school . academy status for <split> it is also academy in girls on <low_freq> road , <low_freq> , in the london borough of bexley , england .
 2---->  <low_freq> grammar school , normally referred to locally as <low_freq> is a grammar school with academy status . <split> it is a schools for girls on <low_freq> road , <low_freq> , in the london borough of bexley , england .


 1---->  townsend

 1---->  the hat originated in the chitral and gilgit regions of what is now northern pakistan , however its ancestor perhaps is the exceedingly similar macedonian <low_freq> .
 2---->  the hat originated in the chitral and gilgit regions of what is now northern pakistan , however its ancestor perhaps is the remarkably similar macedonian <low_freq> .


 1---->  the hatches were left open and her tanks still contained a large sum of fuel , the ship was , left vulnerable to avoid the ship was left vulnerable to any direct hit from an aircraft bomb .
 2---->  the hatches were left open and her tanks still contained a large sum of fuel and the tanks themselves were left unattended to making the ship was left vulnerable to any direct hit from an aircraft bomb .


 1---->  the haveli is situated in <low_freq> nagar , on the eastern side of <low_freq> - <low_freq> railway line just 1 km away from <low_freq> sahib , and is now being restored by <low_freq> <low_freq> with the help of punjab gov

 1---->  anna miller ( granville ) is a teacher at nichols ' school . <split> she is a teacher at nichols ' school . <split> she was born in germany as were her parents , who have become naturalised us citizens .
 2---->  anna miller ( granville ) is a teacher at nichols ' school . <split> she is american but was born in germany as were her parents , who have become naturalised us citizens .


 1---->  anna <low_freq> ( ukrainian '' <low_freq> <low_freq> '' , slovenian '' ana <low_freq> '' ) ( born 28 february 1990 in lviv , soviet union ) is a prominent chess player . <split> she is an international master and woman grandmaster with a fide rating of <low_freq> ( april 2009 ) .
 2---->  anna <low_freq> ( ukrainian '' <low_freq> <low_freq> '' , slovenian '' ana <low_freq> '' ) ( born 28 february 1990 in lviv , soviet union ) is a prominent chess player . <split> <low_freq> is an international master and woman grandmaster with a fide rating of <low_freq> ( april 2009 ) .


 1---->  anna 

 1---->  looking west - north - west towards barn , . with the <low_freq> to is the . . <split> <low_freq> for on the 's <low_freq> on <low_freq> the wood and <low_freq> the left and and <low_freq> on <low_freq> on on the left .
 2---->  looking west - north - west towards barn <low_freq> , from the track , near <low_freq> <low_freq> . <split> looking north towards crook 's <low_freq> , between high wood , on the right , and <low_freq> 's <low_freq> , on the left .


 1---->  looks ok , light on sources . <split> check on sources , check <low_freq> links .
 2---->  looks ok , light on sources . <split> light on sources , check <low_freq> links .


 1---->  loomis also made enormous to biological instrumentation . <split> working with edmund newton harvey , <low_freq> the microscope <low_freq> , and pioneered techniques for <low_freq> .
 2---->  loomis also made contributions to biological instrumentation . <split> working with edmund newton harvey he <low_freq> the microscope <low_freq

 1---->  on 21 july 2013 , a russian sukhoi <low_freq> 100 airliner , prototype aircraft <low_freq> crashed upon landing at the <low_freq> international airport near reykjavík . <split> when the automatic landing system neglected to deploy the landing gear .
 2---->  on 21 july 2013 , a russian sukhoi <low_freq> 100 airliner , prototype aircraft <low_freq> , crashed upon landing at the <low_freq> international airport near reykjavík , iceland . <split> when the automatic landing system neglected to deploy the landing gear .


 1---->  on 21 june 1716 he was made baron cadogan of reading , having recently purchased <low_freq> park , oxfordshire ( now berkshire ) near that town . <split> he was also made a knight of the thistle and , the following year , a member of the privy council .
 2---->  on 21 june 1716 he was made baron cadogan of reading , having recently purchased <low_freq> park , oxfordshire ( now berkshire ) near that town . <split> he was also made a knight of the thistle a

--------split model training sampling display--------
 1---->  emacs has over 2,000 built - in commands and allows the user to combine these commands into <low_freq> work <low_freq> work . <split> emacs emacs of emacs emacs , a variant of emacs emacs programming language , a a deep extension capability .
 2---->  emacs has over 2,000 built - in commands and allows the user to combine these commands into <low_freq> to <low_freq> work . <split> the use of emacs lisp , a variant of the lisp programming language , provides a deep extension capability .


 1---->  <low_freq> <low_freq> is egyptian fashion and portrait photographer lives in cairo , <low_freq> . <split> he started my career in a graphic design company that he set up soon after finishing my degree in applied arts in 1996 .
 2---->  <low_freq> <low_freq> is egyptian fashion and portrait photographer lives in cairo , egypt . <split> he started my career in a graphic design company that he set up soon after finishing my degree in

 1---->  henry chaplin lee , descendant descendant of confederate general robert e. lee and rhodes rhodes scholar , who college with donald wheeler .
 2---->  duncan chaplin lee , a descendant of confederate general robert e. lee and a rhodes scholar , attended college with donald wheeler .


 1---->  duncan lee hunter ( born may 31 , 1948 ) is an american politician who republican republican member of the house of representatives from california 's 52nd congressional district in northern and eastern san diego since 1981 .
 2---->  duncan lee hunter ( born may 31 , 1948 ) is an american politician and a republican member of the house of representatives from california 's 52nd congressional district in northern and eastern san diego since 1981 .


 1---->  macrae macrae ( 20 august 1905 - 23 march 1967 ) was one at 118 kirkland street , <low_freq> , glasgow , the , , glasgow fourth of the six children of james macrae , a sergeant in the glasgow police force , and his wife , catherine gr

 1---->  the type o was the largest aircraft that had been built in the uk and one of the largest in the world . <split> the most were built in two versions , the handley page o / 100 ( <low_freq> ) and handley page o / 400 ( <low_freq> ) .
 2---->  the type o was the largest aircraft that had been built in the uk and one of the largest in the world . <split> most were built in two versions , the handley page o / 100 ( <low_freq> ) and handley page o / 400 ( <low_freq> ) .


 1---->  the <low_freq> compiler is itself written in <low_freq> and <low_freq> to javascript . <split> it is licensed under the apache 2 license .
 2---->  the <low_freq> compiler is itself written in <low_freq> and <low_freq> to javascript . <split> it is licensed under the apache 2 license .


 1---->  the <low_freq> continued to wear the broad , round , turned up and <low_freq> hats of their native region . <split> in addition , some volunteers sported peaked caps or even large berets .
 2---->  the <low_freq> 

 1---->  when when using the <low_freq> with choice band attached , the set is usually compounded by attacking moves only since choice band only allows the use of one move .
 2---->  however when using the <low_freq> with choice band attached , the set is usually compounded by attacking moves only since choice band only allows the use of one move .


 1---->  however with <low_freq> do <low_freq> as official , a , luz , a <low_freq> do <low_freq> to beautiful play of colors , contrasting black and white , dark to light and <low_freq> of this , the battery having used the funk , under the of <low_freq> <low_freq> .
 2---->  however with <low_freq> do <low_freq> as singer and <low_freq> , luz , a <low_freq> do <low_freq> to beautiful play of colors , contrasting black and white , dark to light the highlights of parade was the battery having used <low_freq> funk , under command of <low_freq> <low_freq> .


 1---->  within within less than three decades of this ( disabilities imposed on ca

 1---->  in 1971 , the newberry library conservation laboratory was established with paul banks as the director . <split> he continued in this capacity until he left in 1981 .
 2---->  in 1971 , the newberry library conservation laboratory was established with paul banks as the director . <split> he continued in this capacity until he left in 1981 .


 1---->  in 1971 , the ugandan business was sold off to barclays bank of uganda . <split> this was due to the then prevailing political instability in the country .
 2---->  in 1971 , the ugandan business was sold off to barclays bank of uganda . <split> this was due to the then prevailing political instability in the country .


 1---->  in 1971 , when east pakistan ( now bangladesh ) seceded from pakistan at the behest of zulfiqar ali bhutto . <split> <low_freq> <low_freq> syed began to demand self - determination for the people of sindh .
 2---->  in 1971 , when east pakistan ( now bangladesh ) seceded from pakistan at the behest of zu

--------split model training sampling display--------
 1---->  he is one of only five qpr managers to achieve a promotion to a higher division . <split> the others being alec stock ( who managed the feat twice , in successive seasons ) , gordon <low_freq> , terry <low_freq> and ian holloway .
 2---->  he is one of only five qpr managers to achieve a promotion to a higher division . <split> the others being alec stock ( who managed the feat twice , in successive seasons ) , gordon <low_freq> , terry <low_freq> and ian holloway .


 1---->  he is one of only five horses to achieve this <low_freq> career over hurdles consisted of fifteen races . of which he won the races <split> he won <low_freq> tendons during the career which made it difficult to keep him sound .
 2---->  he is one of only five horses to achieve this <low_freq> career over hurdles consisted of fifteen races , of which he won ten . <split> he had suspect tendons throughout his career which made it difficult to keep him s

 1---->  julian <low_freq> ( born 7 august 1990 ) is a new zealand rugby union player who currently plays for the all in super rugby , and zealand , in the the wellington lions in the itm cup .
 2---->  julian <low_freq> ( born 7 august 1990 ) is a new zealand rugby union player who currently plays for the hurricanes in super rugby , new zealand internationally , and the wellington lions in the itm cup .


 1---->  julian <low_freq> was born in 1969 in hollywood , ca into a musical household , his mother , mary spire , was a pianist , and his former stepfather , robert cole , was a conductor .
 2---->  julian <low_freq> was born in 1969 in hollywood , ca into a musical household -- his mother , mary spire , was a pianist , and his former stepfather , robert cole , was a conductor .


 1---->  julian believed that in order to learn , we must fail , we in order to fail , we must sin .
 2---->  julian believed that in order to learn , we must fail , and in order to fail , we must sin .




 1---->  the masque was performed on twelfth night , january 6 , 1607 . <split> the <low_freq> was the premier event at the stuart court for the 1606 - 7 christmas holiday season .
 2---->  the masque was performed on twelfth night , january 6 , 1607 , in the great hall of whitehall palace . <split> it was the premier event at the stuart court for the 1606 - 7 christmas holiday season .


 1---->  the mass anomaly is centered within a larger ring - like structure visible in radar images of the land surface beneath the antarctic ice cap . <split> this combination suggests that it is the result of a large impact event .
 2---->  the mass anomaly is centered within a larger ring - like structure visible in radar images of the land surface beneath the antarctic ice cap . <split> this combination suggests that it is the result of a large impact event .


 1---->  the mass is 2 -- 7 times the sun , and a radius about 45 times . <split> this indicates that it was a b - type star .
 2---->  th

In [None]:
stop

In [None]:
lm_hidden_dim=512
lm_input_dim=300
use_cuda=1

language_model = LanguageModel(use_cuda = use_cuda, input_dim = lm_input_dim, hidden_dim = lm_hidden_dim, vocab = vocab)
#512
model_path = './models_language_model/time-[2019-02-26-13-18-56]-info=[language_model]-loss=4.003012180-bleu=-1.0000-hidden_dim=512-input_dim=300-epoch=24-batch_size=100-batch_id=[1-[of]-9899]-lr=0.0050'
#2048
# model_path = './models_language_model/time-[2019-02-28-07-04-08]-info=[language_model]-loss=3.475848675-bleu=-1.0000-hidden_dim=2048-input_dim=300-epoch=4-batch_size=100-batch_id=[1-[of]-9899]-lr=0.0050'
# #1024
# model_path = './models_language_model/time-[2019-02-27-21-58-23]-info=[language_model]-loss=4.111208439-bleu=-1.0000-hidden_dim=1024-input_dim=300-epoch=6-batch_size=100-batch_id=[1-[of]-9899]-lr=0.0050'

pre_train = torch.load(model_path, map_location='cpu')
language_model.load_state_dict(pre_train)

if use_cuda:
    language_model = language_model.cuda()
    
language_model.eval()

print('finish loading pre-train weight for language model.')



use_cuda = 1
hidden_dim = 256
input_dim = 100
lr=0.005

split_model = Seq2Seq(use_cuda = use_cuda, input_dim = input_dim, hidden_dim = hidden_dim, 
                          vocab = vocab, max_length = 61)

fusion_model = Seq2Seq(use_cuda = use_cuda, input_dim = input_dim, hidden_dim = hidden_dim, 
                          vocab = vocab, max_length = 51)
#pre train para
split_model_path = './models_saved/time-[2019-03-10-01-36-10]-info=[pre-trained_split_model-20per]-loss=0.515495539-bleu=0.6774-hidden_dim=256-input_dim=100-epoch=1-batch_size=100-batch_id=[501-[of]-1979]-lr=0.0050'
fusion_model_path = './models_saved/time-[2019-03-10-01-36-12]-info=[pre-trained_fusion_model-20per]-loss=0.365494132-bleu=0.7406-hidden_dim=256-input_dim=100-epoch=1-batch_size=100-batch_id=[501-[of]-1979]-lr=0.0050'

split_model_path = './models_saved/time-[2019-03-10-05-52-44]-info=[pre-trained_split_model-20per]-loss=0.467645884-bleu=0.7270-hidden_dim=256-input_dim=100-epoch=2-batch_size=100-batch_id=[501-[of]-1979]-lr=0.0050'
fusion_model_path = './models_saved/time-[2019-03-10-05-52-48]-info=[pre-trained_fusion_model-20per]-loss=0.327692717-bleu=0.7558-hidden_dim=256-input_dim=100-epoch=2-batch_size=100-batch_id=[501-[of]-1979]-lr=0.0050'

split_model_path = './models_saved/time-[2019-03-10-13-23-10]-info=[pre-trained_split_model-20per]-loss=0.454687029-bleu=0.7130-hidden_dim=256-input_dim=100-epoch=4-batch_size=100-batch_id=[1-[of]-1979]-lr=0.0050'
fusion_model_path = './models_saved/time-[2019-03-10-13-23-11]-info=[pre-trained_fusion_model-20per]-loss=0.346116364-bleu=0.7466-hidden_dim=256-input_dim=100-epoch=4-batch_size=100-batch_id=[1-[of]-1979]-lr=0.0050'

split_model_path = './models_saved/time-[2019-03-16-23-39-39]-info=[split_model-semi]-total_loss=-0.002986051-rec_loss=0.017885875-lm_rewards=0.0102-bleu=0.7734-bleu_bs=0.6282-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[1201-[of]-46585]-lr=0.0050-loss_ratio=0.4500'
fusion_model_path = './models_saved/time-[2019-03-16-23-39-39]-info=[fusion_model-semi]'

pre_train = torch.load(split_model_path, map_location='cpu')
split_model.load_state_dict(pre_train)
pre_train = torch.load(fusion_model_path, map_location='cpu')
fusion_model.load_state_dict(pre_train)

if use_cuda:
    split_model = split_model.cuda()
    fusion_model = fusion_model.cuda()
    
split_optimizer = optim.Adam(filter(lambda p: p.requires_grad, split_model.parameters()), lr=lr)
fusion_optimizer = optim.Adam(filter(lambda p: p.requires_grad, fusion_model.parameters()), lr=lr)

# set_model_grad(fusion_model, False)

In [None]:
batch_size=17
split_train_set_size=int(len(split_train_set_inputs)/1)
epochs=10000
train_bleu_mean=-1
train_bleu_max=-1
topk=6
loss_ratio=0.45

sup_bsize=35
dataset_times = int(split_train_set_size/len(split_train_set_inputs_supervised))

#batch_size=35, topk=3  or  batch_size=17, topk=6 or  
start_time = time.time()

def model_train(epoch, batch_size, train_set_size):
    batch_id = 0
    valid_bleu = 0
    for start_idx in range(0, train_set_size-batch_size+1, batch_size):
        if batch_id<=1199 and epoch==0:
            batch_id+=1
            continue
#         now = int(round(time.time()*1000))
#         time_stamp = time.strftime(' --->  starting time-[%Y-%m-%d-%H-%M-%S]-',time.localtime(now/1000))
#         print(time_stamp)
        
        #supervised learning
        if batch_id%2==0:
            set_model_grad(split_model, True)
            set_model_grad(fusion_model, False)
            split_optimizer.zero_grad()#clear  
            sup_idx = (batch_id*sup_bsize)%(len(split_train_set_inputs_supervised)-1-sup_bsize)
            split_loss, predicts = split_model.forward(torch.LongTensor(split_train_set_inputs_supervised[sup_idx:sup_idx+sup_bsize]), 
                                         torch.LongTensor(split_train_set_input_lens_supervised[sup_idx:sup_idx+sup_bsize]), 
                                         labels=torch.LongTensor(split_train_set_labels_supervised[sup_idx:sup_idx+sup_bsize]), 
                                         is_train=1, teaching_rate=1)
            split_loss=torch.mean(split_loss)
            split_loss.backward()#retain_graph=True)
            split_optimizer.step()

        if batch_id%2==1:
            set_model_grad(fusion_model, True)
            set_model_grad(split_model, False)
            fusion_optimizer.zero_grad()#clear
            sup_idx = (batch_id*sup_bsize)%(len(split_train_set_inputs_supervised)-1-sup_bsize)
            fusion_loss, predicts = fusion_model.forward(torch.LongTensor(fusion_train_set_inputs_supervised[sup_idx:sup_idx+sup_bsize]), 
                                         torch.LongTensor(fusion_train_set_input_lens_supervised[sup_idx:sup_idx+sup_bsize]), 
                                         labels=torch.LongTensor(fusion_train_set_labels_supervised[sup_idx:sup_idx+sup_bsize]), 
                                         is_train=1, teaching_rate=1)
            fusion_loss = torch.mean(fusion_loss)
            fusion_loss.backward()#retain_graph=True)
            fusion_optimizer.step()
        
        
        #unsupervised learning
        if batch_id%2==0:
#             a=time.time()
            end_idx = start_idx + batch_size
            split_optimizer.zero_grad()#clear
            total_loss, reconstruct_loss, rm_rewards, lm_rewards=split_model.train_using_reward(inputs=torch.LongTensor(split_train_set_inputs[start_idx:end_idx]), 
                                   input_lens=torch.LongTensor(split_train_set_input_lens[start_idx:end_idx]), 
                                   reconstruct_labels=torch.LongTensor(duplicate_reconstruct_labels(fusion_pseudo_train_set_labels[start_idx:end_idx],topk)), 
                                   reconstruct_model=fusion_model, 
                                   language_model=language_model, 
                                   topk=topk, loss_ratio=loss_ratio)
            reconstruct_loss = torch.mean(reconstruct_loss)
            total_loss.backward()#retain_graph=True)
            split_optimizer.step()
#             print('split: all time: ', time.time()-a)
        if batch_id%2==1: 
#             a=time.time()
            end_idx = start_idx + batch_size
            fusion_optimizer.zero_grad()#clear
            total_loss, reconstruct_loss, rm_rewards, lm_rewards=split_model.train_using_reward(inputs=torch.LongTensor(split_train_set_inputs[start_idx:end_idx]), 
                                   input_lens=torch.LongTensor(split_train_set_input_lens[start_idx:end_idx]), 
                                   reconstruct_labels=torch.LongTensor(duplicate_reconstruct_labels(fusion_pseudo_train_set_labels[start_idx:end_idx],topk)), 
                                   reconstruct_model=fusion_model, 
                                   language_model=language_model, 
                                   topk=topk, loss_ratio=loss_ratio)
            reconstruct_loss = loss_ratio*torch.mean(reconstruct_loss)
            reconstruct_loss.backward()#retain_graph=True)
            fusion_optimizer.step()
#             print('fusion: all time: ', time.time()-a)
        #update batch_id
        batch_id+=1
        #timestamp
#         now = int(round(time.time()*1000))
#         time_stamp = time.strftime('time-[%Y-%m-%d-%H-%M-%S]-',time.localtime(now/1000))
#         print(time_stamp)

        torch.cuda.empty_cache()
        #
        if batch_id%20==1:
            split_model.eval()
            fusion_model.eval()
            set_model_grad(split_model, False)
            set_model_grad(fusion_model, False)
            sample_num = 5
            rand_idx = random.randint(0, train_set_size-sample_num-1)
            
            print('--------split model training sampling display--------')
            #teaching forcing
            loss_, predicts = split_model.forward(torch.LongTensor(split_train_set_inputs[rand_idx:rand_idx+sample_num]), 
                                             torch.LongTensor(split_train_set_input_lens[rand_idx:rand_idx+sample_num]), 
                                             labels=torch.LongTensor(split_pseudo_train_set_labels[rand_idx:rand_idx+sample_num]), 
                                             is_train=1, teaching_rate=1)
            del loss_
            
            predicts = batch_tokens_remove_eos(predicts, vocab)
            labels = batch_tokens_remove_eos(split_pseudo_train_set_labels[rand_idx:rand_idx+sample_num], vocab)
            
            predicts = batch_tokens2words(predicts, vocab)
            labels = batch_tokens2words(labels, vocab)
            
            predicts_sents = batch_words2sentence(predicts)
            labels_sents = batch_words2sentence(labels)
            
            for (predict_sent, label_sent) in zip(predicts_sents, labels_sents):
                print(' 1----> ', predict_sent)
                print(' 2----> ', label_sent)
                print('\n')
            
            now = int(round(time.time()*1000))
            time_stamp = time.strftime('time-[%Y-%m-%d-%H-%M-%S]-',time.localtime(now/1000))
            info_stamp = 'info=[{:s}]-total_loss={:2.9f}-rec_loss={:2.9f}-lm_rewards={:5.4f}-hidden_dim={:n}-input_dim={:n}-epoch={:n}-batch_size={:n}-batch_id=[{:n}-[of]-{:n}]-lr={:1.4f}'.format(
                              'split_model', total_loss.data[0], reconstruct_loss.data[0], lm_rewards, 
                            hidden_dim, input_dim, epoch, batch_size, batch_id, int(train_set_size/batch_size), lr)
            print(time_stamp, info_stamp)
            
            if batch_id%40==1:
                #ground truth
#                 rand_idx=random.randint(0, len(split_valid_set_inputs)-batch_size-1-1)
                rand_idx=2333
                loss_, predicts = split_model.forward(torch.LongTensor(split_valid_set_inputs[rand_idx:rand_idx+batch_size]), 
                                                 torch.LongTensor(split_valid_set_input_lens[rand_idx:rand_idx+batch_size]), 
                                                 labels=torch.LongTensor(split_pseudo_valid_set_labels[rand_idx:rand_idx+batch_size]), 
                                                 is_train=1, teaching_rate=1)
                del loss_
#                 predicts = batch_tokens_remove_eos(predicts, vocab)
#                 labels = batch_tokens_remove_eos(split_pseudo_valid_set_labels[rand_idx:rand_idx+batch_size], vocab)
                
#                 bleu_scores = batch_tokens_bleu(references=labels, candidates=predicts, smooth_epsilon=0.001)
                #split version
                bleu_scores = batch_tokens_bleu_split_version(references=split_pseudo_valid_set_labels[rand_idx:rand_idx+batch_size], 
                                                              candidates=predicts, smooth_epsilon=0.001, vocab=vocab)

                valid_bleu = 0
                for x in bleu_scores:
                    valid_bleu+=x
                valid_bleu/=len(bleu_scores)
                
                #beam search
                dec_seqs, log_probs = split_model.dec.decode_topk_seqs(split_model.enc, 
                                                                       inputs=torch.LongTensor(split_valid_set_inputs[rand_idx:rand_idx+batch_size]), 
                                                                         input_lens=torch.LongTensor(split_valid_set_input_lens[rand_idx:rand_idx+batch_size]),
                                                                         topk=topk)
                predicts = []
                for ii in range(len(dec_seqs)):
                    if ii%topk==0:
                        predicts.append(dec_seqs[ii])
               
                bleu_scores = batch_tokens_bleu_split_version(references = split_pseudo_valid_set_labels[rand_idx:rand_idx+batch_size],
                                                             candidates = predicts,
                                                             smooth_epsilon=0.001,
                                                             vocab=vocab)
                valid_bleu_beam_search=0
                for x in bleu_scores:
                    valid_bleu_beam_search+=x
                valid_bleu_beam_search/=len(bleu_scores)


                info_stamp = 'info=[{:s}]-total_loss={:2.9f}-rec_loss={:2.9f}-lm_rewards={:5.4f}-bleu={:1.4f}-bleu_bs={:1.4f}-hidden_dim={:n}-input_dim={:n}-epoch={:n}-batch_size={:n}-batch_id=[{:n}-[of]-{:n}]-lr={:1.4f}-loss_ratio={:1.4f}'.format(
                              'split_model-semi', total_loss.data[0], reconstruct_loss.data[0], lm_rewards, valid_bleu, valid_bleu_beam_search, 
                            hidden_dim, input_dim, epoch, batch_size, batch_id, int(train_set_size/batch_size), lr, loss_ratio)
                
                print(info_stamp, valid_bleu, valid_bleu_beam_search)
                
                now = int(round(time.time()*1000))
                time_stamp = time.strftime('time-[%Y-%m-%d-%H-%M-%S]-',time.localtime(now/1000))
                torch.save(split_model.state_dict(), ''.join(['./models_saved/', time_stamp, info_stamp]))
                torch.save(fusion_model.state_dict(), ''.join(['./models_saved/', time_stamp, 'info=[fusion_model-semi]']))
            set_model_grad(split_model, True)
            set_model_grad(fusion_model, True)
            split_model.train()
            fusion_model.train()
            torch.cuda.empty_cache()
for epoch in range(epochs):
    model_train(epoch, batch_size, split_train_set_size)
    
print('running time: %.2f mins'%((time.time()-start_time)/60))

In [None]:
stop

In [None]:
sample_num=2
topk=20

predicts, log_probs=split_model.dec.decode_topk_seqs(split_model.enc, inputs=torch.LongTensor(split_train_set_inputs[0:sample_num]), 
                             input_lens=torch.LongTensor(split_train_set_input_lens[0:sample_num]), 
                             topk=topk)

predicts = batch_tokens_remove_eos(predicts, vocab)
labels = batch_tokens_remove_eos(split_pseudo_train_set_labels[0:sample_num], vocab)

predicts = batch_tokens2words(predicts, vocab)
labels = batch_tokens2words(labels, vocab)

predicts_sents = batch_words2sentence(predicts)
labels_sents = batch_words2sentence(labels)

for idx, sent in enumerate(predicts_sents):
    print(' 1----> ', sent)
    if idx%topk==(topk-1):
        print(' 2----> ', labels_sents[int(idx/topk)])
        print('\n')

In [None]:
# copy_thres=1.0
# split_loss, predicts = split_model.forward(torch.LongTensor(split_train_set_inputs[0:sample_num]), 
#                                      torch.LongTensor(split_train_set_input_lens[0:sample_num]), 
#                                      labels=torch.LongTensor(split_pseudo_train_set_labels[0:sample_num]), 
#                                      is_train=1, teaching_rate=1)

# predicts = batch_tokens_remove_eos(predicts, vocab)
# labels = batch_tokens_remove_eos(split_pseudo_train_set_labels[0:sample_num], vocab)

# predicts = batch_tokens2words(predicts, vocab)
# labels = batch_tokens2words(labels, vocab)

# predicts_sents = batch_words2sentence(predicts)
# labels_sents = batch_words2sentence(labels)

# for (predict_sent, label_sent) in zip(predicts_sents, labels_sents):
#     print(' 1----> ', predict_sent)
#     print(' 2----> ', label_sent)
#     print('\n')

In [None]:
stop