In [1]:
import json
import pickle
import random

import torch
from torch import nn, optim
from torch import autograd
from torch.autograd import Variable
import torch.nn.functional as F
import numpy as np
import torch.nn.utils.rnn as rnn_utils

import nltk
from nltk.translate.bleu_score import SmoothingFunction
from nltk.translate.bleu_score import sentence_bleu
import time
import copy

from Vocab import Vocab
from LanguageModel import LanguageModel

import torch
torch.cuda.set_device(1)

print('import over')

copy_thres=1

import over


In [2]:
def batch_words2sentence(words_list):
    return [' '.join(words) for words in words_list]
def batch_tokens2words(tokens_list, vocab):
    ##    para: tokens_list is list[list] type
    ##    return: words_list corresponding to tokens
    return [[vocab.token2word[token] for token in tokens] for tokens in tokens_list]

def batch_tokens_remove_eos(tokens_list, vocab):
    ##    para: tokens_list is list[list] type
    ##    return pure tokens_list removed eos symbol
    result=[]
    for tokens in tokens_list:
        tokens_filtered=[]
        for token in tokens:
            if token == vocab.word2token['<eos>']:
#                 tokens_filtered.append(token)
                break
            else:
                tokens_filtered.append(token)
        result.append(tokens_filtered)
    return result

def batch_tokens_bleu(references, candidates, smooth_epsilon=0.001):
    ##    para: references and candidates are list[list] type
    ##    return: list of BLEU for every sample
    ##
    bleu_scores=[]
    for ref, candidate in zip(references, candidates):
        if min(len(ref), len(candidate))<4:
            bleu_scores.append(0)
        else:
            bleu_scores.append(sentence_bleu([ref], candidate, smoothing_function = SmoothingFunction(epsilon=smooth_epsilon).method1))
    return bleu_scores

with open('data_set/vocab.pk', 'rb') as f:
    vocab=pickle.load(f)

    
def seqs_split(seqs, vocab):
    seqs = batch_tokens_remove_eos(seqs, vocab)
    simple_sent1s=[]
    simple_sent2s=[]
    for seq in seqs:
        simple_sent1=[]
        simple_sent2=[]
        sent=simple_sent1
        for token in seq:
            if token==vocab.word2token['<split>']:
                sent=simple_sent2
            else:
                sent.append(token)
        simple_sent1s.append(simple_sent1)
        simple_sent2s.append(simple_sent2)
        
    return simple_sent1s, simple_sent2s

def simple_sents_concat(simple_sent1s, simple_sent2s, vocab, max_length):
    simple_sent_lens=[]
    simple_sents=simple_sent1s
    for i, sent in enumerate(simple_sent2s):
        simple_sents[i].append(vocab.word2token['<split>'])
        for token in sent:
            simple_sents[i].append(token)

        #if there is no <split> in simple_sent1s and simple_sent2s, then the length of sents_concat will be longer than max_length
        if len(simple_sents[i])>max_length:
            simple_sents[i] = simple_sents[i][:max_length]
            
        simple_sent_lens.append(len(simple_sents[i]))
            
        while(len(simple_sents[i])<max_length):
            simple_sents[i].append(vocab.word2token['<padding>'])
            
    return simple_sents, simple_sent_lens


def get_lm_inputs_and_labels(sents, vocab, max_length):
    lm_inputs=copy.deepcopy(sents)
    lm_labels=copy.deepcopy(sents)
    lm_input_lens=[]
    
    for sent in lm_inputs:
        if len(sent)>=max_length:
            sent=sent[:max_length-1]
        sent.insert(0, vocab.word2token['<sos>'])
        lm_input_lens.append(len(sent))
        while(len(sent)<max_length):
            sent.append(vocab.word2token['<padding>'])

    for sent in lm_labels:
        if len(sent)>=max_length:
            sent = sent[:max_length-1]
        sent.append(vocab.word2token['<eos>'])
        while(len(sent)<max_length):
            sent.append(vocab.word2token['<padding>'])
        
    return lm_inputs, lm_input_lens, lm_labels


def duplicate_reconstruct_labels(sents, topk):
    return [x for x in sents for ii in range(topk)]


def batch_tokens_bleu_split_version(references, candidates, vocab, smooth_epsilon=0.001):
    # needn't remove '<sos>' token before calling this function, which is different from the 'batch_token_bleu()' version
    #
    ref1, ref2 = seqs_split(references, vocab)
    cand1, cand2 = seqs_split(candidates, vocab)
    bleu_simple_sent1s = batch_tokens_bleu(ref1, cand1)
    bleu_simple_sent2s = batch_tokens_bleu(ref2, cand2)
#     print(bleu_simple_sent1s)
#     print(bleu_simple_sent2s)
    bleu=[]
    for idx in range(len(bleu_simple_sent1s)):
        bleu.append((bleu_simple_sent1s[idx]+bleu_simple_sent2s[idx])/2)
    return bleu


def set_model_grad(model, is_grad):
    for param in model.parameters():
         param.requires_grad = is_grad

In [3]:
seqs=[[8,9,90,5,3,2,1], [5,8,9,90,5,3,2,1], [8,2,9,40,5,3,2,2,1], [8,9,90,5,3,2,1], [8,9,90]]
a,b = seqs_split(seqs, vocab)

print(a)
print(b)

lm_in, lm_in_lens, lm_labels=get_lm_inputs_and_labels(a,vocab, max_length=6)
print(lm_in)
print(lm_in_lens)
print(lm_labels)
lm_in, lm_in_lens, lm_labels=get_lm_inputs_and_labels(b,vocab, max_length=6)
print(lm_in)
print(lm_in_lens)
print(lm_labels)

c,d=simple_sents_concat(a,b,vocab, 3)
print(c)
print(d)


batch_tokens_bleu([[1,2,3,4,5,6]], [[2,3,1,4,5]])

[[8, 9, 90], [], [8], [8, 9, 90], [8, 9, 90]]
[[3], [8, 9, 90, 3], [], [3], []]
[[0, 8, 9, 90, 1, 1], [0, 1, 1, 1, 1, 1], [0, 8, 1, 1, 1, 1], [0, 8, 9, 90, 1, 1], [0, 8, 9, 90, 1, 1]]
[4, 1, 2, 4, 4]
[[8, 9, 90, 2, 1, 1], [2, 1, 1, 1, 1, 1], [8, 2, 1, 1, 1, 1], [8, 9, 90, 2, 1, 1], [8, 9, 90, 2, 1, 1]]
[[0, 3, 1, 1, 1, 1], [0, 8, 9, 90, 3, 1], [0, 1, 1, 1, 1, 1], [0, 3, 1, 1, 1, 1], [0, 1, 1, 1, 1, 1]]
[2, 5, 1, 2, 1]
[[3, 2, 1, 1, 1, 1], [8, 9, 90, 3, 2, 1], [2, 1, 1, 1, 1, 1], [3, 2, 1, 1, 1, 1], [2, 1, 1, 1, 1, 1]]
[[8, 9, 90], [5, 8, 9], [8, 5, 1], [8, 9, 90], [8, 9, 90]]
[3, 3, 2, 3, 3]


[0.013910597740964967]

In [4]:
#fusion data set

# with open('./data_set2/fusion_data_set/train_pseudo_simple_sents.pk', 'rb') as f:
#     fusion_pseudo_train_set_inputs = pickle.load(f)
# with open('./data_set2/fusion_data_set/train_pseudo_simple_sent_lens.pk', 'rb') as f:
#     fusion_pseudo_train_set_input_lens = pickle.load(f)
# with open('./data_set2/fusion_data_set/train_pseudo_labels.pk', 'rb') as f:
#     fusion_pseudo_train_set_labels = pickle.load(f)
#supervise
with open('./data_set2/fusion_data_set/train_simple_sents_supervised.pk', 'rb') as f:
    fusion_pseudo_train_set_inputs = pickle.load(f)
with open('./data_set2/fusion_data_set/train_simple_sent_lens_supervised.pk', 'rb') as f:
    fusion_pseudo_train_set_input_lens = pickle.load(f)
with open('./data_set2/fusion_data_set/train_labels_supervised.pk', 'rb') as f:
    fusion_pseudo_train_set_labels = pickle.load(f)
    
    
with open('./data_set2/fusion_data_set/validation_simple_sents.pk', 'rb') as f:
    fusion_pseudo_valid_set_inputs = pickle.load(f)
with open('./data_set2/fusion_data_set/validation_simple_sent_lens.pk', 'rb') as f:
    fusion_pseudo_valid_set_input_lens = pickle.load(f)
with open('./data_set2/fusion_data_set/validation_labels.pk', 'rb') as f:
    fusion_pseudo_valid_set_labels = pickle.load(f)
    
    
    
#split data set

# with open('./data_set2/split_data_set/train_complex_sents.pk', 'rb') as f:
#     split_train_set_inputs = pickle.load(f)
# with open('./data_set2/split_data_set/train_complex_sent_lens.pk', 'rb') as f:
#     split_train_set_input_lens = pickle.load(f)
# with open('./data_set2/split_data_set/train_pseudo_labels.pk', 'rb') as f:
#     split_pseudo_train_set_labels = pickle.load(f)
#supervise
with open('./data_set2/split_data_set/train_complex_sents_supervised.pk', 'rb') as f:
    split_train_set_inputs = pickle.load(f)
with open('./data_set2/split_data_set/train_complex_sent_lens_supervised.pk', 'rb') as f:
    split_train_set_input_lens = pickle.load(f)
with open('./data_set2/split_data_set/train_labels_supervised.pk', 'rb') as f:
    split_pseudo_train_set_labels = pickle.load(f)
    
    
with open('./data_set2/split_data_set/validation_complex_sents.pk', 'rb') as f:
    split_valid_set_inputs = pickle.load(f)
with open('./data_set2/split_data_set/validation_complex_sent_lens.pk', 'rb') as f:
    split_valid_set_input_lens = pickle.load(f)
with open('./data_set2/split_data_set/validation_labels.pk', 'rb') as f:
    split_pseudo_valid_set_labels = pickle.load(f)


In [5]:
print(len(split_train_set_inputs), len(split_train_set_input_lens), len(split_pseudo_train_set_labels))
print(len(fusion_pseudo_train_set_inputs), len(fusion_pseudo_train_set_input_lens), len(fusion_pseudo_train_set_labels))


197988 197988 197988
197988 197988 197988


In [6]:
class Encoder(nn.Module):
    def __init__(self, use_cuda, hidden_dim, input_dim, vocab):#, pre_train_weight, is_fix_word_vector = 1):
        super(Encoder, self).__init__()
        
        self.use_cuda = use_cuda
        self.input_dim=input_dim
        self.hidden_dim=hidden_dim
        self.vocab = vocab
        
        self.lstm=torch.nn.LSTM(input_size=self.input_dim, 
                                hidden_size= self.hidden_dim, 
                                bidirectional=True,
                                batch_first=True
                               )
        
        #embedding
        self.embed=nn.Embedding(len(self.vocab.word2token), input_dim)
        #loading pre trained word embedding
        with open('data_set/pre_trained_token_embedding.pk', 'rb') as f:
            pre_train_word_embedding = pickle.load(f)
            
        self.embed.weight.data.copy_(torch.FloatTensor(pre_train_word_embedding))
#         self.embed.weight.requires_grad = False
        
    def order(self, inputs, inputs_len):    #inputs: tensor, inputs_len: 1D tensor
        inputs_len, sort_ids = torch.sort(inputs_len, dim=0, descending=True)
        
        if self.use_cuda:
            inputs = inputs.index_select(0, Variable(sort_ids).cuda())
        else:
            inputs = inputs.index_select(0, Variable(sort_ids))
        
        _, true_order_ids = torch.sort(sort_ids, dim=0, descending=False)
        
        return inputs, inputs_len, true_order_ids
    #
    def forward(self, inputs, inputs_len):
        inputs = Variable(inputs)
        if self.use_cuda:
            inputs=inputs.cuda()
            
        inputs, sort_len, true_order_ids = self.order(inputs, inputs_len)

        in_vecs=self.embed(inputs)

        packed = rnn_utils.pack_padded_sequence(input=in_vecs, lengths=list(sort_len), batch_first =True)
        
        outputs, (hn,cn) = self.lstm(packed)
        outputs, sent_lens = rnn_utils.pad_packed_sequence(outputs)
        
        #print('outpurs size, hn size and cn size: ', outputs.size(), hn.size(), cn.size())
        outputs = outputs.transpose(0,1)  #transpose is necessary
        #print('outpurs size, hn size and cn size: ', outputs.size(), hn.size(), cn.size())
        
        #warnning: outputs, hn and cn have been sorted by sentences length so the order is wrong, now to sort them
        if self.use_cuda:
            outputs = outputs.index_select(0, Variable(true_order_ids).cuda())
        else:
            outputs = outputs.index_select(0, Variable(true_order_ids))
        
        hn = torch.cat((hn[0], hn[1]), dim=1)
        cn = torch.cat((cn[0], cn[1]), dim=1)
        #print('hn size and cn size: ', hn.size(), cn.size())
        
        if self.use_cuda:
            hn = hn.index_select(0, Variable(true_order_ids).cuda())
            cn = cn.index_select(0, Variable(true_order_ids).cuda())
        else:
            hn = hn.index_select(0, Variable(true_order_ids))
            cn = cn.index_select(0, Variable(true_order_ids))
            
        return outputs, (hn,cn)

In [7]:
def _inflate(tensor, times, dim):
    """
    Examples::
        >> a = torch.LongTensor([[1, 2], [3, 4]])
        >> a
        1   2
        3   4
        [torch.LongTensor of size 2x2]
        >> b = ._inflate(a, 2, dim=1)
        >> b
        1   2   1   2
        3   4   3   4
        [torch.LongTensor of size 2x4]
    """
    repeat_dims = [1] * tensor.dim()
    repeat_dims[dim] = times
    return tensor.repeat(*repeat_dims)

class Decoder(nn.Module):
    def __init__(self, use_cuda, encoder, hidden_dim, max_length=25):
        super(Decoder, self).__init__()
        
        self.use_cuda = use_cuda
        self.hidden_dim=hidden_dim
        self.input_dim = encoder.input_dim
        self.max_length = max_length
        self.vocab = encoder.vocab
        self.weight = [1]*len(self.vocab.word2token)
        self.weight[self.vocab.word2token['<padding>']]=0
        #self.weight[self.vocab.word2token['<eos>']]=1.01
        #self.weight[self.vocab.word2token['<split>']]=1.01
        
        self.hidden_size = self.hidden_dim
        self.V = len(self.vocab.word2token)
        self.SOS = self.vocab.word2token['<sos>']
        self.EOS = self.vocab.word2token['<eos>']
        self.log_softmax = nn.LogSoftmax(dim=1)
        
        self.lstmcell = torch.nn.LSTMCell(input_size=self.input_dim, hidden_size=self.hidden_dim*2, bias=True)
        
        #embedding
        self.embed=encoder.embed# reference share
        #fcnn: projection for crossentroy loss
        self.fcnn = nn.Linear(in_features = self.hidden_dim*2+hidden_dim*2, out_features = len(self.vocab.word2token))
        
        self.softmax = nn.Softmax(dim=1)
        self.cost_func = nn.CrossEntropyLoss(weight=torch.Tensor(self.weight), reduce=False)
        self.nll_loss = nn.NLLLoss(weight=torch.Tensor(self.weight), reduce=False)

        print('init lookup embedding matrix size: ', self.embed.weight.data.size())
        
        #copy
        out_features_dim=self.hidden_dim
        self.attent_wh = nn.Linear(in_features = self.hidden_dim*2, out_features = out_features_dim, bias = 0)
        self.attent_ws = nn.Linear(in_features = self.hidden_dim*2, out_features = out_features_dim, bias = 1)
        self.tanh = nn.Tanh()
        self.attent_vt = nn.Linear(in_features = out_features_dim, out_features = 1, bias=0)
        
        self.prob_wh = nn.Linear(in_features = self.hidden_dim*2, out_features = 1, bias=0)
        self.prob_ws = nn.Linear(in_features = self.hidden_dim*2, out_features = 1, bias=0)
        self.prob_wx = nn.Linear(in_features = input_dim, out_features = 1, bias=1)
        self.sigmoid = nn.Sigmoid()
    
    def get_context_vec(self, enc_outputs, this_timestep_input, dec_state):
        batch_size = enc_outputs.size(dim = 0)
        
        wh = self.attent_wh(enc_outputs)
        ws = self.attent_ws(dec_state).unsqueeze(dim=1)
#         print('wh, ws size: ', wh.size(), ws.size())
        ws = ws.expand(ws.size(0), wh.size(1), ws.size(2))
#         print('ws size: ', ws.size())
        weight = self.attent_vt(self.tanh(wh+ws))
#         print('weight size: ', weight.size())
        weight = self.softmax(weight.squeeze(dim=2))
#         print('weight size: ', weight.size())
        context_v = torch.bmm(weight.unsqueeze(dim=1), enc_outputs)
#         print('context_v size: ', context_v.size())
        context_v = context_v.squeeze(dim=1)
        return context_v, weight
    
    def copy_mechanism(self, enc_outputs, this_timestep_input, dec_state, inputs_one_hot, context_v, weight):
        batch_size = enc_outputs.size(dim = 0)
        
#         wh = self.attent_wh(enc_outputs)
#         ws = self.attent_ws(dec_state).unsqueeze(dim=1)
# #         print('wh, ws size: ', wh.size(), ws.size())
#         ws = ws.expand(ws.size(0), wh.size(1), ws.size(2))
# #         print('ws size: ', ws.size())
#         weight = self.attent_vt(self.tanh(wh+ws))
# #         print('weight size: ', weight.size())
#         weight = self.softmax(weight.squeeze(dim=2))
# #         print('weight size: ', weight.size())
#         context_v = torch.bmm(weight.unsqueeze(dim=1), enc_outputs)
# #         print('context_v size: ', context_v.size())
#         context_v = context_v.squeeze(dim=1)
        
        p_wh = self.prob_wh(context_v)
        p_ws = self.prob_ws(dec_state)
        p_wx = self.prob_wx(this_timestep_input)
        if_copy = self.sigmoid(p_wh+p_ws+p_wx)
#         if_copy = 0.3*if_copy
#         if_copy = self._tocuda(Variable(torch.ones(batch_size, 1), requires_grad=0))
#         print('if_copy size: ', if_copy.size())
        
        prob_copy = torch.bmm(inputs_one_hot, weight.unsqueeze(dim=2))
        prob_copy = prob_copy.squeeze(dim=2)
#         prob_copy = self._tocuda(Variable(torch.rand(batch_size, len(self.vocab.word2token)), requires_grad=0))
#         prob_copy = self.softmax(prob_copy)

#         print('prob_copy size: ', prob_copy.size())
#         print(torch.sum(prob_copy, dim=1))
#         print(torch.mean(if_copy))
        
#         if random.random()<0.005:
#             print('if_copy mean: ', torch.mean(if_copy))
#             _, max_ids = torch.max(prob_copy, dim=1)
#             print(self.vocab.token2word[max_ids.data[0]], self.vocab.token2word[max_ids.data[1]], self.vocab.token2word[max_ids.data[2]])
            
            
        return if_copy, prob_copy

    def forward(self, enc_outputs, sent_lens, h0_and_c0, labels, inputs, teaching_rate=0.6, is_train=1):
        labels = Variable(labels)
        if self.use_cuda:
            labels = labels.cuda()

        all_loss = 0
        predicts = []
        max_probs=[]
        batch_size = enc_outputs.size(dim = 0)
        final_hidden_states = h0_and_c0[0]
#         print('enc_outputs size:', enc_outputs.size())

        sents_len = enc_outputs.size(1)
        inputs = inputs[:,:sents_len].unsqueeze(dim=2)
        one_hot = torch.FloatTensor(batch_size, sents_len, len(self.vocab.word2token)).zero_()
        one_hot.scatter_(2, inputs, 1)
        one_hot = one_hot.transpose(1,2)
        one_hot = self._tocuda(Variable(one_hot, requires_grad = 0))
#         print('one_hot size: ', one_hot.size())
        
        for ii in range(self.max_length):
            if ii==0:
                zero_timestep_input = Variable(torch.LongTensor([self.vocab.word2token['<sos>']]*batch_size))
                if self.use_cuda:
                    zero_timestep_input = zero_timestep_input.cuda()
                    
                zero_timestep_input = self.embed(zero_timestep_input)#size: batch_size * self.input_dim
                
                last_timestep_hidden_state,cx = self.lstmcell(zero_timestep_input, h0_and_c0)
                #print('last_timestep_hidden_state: ', last_timestep_hidden_state.size(), cx.size())

                #get context vector
                context_vec, weight = self.get_context_vec(enc_outputs=enc_outputs, this_timestep_input=-1, 
                                                            dec_state = last_timestep_hidden_state)
                logits = self.fcnn(torch.cat([last_timestep_hidden_state, context_vec], dim=1))
                
                #copy or not
                copy_control=random.random()
                if copy_control<copy_thres:
                    if_copy, prob_copy = self.copy_mechanism(enc_outputs=enc_outputs, this_timestep_input=zero_timestep_input, 
                                                            dec_state = last_timestep_hidden_state, inputs_one_hot = one_hot, 
                                                            context_v=context_vec,
                                                            weight = weight)
                    score = (1-if_copy)*self.softmax(logits)+if_copy*prob_copy
                    score = torch.clamp(score, min=10**(-30), max=1)

                #for saving time: no training, no loss calculating
                if is_train:
                    if copy_control<copy_thres:
                        loss = self.nll_loss(torch.log(score), labels[:,0])
                    else:
                        loss = self.cost_func(logits, labels[:,0])
                    all_loss+=loss
                
                #get predicts
                if copy_control<copy_thres:
                    _, max_idxs = torch.max(score, dim=1)
                else:
                    _, max_idxs = torch.max(logits, dim=1)
                predicts.append(torch.unsqueeze(max_idxs, dim=0))
                
                
            else:
                if is_train:
                    rand = random.random()
                    if rand<teaching_rate:
                        this_timestep_input = self.embed(labels[:,ii-1])#label teaching, lookup embedding
                    else:
                        this_timestep_input = self.embed(max_idxs)#last_timestep output, and then look up word embedding
                else:
                    this_timestep_input = self.embed(max_idxs)#last_timestep output, and then look up word embedding
                
                last_timestep_hidden_state,cx = self.lstmcell(this_timestep_input, (last_timestep_hidden_state,cx))
                
                #get context vector
                context_vec, weight = self.get_context_vec(enc_outputs=enc_outputs, this_timestep_input=this_timestep_input, 
                                                            dec_state = last_timestep_hidden_state)
                logits = self.fcnn(torch.cat([last_timestep_hidden_state, context_vec], dim=1))
                
                #copy or not
                copy_control=random.random()
                if copy_control<copy_thres:
                    if_copy, prob_copy = self.copy_mechanism(enc_outputs=enc_outputs, this_timestep_input=this_timestep_input, 
                                                            dec_state = last_timestep_hidden_state, inputs_one_hot = one_hot, 
                                                             context_v=context_vec,
                                                            weight = weight)
                    score = (1-if_copy)*self.softmax(logits)+if_copy*prob_copy
                    score = torch.clamp(score, min=10**(-30), max=1)

                #for saving time: no training, no loss calculating
                if is_train:
                    if copy_control<copy_thres:
                        loss = self.nll_loss(torch.log(score), labels[:,ii])
                    else:
                        loss = self.cost_func(logits, labels[:,ii])
                    all_loss+=loss
                
                #get predicts
                if copy_control<copy_thres:
                    _, max_idxs = torch.max(score, dim=1)
                else:
                    _, max_idxs = torch.max(logits, dim=1)
                predicts.append(torch.unsqueeze(max_idxs, dim=0))
                
        predicts = torch.cat(predicts, dim=0)
        predicts = torch.transpose(predicts, 0, 1)
    
        if is_train:  #training
#             all_loss = torch.cat(all_loss, dim=1)
#             all_loss = torch.mean(all_loss, dim=1)
#             loss = torch.mean(all_loss)
            loss = all_loss/self.max_length
    
            #print('loss size: ', loss.size())
            #torch.cuda.empty_cache()
            if self.use_cuda:
                return loss, predicts.data.cpu().tolist()
            else:
                return loss, predicts.data.tolist()
        else:   #testing
            if self.use_cuda:
                return predicts.data.cpu().tolist()
            else:
                return predicts.data.tolist()
#         if is_train:  #training
#             if self.use_cuda:
#                 return all_loss/(self.max_length+1), predicts.data.cpu().numpy()
#             else:
#                 return all_loss/(self.max_length+1), predicts.data.numpy()
#         else:   #testing
#             if self.use_cuda:
#                 return predicts.data.cpu().numpy()
#             else:
#                 return predicts.data.numpy()
    
    
    def decode_topk_seqs(self, encoder, inputs, input_lens, topk=3):
        enc_outputs, (enc_hn, enc_cn) = encoder(inputs, input_lens)
        batch_size = enc_outputs.size(dim = 0)
        
        #one hot of inputs
        sents_len = enc_outputs.size(1)
        inputs = inputs[:,:sents_len].unsqueeze(dim=2)
        one_hot = torch.FloatTensor(batch_size, sents_len, len(self.vocab.word2token)).zero_()
        one_hot.scatter_(2, inputs, 1)
        one_hot = one_hot.transpose(1,2)
        one_hot = self._tocuda(Variable(one_hot, requires_grad = 0))
        
        metadata = self.decode_by_beamsearch(encoder_hidden=(enc_hn, enc_cn), encoder_outputs=enc_outputs, inputs_one_hot=one_hot,topk = topk)
        results = metadata['topk_sequence']
        results =torch.cat(results, dim = 2)
        results=results.view(batch_size*topk, -1)
        if self.use_cuda:
            results = results.data.cpu().tolist()
        else:
            results = results.data.tolist()
#         results=batch_tokens_remove_eos(results, self.vocab)

#         labels = [x for x in labels for ii in range(topk)]
#         labels = batch_tokens_remove_eos(labels, self.vocab)
#         bleu_scores = batch_tokens_bleu(references=labels, candidates=results, smooth_epsilon=0.01)
        
#         bleu_scores = torch.FloatTensor(bleu_scores).view(batch_size, topk)
#         bleu_max, _ = torch.max(bleu_scores, dim=1)
        
#         bleu_mean = torch.mean(bleu_scores, dim=1).unsqueeze(dim=1)
#         bleu_scores = bleu_scores-bleu_mean
#         bleu_scores = bleu_scores.view(-1)
        
#         bleu_scores = self._tocuda(Variable(bleu_scores, requires_grad = 0))
#         log_probs = metadata['score']
#         log_probs = log_probs.view(batch_size*topk)
#         loss = -torch.dot(log_probs, bleu_scores)/batch_size/topk
#         return loss, results, torch.mean(bleu_mean.squeeze()), torch.mean(bleu_max)

        log_probs = metadata['score']
        log_probs = log_probs.view(batch_size*topk)
        
        return results, log_probs
        
        
        
    def _tocuda(self, var):
        if self.use_cuda:
            return var.cuda()
        else:
            return var
    def decode_by_beamsearch(self, encoder_hidden=None, encoder_outputs=None, inputs_one_hot=None, topk = 10):
        self.k = topk
        batch_size = encoder_outputs.size(dim=0)
        
        self.pos_index = self._tocuda(Variable(torch.LongTensor(range(batch_size)) * self.k).view(-1, 1))

        hidden = tuple([_inflate(h, self.k, 1).view(batch_size*self.k, -1) for h in encoder_hidden])
        #print('hidden0 size: (%s, %s)'%(hidden[0].size(), hidden[1].size()))

        encoder_outputs = _inflate(encoder_outputs, self.k, 1).view(batch_size*self.k, encoder_outputs.size(1), encoder_outputs.size(2))
        inputs_one_hot = _inflate(inputs_one_hot, self.k, 1).view(batch_size*self.k, inputs_one_hot.size(1), inputs_one_hot.size(2))
        
        # Initialize the scores; for the first step,
        # ignore the inflated copies to avoid duplicate entries in the top k
        sequence_scores = torch.Tensor(batch_size * self.k, 1)
        sequence_scores.fill_(-float('Inf'))
        sequence_scores.index_fill_(0, torch.LongTensor([i * self.k for i in range(0, batch_size)]), 0.0)
        sequence_scores = self._tocuda(Variable(sequence_scores))

        # Initialize the input vector
        input_var = self._tocuda(Variable(torch.LongTensor([self.SOS] * batch_size * self.k)))

        # Store decisions for backtracking
        stored_outputs = list()
        stored_scores = list()
        stored_predecessors = list()
        stored_emitted_symbols = list()
        stored_hidden = list()

        for ii in range(0, self.max_length):
            # Run the RNN one step forward
            #print('setp: %s'%ii)
            input_vec = self.embed(input_var)
            #print('input_var and input_vec size: ', input_var.size(), input_vec.size())
            hidden = self.lstmcell(input_vec, hidden)
            #print('hidden size: (%s, %s)'%(hidden[0].size(), hidden[1].size()))
            
            #log_softmax_output = self.log_softmax(self.fcnn(hidden[0]))
            
            logits = self.fcnn(hidden[0])
#             print('logits size', logits.size())
#             print(encoder_outputs.size())
#             print(input_vec.size())
#             print(hidden[0].size())
#             print(inputs_one_hot.size())
            if_copy, prob_copy = self.copy_mechanism(enc_outputs=encoder_outputs, this_timestep_input=input_vec.squeeze(dim=1), 
                                                            dec_state = hidden[0], inputs_one_hot = inputs_one_hot)
#             print('if_copy size', if_copy.size(), 'prob_copy size', prob_copy.size())
            
            score = (1-if_copy)*self.softmax(logits)+if_copy*prob_copy
            score = torch.clamp(score, min=10**(-30), max=1)
#             print('score size: ', score.size())

            # To get the full sequence scores for the new candidates, add the local scores for t_i to the predecessor scores for t_(i-1)
            sequence_scores = _inflate(sequence_scores, self.V, 1)
            sequence_scores += torch.log(score).squeeze(1)
            scores, candidates = sequence_scores.view(batch_size, -1).topk(self.k, dim=1)

            # Reshape input = (bk, 1) and sequence_scores = (bk, 1)
            input_var = (candidates % self.V).view(batch_size * self.k, 1)
            sequence_scores = scores.view(batch_size * self.k, 1)

            # Update fields for next timestep
            predecessors = (candidates / self.V + self.pos_index.expand_as(candidates)).view(batch_size * self.k, 1)
            if isinstance(hidden, tuple):
                hidden = tuple([h.index_select(0, predecessors.squeeze()) for h in hidden])
            else:
                hidden = hidden.index_select(0, predecessors.squeeze())

            # Update sequence scores and erase scores for end-of-sentence symbol so that they aren't expanded
            stored_scores.append(sequence_scores.clone())
            eos_indices = input_var.data.eq(self.EOS)
            if eos_indices.nonzero().dim() > 0:
                sequence_scores.data.masked_fill_(eos_indices, -float('inf'))

            # Cache results for backtracking
            stored_predecessors.append(predecessors)
            stored_emitted_symbols.append(input_var)
#             stored_hidden.append(hidden)

        # Do backtracking to return the optimal values
        output, h_t, h_n, s, l, p = self._backtrack(hidden,
                                                    stored_predecessors, stored_emitted_symbols,
                                                    stored_scores, batch_size, self.hidden_size)

        metadata = {}

        metadata['score'] = s
        metadata['topk_length'] = l
        metadata['topk_sequence'] = p
        metadata['length'] = [seq_len[0] for seq_len in l]
        metadata['sequence'] = [seq[0] for seq in p]
        
#         torch.cuda.empty_cache()
        
        return metadata

    def _backtrack(self, hidden, predecessors, symbols, scores, b, hidden_size):
        """Backtracks over batch to generate optimal k-sequences.

        Args:
            nw_output [(batch*k, vocab_size)] * sequence_length: A Tensor of outputs from network
            nw_hidden [(num_layers, batch*k, hidden_size)] * sequence_length: A Tensor of hidden states from network
            predecessors [(batch*k)] * sequence_length: A Tensor of predecessors
            symbols [(batch*k)] * sequence_length: A Tensor of predicted tokens
            scores [(batch*k)] * sequence_length: A Tensor containing sequence scores for every token t = [0, ... , seq_len - 1]
            b: Size of the batch
            hidden_size: Size of the hidden state

        Returns:
            output [(batch, k, vocab_size)] * sequence_length: A list of the output probabilities (p_n)
            from the last layer of the RNN, for every n = [0, ... , seq_len - 1]

            h_t [(batch, k, hidden_size)] * sequence_length: A list containing the output features (h_n)
            from the last layer of the RNN, for every n = [0, ... , seq_len - 1]

            h_n(batch, k, hidden_size): A Tensor containing the last hidden state for all top-k sequences.

            score [batch, k]: A list containing the final scores for all top-k sequences

            length [batch, k]: A list specifying the length of each sequence in the top-k candidates

            p (batch, k, sequence_len): A Tensor containing predicted sequence
        """

        lstm = isinstance(hidden, tuple)

        # initialize return variables given different types
        output = list()
        h_t = list()
        p = list()
        # Placeholder for last hidden state of top-k sequences.
        # If a (top-k) sequence ends early in decoding, `h_n` contains
        # its hidden state when it sees EOS.  Otherwise, `h_n` contains
        # the last hidden state of decoding.
        if lstm:
            state_size = hidden[0].size()
            h_n = tuple([torch.zeros(state_size), torch.zeros(state_size)])
        else:
            h_n = torch.zeros(nw_hidden[0].size())
        l = [[self.max_length] * self.k for _ in range(b)]  # Placeholder for lengths of top-k sequences
                                                                # Similar to `h_n`

        # the last step output of the beams are not sorted
        # thus they are sorted here
        sorted_score, sorted_idx = scores[-1].view(b, self.k).topk(self.k)
        # initialize the sequence scores with the sorted last step beam scores
        s = sorted_score.clone()

        batch_eos_found = [0] * b   # the number of EOS found
                                    # in the backward loop below for each batch

        t = self.max_length - 1
        # initialize the back pointer with the sorted order of the last step beams.
        # add self.pos_index for indexing variable with b*k as the first dimension.
        t_predecessors = (sorted_idx + self.pos_index.expand_as(sorted_idx)).view(b * self.k)
        while t >= 0:
            # Re-order the variables with the back pointer
            current_symbol = symbols[t].index_select(0, t_predecessors)
            # Re-order the back pointer of the previous step with the back pointer of
            # the current step
            t_predecessors = predecessors[t].index_select(0, t_predecessors).squeeze()

            # This tricky block handles dropped sequences that see EOS earlier.
            # The basic idea is summarized below:
            #
            #   Terms:
            #       Ended sequences = sequences that see EOS early and dropped
            #       Survived sequences = sequences in the last step of the beams
            #
            #       Although the ended sequences are dropped during decoding,
            #   their generated symbols and complete backtracking information are still
            #   in the backtracking variables.
            #   For each batch, everytime we see an EOS in the backtracking process,
            #       1. If there is survived sequences in the return variables, replace
            #       the one with the lowest survived sequence score with the new ended
            #       sequences
            #       2. Otherwise, replace the ended sequence with the lowest sequence
            #       score with the new ended sequence
            #
            eos_indices = symbols[t].data.squeeze(1).eq(self.EOS).nonzero()
            if eos_indices.dim() > 0:
                for i in range(eos_indices.size(0)-1, -1, -1):
                    # Indices of the EOS symbol for both variables
                    # with b*k as the first dimension, and b, k for
                    # the first two dimensions
                    idx = eos_indices[i]
                    b_idx = int(idx[0] / self.k)
                    # The indices of the replacing position
                    # according to the replacement strategy noted above
                    res_k_idx = self.k - (batch_eos_found[b_idx] % self.k) - 1
                    batch_eos_found[b_idx] += 1
                    res_idx = b_idx * self.k + res_k_idx

                    # Replace the old information in return variables
                    # with the new ended sequence information
                    t_predecessors[res_idx] = predecessors[t][idx[0]]

                    current_symbol[res_idx, :] = symbols[t][idx[0]]
                    s[b_idx, res_k_idx] = scores[t][idx[0]]
                    l[b_idx][res_k_idx] = t + 1

            # record the back tracked results
            p.append(current_symbol)
            t -= 1

        # Sort and re-order again as the added ended sequences may change
        # the order (very unlikely)
        s, re_sorted_idx = s.topk(self.k)
        for b_idx in range(b):
            l[b_idx] = [l[b_idx][k_idx.data[0]] for k_idx in re_sorted_idx[b_idx,:]]

        re_sorted_idx = (re_sorted_idx + self.pos_index.expand_as(re_sorted_idx)).view(b * self.k)

        # Reverse the sequences and re-order at the same time
        # It is reversed because the backtracking happens in reverse time order
#         output = [step.index_select(0, re_sorted_idx).view(b, self.k, -1) for step in reversed(output)]
        p = [step.index_select(0, re_sorted_idx).view(b, self.k, -1) for step in reversed(p)]
        #    --- fake output ---
        output = None
        #    --- fake ---
        return output, h_t, h_n, s, l, p

    def _mask_symbol_scores(self, score, idx, masking_score=-float('inf')):
            score[idx] = masking_score

    def _mask(self, tensor, idx, dim=0, masking_score=-float('inf')):
        if len(idx.size()) > 0:
            indices = idx[:, 0]
            tensor.index_fill_(dim, indices, masking_score)

In [8]:
class Seq2Seq(nn.Module):
    def __init__(self, use_cuda, input_dim, hidden_dim, vocab, max_length = 25):
        super(Seq2Seq, self).__init__()
        
        self.use_cuda = use_cuda
        self.enc = Encoder(use_cuda=use_cuda, hidden_dim=hidden_dim, input_dim=input_dim, vocab=vocab)
        self.dec = Decoder(use_cuda=use_cuda, encoder=self.enc, hidden_dim=hidden_dim, max_length=max_length)
        if use_cuda:
            self.enc = self.enc.cuda()
            self.dec = self.dec.cuda()
    def forward(self, inputs, input_lens, labels, is_train=1, teaching_rate=1):
        enc_outputs, (enc_hn, enc_cn) = self.enc(torch.LongTensor(inputs), torch.LongTensor(input_lens))
        if is_train:
            loss, predicts = self.dec(enc_outputs = enc_outputs, 
                                    h0_and_c0=(enc_hn, enc_cn), 
                                    sent_lens=input_lens,
                                    labels=torch.LongTensor(labels), 
                                    is_train=1, 
                                    teaching_rate = 1,
                                    inputs = inputs
                                    )
            return loss, predicts
        else:
            predicts = self.dec(enc_outputs = enc_outputs, 
                                h0_and_c0=(enc_hn, enc_cn), 
                                sent_lens=input_lens,
                                labels=torch.LongTensor(labels), 
                                is_train=0, 
                                teaching_rate = 1,
                                inputs = inputs
                                )
            return predicts
#     def train_using_rl(self, inputs, input_lens, labels, is_train=1, teaching_rate=1):
#         enc_outputs, (enc_hn, enc_cn) = self.enc(torch.LongTensor(inputs), torch.LongTensor(input_lens))
#         loss, predicts, bleu_mean = self.dec.train_using_rl_2(enc_outputs = enc_outputs, 
#                                                 h0_and_c0=(enc_hn, enc_cn), 
#                                                 sent_lens=input_lens,
#                                                 labels=labels,
#                                                 is_train=1, 
#                                                 teaching_rate = 1
#                                                 )
#         return loss, predicts, bleu_mean

    def tocuda(self, x):
        if self.use_cuda:
            return x.cuda()
        else:
            return x
        
    def train_using_reward(self, inputs, input_lens, reconstruct_labels, reconstruct_model, language_model, topk=3, loss_ratio=0.5):
        dec_seqs, log_probs = self.dec.decode_topk_seqs(self.enc, inputs, input_lens, topk=topk)
#         enc_outputs, (enc_hn, enc_cn) = self.enc(torch.LongTensor(inputs), torch.LongTensor(input_lens))
#         results = self.dec.decode_no_labels(enc_outputs=enc_outputs, h0_and_c0=(enc_hn, enc_cn), topk=topk)
        simple_sent1s, simple_sent2s = seqs_split(dec_seqs, self.enc.vocab)
        
        lm_input1s, lm_input1_lens, lm_label1s = get_lm_inputs_and_labels(simple_sent1s, self.enc.vocab, self.dec.max_length)
        simple_sent1s_ppl = language_model.get_sentences_ppl(torch.LongTensor(lm_input1s), 
                                                      torch.LongTensor(lm_input1_lens), 
                                                      torch.LongTensor(lm_label1s)
                                                    )
        lm_input2s, lm_input2_lens, lm_label2s = get_lm_inputs_and_labels(simple_sent2s, self.enc.vocab, self.dec.max_length)
        simple_sent2s_ppl = language_model.get_sentences_ppl(torch.LongTensor(lm_input2s), 
                                                      torch.LongTensor(lm_input2_lens), 
                                                      torch.LongTensor(lm_label2s)
                                                    )
        
        simple_inputs, simple_input_lens = simple_sents_concat(simple_sent1s, simple_sent2s, self.enc.vocab, self.dec.max_length)
        #reconstruct labels
        reconstruct_loss, predicts = reconstruct_model.forward(torch.LongTensor(simple_inputs), 
                                     torch.LongTensor(simple_input_lens), 
                                     labels=reconstruct_labels, 
                                     is_train=1, teaching_rate=1)
        
        #rm_rewards: reconstruct model rewards
        #lm_rewards: language model rewards
        rm_rewards=-reconstruct_loss.data
        lm_rewards=(1/self.tocuda(torch.Tensor(simple_sent1s_ppl))+1/self.tocuda(torch.Tensor(simple_sent2s_ppl)))/2
        
        rm_rewards_mean = torch.mean(rm_rewards.view(-1, topk), dim=1)
        lm_rewards_mean = torch.mean(lm_rewards.view(-1, topk), dim=1)
        rm_rewards = rm_rewards.view(-1, topk) - rm_rewards_mean.unsqueeze(dim=1)
        lm_rewards = lm_rewards.view(-1, topk) - lm_rewards_mean.unsqueeze(dim=1)
        
        rm_rewards = rm_rewards.view(-1)
        lm_rewards = lm_rewards.view(-1)
        
        #sum both rewards up
        rewards = loss_ratio*rm_rewards+(1-loss_ratio)*lm_rewards
        rewards = Variable(rewards, requires_grad=0)
        
        #regarding rewards as weights of every seq
        loss = -torch.dot(log_probs, rewards)/log_probs.size(0)
        
#         labels = [x for x in labels for ii in range(topk)]
#         labels = batch_tokens_remove_eos(labels, self.vocab)
#         bleu_scores = batch_tokens_bleu(references=labels, candidates=results, smooth_epsilon=0.01)
        
#         bleu_scores = torch.FloatTensor(bleu_scores).view(batch_size, topk)
#         bleu_max, _ = torch.max(bleu_scores, dim=1)
        
#         bleu_mean = torch.mean(bleu_scores, dim=1).unsqueeze(dim=1)
#         bleu_scores = bleu_scores-bleu_mean
#         bleu_scores = bleu_scores.view(-1)
        
#         bleu_scores = self._tocuda(Variable(bleu_scores, requires_grad = 0))
        
#         log_probs = metadata['score']
#         log_probs = log_probs.view(batch_size*topk)
    
#         loss = -torch.dot(log_probs, bleu_scores)/batch_size/topk
        
        return loss, reconstruct_loss, torch.mean(rm_rewards_mean), torch.mean(lm_rewards_mean)
    
    


In [None]:
def split_model_eval(model, inputs, input_lens, labels):
    dataset_size = len(inputs)
    print(dataset_size)
    scores_ground_truth=0
    scores_no_ground_truth=0
    for idx in range(0, dataset_size, batch_size):
        
        #no teacher forcing
        predicts = model.forward(torch.LongTensor(inputs[idx:idx+batch_size]),
                                 torch.LongTensor(input_lens[idx:idx+batch_size]),
                                 labels=[],
                                 is_train=0, teaching_rate=1)
        bleu_scores = batch_tokens_bleu_split_version(references = labels[idx:idx+batch_size],
                                                     candidates = predicts,
                                                     smooth_epsilon=0.001,
                                                     vocab=vocab)
        for x in bleu_scores:
            scores_no_ground_truth+=x
    return scores_no_ground_truth/dataset_size


def fusion_model_eval(model, inputs, input_lens, labels):
    dataset_size = len(inputs)
    scores_ground_truth=0
    scores_no_ground_truth=0
    for idx in range(0, dataset_size, batch_size):
        
        #no teacher forcing
        predicts = model.forward(torch.LongTensor(inputs[idx:idx+batch_size]),
                                 torch.LongTensor(input_lens[idx:idx+batch_size]),
                                 labels=[],
                                 is_train=0, teaching_rate=1)
        predicts = batch_tokens_remove_eos(predicts, vocab)
        labels_ = batch_tokens_remove_eos(labels[idx:idx+batch_size], vocab)
        bleu_scores = batch_tokens_bleu(references=labels_, candidates=predicts, smooth_epsilon=0.001)
        for x in bleu_scores:
            scores_no_ground_truth+=x
    return scores_no_ground_truth/dataset_size



In [None]:
use_cuda = 1
hidden_dim = 256
input_dim = 100
lr=0.005
batch_size=100
split_train_set_size=int(len(split_train_set_inputs)/2)
epochs=10000
train_bleu_mean=-1
train_bleu_max=-1
split_model = Seq2Seq(use_cuda = use_cuda, input_dim = input_dim, hidden_dim = hidden_dim, 
                          vocab = vocab, max_length = 61)

fusion_model = Seq2Seq(use_cuda = use_cuda, input_dim = input_dim, hidden_dim = hidden_dim, 
                          vocab = vocab, max_length = 51)
# #pre train para
# split_model_path = './models_saved/time-[2019-03-09-18-40-05]-info=[pre-trained_split_model-20per]-loss=0.673111856-bleu=0.7492-hidden_dim=256-input_dim=100-epoch=1-batch_size=100-batch_id=[1-[of]-1979]-lr=0.0050'
# fusion_model_path = './models_saved/time-[2019-03-09-18-40-07]-info=[pre-trained_fusion_model-20per]-loss=0.515186548-bleu=0.3736-hidden_dim=256-input_dim=100-epoch=1-batch_size=100-batch_id=[1-[of]-1979]-lr=0.0050'

# split_model_path = './models_saved/time-[2019-03-10-13-23-10]-info=[pre-trained_split_model-20per]-loss=0.454687029-bleu=0.7130-hidden_dim=256-input_dim=100-epoch=4-batch_size=100-batch_id=[1-[of]-1979]-lr=0.0050'
# fusion_model_path = './models_saved/time-[2019-03-10-13-23-11]-info=[pre-trained_fusion_model-20per]-loss=0.346116364-bleu=0.7466-hidden_dim=256-input_dim=100-epoch=4-batch_size=100-batch_id=[1-[of]-1979]-lr=0.0050'

# pre_train = torch.load(split_model_path, map_location='cpu')
# split_model.load_state_dict(pre_train)
# pre_train = torch.load(fusion_model_path, map_location='cpu')
# fusion_model.load_state_dict(pre_train)

if use_cuda:
    split_model = split_model.cuda()
    fusion_model = fusion_model.cuda()
    
split_optimizer = optim.Adam(filter(lambda p: p.requires_grad, split_model.parameters()), lr=lr)
fusion_optimizer = optim.Adam(filter(lambda p: p.requires_grad, fusion_model.parameters()), lr=lr)

start_time = time.time()

def model_train(epoch, batch_size, train_set_size):
    batch_id = 0
    valid_bleu = 0
    for start_idx in range(0, train_set_size-batch_size+1, batch_size):
#         print('batch id: ', batch_id)
            
        batch_id+=1
        end_idx = start_idx + batch_size
        
        split_optimizer.zero_grad()#clear  
        split_loss, predicts = split_model.forward(torch.LongTensor(split_train_set_inputs[start_idx:end_idx]), 
                                     torch.LongTensor(split_train_set_input_lens[start_idx:end_idx]), 
                                     labels=torch.LongTensor(split_pseudo_train_set_labels[start_idx:end_idx]), 
                                     is_train=1, teaching_rate=1)
        split_loss=torch.mean(split_loss)

        #optimize
        split_loss.backward()#retain_graph=True)
        split_optimizer.step()
        
#         torch.cuda.empty_cache()
        
        fusion_optimizer.zero_grad()#clear
        fusion_loss, predicts = fusion_model.forward(torch.LongTensor(fusion_pseudo_train_set_inputs[start_idx:end_idx]), 
                                     torch.LongTensor(fusion_pseudo_train_set_input_lens[start_idx:end_idx]), 
                                     labels=torch.LongTensor(fusion_pseudo_train_set_labels[start_idx:end_idx]), 
                                     is_train=1, teaching_rate=1)
        fusion_loss = torch.mean(fusion_loss)
        fusion_loss.backward()#retain_graph=True)
        fusion_optimizer.step()

#         torch.cuda.empty_cache()
        
        if batch_id%50==1:
            split_model.eval()
            fusion_model.eval()
            
            sample_num = 5
            rand_idx = random.randint(0, train_set_size-sample_num-1)
            
            print('--------split model training sampling display--------')
            #teaching forcing
            loss_, predicts = split_model.forward(torch.LongTensor(split_train_set_inputs[rand_idx:rand_idx+sample_num]), 
                                             torch.LongTensor(split_train_set_input_lens[rand_idx:rand_idx+sample_num]), 
                                             labels=torch.LongTensor(split_pseudo_train_set_labels[rand_idx:rand_idx+sample_num]), 
                                             is_train=1, teaching_rate=1)
            del loss_
            
            predicts = batch_tokens_remove_eos(predicts, vocab)
            labels = batch_tokens_remove_eos(split_pseudo_train_set_labels[rand_idx:rand_idx+sample_num], vocab)
            
            predicts = batch_tokens2words(predicts, vocab)
            labels = batch_tokens2words(labels, vocab)
            
            predicts_sents = batch_words2sentence(predicts)
            labels_sents = batch_words2sentence(labels)
            
            for (predict_sent, label_sent) in zip(predicts_sents, labels_sents):
                print(' 1----> ', predict_sent)
                print(' 2----> ', label_sent)
                print('\n')
                
            print('--------fusion model training sampling display--------')
            loss_, predicts = fusion_model.forward(torch.LongTensor(fusion_pseudo_train_set_inputs[rand_idx:rand_idx+sample_num]), 
                                             torch.LongTensor(fusion_pseudo_train_set_input_lens[rand_idx:rand_idx+sample_num]), 
                                             labels=torch.LongTensor(fusion_pseudo_train_set_labels[rand_idx:rand_idx+sample_num]), 
                                             is_train=1, teaching_rate=1)
            del loss_
            
            predicts = batch_tokens_remove_eos(predicts, vocab)
            labels = batch_tokens_remove_eos(fusion_pseudo_train_set_labels[rand_idx:rand_idx+sample_num], vocab)
            
            predicts = batch_tokens2words(predicts, vocab)
            labels = batch_tokens2words(labels, vocab)
            
            predicts_sents = batch_words2sentence(predicts)
            labels_sents = batch_words2sentence(labels)
            
            for (predict_sent, label_sent) in zip(predicts_sents, labels_sents):
                print(' 1----> ', predict_sent)
                print(' 2----> ', label_sent)
                print('\n')
                
#             #no teaching forcing
            print('----no teaching forcing----')
            predicts = split_model.forward(torch.LongTensor(fusion_pseudo_train_set_inputs[rand_idx:rand_idx+sample_num]), 
                                             torch.LongTensor(fusion_pseudo_train_set_input_lens[rand_idx:rand_idx+sample_num]), 
                                             labels=[],
                                             is_train=0, teaching_rate=1)
            predicts = batch_tokens_remove_eos(predicts, vocab)
            labels = batch_tokens_remove_eos(split_pseudo_train_set_labels[rand_idx:rand_idx+sample_num], vocab)
            
            predicts = batch_tokens2words(predicts, vocab)
            labels = batch_tokens2words(labels, vocab)
            
            predicts_sents = batch_words2sentence(predicts)
            labels_sents = batch_words2sentence(labels)

            for (predict_sent, label_sent) in zip(predicts_sents, labels_sents):
                print(' 1----> ', predict_sent)
                print(' 2----> ', label_sent)
                print('\n')
                
            info_stamp = 'split_loss={:2.9f}-train_bleu_mean={:2.9f}-train_bleu_max={:2.9f}-batch_size={:n}-epoch={:n}-batch_id=({:n}/{:n})'.format(
                              split_loss.data[0], train_bleu_mean, train_bleu_max, batch_size, epoch, batch_id, int(train_set_size/batch_size))
            print(info_stamp)
#             torch.cuda.empty_cache()
            
# #             #valid_set testing
            if batch_id%500==1:
#                 valid_bleu = split_model_eval(split_model, split_valid_set_inputs, split_valid_set_input_lens, split_pseudo_valid_set_labels)
                rand_idx=random.randint(0, len(split_valid_set_inputs)-batch_size-1-1)
                predicts = split_model.forward(torch.LongTensor(split_valid_set_inputs[rand_idx:rand_idx+batch_size]), 
                                                 torch.LongTensor(split_valid_set_input_lens[rand_idx:rand_idx+batch_size]), 
                                                 labels=[],#torch.LongTensor(valid_set_labels[rand_idx:rand_idx+batch_size]), 
                                                 is_train=0, teaching_rate=1)
#                 predicts = batch_tokens_remove_eos(predicts, vocab)
#                 labels = batch_tokens_remove_eos(split_pseudo_valid_set_labels[rand_idx:rand_idx+batch_size], vocab)
                
#                 bleu_scores = batch_tokens_bleu(references=labels, candidates=predicts, smooth_epsilon=0.001)
                bleu_scores = batch_tokens_bleu_split_version(references = split_pseudo_valid_set_labels[rand_idx:rand_idx+batch_size],
                                                             candidates = predicts,
                                                             smooth_epsilon=0.001,
                                                             vocab=vocab)
                valid_bleu = 0
                for x in bleu_scores:
                    valid_bleu+=x
                valid_bleu/=len(bleu_scores)
                       
                info_stamp = 'info=[{:s}]-loss={:2.9f}-bleu={:1.4f}-hidden_dim={:n}-input_dim={:n}-epoch={:n}-batch_size={:n}-batch_id=[{:n}-[of]-{:n}]-lr={:1.4f}'.format(
                              'pre-trained_split_model-10per', split_loss.data[0], valid_bleu, hidden_dim, input_dim, epoch, batch_size, batch_id, int(train_set_size/batch_size), lr)
                print(info_stamp, valid_bleu)
                now = int(round(time.time()*1000))
                time_stamp = time.strftime('time-[%Y-%m-%d-%H-%M-%S]-',time.localtime(now/1000))
                torch.save(split_model.state_dict(), ''.join(['./models_saved/', time_stamp, info_stamp]))
                
                #fusion eval
#                 valid_bleu = fusion_model_eval(fusion_model, fusion_pseudo_valid_set_inputs, 
#                                                   fusion_pseudo_valid_set_input_lens, fusion_pseudo_valid_set_labels)
                


                rand_idx=random.randint(0, len(fusion_pseudo_valid_set_inputs)-batch_size-1-1)
                predicts = fusion_model.forward(torch.LongTensor(fusion_pseudo_valid_set_inputs[rand_idx:rand_idx+batch_size]), 
                                                 torch.LongTensor(fusion_pseudo_valid_set_input_lens[rand_idx:rand_idx+batch_size]), 
                                                 labels=[],#torch.LongTensor(valid_set_labels[rand_idx:rand_idx+batch_size]), 
                                                 is_train=0, teaching_rate=1)
                predicts = batch_tokens_remove_eos(predicts, vocab)
                labels = batch_tokens_remove_eos(fusion_pseudo_valid_set_labels[rand_idx:rand_idx+batch_size], vocab)
                
                bleu_scores = batch_tokens_bleu(references=labels, candidates=predicts, smooth_epsilon=0.001)

                valid_bleu = 0
                for x in bleu_scores:
                    valid_bleu+=x
                valid_bleu/=len(bleu_scores)
                
                info_stamp = 'info=[{:s}]-loss={:2.9f}-bleu={:1.4f}-hidden_dim={:n}-input_dim={:n}-epoch={:n}-batch_size={:n}-batch_id=[{:n}-[of]-{:n}]-lr={:1.4f}'.format(
                              'pre-trained_fusion_model-10per', fusion_loss.data[0], valid_bleu, hidden_dim, input_dim, epoch, batch_size, batch_id, int(train_set_size/batch_size), lr)
                print(info_stamp, valid_bleu)
                now = int(round(time.time()*1000))
                time_stamp = time.strftime('time-[%Y-%m-%d-%H-%M-%S]-',time.localtime(now/1000))
                torch.save(fusion_model.state_dict(), ''.join(['./models_saved/', time_stamp, info_stamp]))
#                 torch.cuda.empty_cache()
            split_model.train()
            fusion_model.train()
            
for epoch in range(epochs):
    model_train(epoch, batch_size, split_train_set_size)
    
print('running time: %.2f mins'%((time.time()-start_time)/60))

init lookup embedding matrix size:  torch.Size([44380, 100])
init lookup embedding matrix size:  torch.Size([44380, 100])
--------split model training sampling display--------
 1---->  in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in in
 2---->  beginning in 1979 , bjork - <low_freq> valves with . <split> the <low_freq> - concave bjork - <low_freq> valve had a tendency to develop fractures in the outflow strut which could result in catastrophic valve failure and possibly sudden cardiac death .


 1---->  the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the
 2---->  beginning in 1980 with the iowa department of education , 5,000 school districts around the world purchase

 1---->  <padding> in in general <low_freq> school school a a a a a a a a . . , , . forms . <split> this is , , , , forms forms forms forms forms
 2---->  in general <low_freq> school is a very impressive , with a small majority of <low_freq> and a high majority of <low_freq> . <split> it also has very skilled children , in many forms .


 1---->  <padding> in in <low_freq> <low_freq> 19 , , , , , , . , . . . <split> this , , , , , 's of ladder of ladder .
 2---->  in genesis <low_freq> - 19 , abraham 's grandson jacob left beersheba and went toward <low_freq> . <split> along the way he had his dream of jacob 's ladder .


 1---->  in in in geneva , his ministry both attracted other protestant protestant protestant and and and . his . . the the . the . . <split> this is the the the the of the his his his of his his . his his his .
 2---->  in geneva , his ministry both attracted other protestant refugees and over time made that city a major force in the spread of reformed theology . <s

 1---->  jacques washer , antiquarian who died in <low_freq> flight 316 and <split> . <split> the <split> of the <split> of the age of the six of six of six of six of six of six of six six six six six six six six six six six six six six six six six six six six six six six six six
 2---->  jacques washer , antiquarian who died in the <low_freq> flight 316 crash and edouard . <split> philippe started playing tennis at the age of six .


 1---->  jacques <low_freq> , was principal ballet dancer with new york city . <split> the <split> , the the the the starlight '' of the the the the film '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' ''
 2---->  jacques <low_freq> , was a principal ballet dancer with the new york city ballet . <split> he danced the role of the '' starlight carnival boy '' in the film '' carousel '' .


 1---->  <low_freq> <low_freq> jean frederica eliot , countess of germans ( 29 ) ) ) ) ) ) of wedderburn . <split> the <lo

--------split model training sampling display--------
 1---->  between 1971 and 1975 the organisation became carrier group 4 affiliated affiliated with striking fleet . atlantic . and . 2004 . <split> he 2004 , became carrier strike group 4 .
 2---->  between 1971 and 1975 the organisation became carrier group 4 , affiliated with nato striking fleet atlantic , part of <low_freq> . <split> in 2004 it became carrier strike group 4 .


 1---->  between 1971 and 1982 , roy <low_freq> served as deputy premier of saskatchewan . <split> from 1987 premier premier he , the to led . he 2001 . 2001 saskatchewan new democratic party .
 2---->  between 1971 and 1982 , roy <low_freq> served as deputy premier of saskatchewan . <split> on november 7 , 1987 mr. <low_freq> replaced allan <low_freq> as leader of the saskatchewan new democratic party .


 1---->  between 1971 and 1991 , mortality from heart disease dropped 41 percent , strokes decreased by 59 percent . <split> today today more than 70 per

 1---->  eastern parkway was a station on the demolished bmt fulton street <low_freq> had 2 tracks and 1 island platform was served served by trains of the bmt fulton street line .
 2---->  eastern parkway was a station on the demolished bmt fulton street <low_freq> had 2 tracks and 1 island platform and was served by trains of the bmt fulton street line .


 1---->  eastern slavonia is a mostly flat area , with the best type of soil where agriculture is highly developed , particularly on wheat fields . it has also has several forests as well as vineyards .
 2---->  eastern slavonia is a mostly flat area , with the best type of soil where agriculture is highly developed , particularly on wheat fields , and it also has several forests as well as vineyards .


 1---->  eastern twin ( <low_freq> ) and cinematic volunteer composed and produced by martin tillman and tom <low_freq> and the by by michael <low_freq> , cello solos performed by michael tillman , tim performed <low_freq> , perfor

 1---->  an unexpected thing happened : people kept adding to them until they were no longer basic . <split> they were renamed to '' outline of '' outline of '' .
 2---->  an unexpected thing happened : people kept adding to them until they were no longer basic . <split> so they were renamed to '' outline of '' .


 1---->  an unidentified <low_freq> user applied the concept of the <low_freq> to a 2007 post relating to the video game relating to the video game '' grand theft auto iv '' . <split> in the trailer , but in reality directed users the music video for rick astley 's 1987 song '' never gon na give you the music video for rick
 2---->  an unidentified <low_freq> user applied the concept of the <low_freq> to a 2007 post relating to the video game '' grand theft auto iv '' . <split> the user posted a youtube link that purportedly led to the trailer , but in reality directed users the music video for rick astley 's 1987 song '' never gon na give you


 1---->  an unidentified sans

--------split model training sampling display--------
 1---->  it was their 8th video , and was released and june 21 , <low_freq> . <split> it <low_freq> premiered 100 june 21 , <low_freq> , june of in the u.s. , 34 of them in california alone .
 2---->  it was their 8th video , and was released on june 21 , 2007 . <split> the video premiered on june 21 , 2007 at 100 locations in the u.s. , 34 of them in california alone .


 1---->  it was their final us single and reached reached to . the . <split> the was number 21 on the '' billboard '' hot 100 in february 1980 .
 2---->  it was their final us single before they formally disbanded in 1980 . <split> it reached number 21 on the '' billboard '' hot 100 in february 1980 .


 1---->  it was then he that he decided to actively dedicate his time teaching the '' bel canto '' vocal technique . has . actively . the success . . <split> he has had great success giving the he on classes which he holds on a regular basis .
 2---->  it was then ,

 1---->  hang hau station is located in the hang hau on the hong kong mtr <low_freq> kwan o line . <split> the station is in the vicinity of many housing complexes , including east point city , maritime bay , and on ning garden .
 2---->  hang hau station is located in the hang hau on the hong kong mtr <low_freq> kwan o line . <split> the station is in the vicinity of many housing complexes , including east point city , maritime bay , nan fung plaza , and on ning garden .


 1---->  hangar 18 is located at wright - patterson air force base in ohio . <split> an alien aircraft was speculated that an alien aircraft was brought there from roswell in 1947 .
 2---->  hangar 18 is located at wright - patterson air force base in ohio . <split> it is speculated that an alien aircraft was brought there from roswell in 1947 .


 1---->  <low_freq> - man is a hacked clone of the pac - man arcade game . <split> it appeared sometime around 1981 , at the height of the pac - man craze .
 2---->  <low_

 1---->  after mating , new colonies can be founded by one or more queens , and a colony with two queens reduces to a single queen when the nest is mature , forming colonies that are termed <low_freq> .
 2---->  after mating , new colonies can be founded by one or more queens , though a colony with two queens reduces to a single queen when the nest is mature , forming colonies that are termed <low_freq> .


 1---->  after mating , the female searches for a place to lay her eggs , emerge emerge and process begins and skins skins and the over all the , all the their all the their skins , over over and the process begins all over again .
 2---->  after mating , the female searches for a place to lay her eggs then after hatching the nymphs drop to the ground and burrow there for three years then dig up and shed their skins and begin mating and the process begins all over again .


 1---->  after medical treatment in chichester and recuperation carey to posted back 43 month month again for 

--------split model training sampling display--------
 1---->  in a preface on the dvd 's liner notes , ronny p. <low_freq> of jive ! collection ronny wrote that viewers should view the film with open hearts , as the conflicts within reflected <low_freq> in society .
 2---->  in a preface on the dvd 's liner notes , ronny p. <low_freq> of jive ! <split> collection wrote that viewers should view the film with open hearts , as the conflicts within reflected <low_freq> in society .


 1---->  in a press briefing on the same day , national security advisor h. r. mcmaster strongly denied '' washington the '' '' report . <split> saying saying , '' report no time , at no time , at intelligence sources or methods discussed .
 2---->  in a press briefing on the same day , national security advisor h. r. mcmaster strongly denied the '' washington post '' report . <split> he said , '' at no time , at no time , were intelligence sources or methods discussed .


 1---->  in a press conference marc 

 1---->  dario , who is behind bars , is anticipating the new life that will be transported to him , then shows up at the prison and shoots him dead .
 2---->  dario , who is behind bars , is anticipating the new life that will be transported to him but <low_freq> shows up at the prison and shoots him dead .


 1---->  dario david hunter ( born 1983 ) is an american - born lawyer , rabbi , the , is the the , the first muslim - the first muslim - born person to be ordained as a rabbi .
 2---->  dario david hunter ( born 1983 ) is an american - israeli lawyer , rabbi and academic who has been described in the israeli media as the first muslim - born person to be ordained as a rabbi .


 1---->  <low_freq> <low_freq> ( born 1937 is an indian social worker and <low_freq> village of <low_freq> state of telangana .
 2---->  <low_freq> <low_freq> ( 70 ) is an indian social worker from <low_freq> district in the state of telangana .


 1---->  darius yuen lai yan ( <low_freq> ) ( born 1969 ) i

--------split model training sampling display--------
 1---->  you 're not alone '' was originally <low_freq> '' by by tim . <split> the <low_freq> was originally written by tim <low_freq> and performed taylor - firth and performed by olive in 1997 .
 2---->  you 're not alone '' is a cd single released by <low_freq> . <split> the song was originally written by tim <low_freq> and robin taylor - firth and performed by olive in 1997 .


 1---->  you 're putting me on ! '' was a short - lived bob stewart game show in he in the . the years . <split> he game celebrities the celebrities tried tried communicate the identities of famous people through odd and interesting clues .
 2---->  you 're putting me on ! '' was a short - lived bob stewart game show that ran on nbc for several months . <split> the object of the game was to communicate the identities of famous people through odd and interesting clues .


 1---->  you are so beautiful '' is a song written by billy preston and bruce fisher 

 1---->  in june 2006 , chinese assistance was promised to build a railway extension from <low_freq> to juba , and is still south of the sudan railways southernmost station at <low_freq> .
 2---->  in june 2006 , chinese assistance was promised to build a railway extension from <low_freq> to juba , which is about south of the sudan railways southernmost station at <low_freq> .


 1---->  in june 2006 , <low_freq> acquired geneva based <low_freq> , in october 2006 , the <low_freq> opened its second largest office in shanghai .
 2---->  in june 2006 , <low_freq> acquired geneva based <low_freq> and in october 2006 , the company opened its second largest office in shanghai .


 1---->  in june 2006 , <low_freq> released a for your love '' , in in july 2008 , <low_freq> i do n't know why '' .
 2---->  in june 2006 , <low_freq> released '' for your love '' , and in july 2008 , '' i do n't know why '' .


 1---->  in june 2006 , <low_freq> was named one of the 100 most influential lawyers in

--------split model training sampling display--------
 1---->  allen walker , the main character of the series , is also based on the previous series ' female protagonist . <split> but , hoshino changed some characteristics to make allen look more masculine .
 2---->  allen walker , the main character of the series , is also based on the previous series ' female protagonist . <split> however , hoshino changed some characteristics to make allen look more masculine .


 1---->  allen argued many native american tribes were '' <low_freq> '' , with women making the principal decisions . <split> others tribes were in absolute balance between male and female , with neither side gaining dominance .
 2---->  allen argued many native american tribes were '' <low_freq> '' , with women making the principal decisions . <split> other tribes believed in absolute balance between male and female , with neither side gaining dominance .


 1---->  allen confirmed during an interview to the bbc at glasto

 1---->  han tells dre that he crashed the same car years ago , his his his wife and ten - year - old son were with him and died in the car crash .
 2---->  han tells dre that he crashed the same car years ago , and that his wife and ten - year - old son were with him and died in the car crash .


 1---->  hana <low_freq> gaddafi was allegedly the adopted daughter of muammar gaddafi , was purportedly killed during the u.s. bombing raids in 1986 .
 2---->  hana <low_freq> gaddafi was allegedly the adopted daughter of muammar gaddafi who was purportedly killed during the u.s. bombing raids in 1986 .


 1---->  hana ryu studied at the korea national university of arts in seoul , and came to the united states in 2012 on a student visa to pursue a master of music degree at binghamton university .
 2---->  hana ryu studied at the korea national university of arts in seoul , then came to the united states in 2012 on a student visa to pursue a master of music degree at binghamton university .


--------split model training sampling display--------
 1---->  april 20 , 1992 - the freddie mercury tribute concert was organized at wembley freddie stadium of the 's <split> the was homage to the stadium , london , to pay homage to recently mercury phoenix , and was , to raise money for the mercury phoenix trust , a fund for victims of aids .
 2---->  april 20 , 1992 - the freddie mercury tribute concert was organized by the <low_freq> members of queen . <split> it took place at wembley stadium , london , to pay homage to their deceased lead singer freddie mercury and to raise money for the mercury phoenix trust , a fund for victims of aids .


 1---->  april 2008 saw <low_freq> europe announcing a facelift for its c2 model . <split> this facelift had which a 2006 , which a minor update in 2006 .
 2---->  april 2008 saw <low_freq> europe announcing a facelift for its c2 model . <split> the car , launched in 2003 , received a minor update in 2006 .


 1---->  april jeanette mendez ( b

 1---->  it is the first song on the film score of '' toy story 3 '' . <split> the entire album was composed and conducted by randy newman .
 2---->  it is the first song on the film score of '' toy story 3 '' . <split> the entire album was composed and conducted by randy newman .


 1---->  it is the first time the train has stopped there in four years . <split> the little town has very few inhabitants and appears to be dying .
 2---->  it is the first time the train has stopped there in four years . <split> the little town has very few inhabitants and appears to be dying .


 1---->  it is the fourteenth major book by coelho . <split> <split> the book touches on the theme of spirituality .
 2---->  it is the fourteenth major book by coelho . <split> the book touches on the theme of spirituality .


 1---->  it is the fourth installment in the '' alien '' film series . <split> the final installment in the original series . <split> it was filmed at the 20th century fox studios in los a

 1---->  it filed a land claims suit in 1974 , seeking to gain title to 3,000 acres of lands lost to the state and town , as required had not gained federal approval through <low_freq> of wampanoag title , as required under the 1790 <low_freq> act .
 2---->  it filed a land claims suit in 1974 , seeking to gain title to 3,000 acres of lands lost to the state and town , as they did not gain federal approval for <low_freq> of wampanoag title , as required under the 1790 <low_freq> act .


 1---->  it filmed under the working title '' bluff '' , on a $ 56 million budget , received $ <low_freq> million under the new york film & tv tax credit program .
 2---->  it filmed under the working title '' bluff '' , on a $ 56 million budget and received $ <low_freq> million under the new york film & tv tax credit program .


 1---->  it finally replaced '' hey ya ! '' at number one on february 7 , 2004 , where it stayed for one week , that was the sixth time a recording act has replaced itself on t

 1---->  he has taught and had residencies at over forty universities and law schools . <split> he has lectured on morality and ethics erosion at most of the nation 's police departments as well as the fbi academy in quantico , virginia .
 2---->  he has taught and had residencies at over forty universities and law schools . <split> in addition he has lectured on morality and ethics erosion at most of the nation 's police departments as well as the fbi academy in quantico , virginia .


 1---->  he has taught at columbia university , new school for social research , wellesley college , asu . <split> he was chair between 1990 and 1992 - and ucla film school .
 2---->  he has taught at columbia university , new school for social research , wellesley college , asu - where he was chair between 1990 and 1992 - and ucla film school . <split> levy currently teaches in the department of cinema studies at new york university .


 1---->  he has taught at wichita state university , university of

 1---->  a popular application is found in <low_freq> <low_freq> , the video possible of <low_freq> , deploys greater technical sophistication and improved fidelity of both sight and sound greater in traditional <low_freq> .
 2---->  a popular application is found in <low_freq> <low_freq> , a higher level of <low_freq> which deploys greater technical sophistication and improved fidelity of both video and audio than in traditional <low_freq> .


 1---->  a popular but unofficial flag is '' newfoundland 's native flag '' , which is a tricolour flag , is commonly known as the '' pink , white and green '' .
 2---->  a popular but unofficial flag is '' newfoundland 's native flag '' , which is a tricolour flag that is sometimes known as the '' pink , white and green '' .


 1---->  a popular choice of coloured liquids for <low_freq> was flo - master ink , a product developed for use in permanent marker pens , while this ink was very vivid it also had the problem of staining the operators ha

--------split model training sampling display--------
 1---->  instead she finds a strange but welcoming woman , the seer madame serena ( rubinstein ) . <split> she is stunned to realize that louise is a reincarnated witch and an old friend of serena 's .
 2---->  instead she finds a strange but welcoming woman , the seer madame serena ( rubinstein ) . <split> serena is stunned to realize that louise is a reincarnated witch and an old friend of serena 's .


 1---->  instead the game usually continues <low_freq> <low_freq> ( 3 ... c5 allows <low_freq> because black no longer has the bishop check ) . now white 's <low_freq> level . not . try . and now . <split> <low_freq> white 's <low_freq> try for an advantage is to <low_freq> his king bishop with <low_freq> and by <low_freq> and <low_freq> .
 2---->  instead the game usually continues <low_freq> <low_freq> ( 4 ... c5 allows <low_freq> because black no longer has the bishop check ) and now white 's primary options are <low_freq> , <lo

 1---->  <low_freq> was among the early founders of the italian - american pentecostal church in chicago , and chicago he from to the italian colonies in the united states planting churches mostly in the northeast .
 2---->  <low_freq> was among the early founders of the italian - american pentecostal church in chicago , from where evangelists went to the italian colonies in the united states planting churches mostly in the northeast .


 1---->  <low_freq> tells peter she will leave him unless he can change his father 's mind before her train departs , and is 's until threaten by family with by getting drunk and otherwise <low_freq> until his father relents , even if it costs him his job .
 2---->  <low_freq> tells peter she will leave him unless he can change his father 's mind before her train departs , and peter decides to disgrace the family name by getting drunk and otherwise <low_freq> until his father relents , even if it costs him his job .


 1---->  <low_freq> and josie firs

 1---->  it consisted of the american , british , and french occupation sectors established in 1945 . <split> the occupation sectors established in 1945 .
 2---->  it consisted of the american , british , and french occupation sectors established in 1945 . <split> it shared economic , political , legal , and sporting systems with west germany , but was not '' de jure '' a part of it .


 1---->  it consisted of the american , british , british , and french occupation sectors established in 1945 . <split> however , however , a '' de facto '' part of west germany .
 2---->  it consisted of the american , british , and french occupation sectors established in 1945 . <split> it was , however , a '' de facto '' part of west germany .


 1---->  it consisted of the liberal democratic party , the german forum party and the german forum party . <split> it eventually merged with the west german free democratic party .
 2---->  it consisted of the liberal democratic party , the german forum part

--------split model training sampling display--------
 1---->  <low_freq> and <low_freq> was very important since they provided infrastructure . <split> they were most likely the very first to be in the .
 2---->  <low_freq> and <low_freq> were very important since they provided infrastructure . <split> they were most likely the very first to be in the .


 1---->  dmitri <low_freq> ( ; born 6 july 1976 ) <low_freq> ) is a russian former competitive ice dancer . <split> he competed with partner olga <low_freq> .
 2---->  dmitri <low_freq> ( ; born 6 july 1976 in <low_freq> ) is a russian former competitive ice dancer . <split> he competed with partner olga <low_freq> .


 1---->  dmitri had attempted to become the new dracula like graham before him so he may understand his power , but lost control of dracula dracula . dracula . was consumed by it . <split> he was control called a <low_freq> creature called menace , which soma <low_freq> .
 2---->  dmitri had attempted to become the new

 1---->  anderson water systems , inc. is a canadian manufacturer and service provider of industrial water systems , in 1952 in dundas , ontario , canada and is now located in <low_freq> , ontario .
 2---->  anderson water systems , inc. is a canadian manufacturer and service provider of industrial water systems established in 1952 in dundas , ontario , canada and is now located in <low_freq> , ontario .


 1---->  anderson also served as both host and designer for shows such as e home 's hot trends in outdoor entertaining , and turner south 's southern home by design .
 2---->  anderson also served as both host and designer for shows such as , <low_freq> 's hot trends in outdoor entertaining , and turner south 's southern home by design .


 1---->  anderson arrived in illinois following the new england elections and had a lead in the state polls , but his illinois campaign struggled despite endorsements from the state 's two largest newspapers .
 2---->  anderson arrived in illinois 

 1---->  additionally , more and more people live in cities . <split> they tend to consume less staple foods and more meat and more meat and more meat and dairy products .
 2---->  additionally , more and more people live in cities . <split> urban populations feed themselves differently than inhabitants of rural areas ; they tend to consume less staple foods and more meat and dairy products .


 1---->  additionally , one of the mirrors has to be very tiny and fixed on an isolated <low_freq> - oscillator . <split> this allows it to move when the photon is reflected on it , so that it may become <low_freq> with the photon .
 2---->  additionally , one of the mirrors has to be very tiny and fixed on an isolated <low_freq> - oscillator . <split> this allows it to move when the photon is reflected on it , so that it may become <low_freq> with the photon .


 1---->  additionally , other sports teams at the school have won multiple sectional and regional championships . <split> the school '

--------split model training sampling display--------
 1---->  it connected to the southern pacific via a 51 - mile - long ( 82 km ) railroad spur . as the southern - spur . <split> it is on from the mine to the northeast shore of the <low_freq> sea , just north of the riverside / imperial county line .
 2---->  it connected to the southern pacific via a 51 - mile - long ( 82 km ) railroad branch known as the eagle mountain railroad . <split> it ran southwest from the mine to the northeast shore of the <low_freq> sea , just north of the riverside / imperial county line .


 1---->  it connects <low_freq> street and <low_freq> street . <split> it stands bydgoszcz cathedral , following along a north - south path .
 2---->  it connects <low_freq> street and <low_freq> street . <split> where stands bydgoszcz cathedral , following along a north - south path .


 1---->  it connects sr 290 in bloomington springs with sr 135 near dodson branch . <split> it provides access to cummins falls sta

 1---->  both lev and daniel were convicted of '' collaboration with foreign secret services '' . <split> they were executed by firing squad september 25 , 1938 .
 2---->  both lev and daniel were convicted of '' collaboration with foreign secret services '' . <split> they were executed by firing squad september 25 , 1938 .


 1---->  both line 1 and line 5 's stations have two side platforms and line 5 's stations . <split> <split> have two side platforms and two tracks .
 2---->  both line 1 and line 5 's stations have two side platforms and two tracks . <split> there is a wall separating the line 1 and the line 5 platforms .


 1---->  both <low_freq> and <low_freq> ( the roman name of <low_freq> ) were major early christian centres . <split> it is believed that saint andrew founded the local christian church and his disciple <low_freq> served as bishop at <low_freq> .
 2---->  both <low_freq> and <low_freq> ( the roman name of <low_freq> ) were major early christian centres . <spli

info=[pre-trained_split_model-10per]-loss=0.574934304-bleu=0.6795-hidden_dim=256-input_dim=100-epoch=2-batch_size=100-batch_id=[1-[of]-989]-lr=0.0050 0.6795217604511479
info=[pre-trained_fusion_model-10per]-loss=0.453640640-bleu=0.7717-hidden_dim=256-input_dim=100-epoch=2-batch_size=100-batch_id=[1-[of]-989]-lr=0.0050 0.7717444321283641
--------split model training sampling display--------
 1---->  his parents gave him the nickname ' dave ' . <split> he plays acoustic and <low_freq> since about 6 years .
 2---->  his parents gave him the nickname ' dave ' . <split> he plays acoustic and <low_freq> since about 6 years .


 1---->  his parents immediately enrolled him in swimming lessons . <low_freq> <low_freq> lacy and the state . . . <split> he came third at the state age championships in the 50 m breaststroke just six weeks later .
 2---->  his parents immediately enrolled him in swimming lessons with fred de lacy at the <low_freq> swim club . <split> brooks came third at the state ag

 1---->  although the werewolves did completely recover from wounds made by non series , the series never explored whether or not werewolves could regenerate missing limbs .
 2---->  although the werewolves did completely recover from wounds made by <low_freq> weapons , the series never explored whether or not werewolves could regenerate missing limbs .


 1---->  although the western allies were aware of the panther and had access to technical details through the soviets , there panther was not seen by the western allies until early 1944 at anzio in italy , where panthers were employed in the <low_freq> .
 2---->  although the western allies were aware of the panther and had access to technical details through the soviets , the panther was not employed against the western allies until early 1944 at anzio in italy , where panthers were employed in small numbers .


 1---->  although the wooden type was more durable under the mechanical <low_freq> of handling , repeated printing wore th

--------split model training sampling display--------
 1---->  harper 's government was highly concerned to maintain canada 's competitive position of international trade which is equivalent to more than 60 percent of canada 's annual gross domestic product . <split> one one out of five jobs were created by the global market presence .
 2---->  harper 's government was highly concerned to maintain canada 's competitive position of international trade which is equivalent to more than 60 percent of canada 's annual gross domestic product . <split> and one out of five jobs were created by the global market presence .


 1---->  harper - mercer lived with his mother in torrance , california , from 2011 to 2013 . <split> he was a graduate of <low_freq> learning center , a school for teenagers with learning disabilities or emotional issues .
 2---->  harper - mercer lived with his mother in torrance , california , from 2011 to 2013 . <split> he was a graduate of <low_freq> learning center , 

 1---->  for most of his life khalil was known by the name khalil bey , but word word , bey '' is not person title , by a son of a pasha ( lord ) .
 2---->  for most of his life khalil was known by the name khalil bey , the ottoman title '' bey '' being a courtesy title used by the sons of a pasha ( lord ) .


 1---->  for most of his life he worked for the family company , gonzález <low_freq> , and managed to increase its exports to a worldwide level .
 2---->  for most of his life he worked for the family company , gonzález <low_freq> , and managed to increase its exports to a worldwide level .


 1---->  for most of its existence before amalgamation with hamilton in 2001 , <low_freq> was better fact a trio of sparsely settled townships .
 2---->  for most of its existence before amalgamation with hamilton in 2001 , <low_freq> was in fact a trio of sparsely settled townships .


 1---->  for most of its existence <low_freq> , large losses and was subsidised by the taxpayer , and from

--------split model training sampling display--------
 1---->  in november 2014 , her solo the song song , the first kiss '' . <split> it song was released as a part of musician hwang sung je 's '' project super hero '' series .
 2---->  in november 2014 , she recorded a solo song '' the first kiss '' . <split> the track was released as a part of musician hwang sung je 's '' project super hero '' series .


 1---->  in november 2014 george opened california california california mixed well of <low_freq> . in as the <low_freq> of martial arts training covering <low_freq> ) . <low_freq> , california . <split> he california arts academy covering training in several disciplines for both children and adults .
 2---->  in november 2014 chad opened a gym , as owner and operator , known as the california mixed martial arts academy ( <low_freq> ) in <low_freq> , california . <split> the mma training academy covers training in several disciplines for both children and adults .


 1---->  in nove

 1---->  in july 2009 it was confirmed that dixon would take on the role of judge on '' strictly come dancing '' , because to <low_freq> show that the show is broadcast live on saturday night , had to <low_freq> three shows in lincoln , southampton and brighton .
 2---->  in july 2009 it was confirmed that dixon would take on the role of judge on '' strictly come dancing '' , due to the fact that the show is broadcast live on saturday night dixon had to <low_freq> three shows in lincoln , southampton and brighton .


 1---->  in july 2010 , stevens was rushed to hospital after he collapsed collapsed at his home in windsor , resulting on exhaustion brought on by the stress of working on a new album .
 2---->  in july 2010 , stevens was rushed to hospital after he had collapsed at his home in windsor , reportedly from exhaustion brought on by the stress of working on a new album .


 1---->  in july 2010 , director manish jha announced a film , '' <low_freq> '' , based on the life of <lo

--------split model training sampling display--------
 1---->  he attended west tennessee christian college ( henderson , where henderson he graduated in 1895 with a bachelor of arts degree . <split> later received a received master of science degree from the same college .
 2---->  he attended west tennessee christian college in henderson , from which he graduated in 1895 with a bachelor of arts degree . <split> he received thereafter a master of science degree from the same college .


 1---->  he attended westchester college and graduated with an '' advanced civil technology '' degree . <split> he later attending <low_freq> university , pace university , and mercy college where where he worked on a minor in computer science .
 2---->  he attended westchester college and graduated with an '' advanced civil technology '' degree . <split> <low_freq> later attended <low_freq> university , pace university , and mercy college , where he worked on a minor in computer science .


 1---->  h

 1---->  john crichton - stuart , 5th marquess of bute ( 4 august 1907 -- 14 august 1956 ) was the son of john crichton - stuart , 4th marquess of bute and augusta bellingham .
 2---->  john crichton - stuart , 5th marquess of bute ( 4 august 1907 -- 14 august 1956 ) was the son of john crichton - stuart , 4th marquess of bute and augusta bellingham .


 1---->  john <low_freq> , edward roberts and clyde wells , however he continued to face internal strife and announced his resignation as leader and his retirement from politics in 1969 .
 2---->  john <low_freq> , edward roberts and clyde wells , but he continued to face internal strife and announced his resignation as leader and his retirement from politics in 1969 .


 1---->  john d. robinson ( born ) is a professor of psychiatry and surgery at howard university college of medicine and currently as director of behavioral medicine for the howard university center for wellness and weight loss surgery .
 2---->  john d. robinson ( born

 1---->  she gets revenge '' was watched by <low_freq> million people during its original broadcast , and gained a 0.9 ratings share among adults aged 18 -- 49 . <split> it also ranked second in the nielsen social ratings , with <low_freq> tweets seen by over <low_freq> thousand people , making it the lowest rated episode in the shows history since the
 2---->  she gets revenge '' was watched by <low_freq> million people during its original broadcast , and gained a 0.9 ratings share among adults aged 18 -- 49 . <split> it also ranked second in the nielsen social ratings , with <low_freq> tweets seen by over <low_freq> thousand people , making it the lowest rated episode in the shows history since


 1---->  she needs someone to hold her ( when she cries ) '' is a 1972 single by conway <low_freq> . <split> the song was <low_freq> 's ninth number one on the u.s. country chart as a solo artist .
 2---->  she needs someone to hold her ( when she cries ) '' is a 1972 single by conway <low_f

 1---->  from the age of 14 , she took mathematics coursework at harvard university . <split> at the age of 16 graduated she graduated valedictorian of the class of 1991 at boston latin school , an examination school and the oldest high school in the united states .
 2---->  from the age of 14 , carlo took mathematics coursework at harvard university . <split> at the age of 16 , she graduated valedictorian of the class of 1991 at boston latin school , an examination school and the oldest high school in the united states .


 1---->  from the age of 3 to 29 he never game <low_freq> . <split> he age he recieve a full sponsor ship from honda .
 2---->  from the age of 3 to 29 he never game up . <split> at 17 he receive a full sponsor ship from honda .


 1---->  from the age of 5 , he was involved in competitive sports : <split> he basketball , he , boxing , and achieving a scholarship playing football .
 2---->  from the age of 5 , he was involved in competitive sports . <split> from soc

 1---->  fun , fun , fun '' is a song written by brian wilson and mike love for american rock band the beach boys . <split> it was released in 1964 as a single backed with '' why do fools fall in love '' , both later appearing on the band 's album '' shut down volume 2 '' .
 2---->  fun , fun , fun '' is a song written by brian wilson and mike love for american rock band the beach boys . <split> it was released in 1964 as a single backed with '' why do fools fall in love '' , both later appearing on the band 's album '' shut down volume 2 '' .


 1---->  <low_freq> '' was chosen as the third single from '' <low_freq> '' . <split> it will officially impact top <low_freq> radio in italy on march 28 , 2014 and in the u.s. on april 8 , 2014 .
 2---->  <low_freq> '' was chosen as the third single from '' <low_freq> '' . <split> it will officially impact top <low_freq> radio in italy on march 28 , 2014 and in the u.s. on april 8 , 2014 .


 1---->  galway girl '' is a song written by steve e

--------split model training sampling display--------
 1---->  case has stated that although he has the deepest respect for <low_freq> , hawaii is in a time of transition with regard to the state 's representation in congress . the . the next . <split> he requires for that the phases in the next generation to provide continuity in that service .
 2---->  case has stated that although he has the deepest respect for <low_freq> , hawaii is in a time of transition with regard to the state 's representation in congress and especially in the senate . <split> this transition requires that hawaii phases in the next generation to provide continuity in that service .


 1---->  <low_freq> <low_freq> 's novel '' ethiopia unbound '' is one of the first novels in english by an african . <split> it has been cited as the earliest pan-african fiction .
 2---->  <low_freq> <low_freq> 's novel '' ethiopia unbound '' is one of the first novels in english by an african . <split> it has been cited as the e

 1---->  its approach is fully automated , more affordable and gives quicker results , yielding large amounts of data , with it development with directed with pharmaceutical research , with longer - term uses in pharmaceutical screening and <low_freq> diagnostics .
 2---->  its approach is fully automated , more affordable and gives quicker results , yielding large amounts of data , making the technology especially suitable for biomedical research , with longer - term uses in pharmaceutical screening and <low_freq> diagnostics .


 1---->  its architecture is very original : it is a merger of european art with old polish art building tradition , in elevation and palace interiors antique antique in the symbols , especially family <low_freq> family , especially the triumphs of the king .
 2---->  its architecture is very original -- it is a merger of european art with old - polish building tradition and its elevation and palace interiors that , using antique symbols , <low_freq> the <low

 1---->  based on the 1985 film of the same name the musical is expected to make its world première in london 's west end . <split> the show will feature original music alongside hits from the film , including '' the power of love '' and '' johnny b. goode '' .
 2---->  based on the 1985 film of the same name the musical is expected to make its world première in london 's west end in 2016 . <split> the show will feature original music alongside hits from the film , including '' the power of love '' and '' johnny b. goode '' .


 1---->  based on the 2007 census of population , oriental mindoro has a population of <low_freq> which makes it the most populous province in the region . <split> this is higher by <low_freq> from the year 2000 , resulting in an annual population growth rate of <low_freq> % during the seven - year period .
 2---->  based on the 2007 census of population , oriental mindoro has a population of <low_freq> which makes it the most populous province in the region . <

info=[pre-trained_split_model-10per]-loss=0.491277695-bleu=0.6897-hidden_dim=256-input_dim=100-epoch=3-batch_size=100-batch_id=[1-[of]-989]-lr=0.0050 0.6896877010794782
info=[pre-trained_fusion_model-10per]-loss=0.382489473-bleu=0.7493-hidden_dim=256-input_dim=100-epoch=3-batch_size=100-batch_id=[1-[of]-989]-lr=0.0050 0.7493001992612933
--------split model training sampling display--------
 1---->  it is the location of the historical stoughton army barracks ( now the the , the early . the <low_freq> keep ) . <split> the , the , the wooden bridge pub where both the rolling stones and eric clapton performed concerts at the early stages of their respective careers .
 2---->  it is the location of the former stoughton army barracks ( redeveloped for housing in the 1990s and renamed <low_freq> keep ) . <split> also in stoughton is the wooden bridge pub where both the rolling stones and eric clapton performed concerts at the early stages of their respective careers .


 1---->  it is the lo

 1---->  in 1988 , moore produced the now famous one - recording '' trinity by by sessions '' by was released in early 1988 on latent records in canada , and re-released worldwide in 1989 by rca new york .
 2---->  in 1988 , moore produced the now famous one microphone recording cowboy <low_freq> '' trinity sessions '' which was released in early 1988 on latent records in canada , and re-released worldwide in 1989 by rca new york .


 1---->  in 1988 , peterson co-starred in the roger corman - produced post-apocalyptic film '' the lawless land '' , directed in jon drama in the 1989 teen drama '' listen to me '' .
 2---->  in 1988 , peterson co-starred in the roger corman - produced post-apocalyptic film '' the lawless land '' , followed by a role in the 1989 teen drama '' listen to me '' .


 1---->  in 1988 , rabbi <low_freq> laid the cornerstone for a larger renovation project , is yet to be completed .
 2---->  in 1988 , rabbi <low_freq> laid the cornerstone for a larger renovation 

--------split model training sampling display--------
 1---->  in 2004 , sarah fox made her north american debut as <low_freq> ( '' giovanni '' and the cincinnati opera . <split> she will sing a concert performance of '' <low_freq> '' at the alice tully hall in may 2009 .
 2---->  in 2004 , sarah fox made her north american debut as <low_freq> ( don giovanni ) at the cincinnati opera . <split> she will sing a concert performance of '' <low_freq> '' at the alice tully hall in may 2009 .


 1---->  in 2004 , soriano meet with philippine president gloria macapagal - arroyo at <low_freq> . <split> he finally retired on march 1 , 2005 .
 2---->  in 2004 , soriano meet with philippine president gloria macapagal - arroyo at <low_freq> . <split> he finally retired on march 1 , 2005 .


 1---->  in 2004 , texas a&m joined a consortium of universities and countries to build the giant magellan telescope in chile . <split> the , build the largest optical telescope ever constructed , the facility h

 1---->  at the end of the 1996 - 97 season he was a competitive process . <split> he dropped from the premier list .
 2---->  at the end of the 1996 - 97 season he was dropped from the premier list . <split> <low_freq> reverted to football league level .


 1---->  at the end of the 19th century , several railways were completed . <split> several railways were completed .
 2---->  at the end of the 19th century , several railways were completed . <split> this prompted the arrival of streetcars .


 1---->  at the end of the 2001 - 2002 season , butterfield , who was a first team regular left the club . <split> butterfield was a first team regular left the club due to the poor finances brought on by the collapse of itv digital .
 2---->  at the end of the 2001 - 2002 season , butterfield , who by now was a first team regular left the club due to the poor finances brought on by the collapse of itv digital . <split> from between 1997 and 2002 , danny featured 124 times in the league , sc

--------split model training sampling display--------
 1---->  designed to operate in a no - wake zone , the <low_freq> rises after a few meters and reaching a speed of , on four <low_freq> . <split> the reduces water drag by 40 % and increases efficiency , allowing speeds of up to a potential .
 2---->  designed to operate in a no - wake zone , the <low_freq> rises after a few meters and reaches a speed of , on four <low_freq> . <split> this reduces water drag by 40 % and increases efficiency , allowing speeds of up to a potential .


 1---->  designed under frank halford , the engine was very advanced and proved to be difficult to adapt to assembly line efforts efforts <split> while , the engine was ready in production , it was n't until 1944 production production versions were considered reliable .
 2---->  designed under frank halford , the engine was very advanced and proved to be difficult to adapt to assembly line production . <split> therefore although the engine was ready by 1

 1---->  in 2006 , the school opened a higher secondary level , in 2009 , it began a higher - education program leading to a bachelor 's degree .
 2---->  in 2006 , the school opened a higher secondary level ; in 2009 , it began a higher - education program leading to a bachelor 's degree .


 1---->  in 2006 , the song appeared as a bonus track on the japanese version of jack 's mannequin 's debut album '' everything in transit '' , and well as the the and bean 's '' super christmas '' .
 2---->  in 2006 , the song appeared as a bonus track on the japanese version of jack 's mannequin 's debut album '' everything in transit '' , as well as on kevin and bean 's '' super christmas '' .


 1---->  in 2006 , the suicide rate in the united states army increased by 37 % , and in 2009 , there were 344 completed suicides by military personnel ( 211 of whom were members of the army ) .
 2---->  in 2006 , the suicide rate in the united states army increased by 37 % , while in 2009 , there were 

 1---->  <low_freq> high school is part of the international baccalaureate organization ( ib ) . <split> it is the middle years program and ib diploma .
 2---->  <low_freq> high school is part of the international baccalaureate organization ( ib ) including the middle years program and ib diploma . <split> the school offers the avid program as well .


 1---->  <low_freq> pass gets snow occasionally . <split> <split> sometimes there is enough snow to close the pass temporarily .
 2---->  <low_freq> pass gets snow occasionally . <split> sometimes there is enough snow to close the pass temporarily .


 1---->  cake ( stylized cake ) is an american alternative rock band from sacramento , california . <split> it consists of singer john mccrea , trumpeter vince <low_freq> , guitarist <low_freq> <low_freq> , bassist casey <low_freq> and drummer todd roper .
 2---->  cake ( stylized cake ) is an american alternative rock band from sacramento , california . <split> consisting of singer john mc

 1---->  he played high school basketball at <low_freq> high school ( now <low_freq> high magnet school ) on tucson 's west side . <split> after graduating in 1985 , he remained in tucson to play college basketball at the university of arizona .
 2---->  he played high school basketball at <low_freq> high school ( now <low_freq> high magnet school ) on tucson 's west side . <split> after graduating in 1985 , he remained in tucson to play college basketball at the university of arizona .


 1---->  he played high school basketball at la salle academy and <split> he at basketball at st. john 's university from 1997 to 1999 .
 2---->  he played high school basketball at la salle academy . <split> played college basketball at st. john 's university from 1997 to 1999 .


 1---->  he played his college basketball at new york university in 1944 ) . 1944 -- 48 . <split> he earned an engineering degree , was an all - american in basketball and won the haggerty award in his final year .
 2----> 

 1---->  it was formed in november 2001 with the bringing together of three separate companies under one name , thus currently operates twelve routes ( four electric , eight diesel ) with its main operating base being the capital city , riga .
 2---->  it was formed in november 2001 with the bringing together of three separate companies under one name , and currently operates twelve routes ( four electric , eight diesel ) with its main operating base being the capital city , riga .


 1---->  it was formed in world war ii , in mid- 1943 , and was its number was it commanded only the second of two airborne divisions raised by the british army during the second world war , the other being the 1st airborne division .
 2---->  it was formed in world war ii , in mid- 1943 , and despite its name , was actually only the second of two airborne divisions raised by the british army during the second world war with the other being the 1st airborne division .


 1---->  it was formed in the late 1

 1---->  at a later date , <low_freq> 170 was re-designated as the joint intelligence group . <split> he was assigned as a subordinate element of joint task force guantanamo , <split> the other subordinate elements of <low_freq> <low_freq> are the joint detention group and the joint medical group .
 2---->  at a later date , <low_freq> 170 was re-designated as the joint intelligence group and was assigned as a subordinate element of joint task force guantanamo . <split> the other subordinate elements of <low_freq> <low_freq> are the joint detention group and the joint medical group .


 1---->  at a later date , the abbot of the region who knew <low_freq> to the bishop of <low_freq> . <split> the bishop of <low_freq> made him a priest in 512 .
 2---->  at a later date , the abbot of the region who knew <low_freq> recommended him to the bishop of <low_freq> . <split> the bishop of <low_freq> made him a priest in 512 .


 1---->  at a later date , the fort was converted into a prison . <

--------split model training sampling display--------
 1---->  being more involved in western european integration , <low_freq> delegated much of this work to his deputy , wilhelm <low_freq> . <split> it it is in this area , particularly , that german foreign policy is associated with the name <low_freq> .
 2---->  being more involved in western european integration , <low_freq> delegated much of this work to his deputy , wilhelm <low_freq> . <split> but it is in this area , particularly , that german foreign policy is associated with the name <low_freq> .


 1---->  being mute is often associated with deafness as people who have been unable to hear from birth may not be able to articulate words correctly ( see deaf - mute ) . <split> but <low_freq> describes people who can hear but can not talk .
 2---->  being mute is often associated with deafness as people who have been unable to hear from birth may not be able to articulate words correctly ( see deaf - mute ) . <split> but <low_fr

 1---->  chief <low_freq> <low_freq> has been recognized and conferred with many traditional titles , including <low_freq> <low_freq> , <low_freq> of <low_freq> and <low_freq> of <low_freq> south .
 2---->  chief <low_freq> <low_freq> has been recognized and conferred with many traditional titles , including <low_freq> <low_freq> , <low_freq> of <low_freq> and <low_freq> of <low_freq> south .


 1---->  chief <low_freq> - to - <low_freq> ( or <low_freq> or <low_freq> or <low_freq> or was the of the principal clan leaders over a band in the when southern utah valley , along with chief <low_freq> and <low_freq> .
 2---->  chief <low_freq> - to - <low_freq> ( or <low_freq> or <low_freq> or <low_freq> ) was one of the principal clan leaders of a band of <low_freq> in southern utah valley , along with chief <low_freq> and <low_freq> .


 1---->  chief turkey <low_freq> was a prominent figure in the early and <low_freq> century cultural revitalization movements not only among the piscataway 

 1---->  he is a strong aggressive striker who currently plays for football league championship team reading . <split> he set a new record during his spell at coventry city when 21 days , he made his debut and became the youngest player ever to play for the coventry first team .
 2---->  he is a strong aggressive striker who currently plays for football league championship team reading . <split> he set a new record during his spell at coventry city when , aged 16 years and 21 days , he made his debut and became the youngest player ever to play for the coventry first team .


 1---->  he is a strong digimon with metal wings and multiple cannons . <split> he specialized in close - combat fighting .
 2---->  he is a strong digimon with metal wings and multiple cannons . <split> specialized in close - combat fighting .


 1---->  he is a three - time icelandic chess champion . <split> he was world <low_freq> chess champion in 1987 .
 2---->  he is a three - time icelandic chess champion . 

 1---->  <low_freq> <low_freq> afrika '' was inspired by the african national congress 's use of '' <low_freq> <low_freq> ' <low_freq> as its party song after anthem led to <low_freq> <low_freq> afrika being selected as the national anthem of tanzania .
 2---->  <low_freq> <low_freq> afrika '' was inspired by the african national congress 's use of '' <low_freq> <low_freq> ' <low_freq> as its party song , which led to <low_freq> <low_freq> afrika being selected as the national anthem of tanzania .


 1---->  music '' is a song by by american singer madonna and her eighth studio album '' '' - from 2000 ) .
 2---->  music '' is a song recorded by american singer madonna for her eighth studio album of same name ( 2000 ) .


 1---->  my blue heaven '' is a popular song , by walter donaldson , the by george a. whiting .
 2---->  my blue heaven '' is a popular song written by walter donaldson with lyrics by george a. whiting .


 1---->  my blue heaven '' is a song by alternative rock band t

 1---->  frederick i of denmark and norway ( 7 october 1471 -- 10 april 1533 ) was a king of denmark and norway . <split> frederick was the son of the first oldenburg king christian i of denmark , norway and sweden ( 1426 -- 1481 ) and of dorothea of brandenburg ( 1430 -- 1495 ) .
 2---->  frederick i of denmark and norway ( 7 october 1471 -- 10 april 1533 ) was the king of denmark and norway . <split> frederick was the son of the first oldenburg king christian i of denmark , norway and sweden ( 1426 -- 1481 ) and of dorothea of brandenburg ( 1430 -- 1495 ) .


 1---->  frederick james <low_freq> ( february 8 , 1886 -- july 16 , 1950 ) , nicknamed '' fritz , '' was an american baseball player . <split> he played college baseball for the university of michigan in 1909 and was a right - handed pitcher in major league baseball for the cleveland naps from 1910 to 1914 .
 2---->  frederick james <low_freq> ( february 8 , 1886 -- july 16 , 1950 ) , nicknamed '' fritz , '' was an american bas

--------split model training sampling display--------
 1---->  he was named australian of the year in 1999 . <split> he was awarded the australian sports medal on 23 june 2000 for his services to australian sport .
 2---->  he was named australian of the year in 1999 . <split> taylor was awarded the australian sports medal on 23 june 2000 for his services to australian sport .


 1---->  he was named head of the refugee department . <split> he polish he to the under study for the polish ministry . foreign affairs .
 2---->  he was named head of the refugee department . <split> in 1943 moved to london to work for the polish ministry of foreign affairs .


 1---->  he was named mvp after leading the warriors to a franchise - record and nba - best 67 wins on the season . <split> he season year he he best the warriors to their first nba championship since 1975 .
 2---->  he was named mvp after leading the warriors to a franchise - record and nba - best 67 wins on the season . <split> that 

 1---->  a walnut is the nut of any tree of the genus '' <low_freq> '' ( family <low_freq> ) , particularly the persian or english walnut , '' <low_freq> regia '' . <split> it is used for food after being processed while green for pickled <low_freq> or after full ripening for its <low_freq> .
 2---->  a walnut is the nut of any tree of the genus '' <low_freq> '' ( family <low_freq> ) , particularly the persian or english walnut , '' <low_freq> regia '' . <split> it is used for food after being processed while green for pickled <low_freq> or after full ripening for its <low_freq> .


 1---->  a warm core ring is a type of <low_freq> eddy which breaks off from an ocean current , e.g. the gulf stream or the <low_freq> current . <split> the ring is an independent circulatory system of warm water which can persist for several months .
 2---->  a warm core ring is a type of <low_freq> eddy which breaks off from an ocean current , e.g. the gulf stream or the <low_freq> current . <split> the r

 1---->  jules auguste , <low_freq> ( salomon ) ( 11 11 11 , <low_freq> july 1896 ) 10 10 , 10 october 1896 ) was <split> he french violinist , conductor and composer of the 19th century .
 2---->  jules auguste , <low_freq> ( salomon ) - ( b. bourges , 11 july 1830 ; d paris , 10 oct 1896 ) . <split> illustrious french violinist , conductor and composer of the 19th century .


 1---->  jules jordan formerly directed for evil angel , you . make the own . . <split> he producing series including '' ass worship '' .
 2---->  jules jordan formerly directed for evil angel before leaving to form his own company . <split> he produced series including '' ass worship '' .


 1---->  jules mark shear is born pittsburgh ) is a american singer , songwriter , and guitarist . <split> guitarist , born in pittsburgh in 1952 .
 2---->  jules mark shear ( born 1952 ) is an american singer , songwriter , and guitarist . <split> shear was born in pittsburgh in 1952 .


 1---->  jules <low_freq> ( august 2

 1---->  following a holiday to poland , he visited an ex-boyfriend in manila in manila in manila in the philippines . <split> he found manila to be a '' culture shock '' from the inland revenue , but briefly worked as a barman and waiter at a brothel known as <low_freq> 's bar known as <low_freq> 's bar known as <low_freq> 's
 2---->  following a holiday to poland , he visited an ex-boyfriend in manila in the philippines . <split> affording the fare due to a '' sizeable tax <low_freq> '' from the inland revenue , o'grady found manila to be a '' culture shock '' , but briefly worked as a barman and waiter at a brothel known as <low_freq> 's bar


 1---->  following a joint decision between the assyrian associations in sweden , the idea that had existed since the assyrian associations in sweden . <split> the idea was to create a new assyrian media platform that could reach out to all groups within the assyrian population .
 2---->  following a joint decision between the assyrian associa

--------split model training sampling display--------
 1---->  david <low_freq> ( born february 21 , 1951 ) is an american football executive . <split> he was the general general manager of the carolina panthers of the national football league ( nfl ) .
 2---->  david <low_freq> ( born february 21 , 1951 ) is an american football executive . <split> he is the former general manager of the carolina panthers of the national football league ( nfl ) .


 1---->  david <low_freq> ( born february 21 , 1951 ) is an former lobbyist football executive . <split> he is recently served as the general manage of the new panthers of the national football league .
 2---->  david <low_freq> ( born february 21 , 1951 ) is a former american football executive . <split> he most recently served as the general manager of the carolina panthers of the national football league .


 1---->  david gil <low_freq> is an american historian and author . <split> he co-author of several books on jewish history .
 2---

 1---->  according to the u.s. army , there is no record of macbeth being a ranger , or serving in a combat unit , he was discharged from the service after having been declared unfit or unsuitable for the army , or both , before he could complete basic training .
 2---->  according to the u.s. army , there is no record of macbeth being a ranger , or serving in a combat unit : he was discharged from the service after having been declared unfit or unsuitable for the army , or both , before he could complete basic training .


 1---->  according to the 2000 census bureau , clarksville district 5 had median population of <low_freq> , with median estimated average income in $ <low_freq> , with median income of $ <low_freq> .
 2---->  according to the u.s. census bureau , clarksville district 5 has a population of <low_freq> , with an estimated average income of $ <low_freq> , with median income of $ <low_freq> .


 1---->  according to the u.s. census bureau , the county has a total area of

info=[pre-trained_split_model-10per]-loss=0.379748613-bleu=0.6633-hidden_dim=256-input_dim=100-epoch=4-batch_size=100-batch_id=[501-[of]-989]-lr=0.0050 0.6632911416157141
info=[pre-trained_fusion_model-10per]-loss=0.263998926-bleu=0.7486-hidden_dim=256-input_dim=100-epoch=4-batch_size=100-batch_id=[501-[of]-989]-lr=0.0050 0.7485770432686898
--------split model training sampling display--------
 1---->  his carrer as a professional boxer is going to be longer and maybe <low_freq> . <split> a he , he is 9 victories with 0 , and and is considered by some expert as a future heavyweight world champion .
 2---->  his carrer as a professional boxer is going to be longer and maybe better . <split> so far , he has 9 victories with none defeats , and is considered by some experts as a future heavyweight world champion .


 1---->  his cartoons are often found on humorous greeting cards . <split> he has a regular cartoon cartoon strip in '' private eye '' entitled '' <low_freq> '' , skinheads , w

 1---->  <low_freq> <low_freq> studied with <low_freq> at the <low_freq> wilhelm - <low_freq> at münster and his work was a continuation of his mentor 's , spurred on by the frequent claims in the popular literature that '' the hopi have no concept of time '' .
 2---->  <low_freq> <low_freq> studied with <low_freq> at the <low_freq> wilhelm - <low_freq> at münster and his work was a continuation of his mentor 's , spurred on by the frequent claims in the popular literature that '' the hopi have no concept of time '' .


 1---->  <low_freq> then consults the girl 's father , richard malkin , who claims that the girl and her mother are simply pretending that there was a miracle because they resent the fact that he is a fraudulent psychic .
 2---->  <low_freq> then consults the girl 's father , richard malkin , who claims that the girl and her mother are simply pretending that there was a miracle because they resent the fact that he is a fraudulent psychic .


 1---->  <low_freq> <low_fre

 1---->  grand slam was born in chippewa falls , wisconsin . <split> <split> <split> he graduated at the top of his class from special weapons school .
 2---->  grand slam was born in chippewa falls , wisconsin . <split> he graduated at the top of his class from special weapons school .


 1---->  grand daughter of rai bahadur <low_freq> prasad <split> born into a family of mostly doctors and lawyers . <split> she excelled in academics with first class first honors in sociology .
 2---->  grand daughter of rai bahadur <low_freq> prasad . <split> born into a family of mostly doctors and lawyers , she excelled in academics with first class first honors in sociology .


 1---->  grand emperor <low_freq> <low_freq> <low_freq> and emperor <low_freq> <low_freq> <low_freq> <low_freq> <low_freq> . <split> <low_freq> went to tam <low_freq> by means of a small boat .
 2---->  grand emperor <low_freq> <low_freq> <low_freq> and emperor <low_freq> <low_freq> <low_freq> went to tam <low_freq> by mea

--------split model training sampling display--------
 1---->  <low_freq> <low_freq> and elena <low_freq> are pioneers in the field of researching their own artistic partnership through a includes the includes the other of <low_freq> machine artists . the process resource . equally contributes to the concept of <split> this process described described with the concept of a hybrid neural network ( <low_freq> ) .
 2---->  <low_freq> <low_freq> and elena <low_freq> are pioneers in the field of researching their own artistic partnership , which in fact includes three members - the two artists and a machine resource that equally contributes to the process . <split> this partnership is described through the concept of a hybrid neural network ( <low_freq> ) .


 1---->  <low_freq> made his debut as a wrestler in 1938 at the age of fifteen under the name <low_freq> valentino . <split> he would also wrestle as <low_freq> <low_freq> before taking up the name that made him famous .
 2---->  <low_

 1---->  at one point in the war , in 1997 , the top foreign policy officials in the clinton administration flew to northern afghanistan to the opportunity to make - - to take advantage of an opportunity top to make crucial crucial to the move .
 2---->  at one point in the war , in 1997 , two top foreign policy officials in the clinton administration flew to northern afghanistan in an attempt to convince <low_freq> not to take advantage of a strategic opportunity to make crucial gains against the taliban .


 1---->  at one point in time , braddock road 's intersection with u.s. 29 was cut and the remaining southern portion of the spur was renamed spindle ct .
 2---->  at one point in time , braddock road 's intersection with u.s. 29 was cut , the remaining southern portion of the spur was renamed spindle ct .


 1---->  at one point in time , an ancestor of mina donated a dna sample to the corner family , later was later on by alejandro to create the trinity siblings .
 2---->  at on

 1---->  despite the duplicate service , metro considers the redundant bus service justified because both bus routes run frequently from downtown los angeles . <split> they run along the '' entire '' wilshire corridor , west to beverly hills , westwood and santa monica .
 2---->  despite the duplicate service , metro considers the redundant bus service justified because both bus routes run frequently from downtown los angeles . <split> unlike the purple line , they run along the '' entire '' wilshire corridor , west to beverly hills , westwood and santa monica .


 1---->  despite the end of support for windows xp , microsoft has released two emergency security patches for the operating system to address major security exploits . <split> microsoft has released two emergency security patches for the operating system to address major security exploits .
 2---->  despite the end of support for windows xp , microsoft has released two emergency security patches for the operating system to a

--------split model training sampling display--------
 1---->  at this time the yard <low_freq> of 30 miles of track and could accommodate 10,500 wagons . <split> the engine shed was adjacent to the site .
 2---->  at this time the yard <low_freq> of 30 miles of track and could accommodate 10,500 wagons . <split> the engine shed was adjacent to the site .


 1---->  at this time there are more then 100,000 nuclear weapons stock piled on the planet earth . <split> as priests a result of the u. s / soviet cold war .
 2---->  at this time there are more then 100,000 nuclear weapons stock piled on the planet earth . <split> these were the result of the u. s / soviet cold war .


 1---->  at this time they also released their self - titled five - song ep . <split> a april 2005 their long awaited full - length '' between the heart and the synapse '' was released to the world .
 2---->  at this time they also released their self - titled five - song ep . <split> in april 2005 their long await

 1---->  according to louis duchesne , he should be identified with <low_freq> <low_freq> whom the '' <low_freq> christiana '' places later later the list of bishops , he was patron saint of st - <low_freq> , the episcopal residence of the bishops of <low_freq> .
 2---->  according to louis duchesne , he should be identified with saint <low_freq> whom the '' <low_freq> christiana '' places lower in the list of bishops ; he was patron saint of st - <low_freq> , the episcopal residence of the bishops of <low_freq> .


 1---->  according to <low_freq> , friedman says that '' for a woman to wait to have children is wrong because she is violating herself '' while they are <low_freq> get married at the age of fourteen .
 2---->  according to <low_freq> , friedman says that '' for a woman to wait to have children is wrong because she is violating herself '' and women should <low_freq> get married at the age of fourteen .


 1---->  according to <low_freq> , the fortress was poorly built and d

 1---->  a concrete floor that has been hardened and polished will have an extremely long life . <split> a polished concrete floor that has been properly maintained may give 100 + years of service .
 2---->  a concrete floor that has been hardened and polished will have an extremely long life expectancy compared to other flooring . <split> for example , tile that may only last 10 - 20 years , but a polished concrete floor that has been properly maintained may give 100 + years of service .


 1---->  a concubine was a free woman , was often <low_freq> for marriage . <split> her children were legitimate and lawful heirs .
 2---->  a concubine was a free woman , was often <low_freq> for marriage , and her children were legitimate . <split> in all these cases , the children were legitimate and lawful heirs .


 1---->  a condition known as drunken tree syndrome is being caused by this melting . <split> <split> ground water and river <low_freq> are being negatively impacted as well .
 2----

--------split model training sampling display--------
 1---->  dawn equipment company is an american corporation based corporation based in sycamore . and is in sycamore , illinois . and it company 's planters is is center is <low_freq> <split> it is a manufacturer of tools for row crop planters , as well as fertilizer <low_freq> and strip - till systems .
 2---->  dawn equipment company is an u.s. - american corporation founded in 1992 , and based in sycamore , illinois , where the company 's factory and administrative center is . <split> dawn is a manufacturer of tools for row crop planters , as well as fertilizer <low_freq> and strip - till systems .


 1---->  dawn <low_freq> mbe is a british entrepreneur who started a a be a . <split> she started in the 1982 of 1982 cheshire and industrial flooring company <low_freq> in 1982 in south cheshire , england , with with peter late father peter peter <low_freq> .
 2---->  dawn <low_freq> mbe is a british entrepreneur who is not to be tru

 1---->  after the commercial success and popularity of the album version of the song , several remixes , several remixes were which spores which were a in dance clubs and radio stations with a dance format .
 2---->  after the commercial success and popularity of the album version of the song , the group released several remixes , some of which become hits in dance clubs and radio stations with a dance format .


 1---->  after the communist revolution of 1917 , <low_freq> 's textile mills were nationalized , and '' as '' <low_freq> - <low_freq> <low_freq> - <low_freq> <low_freq> - ( <low_freq> <low_freq> - <low_freq> cotton center ) .
 2---->  after the communist revolution of 1917 , <low_freq> 's textile mills were nationalized , becoming known as '' <low_freq> - <low_freq> <low_freq> - <low_freq> <low_freq> '' ( the <low_freq> - <low_freq> cotton center ) .


 1---->  after the company closed in 1951 , the play fell into obscurity , but stewart revived for the 1982 <low_freq> seaso

--------split model training sampling display--------
 1---->  he firstly began his union career as an organiser with the now defunct municipal employees union of victoria . <split> he was appointed was appointed national secretary with the australian services union before entering politics .
 2---->  he firstly began his union career as an organiser with the now defunct municipal employees union of victoria . <split> and then he was assistant national secretary with the australian services union before entering politics .


 1---->  he fled vietnam from the new communist regime in november 1977 , and returned by the . utah italy . <split> he <low_freq> by boat to darwin as a refugee in 1977 with his wife lan and about 40 other people .
 2---->  he fled vietnam from the new communist regime in november 1977 , and arrived in darwin , northern territory . <split> le arrived by boat to darwin as a refugee in 1977 with his wife lan and about 40 other people .


 1---->  he fled into exile 

 1---->  in addition to its arts academy , einstein is home to the visual art center ( <low_freq> ) , which more intensive program devoted to the fine arts , including courses in history , contemporary artists , artwork .
 2---->  in addition to its arts academy , einstein is home to the visual art center ( <low_freq> ) , a more intensive program devoted to the fine arts , including courses in history , contemporary artists and artwork .


 1---->  in addition to free free credit scores , reports , credit karma also offers financial system account monitoring through account aggregation service <low_freq> , which allows users to track their banking card , and transactions and balances in credit karma 's interface .
 2---->  in addition to its free credit reports and tools , credit karma also offers a my spending tool through account aggregation service <low_freq> , which allows users to track their credit card , loan transactions and balances in credit karma 's interface .


 1---->  in

 1---->  in 1981 it was sampled as part of the stars on 45 medley . <split> in 1986 , the british girl group <low_freq> returned the song to number one in seven countries .
 2---->  in 1981 it was sampled as part of the stars on 45 medley . <split> in 1986 , the british girl group <low_freq> returned the song to number one in seven countries .


 1---->  in 1981 she returned to new south wales , to join the staff of the composition school at the sydney conservatorium of music . <split> she was for four years head of composition there , before taking early retirement in 1996 .
 2---->  in 1981 she returned to new south wales , to join the staff of the composition school at the sydney conservatorium of music . <split> she was for four years head of composition there , before taking early retirement in 1996 .


 1---->  in 1981 the western stand at the mcg was renamed the wh <low_freq> stand in his honour . <split> at the completion of the redevelopment in 2005 , a statue of <low_freq> wa

--------split model training sampling display--------
 1---->  his first trip away from mexico alone was to germany . where he earned some college contacts linking it to europe when <split> he earned influenced by <low_freq> <low_freq> 's style of life .
 2---->  his first trip away from mexico alone was to germany , where he earned some college contacts linking it to europe . <split> hi was influenced by <low_freq> <low_freq> 's style of life .


 1---->  his first two terms as chief minister came to be when he replaced <low_freq> , courts courts of who he replaced forced to resign by the courts . <split> his third and current term began upon chief minister <low_freq> 's death .
 2---->  his first two terms as chief minister came to be when he replaced <low_freq> in the role , after she was forced to resign by the courts . <split> his third and current term began following chief minister <low_freq> 's death .


 1---->  his first victory came in 1842 when he piloted <low_freq> . <spli

 1---->  <low_freq> wells high school is the the 2nd best performing state secondary school in powys after <low_freq> high school .
 2---->  <low_freq> wells high school is also the 2nd best performing state secondary school in powys after <low_freq> high school .


 1---->  <low_freq> power station is a <low_freq> hydroelectric power project in in <low_freq> victoria of jinja falls , about north of jinja immediately the former location of <low_freq> falls .
 2---->  <low_freq> power station is a <low_freq> hydroelectric power project located at the headwaters of river nile , about north of jinja at the former location of <low_freq> falls .


 1---->  <low_freq> seeks the help of a <low_freq> ( ajay ) , reveals that the house is indeed haunted by souls .
 2---->  <low_freq> seeks the help of a <low_freq> ( ajay ) who reveals that the house is indeed haunted by souls .


 1---->  bukit <low_freq> secondary school is a government secondary school in singapore <low_freq> in located singap

 1---->  graeme paul knowles cvo ( born 25 september 1951 ) is an anglican bishop . <split> he was installed as dean of st paul 's cathedral , london on 1 october 2007 after letters patent were issued on 20 september 2007 .
 2---->  graeme paul knowles cvo ( born 25 september 1951 ) is an anglican bishop . <split> he was installed as dean of st paul 's cathedral , london on 1 october 2007 after letters patent were issued on 20 september 2007 .


 1---->  <low_freq> berger , dr. <low_freq> 's colleague , brought the concept to federation following a trip with andrew mckinley to interlochen in 1951 . <split> the next year , dr. berger saw the possibility of a comprehensive not - for - profit arts camp for children living in the new york metropolitan area .
 2---->  <low_freq> berger , dr. <low_freq> 's colleague , brought the concept to federation following a trip with andrew mckinley to interlochen in 1951 . <split> here dr. berger saw the possibility of a comprehensive not - for - prof

--------split model training sampling display--------
 1---->  having recovered his identity but not his memories , jake returns to the life he was living . until him to a divorce so . was marry . <low_freq> <split> sam , sam exposes that elizabeth knew of his identity all along and lied to him .
 2---->  having recovered his identity but not his memories , jason returns to the life he was living , asking sam for a divorce so he can marry elizabeth . <split> however , sam exposes that elizabeth knew of his identity all along and lied to him .


 1---->  having redeemed himself , vader asks luke to remove his mask . <split> he says he had good left in him after all , dies peacefully .
 2---->  having redeemed himself , vader asks luke to remove his mask . <split> vader says he had good left in him after all and dies peacefully .


 1---->  having refused to fire upon civilians routine , the the the were police commander , was team of <low_freq> was sent ambushed hiding operational hidin

 1---->  a five - mile - long trail loops around the open space property , and the is , to hikers , horseback riders , <low_freq> , and mountain bikers .
 2---->  a five - mile - long trail loops around the open space property , and it 's open to hikers , horseback riders , <low_freq> , and mountain bikers .


 1---->  a five - part series published between august and december 1995 , the marks the first appearance of she - venom , the symbiote bonds with brock 's wife ann to save her from gunshot wounds .
 2---->  a five - part series published between august and december 1995 , it marks the first appearance of she - venom after the symbiote bonds with brock 's wife ann to save her from gunshot wounds .


 1---->  a five - part television version of '' mildred pierce '' premiered on hbo in march 2011 , starring kate winslet as mildred , guy pearce as <low_freq> , evan rachel wood as veda and mare mare <low_freq> as ida .
 2---->  a five - part television miniseries of '' mildred pierce

--------split model training sampling display--------
 1---->  however , she is delighted when leanne confides in her about she is delighted ex-husband ex-husband with ex-husband , nick , and encourages to to consider leaving her peter for nick . <split> when peter is seriously injured in the tram crash , leanne chooses to marry peter .
 2---->  however , she is delighted when leanne confides in her that she is having an affair with ex-husband , nick , and encourages her to think about leaving peter for nick . <split> when peter is seriously injured in the tram crash , leanne chooses to marry peter .


 1---->  however , she is shown to change sides later on . she she defending her younger sister from the robots . <split> afterwards then defended her own <low_freq> ultraviolet ultraviolet ultraviolet energies blasts , <low_freq> ultraviolet '' ) with <low_freq> 's bright green ones and and they destroyed the remaining robots with ease .
 2---->  however , she is shown to change sides l

 1---->  arthur <low_freq> ( , born in 1964 in new jersey , u.s. ) , is an israeli who served as the fourth ambassador of the state of israel to the republic of azerbaijan from august 2005 - july 2009 .
 2---->  arthur <low_freq> ( , born in 1964 in new jersey , u.s. ) , is an israeli who served as the fourth ambassador of the state of israel to the republic of azerbaijan from august 2005 - july 2009 .


 1---->  arthur lillie ( 24 february 1831 -- 28 november 1911 ) , was as george arthur howard , was the youngest son of sir john scott lillie and his wife louisa , born at north end , fulham .
 2---->  arthur lillie ( 24 february 1831 -- 28 november 1911 ) , christened as george arthur howard , was the youngest son of sir john scott lillie and his wife louisa , born at north end , fulham .


 1---->  arthur lloyd ( 1839 - 1904 ) was a scottish singer , songwriter , comedian and stage in , who for music hall in the united kingdom .
 2---->  arthur lloyd ( 1839 - 1904 ) was a scottish si

 1---->  for centuries <low_freq> was an important stopping place on the coaching road between london and holyhead ( now the <low_freq> road ) . <split> at one point 40 minutes every day would stop at <low_freq> to allow travellers to refresh themselves .
 2---->  for centuries <low_freq> was an important stopping place on the coaching road between london and holyhead ( now the <low_freq> road ) . <split> at one point 40 <low_freq> every day would stop at <low_freq> to allow travellers to refresh themselves .


 1---->  for centuries its graduates dominated massachusetts ' clerical and civil ranks . <split> <split> beginning in the 19th century it gained <low_freq> <low_freq> as a dozen graduate and professional schools formed alongside the nucleus undergraduate college .
 2---->  for centuries its graduates dominated massachusetts ' clerical and civil ranks . <split> beginning in the 19th century it gained <low_freq> <low_freq> as a dozen graduate and professional schools formed along

--------split model training sampling display--------
 1---->  in 2010 , the percentage of workers belonging to a union union '' or total labor union '' density '' ) was <low_freq> % . germany labor states . <split> the to <low_freq> % in germany , <low_freq> % in canada , and 70 % in finland .
 2---->  in 2010 , the percentage of workers belonging to a labor union ( or total labor union '' density '' ) was <low_freq> % in the united states . <split> compared to <low_freq> % in germany , <low_freq> % in canada , and 70 % in finland .


 1---->  in 2010 , the website <low_freq> was developed in find in find investors for talented , emerging designers in want to find investors career career . the fashion world . <split> the platform functions are order to finance the design process , promotion , production , and retail of selected designers ' first collections .
 2---->  in 2010 , the website <low_freq> was launched to continue to pair investors with talented , emerging designers who wan

 1---->  he is quickly dispatched by arthur , the the his golden mask .
 2---->  he is quickly dispatched by arthur , leaving only his golden mask .


 1---->  he is quite naïve , hyper , <low_freq> , clumsy , and <low_freq> , while while this tends to get the girls and himself in trouble , he really cares about them and their safety .
 2---->  he is quite naïve , hyper , <low_freq> , clumsy , and <low_freq> , but while this tends to get the girls and himself in trouble , he really cares about them and their safety .


 1---->  he is rather perverted , with a love of women with big breasts , and his coach instructs him to knock opponents away by diving into their chests , because '' young men love breasts '' .
 2---->  he is rather perverted , with a love of women with big breasts , and his coach instructs him to knock opponents away by diving into their chests , because '' young men love breasts '' .


 1---->  he is rather quiet , only speaking when provoked , however , he disturbed 

--------split model training sampling display--------
 1---->  after her death , her husband helped establish the kamehameha schools in 1887 . was founded in honolulu museum . <split> in marriage was charles reed , , founded the bishop museum in honolulu in 1889 as a memorial to <low_freq> .
 2---->  after her death , her husband helped establish the kamehameha schools in 1887 which was written in her will . <split> her husband , charles reed bishop , founded the bishop museum in honolulu in 1889 as a memorial to <low_freq> .


 1---->  after her death , the cult of saint teresa was also known in spain during the <low_freq> due to the religious claim and debate of national patronage versus saint viejo matamoros . <split> teresa was work brother , rodrigo <low_freq> y <low_freq> , became his <low_freq> among of the immaculate conception of '' el viejo '' now widely venerated among
 2---->  after her death , the cult of saint teresa was also known in spain during the <low_freq> due to th

 1---->  according to the international spa association ( <low_freq> ) , <low_freq> has the generic term for water therapies using jets , underwater massage and mineral baths ( e.g. <low_freq> , iodine - <low_freq> therapy , scotch treatments , scotch hose , swiss shower , <low_freq> ) and others .
 2---->  according to the international spa association ( <low_freq> ) , <low_freq> is the generic term for water therapies using jets , underwater massage and mineral baths ( e.g. <low_freq> , iodine - <low_freq> therapy , <low_freq> treatments , scotch hose , swiss shower , <low_freq> ) and others .


 1---->  according to the jerusalem post , <low_freq> has not denounced iranian acquisition of nuclear weapons because because , '' i am afraid of real risk rather than of potential risk .
 2---->  according to the jerusalem post , <low_freq> has not denounced iranian acquisition of nuclear weapons , explaining , '' i am afraid of real risk rather than of potential risk .


 1---->  according

 1---->  further troubles came from the depression of the 1930s . <split> another drop in recreational use due to world war ii , point walter fell into a state of disrepair .
 2---->  further troubles came from the depression of the 1930s in which another drop in recreational use was experienced . <split> after yet another drop in recreational use due to world war ii , point walter fell into a state of disrepair .


 1---->  further types of component analog video signals do not use r , g , and b components , rather a colorless component , termed <low_freq> , which provides brightness information ( as in black - and - white video ) . <split> this provides brightness information ( as in black - and - white video ) .
 2---->  further types of component analog video signals do not use r , g , and b components but rather a colorless component , termed <low_freq> , which provides brightness information ( as in black - and - white video ) . <split> this combines with one or more color - carr

--------split model training sampling display--------
 1---->  elisa hendrik '' <low_freq> '' <low_freq> ( 16 april 1909 -- 7 july 1982 ) was a dutch football player and manager . <split> he scored 28 goals in 23 games for the dutch national side .
 2---->  elisa hendrik '' <low_freq> '' <low_freq> ( 16 april 1909 -- 7 july 1982 ) was a dutch football player and manager . <split> <low_freq> scored 28 goals in 23 games for the dutch national side .


 1---->  elisa lam , from vancouver , british columbia , was was on weeks a weeks before a roof hotel in los angeles . <split> after was found found dead in a large water tank on the roof of the cecil , after guests complained about the taste of the water .
 2---->  elisa lam , from vancouver , british columbia , canada went missing for several weeks at the cecil hotel in los angeles . <split> she was eventually found dead in a large water tank on the roof of the hotel , after guests complained about the taste of the water .


 1---->  elis

 1---->  another version similar to the chinese version and '' <low_freq> merah , <low_freq> <low_freq> '' also exists in the philippines . <split> it is known as '' <low_freq> <low_freq> '' ( '' mary the crab '' ) .
 2---->  another version similar to the chinese version and '' <low_freq> merah , <low_freq> <low_freq> '' also exists in the philippines . <split> the story is known as '' <low_freq> <low_freq> '' ( '' mary the crab '' ) .


 1---->  another victory followed on the 21 april . <split> <split> flying over tripoli , orr and sub-lieutenant graham hogg forced a dornier do <low_freq> flying boat down with its engine on fire .
 2---->  another victory followed on the 21 april . <split> flying over tripoli , orr and sub-lieutenant graham hogg forced a dornier do <low_freq> flying boat down with its engine on fire .


 1---->  another was shot in the crossfire . <split> the only surviving attacker was <low_freq> <low_freq> .
 2---->  another was shot in the crossfire . <split> the

info=[pre-trained_split_model-10per]-loss=0.334912747-bleu=0.6872-hidden_dim=256-input_dim=100-epoch=6-batch_size=100-batch_id=[501-[of]-989]-lr=0.0050 0.6872462352143377
info=[pre-trained_fusion_model-10per]-loss=0.220021978-bleu=0.7492-hidden_dim=256-input_dim=100-epoch=6-batch_size=100-batch_id=[501-[of]-989]-lr=0.0050 0.7491855023102368
--------split model training sampling display--------
 1---->  diamond valley lake is their third and newest reservoir with with a capacity of <low_freq> af of water . <split> it of november 11 , 2008 it it is is % full .
 2---->  diamond valley lake is their third and newest reservoir , with a capacity of <low_freq> af of water . <split> as of november 11 , 2008 , it <low_freq> 55 % full .


 1---->  diamond is known for wearing colorful <low_freq> shirts in concert . <split> it is originally that it is originally out out of necessity so diamond everyone in the audience could see him without the aid of binoculars .
 2---->  diamond is known for wea

 1---->  atwater kent radios were of high quality and many examples of working models exist today ; they are highly prized by collectors and <low_freq> .
 2---->  atwater kent radios were of high quality and many examples of working models exist today ; they are highly prized by collectors and <low_freq> .


 1---->  atwood appeared in many music videos and stage stage productions and sang backup for <low_freq> .
 2---->  atwood appeared in many music videos , and stage productions and sang backup for <low_freq> .


 1---->  atwood serves as consulting producer , feedback on some of the areas where the series expands or <low_freq> the book , and and also had a small cameo role in the first episode .
 2---->  atwood serves as consulting producer giving feedback on some of the areas where the series expands or <low_freq> the book , , and also had a small cameo role in the first episode .


 1---->  atwood was the school 's chief instructor from the time it opened until he quit on june 10

--------split model training sampling display--------
 1---->  an earlier strip , '' herman '' , created by clyde lamb , published from 1950 through 1966 . <split> it had no relation to unger 's strip .
 2---->  an earlier strip , '' herman '' , created by clyde lamb , ran from 1950 through 1966 . <split> it had no relation to unger 's strip .


 1---->  an earlier trip in 2004 ran over the beacon line from danbury , <split> using beacon - was fl - 9 locomotives owned by <low_freq> .
 2---->  an earlier trip in 2004 ran over the beacon line from danbury . <split> the 2004 trip used fl - 9 locomotives owned by <low_freq> .


 1---->  an early 1940s style known as '' jumping the blues '' or jump blues used small combos , uptempo music and uptempo blues chord <low_freq> . <split> he blues used on the - <low_freq> , the 1930s .
 2---->  an early 1940s style known as '' jumping the blues '' or jump blues used small combos , uptempo music , and blues chord <low_freq> . <split> jump blues dre

 1---->  <low_freq> is <low_freq> taken to valdemar , where he recovers from his injuries . <split> he is informed by king 's own <low_freq> that he is informed by king 's own <low_freq> that he is now a herald trainee .
 2---->  <low_freq> is <low_freq> taken to valdemar , where he recovers from his injuries . <split> when he recovers , he is informed by king 's own <low_freq> that he is now a herald trainee .


 1---->  <low_freq> <low_freq> was born into a noble <low_freq> on january 14 , 1552 . <split> his birthplace was the town of san <low_freq> in the region marche in italy .
 2---->  <low_freq> <low_freq> was born into a noble <low_freq> on january 14 , 1552 . <split> his birthplace was the town of san <low_freq> in the region marche in italy .


 1---->  <low_freq> left a large quantity of manuscripts . <split> the <low_freq> of the political testament , published in his name at lausanne in 1753 , has been questioned .
 2---->  <low_freq> left a large quantity of manuscripts .

--------split model training sampling display--------
 1---->  it was adapted from his own novel and directed by emmanuel <low_freq> . <split> it is currently distributed in koch - <low_freq> films .
 2---->  it was adapted from his own novel and directed by emmanuel <low_freq> . <split> it is currently distributed by koch - <low_freq> films .


 1---->  it was added to the national register of historic places in 1973 . <low_freq> <low_freq> <low_freq> of . <split> it shares its <low_freq> romanesque architectural style with the colgate administration building .
 2---->  it was added to the national register of historic places in 1973 as the old biology hall . <split> it shares its <low_freq> romanesque architectural style with the colgate administration building .


 1---->  it was adopted ( along with irving berlin 's '' white christmas '' ) by american troops in europe in world war ii as a symbol of the united states . <split> garland even performed the song for american of troops a

 1---->  for <low_freq> and <low_freq> '' ( 1991 ) was a personal documentary about the japanese american internment , and won the distinguished achievement award from the international documentary association .
 2---->  for <low_freq> and <low_freq> '' ( 1992 ) was a personal documentary about the japanese american internment , and won the distinguished achievement award from the international documentary association .


 1---->  for apertura 2007 he returns to mexico for cf atlas with played the <low_freq> 2008 with cf atlas and qualified to copa libertadores 2008 .
 2---->  for apertura 2007 he returns to mexico for cf <low_freq> already played an <low_freq> tournament with cf atlas and qualified to copa libertadores 2008 .


 1---->  for arthur schopenhauer destiny was just a manifestation of the will to live , therefore the be at the same time living fate and choice of <low_freq> the fate same , by means of the art , of the morality and of the <low_freq> .
 2---->  for arthur scho

 1---->  beni - <low_freq> ( berber : <low_freq> <low_freq> , ) is the capital of the <low_freq> <low_freq> - <low_freq> region . <split> it has a population of <low_freq> ( 2010 census ) .
 2---->  beni - <low_freq> ( berber : <low_freq> <low_freq> , ) is a moroccan city . <split> it is the capital of the <low_freq> <low_freq> - <low_freq> region and has a population of <low_freq> ( 2010 census ) .


 1---->  <low_freq> of tokyo owned approximately 51 % of <low_freq> . <split> <low_freq> <low_freq> was having financial difficulties and its directors approved an issuance of $ 20 million of preferred stock .
 2---->  <low_freq> of tokyo owned approximately 51 % of <low_freq> . <split> <low_freq> was having financial difficulties and its directors approved an issuance of $ 20 million of preferred stock .


 1---->  benin city is home to some of nigeria 's institutions of higher learning . <split> the university of benin located at <low_freq> and <low_freq> , ambrose ali university .
 2--

info=[pre-trained_split_model-10per]-loss=0.337236971-bleu=0.6797-hidden_dim=256-input_dim=100-epoch=7-batch_size=100-batch_id=[1-[of]-989]-lr=0.0050 0.6797348114163994
info=[pre-trained_fusion_model-10per]-loss=0.243150190-bleu=0.7307-hidden_dim=256-input_dim=100-epoch=7-batch_size=100-batch_id=[1-[of]-989]-lr=0.0050 0.7306741782080973
--------split model training sampling display--------
 1---->  his participation at the montreal games sparked a heated debate . <split> in supposedly said he he hated french canadians ( montréal - <low_freq> newspaper , july 29 , 1976 , pages 5 and 8 ) .
 2---->  his participation at the montreal games sparked a heated debate . <split> he allegedly said that he hated french canadians ( montréal - <low_freq> newspaper , july 29 , 1976 , pages 5 and 8 ) .


 1---->  his participation in the volta a catalunya yielded more success , the and won second <low_freq> <low_freq> <low_freq> won <split> he did the sixth stage in the volta in a bike , the volta fin

 1---->  a stepfather or <low_freq> is the man of one 's biological mother , he he one 's biological father .
 2---->  a stepfather or <low_freq> is the husband of one 's biological mother , and not one 's biological father .


 1---->  a stepmother - in - law is the stepmother of one 's spouse and the wife of the 's <low_freq> - in - law and <low_freq> <low_freq> person say <low_freq> - in - law .
 2---->  a stepmother - in - law is the stepmother of one 's spouse and the wife of one 's father - in - law , and not one 's mother - in - law .


 1---->  a steward and member member of hick lane wesleyan chapel , he was also a lay preacher in the batley area , well being a liberal councillor for nine years helped his fellow cricketers as chairman of the cricketers benevolent fund .
 2---->  a steward and loyal member of hick lane wesleyan chapel , he was also a lay preacher in the batley area as well being a liberal councillor for nine <low_freq> helped his fellow cricketers as chairman o

 1---->  in 2009 , he was the assistant coach of team canada of team canada 's world junior gold medal team . <split> he was the assistant coach of team canada 's world junior gold medal team .
 2---->  in 2009 , he was the assistant coach of team canada 's world junior gold medal team . <split> under head coach pat quinn and alongside other assistant coaches guy boucher and dave cameron .


 1---->  in 2009 , his lawyer requested that more than 100 charges pending against <low_freq> be dropped against <low_freq> be dropped . <split> however , <low_freq> 's brother , ganga prasad <low_freq> , subsequently claimed to have cremated him in 1996 .
 2---->  in 2009 , his lawyer requested that more than 100 charges pending against <low_freq> be dropped claiming that <low_freq> died on saturday july 25 , 2009 . <split> however , <low_freq> 's brother , ganga prasad <low_freq> , subsequently claimed to have cremated him in 1996 .


 1---->  in 2009 , invited by the president of the european ce

--------split model training sampling display--------
 1---->  a progressively larger portion of the farmland was devoted to fodder production , and milk adjoining area increased until 2013 1920s removed <split> milk - and diary - related income was the most important source income for swedish agricultural business around the turn of the century .
 2---->  a progressively larger portion of the farmland was devoted to fodder production , and the farmed area increased until the 1920s . <split> milk - and diary - related income was the most important source income for swedish agricultural business around the turn of the century .


 1---->  a prohibited licence may only issued to qualifying businesses , and under rarely being qualifying businesses individuals . . <split> under , latter can only obtain the prohibited handguns .
 2---->  a prohibited licence can be issued to qualifying businesses , and very rarely to individuals with extraordinary circumstances . <split> however the latter 

 1---->  behind it , the rectangular cross-section fuselage was plywood - covered and flat - sided apart from curved <low_freq> with with two open cockpits , equipped with dual control , in tandem over the wing .
 2---->  behind it , the rectangular cross-section fuselage was plywood - covered and flat - sided apart from curved <low_freq> , with two open cockpits , equipped with dual control , in tandem over the wing .


 1---->  behind the blinds aka filmmaking 101 is an american comedy web series , by charles <low_freq> and produced by charles <low_freq> .
 2---->  behind the blinds aka filmmaking 101 is a us comedy web series written by charles <low_freq> and produced by anna <low_freq> .


 1---->  behind the indian textile tradition '' , published in 2007 , and has also several articles in indian journals .
 2---->  behind the indian textile tradition '' , published in 2007 , and has written several articles in indian journals .


 1---->  behind the line '' was an australian docu

 1---->  along with that he was able to stage several plays by danish authors . <split> working with an uppsala university student theatre group he was deeply involved in the translation and production of several plays by <low_freq> brecht .
 2---->  along with that he was able to stage several plays by danish authors . <split> working with an uppsala university student theatre group he was deeply involved in the translation and production of several plays by <low_freq> brecht .


 1---->  along with the '' <low_freq> '' , <low_freq> may have brought back from cologne the first manuscript of the '' cambridge songs '' to enter england . <split> this work was a collection of latin <low_freq> songs , and became famous in the middle ages .
 2---->  along with the '' <low_freq> '' , <low_freq> may have brought back from cologne the first manuscript of the '' cambridge songs '' to enter england . <split> this work was a collection of latin <low_freq> songs , and became famous in the middle a

--------split model training sampling display--------
 1---->  kyle l lawson ( born july 6 , 1986 in tucson . al . <split> he is the current mayor for the town of gene autry , oklahoma .
 2---->  kyle l lawson was born july 6 , 1986 in tucson , arizona . <split> kyle is the current mayor for the town of gene autry , oklahoma .


 1---->  kyle larson scored the pole for the race with a time of <low_freq> and a speed of after <split> he success of disqualified disqualified after nascar officials discovered an <low_freq> rear deck fin on his car .
 2---->  kyle larson scored the pole for the race with a time of <low_freq> and a speed of . <split> his time was later disqualified after nascar officials discovered an <low_freq> rear deck fin on his car .


 1---->  kyle <low_freq> is an australian professional mixed martial artist who has fights with the <low_freq> fighting . . <split> he has fought fought for <low_freq> and the australian fighting promotion spartan reality fight .
 2---->  

 1---->  he made the album '' end beginnings '' with <low_freq> <low_freq> in 1993 . <split> he worked on <low_freq> , an electro - dub project with <low_freq> members greg hunter and kris weston in 1998 .
 2---->  he made the album '' end beginnings '' with <low_freq> <low_freq> in 1993 , which led to a series of concerts in brazil . <split> in 1998 he worked on <low_freq> , an electro - dub project with <low_freq> members greg hunter and kris weston in 1998 .


 1---->  he made three guest appearances on '' perry mason '' <split> in 1962 . <split> he appeared as danny pierce in '' the case of the lonely <low_freq> '' .
 2---->  he made three guest appearances on '' perry mason '' . <split> in 1962 , he appeared as danny pierce in '' the case of the lonely <low_freq> '' .


 1---->  he made twenty starts in the 24 hours of le mans race , finishing in 8th place at the first attempt in 1962 . <split> he has raced in the renowned 24 hours of le mans twenty times , with a pole position an

--------split model training sampling display--------
 1---->  in the 15th century significant alterations were made to the church and major reconstruction commenced , with the eastern wing renovated . <split> larger windows were installed at this time to provide more light to the aisles .
 2---->  in the 15th century significant alterations were made to the church and major reconstruction commenced , with the eastern wing renovated . <split> larger windows were installed at this time to provide more light to the aisles .


 1---->  in the 15th century the <low_freq> state was at war with the aztecs . <split> <low_freq> of speakers were were relocated fleeing outside in the the speakers the fleeing of speakers relocated outside of the <low_freq> frontiers , whereas speakers of <low_freq> fleeing the aztec expansion <low_freq> on the border between the two <low_freq> .
 2---->  in the 15th century the <low_freq> state was at war with the aztecs . <split> many <low_freq> speakers who had

 1---->  cain and <low_freq> then fought and and a bear hug so tight that external force could affect them , cain forced <low_freq> into the god machine , caused the eight <low_freq> to be hurled off to different sections of the earth ending the struggle .
 2---->  cain and <low_freq> then fought locked in a bear hug so tight no external force could affect them and cain forced <low_freq> into the god machine which caused the eight <low_freq> to be hurled off to different sections of the earth ending the struggle .


 1---->  cain emigrated to the usa in 1985 where she worked at universal studios hollywood cashier before to a part with later became a director of operations studios theme parks , she was in charge of 200 - 300 people .
 2---->  cain emigrated to the usa in 1985 where she worked for universal studios , cain began as a cashier but later became a director of universal studios theme parks where she was in charge of 200 - 300 people .


 1---->  cain started his career playing

 1---->  he was with the band from its inception in 1969 until 1980 . <split> <split> played on their classic album '' argus '' and also contributed vocals to it .
 2---->  he was with the band from its inception in 1969 until 1980 . <split> he played on their classic album '' argus '' and also contributed vocals to it .


 1---->  he was with the band until 1978 , at the age of 18 years of age . <split> he left to pursue a career as a solo artist .
 2---->  he was with the band until 1978 , at which time he left to pursue a career as a solo artist . <split> he released a series of solo albums which saw modest success , primarily in germany and japan .


 1---->  he was working at nebraska book co. of lincoln , nebraska , when he met jack cole , the co-owner of a toronto book of the series . <split> he met jack cole , the co-owner of a toronto book business , coles .
 2---->  he was working at nebraska book co. of lincoln , nebraska , when he met jack cole , the co-owner of a toronto b

--------split model training sampling display--------
 1---->  brand agreed and wrote her first one act play . <split> a also to write a about comedy about the learning curves of first aid treatments . titled aid was first aid '' .
 2---->  brand agreed and wrote her first one act play . <split> she decided to make it a comedy about the learning curves of first aid treatments and entitled it '' first aid '' .


 1---->  brand arrived to mediate on 20 march and the a kruger and later to drop drop recently . <split> the was committed ; republic ; ; the british committed to formally restore the republic within six months .
 2---->  brand arrived to mediate on 20 march and persuaded both kruger and wood to drop certain points . <split> agreement was reached the following day ; the british committed to formally restore the republic within six months .


 1---->  <low_freq> wright ( born october 5 , in in nashville , tennessee ) is an american basketball basketball player . the north 's sacr

 1---->  for that article , editor warren <low_freq> paired thompson with illustrator ralph steadman , who drew to expressionist expressionist illustrations with lipstick and <low_freq> .
 2---->  for that article , editor warren <low_freq> paired thompson with illustrator ralph steadman , who claimed to drew expressionist illustrations with lipstick and <low_freq> .


 1---->  for that reason , a is often used in conjunction with computational fluid dynamics tools , are better able to account for geometric complexity , at the expense of being more limited in their treatment of the underlying chemistry of the reactive process being studied .
 2---->  for that reason , <low_freq> is often used in conjunction with computational fluid dynamics tools that are better able to account for geometric complexity , at the expense of being more limited in their treatment of the underlying chemistry of the reactive process being studied .


 1---->  for that reason , game data such as levels , grap

 1---->  his father worked as a petition writer in district <low_freq> , <low_freq> . <split> he died in 1899 when master ji was only 12 years old .
 2---->  his father worked as a petition writer in district <low_freq> , <low_freq> . <split> he died in 1899 when master ji was only 12 years old .


 1---->  his father worked for a long time trying to turn him back to normal practice . <split> after much experimenting , he was finally freed thanks to static , gear and <low_freq> .
 2---->  his father worked for a long time trying to turn him back to normal . <split> after much experimenting , he was finally freed thanks to static , gear and <low_freq> .


 1---->  his father worked irregularly as a day laborer and indulged in gambling . <split> <split> john gotti came to resent his father for being unable to provide for his family .
 2---->  his father worked irregularly as a day laborer and indulged in gambling . <split> as an adult , john gotti came to resent his father for being unab

--------split model training sampling display--------
 1---->  in five years , global sales of highway legal light - duty plug - in electric vehicles increased increased more than <low_freq> , totaling more than <low_freq> units in 2015 . <split> itself - in electric were 2015 - more <low_freq> % increase 2014 , driven mainly by china and europe .
 2---->  in five years , global sales of highway legal light - duty plug - in electric vehicles have increased more than <low_freq> , totaling more than <low_freq> units in 2015 . <split> plug - in sales in 2015 increased about 80 % from 2014 , driven mainly by china and europe .


 1---->  in fixed - wing aircraft driven by one or more jet engines , the performance of the jet engine is important to the operation of the jet . <split> the includes the jet engine includes measurement of thrust , fuel consumption , noise and engine emissions .
 2---->  in fixed - wing aircraft driven by one or more jet engines , the performance of the jet engine

info=[pre-trained_fusion_model-10per]-loss=0.235383555-bleu=0.7049-hidden_dim=256-input_dim=100-epoch=8-batch_size=100-batch_id=[1-[of]-989]-lr=0.0050 0.704863605498269
--------split model training sampling display--------
 1---->  in 1984 newham council voted to demolish ronan point . <split> all nine blocks on the estate , comprising 990 flats , were demolished and the and the area was redeveloped with 20 two - storey houses with gardens .
 2---->  in 1984 newham council voted to demolish ronan point . <split> all nine blocks on the estate , comprising 990 flats , were demolished in 1986 and the area was redeveloped with 20 two - storey houses with gardens .


 1---->  in 1984 an eight room permanent a a - pak was added to the school to ease some of the overcrowding . <split> through the 1980 's several new schools were built in the area to further some of the school 's enrollment pressures .
 2---->  in 1984 an eight room port - a - pak was added to the school to ease some of the ov

 1---->  he was deputy inspector - general of police , <low_freq> range , joint commissioner of police , chennai city . <split> following matriculation , chennai was dig of police , tiruchirapalli before becoming chief vigilance officer , <low_freq> , at <low_freq> .
 2---->  he was deputy inspector - general of police , <low_freq> range , joint commissioner of police , chennai city . <split> later , he was dig of police , tiruchirapalli before becoming chief vigilance officer , <low_freq> , at <low_freq> .


 1---->  he was editor - in - chief of the <low_freq> <low_freq> editorial board . <split> he also has a member of several editorial boards .
 2---->  he was editor - in - chief of the <low_freq> <low_freq> editorial board and also member of several editorial boards . <split> he is currently at the national universities commission .


 1---->  he was executive secretary of the irish catholic bishops ' conference from 1991 to 1997 . <split> he published a book on '' the irish <low_

In [None]:
stop

In [None]:
lm_hidden_dim=512
lm_input_dim=300
use_cuda=1

language_model = LanguageModel(use_cuda = use_cuda, input_dim = lm_input_dim, hidden_dim = lm_hidden_dim, vocab = vocab)
#512
model_path = './models_language_model/time-[2019-02-26-13-18-56]-info=[language_model]-loss=4.003012180-bleu=-1.0000-hidden_dim=512-input_dim=300-epoch=24-batch_size=100-batch_id=[1-[of]-9899]-lr=0.0050'
#2048
# model_path = './models_language_model/time-[2019-02-28-07-04-08]-info=[language_model]-loss=3.475848675-bleu=-1.0000-hidden_dim=2048-input_dim=300-epoch=4-batch_size=100-batch_id=[1-[of]-9899]-lr=0.0050'
# #1024
# model_path = './models_language_model/time-[2019-02-27-21-58-23]-info=[language_model]-loss=4.111208439-bleu=-1.0000-hidden_dim=1024-input_dim=300-epoch=6-batch_size=100-batch_id=[1-[of]-9899]-lr=0.0050'

pre_train = torch.load(model_path, map_location='cpu')
language_model.load_state_dict(pre_train)

if use_cuda:
    language_model = language_model.cuda()
    
language_model.eval()

print('finish loading pre-train weight for language model.')



use_cuda = 1
hidden_dim = 256
input_dim = 100
lr=0.005

split_model = Seq2Seq(use_cuda = use_cuda, input_dim = input_dim, hidden_dim = hidden_dim, 
                          vocab = vocab, max_length = 61)

fusion_model = Seq2Seq(use_cuda = use_cuda, input_dim = input_dim, hidden_dim = hidden_dim, 
                          vocab = vocab, max_length = 51)
#pre train para
split_model_path = './models_saved/time-[2019-03-10-01-36-10]-info=[pre-trained_split_model-20per]-loss=0.515495539-bleu=0.6774-hidden_dim=256-input_dim=100-epoch=1-batch_size=100-batch_id=[501-[of]-1979]-lr=0.0050'
fusion_model_path = './models_saved/time-[2019-03-10-01-36-12]-info=[pre-trained_fusion_model-20per]-loss=0.365494132-bleu=0.7406-hidden_dim=256-input_dim=100-epoch=1-batch_size=100-batch_id=[501-[of]-1979]-lr=0.0050'

split_model_path = './models_saved/time-[2019-03-10-05-52-44]-info=[pre-trained_split_model-20per]-loss=0.467645884-bleu=0.7270-hidden_dim=256-input_dim=100-epoch=2-batch_size=100-batch_id=[501-[of]-1979]-lr=0.0050'
fusion_model_path = './models_saved/time-[2019-03-10-05-52-48]-info=[pre-trained_fusion_model-20per]-loss=0.327692717-bleu=0.7558-hidden_dim=256-input_dim=100-epoch=2-batch_size=100-batch_id=[501-[of]-1979]-lr=0.0050'

split_model_path = './models_saved/time-[2019-03-10-13-23-10]-info=[pre-trained_split_model-20per]-loss=0.454687029-bleu=0.7130-hidden_dim=256-input_dim=100-epoch=4-batch_size=100-batch_id=[1-[of]-1979]-lr=0.0050'
fusion_model_path = './models_saved/time-[2019-03-10-13-23-11]-info=[pre-trained_fusion_model-20per]-loss=0.346116364-bleu=0.7466-hidden_dim=256-input_dim=100-epoch=4-batch_size=100-batch_id=[1-[of]-1979]-lr=0.0050'

split_model_path = './models_saved/time-[2019-03-16-23-39-39]-info=[split_model-semi]-total_loss=-0.002986051-rec_loss=0.017885875-lm_rewards=0.0102-bleu=0.7734-bleu_bs=0.6282-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[1201-[of]-46585]-lr=0.0050-loss_ratio=0.4500'
fusion_model_path = './models_saved/time-[2019-03-16-23-39-39]-info=[fusion_model-semi]'

pre_train = torch.load(split_model_path, map_location='cpu')
split_model.load_state_dict(pre_train)
pre_train = torch.load(fusion_model_path, map_location='cpu')
fusion_model.load_state_dict(pre_train)

if use_cuda:
    split_model = split_model.cuda()
    fusion_model = fusion_model.cuda()
    
split_optimizer = optim.Adam(filter(lambda p: p.requires_grad, split_model.parameters()), lr=lr)
fusion_optimizer = optim.Adam(filter(lambda p: p.requires_grad, fusion_model.parameters()), lr=lr)

# set_model_grad(fusion_model, False)

In [None]:
batch_size=17
split_train_set_size=int(len(split_train_set_inputs)/1)
epochs=10000
train_bleu_mean=-1
train_bleu_max=-1
topk=6
loss_ratio=0.45

sup_bsize=35
dataset_times = int(split_train_set_size/len(split_train_set_inputs_supervised))

#batch_size=35, topk=3  or  batch_size=17, topk=6 or  
start_time = time.time()

def model_train(epoch, batch_size, train_set_size):
    batch_id = 0
    valid_bleu = 0
    for start_idx in range(0, train_set_size-batch_size+1, batch_size):
        if batch_id<=1199 and epoch==0:
            batch_id+=1
            continue
#         now = int(round(time.time()*1000))
#         time_stamp = time.strftime(' --->  starting time-[%Y-%m-%d-%H-%M-%S]-',time.localtime(now/1000))
#         print(time_stamp)
        
        #supervised learning
        if batch_id%2==0:
            set_model_grad(split_model, True)
            set_model_grad(fusion_model, False)
            split_optimizer.zero_grad()#clear  
            sup_idx = (batch_id*sup_bsize)%(len(split_train_set_inputs_supervised)-1-sup_bsize)
            split_loss, predicts = split_model.forward(torch.LongTensor(split_train_set_inputs_supervised[sup_idx:sup_idx+sup_bsize]), 
                                         torch.LongTensor(split_train_set_input_lens_supervised[sup_idx:sup_idx+sup_bsize]), 
                                         labels=torch.LongTensor(split_train_set_labels_supervised[sup_idx:sup_idx+sup_bsize]), 
                                         is_train=1, teaching_rate=1)
            split_loss=torch.mean(split_loss)
            split_loss.backward()#retain_graph=True)
            split_optimizer.step()

        if batch_id%2==1:
            set_model_grad(fusion_model, True)
            set_model_grad(split_model, False)
            fusion_optimizer.zero_grad()#clear
            sup_idx = (batch_id*sup_bsize)%(len(split_train_set_inputs_supervised)-1-sup_bsize)
            fusion_loss, predicts = fusion_model.forward(torch.LongTensor(fusion_train_set_inputs_supervised[sup_idx:sup_idx+sup_bsize]), 
                                         torch.LongTensor(fusion_train_set_input_lens_supervised[sup_idx:sup_idx+sup_bsize]), 
                                         labels=torch.LongTensor(fusion_train_set_labels_supervised[sup_idx:sup_idx+sup_bsize]), 
                                         is_train=1, teaching_rate=1)
            fusion_loss = torch.mean(fusion_loss)
            fusion_loss.backward()#retain_graph=True)
            fusion_optimizer.step()
        
        
        #unsupervised learning
        if batch_id%2==0:
#             a=time.time()
            end_idx = start_idx + batch_size
            split_optimizer.zero_grad()#clear
            total_loss, reconstruct_loss, rm_rewards, lm_rewards=split_model.train_using_reward(inputs=torch.LongTensor(split_train_set_inputs[start_idx:end_idx]), 
                                   input_lens=torch.LongTensor(split_train_set_input_lens[start_idx:end_idx]), 
                                   reconstruct_labels=torch.LongTensor(duplicate_reconstruct_labels(fusion_pseudo_train_set_labels[start_idx:end_idx],topk)), 
                                   reconstruct_model=fusion_model, 
                                   language_model=language_model, 
                                   topk=topk, loss_ratio=loss_ratio)
            reconstruct_loss = torch.mean(reconstruct_loss)
            total_loss.backward()#retain_graph=True)
            split_optimizer.step()
#             print('split: all time: ', time.time()-a)
        if batch_id%2==1: 
#             a=time.time()
            end_idx = start_idx + batch_size
            fusion_optimizer.zero_grad()#clear
            total_loss, reconstruct_loss, rm_rewards, lm_rewards=split_model.train_using_reward(inputs=torch.LongTensor(split_train_set_inputs[start_idx:end_idx]), 
                                   input_lens=torch.LongTensor(split_train_set_input_lens[start_idx:end_idx]), 
                                   reconstruct_labels=torch.LongTensor(duplicate_reconstruct_labels(fusion_pseudo_train_set_labels[start_idx:end_idx],topk)), 
                                   reconstruct_model=fusion_model, 
                                   language_model=language_model, 
                                   topk=topk, loss_ratio=loss_ratio)
            reconstruct_loss = loss_ratio*torch.mean(reconstruct_loss)
            reconstruct_loss.backward()#retain_graph=True)
            fusion_optimizer.step()
#             print('fusion: all time: ', time.time()-a)
        #update batch_id
        batch_id+=1
        #timestamp
#         now = int(round(time.time()*1000))
#         time_stamp = time.strftime('time-[%Y-%m-%d-%H-%M-%S]-',time.localtime(now/1000))
#         print(time_stamp)

        torch.cuda.empty_cache()
        #
        if batch_id%20==1:
            split_model.eval()
            fusion_model.eval()
            set_model_grad(split_model, False)
            set_model_grad(fusion_model, False)
            sample_num = 5
            rand_idx = random.randint(0, train_set_size-sample_num-1)
            
            print('--------split model training sampling display--------')
            #teaching forcing
            loss_, predicts = split_model.forward(torch.LongTensor(split_train_set_inputs[rand_idx:rand_idx+sample_num]), 
                                             torch.LongTensor(split_train_set_input_lens[rand_idx:rand_idx+sample_num]), 
                                             labels=torch.LongTensor(split_pseudo_train_set_labels[rand_idx:rand_idx+sample_num]), 
                                             is_train=1, teaching_rate=1)
            del loss_
            
            predicts = batch_tokens_remove_eos(predicts, vocab)
            labels = batch_tokens_remove_eos(split_pseudo_train_set_labels[rand_idx:rand_idx+sample_num], vocab)
            
            predicts = batch_tokens2words(predicts, vocab)
            labels = batch_tokens2words(labels, vocab)
            
            predicts_sents = batch_words2sentence(predicts)
            labels_sents = batch_words2sentence(labels)
            
            for (predict_sent, label_sent) in zip(predicts_sents, labels_sents):
                print(' 1----> ', predict_sent)
                print(' 2----> ', label_sent)
                print('\n')
            
            now = int(round(time.time()*1000))
            time_stamp = time.strftime('time-[%Y-%m-%d-%H-%M-%S]-',time.localtime(now/1000))
            info_stamp = 'info=[{:s}]-total_loss={:2.9f}-rec_loss={:2.9f}-lm_rewards={:5.4f}-hidden_dim={:n}-input_dim={:n}-epoch={:n}-batch_size={:n}-batch_id=[{:n}-[of]-{:n}]-lr={:1.4f}'.format(
                              'split_model', total_loss.data[0], reconstruct_loss.data[0], lm_rewards, 
                            hidden_dim, input_dim, epoch, batch_size, batch_id, int(train_set_size/batch_size), lr)
            print(time_stamp, info_stamp)
            
            if batch_id%40==1:
                #ground truth
#                 rand_idx=random.randint(0, len(split_valid_set_inputs)-batch_size-1-1)
                rand_idx=2333
                loss_, predicts = split_model.forward(torch.LongTensor(split_valid_set_inputs[rand_idx:rand_idx+batch_size]), 
                                                 torch.LongTensor(split_valid_set_input_lens[rand_idx:rand_idx+batch_size]), 
                                                 labels=torch.LongTensor(split_pseudo_valid_set_labels[rand_idx:rand_idx+batch_size]), 
                                                 is_train=1, teaching_rate=1)
                del loss_
#                 predicts = batch_tokens_remove_eos(predicts, vocab)
#                 labels = batch_tokens_remove_eos(split_pseudo_valid_set_labels[rand_idx:rand_idx+batch_size], vocab)
                
#                 bleu_scores = batch_tokens_bleu(references=labels, candidates=predicts, smooth_epsilon=0.001)
                #split version
                bleu_scores = batch_tokens_bleu_split_version(references=split_pseudo_valid_set_labels[rand_idx:rand_idx+batch_size], 
                                                              candidates=predicts, smooth_epsilon=0.001, vocab=vocab)

                valid_bleu = 0
                for x in bleu_scores:
                    valid_bleu+=x
                valid_bleu/=len(bleu_scores)
                
                #beam search
                dec_seqs, log_probs = split_model.dec.decode_topk_seqs(split_model.enc, 
                                                                       inputs=torch.LongTensor(split_valid_set_inputs[rand_idx:rand_idx+batch_size]), 
                                                                         input_lens=torch.LongTensor(split_valid_set_input_lens[rand_idx:rand_idx+batch_size]),
                                                                         topk=topk)
                predicts = []
                for ii in range(len(dec_seqs)):
                    if ii%topk==0:
                        predicts.append(dec_seqs[ii])
               
                bleu_scores = batch_tokens_bleu_split_version(references = split_pseudo_valid_set_labels[rand_idx:rand_idx+batch_size],
                                                             candidates = predicts,
                                                             smooth_epsilon=0.001,
                                                             vocab=vocab)
                valid_bleu_beam_search=0
                for x in bleu_scores:
                    valid_bleu_beam_search+=x
                valid_bleu_beam_search/=len(bleu_scores)


                info_stamp = 'info=[{:s}]-total_loss={:2.9f}-rec_loss={:2.9f}-lm_rewards={:5.4f}-bleu={:1.4f}-bleu_bs={:1.4f}-hidden_dim={:n}-input_dim={:n}-epoch={:n}-batch_size={:n}-batch_id=[{:n}-[of]-{:n}]-lr={:1.4f}-loss_ratio={:1.4f}'.format(
                              'split_model-semi', total_loss.data[0], reconstruct_loss.data[0], lm_rewards, valid_bleu, valid_bleu_beam_search, 
                            hidden_dim, input_dim, epoch, batch_size, batch_id, int(train_set_size/batch_size), lr, loss_ratio)
                
                print(info_stamp, valid_bleu, valid_bleu_beam_search)
                
                now = int(round(time.time()*1000))
                time_stamp = time.strftime('time-[%Y-%m-%d-%H-%M-%S]-',time.localtime(now/1000))
                torch.save(split_model.state_dict(), ''.join(['./models_saved/', time_stamp, info_stamp]))
                torch.save(fusion_model.state_dict(), ''.join(['./models_saved/', time_stamp, 'info=[fusion_model-semi]']))
            set_model_grad(split_model, True)
            set_model_grad(fusion_model, True)
            split_model.train()
            fusion_model.train()
            torch.cuda.empty_cache()
for epoch in range(epochs):
    model_train(epoch, batch_size, split_train_set_size)
    
print('running time: %.2f mins'%((time.time()-start_time)/60))

In [None]:
stop

In [None]:
sample_num=2
topk=20

predicts, log_probs=split_model.dec.decode_topk_seqs(split_model.enc, inputs=torch.LongTensor(split_train_set_inputs[0:sample_num]), 
                             input_lens=torch.LongTensor(split_train_set_input_lens[0:sample_num]), 
                             topk=topk)

predicts = batch_tokens_remove_eos(predicts, vocab)
labels = batch_tokens_remove_eos(split_pseudo_train_set_labels[0:sample_num], vocab)

predicts = batch_tokens2words(predicts, vocab)
labels = batch_tokens2words(labels, vocab)

predicts_sents = batch_words2sentence(predicts)
labels_sents = batch_words2sentence(labels)

for idx, sent in enumerate(predicts_sents):
    print(' 1----> ', sent)
    if idx%topk==(topk-1):
        print(' 2----> ', labels_sents[int(idx/topk)])
        print('\n')

In [None]:
# copy_thres=1.0
# split_loss, predicts = split_model.forward(torch.LongTensor(split_train_set_inputs[0:sample_num]), 
#                                      torch.LongTensor(split_train_set_input_lens[0:sample_num]), 
#                                      labels=torch.LongTensor(split_pseudo_train_set_labels[0:sample_num]), 
#                                      is_train=1, teaching_rate=1)

# predicts = batch_tokens_remove_eos(predicts, vocab)
# labels = batch_tokens_remove_eos(split_pseudo_train_set_labels[0:sample_num], vocab)

# predicts = batch_tokens2words(predicts, vocab)
# labels = batch_tokens2words(labels, vocab)

# predicts_sents = batch_words2sentence(predicts)
# labels_sents = batch_words2sentence(labels)

# for (predict_sent, label_sent) in zip(predicts_sents, labels_sents):
#     print(' 1----> ', predict_sent)
#     print(' 2----> ', label_sent)
#     print('\n')

In [None]:
stop