In [1]:
import json
import pickle
import random

import torch
from torch import nn, optim
from torch import autograd
from torch.autograd import Variable
import torch.nn.functional as F
import numpy as np
import torch.nn.utils.rnn as rnn_utils

import nltk
from nltk.translate.bleu_score import SmoothingFunction
from nltk.translate.bleu_score import sentence_bleu
import time
import copy

from Vocab import Vocab
from LanguageModel import LanguageModel

import torch
torch.cuda.set_device(1)

print('import over')

copy_thres=1

import over


In [2]:
def batch_words2sentence(words_list):
    return [' '.join(words) for words in words_list]
def batch_tokens2words(tokens_list, vocab):
    ##    para: tokens_list is list[list] type
    ##    return: words_list corresponding to tokens
    return [[vocab.token2word[token] for token in tokens] for tokens in tokens_list]

def batch_tokens_remove_eos(tokens_list, vocab):
    ##    para: tokens_list is list[list] type
    ##    return pure tokens_list removed eos symbol
    result=[]
    for tokens in tokens_list:
        tokens_filtered=[]
        for token in tokens:
            if token == vocab.word2token['<eos>']:
#                 tokens_filtered.append(token)
                break
            else:
                tokens_filtered.append(token)
        result.append(tokens_filtered)
    return result

def batch_tokens_bleu(references, candidates, smooth_epsilon=0.001):
    ##    para: references and candidates are list[list] type
    ##    return: list of BLEU for every sample
    ##
    bleu_scores=[]
    for ref, candidate in zip(references, candidates):
        if min(len(ref), len(candidate))<4:
            bleu_scores.append(0)
        else:
            bleu_scores.append(sentence_bleu([ref], candidate, smoothing_function = SmoothingFunction(epsilon=smooth_epsilon).method1))
    return bleu_scores

with open('data_set/vocab.pk', 'rb') as f:
    vocab=pickle.load(f)

    
def seqs_split(seqs, vocab):
    seqs = batch_tokens_remove_eos(seqs, vocab)
    simple_sent1s=[]
    simple_sent2s=[]
    for seq in seqs:
        simple_sent1=[]
        simple_sent2=[]
        sent=simple_sent1
        for token in seq:
            if token==vocab.word2token['<split>']:
                sent=simple_sent2
            else:
                sent.append(token)
        simple_sent1s.append(simple_sent1)
        simple_sent2s.append(simple_sent2)
        
    return simple_sent1s, simple_sent2s

def simple_sents_concat(simple_sent1s, simple_sent2s, vocab, max_length):
    simple_sent_lens=[]
    simple_sents=simple_sent1s
    for i, sent in enumerate(simple_sent2s):
        simple_sents[i].append(vocab.word2token['<split>'])
        for token in sent:
            simple_sents[i].append(token)

        #if there is no <split> in simple_sent1s and simple_sent2s, then the length of sents_concat will be longer than max_length
        if len(simple_sents[i])>max_length:
            simple_sents[i] = simple_sents[i][:max_length]
            
        simple_sent_lens.append(len(simple_sents[i]))
            
        while(len(simple_sents[i])<max_length):
            simple_sents[i].append(vocab.word2token['<padding>'])
            
    return simple_sents, simple_sent_lens


def get_lm_inputs_and_labels(sents, vocab, max_length):
    lm_inputs=copy.deepcopy(sents)
    lm_labels=copy.deepcopy(sents)
    lm_input_lens=[]
    
    for sent in lm_inputs:
        if len(sent)>=max_length:
            sent=sent[:max_length-1]
        sent.insert(0, vocab.word2token['<sos>'])
        lm_input_lens.append(len(sent))
        while(len(sent)<max_length):
            sent.append(vocab.word2token['<padding>'])

    for sent in lm_labels:
        if len(sent)>=max_length:
            sent = sent[:max_length-1]
        sent.append(vocab.word2token['<eos>'])
        while(len(sent)<max_length):
            sent.append(vocab.word2token['<padding>'])
        
    return lm_inputs, lm_input_lens, lm_labels


def duplicate_reconstruct_labels(sents, topk):
    return [x for x in sents for ii in range(topk)]


def batch_tokens_bleu_split_version(references, candidates, vocab, smooth_epsilon=0.001):
    # needn't remove '<sos>' token before calling this function, which is different from the 'batch_token_bleu()' version
    #
    ref1, ref2 = seqs_split(references, vocab)
    cand1, cand2 = seqs_split(candidates, vocab)
    bleu_simple_sent1s = batch_tokens_bleu(ref1, cand1)
    bleu_simple_sent2s = batch_tokens_bleu(ref2, cand2)
#     print(bleu_simple_sent1s)
#     print(bleu_simple_sent2s)
    bleu=[]
    for idx in range(len(bleu_simple_sent1s)):
        bleu.append((bleu_simple_sent1s[idx]+bleu_simple_sent2s[idx])/2)
    return bleu


def set_model_grad(model, is_grad):
    for param in model.parameters():
         param.requires_grad = is_grad

In [3]:
seqs=[[8,9,90,5,3,2,1], [5,8,9,90,5,3,2,1], [8,2,9,40,5,3,2,2,1], [8,9,90,5,3,2,1], [8,9,90]]
a,b = seqs_split(seqs, vocab)

print(a)
print(b)

lm_in, lm_in_lens, lm_labels=get_lm_inputs_and_labels(a,vocab, max_length=6)
print(lm_in)
print(lm_in_lens)
print(lm_labels)
lm_in, lm_in_lens, lm_labels=get_lm_inputs_and_labels(b,vocab, max_length=6)
print(lm_in)
print(lm_in_lens)
print(lm_labels)

c,d=simple_sents_concat(a,b,vocab, 3)
print(c)
print(d)


batch_tokens_bleu([[1,2,3,4,5,6]], [[2,3,1,4,5]])

[[8, 9, 90], [], [8], [8, 9, 90], [8, 9, 90]]
[[3], [8, 9, 90, 3], [], [3], []]
[[0, 8, 9, 90, 1, 1], [0, 1, 1, 1, 1, 1], [0, 8, 1, 1, 1, 1], [0, 8, 9, 90, 1, 1], [0, 8, 9, 90, 1, 1]]
[4, 1, 2, 4, 4]
[[8, 9, 90, 2, 1, 1], [2, 1, 1, 1, 1, 1], [8, 2, 1, 1, 1, 1], [8, 9, 90, 2, 1, 1], [8, 9, 90, 2, 1, 1]]
[[0, 3, 1, 1, 1, 1], [0, 8, 9, 90, 3, 1], [0, 1, 1, 1, 1, 1], [0, 3, 1, 1, 1, 1], [0, 1, 1, 1, 1, 1]]
[2, 5, 1, 2, 1]
[[3, 2, 1, 1, 1, 1], [8, 9, 90, 3, 2, 1], [2, 1, 1, 1, 1, 1], [3, 2, 1, 1, 1, 1], [2, 1, 1, 1, 1, 1]]
[[8, 9, 90], [5, 8, 9], [8, 5, 1], [8, 9, 90], [8, 9, 90]]
[3, 3, 2, 3, 3]


[0.013910597740964967]

In [4]:
#fusion data set

with open('./data_set2/fusion_data_set/train_pseudo_simple_sents.pk', 'rb') as f:
    fusion_pseudo_train_set_inputs = pickle.load(f)
with open('./data_set2/fusion_data_set/train_pseudo_simple_sent_lens.pk', 'rb') as f:
    fusion_pseudo_train_set_input_lens = pickle.load(f)
with open('./data_set2/fusion_data_set/train_pseudo_labels.pk', 'rb') as f:
    fusion_pseudo_train_set_labels = pickle.load(f)
with open('./data_set2/fusion_data_set/train_simple_sents_supervised.pk', 'rb') as f:
    fusion_train_set_inputs_supervised = pickle.load(f)
with open('./data_set2/fusion_data_set/train_simple_sent_lens_supervised.pk', 'rb') as f:
    fusion_train_set_input_lens_supervised = pickle.load(f)
with open('./data_set2/fusion_data_set/train_labels_supervised.pk', 'rb') as f:
    fusion_train_set_labels_supervised = pickle.load(f)
    
    
with open('./data_set2/fusion_data_set/validation_simple_sents.pk', 'rb') as f:
    fusion_pseudo_valid_set_inputs = pickle.load(f)
with open('./data_set2/fusion_data_set/validation_simple_sent_lens.pk', 'rb') as f:
    fusion_pseudo_valid_set_input_lens = pickle.load(f)
with open('./data_set2/fusion_data_set/validation_labels.pk', 'rb') as f:
    fusion_pseudo_valid_set_labels = pickle.load(f)
    
    
#split data set

with open('./data_set2/split_data_set/train_complex_sents.pk', 'rb') as f:
    split_train_set_inputs = pickle.load(f)
with open('./data_set2/split_data_set/train_complex_sent_lens.pk', 'rb') as f:
    split_train_set_input_lens = pickle.load(f)
with open('./data_set2/split_data_set/train_pseudo_labels.pk', 'rb') as f:
    split_pseudo_train_set_labels = pickle.load(f)
with open('./data_set2/split_data_set/train_complex_sents_supervised.pk', 'rb') as f:
    split_train_set_inputs_supervised = pickle.load(f)
with open('./data_set2/split_data_set/train_complex_sent_lens_supervised.pk', 'rb') as f:
    split_train_set_input_lens_supervised = pickle.load(f)
with open('./data_set2/split_data_set/train_labels_supervised.pk', 'rb') as f:
    split_train_set_labels_supervised = pickle.load(f)
    
    
with open('./data_set2/split_data_set/validation_complex_sents.pk', 'rb') as f:
    split_valid_set_inputs = pickle.load(f)
with open('./data_set2/split_data_set/validation_complex_sent_lens.pk', 'rb') as f:
    split_valid_set_input_lens = pickle.load(f)
with open('./data_set2/split_data_set/validation_labels.pk', 'rb') as f:
    split_pseudo_valid_set_labels = pickle.load(f)


In [5]:
print(len(split_train_set_inputs), len(split_train_set_input_lens), len(split_pseudo_train_set_labels))
print(len(fusion_pseudo_train_set_inputs), len(fusion_pseudo_train_set_input_lens), len(fusion_pseudo_train_set_labels))

print(len(split_train_set_inputs_supervised), len(split_train_set_input_lens_supervised), len(split_train_set_labels_supervised))
print(len(fusion_train_set_inputs_supervised), len(fusion_train_set_input_lens_supervised), len(fusion_train_set_labels_supervised))

791956 791956 791956
791956 791956 791956
197988 197988 197988
197988 197988 197988


In [6]:
class Encoder(nn.Module):
    def __init__(self, use_cuda, hidden_dim, input_dim, vocab):#, pre_train_weight, is_fix_word_vector = 1):
        super(Encoder, self).__init__()
        
        self.use_cuda = use_cuda
        self.input_dim=input_dim
        self.hidden_dim=hidden_dim
        self.vocab = vocab
        
        self.lstm=torch.nn.LSTM(input_size=self.input_dim, 
                                hidden_size= self.hidden_dim, 
                                bidirectional=True,
                                batch_first=True
                               )
        
        #embedding
        self.embed=nn.Embedding(len(self.vocab.word2token), input_dim)
        #loading pre trained word embedding
        with open('data_set/pre_trained_token_embedding.pk', 'rb') as f:
            pre_train_word_embedding = pickle.load(f)
            
        self.embed.weight.data.copy_(torch.FloatTensor(pre_train_word_embedding))
#         self.embed.weight.requires_grad = False
        
    def order(self, inputs, inputs_len):    #inputs: tensor, inputs_len: 1D tensor
        inputs_len, sort_ids = torch.sort(inputs_len, dim=0, descending=True)
        
        if self.use_cuda:
            inputs = inputs.index_select(0, Variable(sort_ids).cuda())
        else:
            inputs = inputs.index_select(0, Variable(sort_ids))
        
        _, true_order_ids = torch.sort(sort_ids, dim=0, descending=False)
        
        return inputs, inputs_len, true_order_ids
    #
    def forward(self, inputs, inputs_len):
        inputs = Variable(inputs)
        if self.use_cuda:
            inputs=inputs.cuda()
            
        inputs, sort_len, true_order_ids = self.order(inputs, inputs_len)

        in_vecs=self.embed(inputs)

        packed = rnn_utils.pack_padded_sequence(input=in_vecs, lengths=list(sort_len), batch_first =True)
        
        outputs, (hn,cn) = self.lstm(packed)
        outputs, sent_lens = rnn_utils.pad_packed_sequence(outputs)
        
        #print('outpurs size, hn size and cn size: ', outputs.size(), hn.size(), cn.size())
        outputs = outputs.transpose(0,1)  #transpose is necessary
        #print('outpurs size, hn size and cn size: ', outputs.size(), hn.size(), cn.size())
        
        #warnning: outputs, hn and cn have been sorted by sentences length so the order is wrong, now to sort them
        if self.use_cuda:
            outputs = outputs.index_select(0, Variable(true_order_ids).cuda())
        else:
            outputs = outputs.index_select(0, Variable(true_order_ids))
        
        hn = torch.cat((hn[0], hn[1]), dim=1)
        cn = torch.cat((cn[0], cn[1]), dim=1)
        #print('hn size and cn size: ', hn.size(), cn.size())
        
        if self.use_cuda:
            hn = hn.index_select(0, Variable(true_order_ids).cuda())
            cn = cn.index_select(0, Variable(true_order_ids).cuda())
        else:
            hn = hn.index_select(0, Variable(true_order_ids))
            cn = cn.index_select(0, Variable(true_order_ids))
            
        return outputs, (hn,cn)

In [7]:
def _inflate(tensor, times, dim):
    """
    Examples::
        >> a = torch.LongTensor([[1, 2], [3, 4]])
        >> a
        1   2
        3   4
        [torch.LongTensor of size 2x2]
        >> b = ._inflate(a, 2, dim=1)
        >> b
        1   2   1   2
        3   4   3   4
        [torch.LongTensor of size 2x4]
    """
    repeat_dims = [1] * tensor.dim()
    repeat_dims[dim] = times
    return tensor.repeat(*repeat_dims)

class Decoder(nn.Module):
    def __init__(self, use_cuda, encoder, hidden_dim, max_length=25):
        super(Decoder, self).__init__()
        
        self.use_cuda = use_cuda
        self.hidden_dim=hidden_dim
        self.input_dim = encoder.input_dim
        self.max_length = max_length
        self.vocab = encoder.vocab
        self.weight = [1]*len(self.vocab.word2token)
        self.weight[self.vocab.word2token['<padding>']]=0
        #self.weight[self.vocab.word2token['<eos>']]=1.01
        #self.weight[self.vocab.word2token['<split>']]=1.01
        
        self.hidden_size = self.hidden_dim
        self.V = len(self.vocab.word2token)
        self.SOS = self.vocab.word2token['<sos>']
        self.EOS = self.vocab.word2token['<eos>']
        self.log_softmax = nn.LogSoftmax(dim=1)
        
        self.lstmcell = torch.nn.LSTMCell(input_size=self.input_dim, hidden_size=self.hidden_dim*2, bias=True)
        
        #embedding
        self.embed=encoder.embed# reference share
        #fcnn: projection for crossentroy loss
        self.fcnn = nn.Linear(in_features = self.hidden_dim*2, out_features = len(self.vocab.word2token))
        
        self.softmax = nn.Softmax(dim=1)
        self.cost_func = nn.CrossEntropyLoss(weight=torch.Tensor(self.weight), reduce=False)
        self.nll_loss = nn.NLLLoss(weight=torch.Tensor(self.weight), reduce=False)

        print('init lookup embedding matrix size: ', self.embed.weight.data.size())
        
        #copy
        out_features_dim=self.hidden_dim
        self.attent_wh = nn.Linear(in_features = self.hidden_dim*2, out_features = out_features_dim, bias = 0)
        self.attent_ws = nn.Linear(in_features = self.hidden_dim*2, out_features = out_features_dim, bias = 1)
        self.tanh = nn.Tanh()
        self.attent_vt = nn.Linear(in_features = out_features_dim, out_features = 1, bias=0)
        
        self.prob_wh = nn.Linear(in_features = self.hidden_dim*2, out_features = 1, bias=0)
        self.prob_ws = nn.Linear(in_features = self.hidden_dim*2, out_features = 1, bias=0)
        self.prob_wx = nn.Linear(in_features = self.input_dim, out_features = 1, bias=1)
        self.sigmoid = nn.Sigmoid()
        
    def copy_mechanism(self, enc_outputs, this_timestep_input, dec_state, inputs_one_hot):
        batch_size = enc_outputs.size(dim = 0)
        
        wh = self.attent_wh(enc_outputs)
        ws = self.attent_ws(dec_state).unsqueeze(dim=1)
#         print('wh, ws size: ', wh.size(), ws.size())
        ws = ws.expand(ws.size(0), wh.size(1), ws.size(2))
#         print('ws size: ', ws.size())
        weight = self.attent_vt(self.tanh(wh+ws))
#         print('weight size: ', weight.size())
        weight = self.softmax(weight.squeeze(dim=2))
#         print('weight size: ', weight.size())
        context_v = torch.bmm(weight.unsqueeze(dim=1), enc_outputs)
#         print('context_v size: ', context_v.size())
        context_v = context_v.squeeze(dim=1)
        
        p_wh = self.prob_wh(context_v)
        p_ws = self.prob_ws(dec_state)
        p_wx = self.prob_wx(this_timestep_input)
        if_copy = self.sigmoid(p_wh+p_ws+p_wx)
#         if_copy = 0.3*if_copy
#         if_copy = self._tocuda(Variable(torch.ones(batch_size, 1), requires_grad=0))
#         print('if_copy size: ', if_copy.size())
        
        prob_copy = torch.bmm(inputs_one_hot, weight.unsqueeze(dim=2))
        prob_copy = prob_copy.squeeze(dim=2)
#         prob_copy = self._tocuda(Variable(torch.rand(batch_size, len(self.vocab.word2token)), requires_grad=0))
#         prob_copy = self.softmax(prob_copy)

#         print('prob_copy size: ', prob_copy.size())
#         print(torch.sum(prob_copy, dim=1))
#         print(torch.mean(if_copy))
        
#         if random.random()<0.005:
#             print('if_copy mean: ', torch.mean(if_copy))
#             _, max_ids = torch.max(prob_copy, dim=1)
#             print(self.vocab.token2word[max_ids.data[0]], self.vocab.token2word[max_ids.data[1]], self.vocab.token2word[max_ids.data[2]])
            
            
        return if_copy, prob_copy

    def forward(self, enc_outputs, sent_lens, h0_and_c0, labels, inputs, teaching_rate=0.6, is_train=1):
        labels = Variable(labels)
        if self.use_cuda:
            labels = labels.cuda()

        all_loss = 0
        predicts = []
        max_probs=[]
        batch_size = enc_outputs.size(dim = 0)
        final_hidden_states = h0_and_c0[0]
#         print('enc_outputs size:', enc_outputs.size())

        sents_len = enc_outputs.size(1)
        inputs = inputs[:,:sents_len].unsqueeze(dim=2)
        one_hot = torch.FloatTensor(batch_size, sents_len, len(self.vocab.word2token)).zero_()
        one_hot.scatter_(2, inputs, 1)
        one_hot = one_hot.transpose(1,2)
        one_hot = self._tocuda(Variable(one_hot, requires_grad = 0))
#         print('one_hot size: ', one_hot.size())
        
        for ii in range(self.max_length):
            if ii==0:
                zero_timestep_input = Variable(torch.LongTensor([self.vocab.word2token['<sos>']]*batch_size))
                if self.use_cuda:
                    zero_timestep_input = zero_timestep_input.cuda()
                    
                zero_timestep_input = self.embed(zero_timestep_input)#size: batch_size * self.input_dim

                last_timestep_hidden_state,cx = self.lstmcell(zero_timestep_input, h0_and_c0)
                #print('last_timestep_hidden_state: ', last_timestep_hidden_state.size(), cx.size())
                
                
                logits = self.fcnn(last_timestep_hidden_state)
                
                #copy or not
                copy_control=random.random()
                if copy_control<copy_thres:
                    if_copy, prob_copy = self.copy_mechanism(enc_outputs=enc_outputs, this_timestep_input=zero_timestep_input, 
                                                            dec_state = last_timestep_hidden_state, inputs_one_hot = one_hot)
                    score = (1-if_copy)*self.softmax(logits)+if_copy*prob_copy
                    score = torch.clamp(score, min=10**(-30), max=1)
                
                #for saving time: no training, no loss calculating
                if is_train:
                    if copy_control<copy_thres:
                        loss = self.nll_loss(torch.log(score), labels[:,0])
                    else:
                        loss = self.cost_func(logits, labels[:,0])
                    all_loss+=loss
                
                #get predicts
                if copy_control<copy_thres:
                    _, max_idxs = torch.max(score, dim=1)
                else:
                    _, max_idxs = torch.max(logits, dim=1)
                predicts.append(torch.unsqueeze(max_idxs, dim=0))
                
                
            else:
                if is_train:
                    rand = random.random()
                    if rand<teaching_rate:
                        this_timestep_input = self.embed(labels[:,ii-1])#label teaching, lookup embedding
                    else:
                        this_timestep_input = self.embed(max_idxs)#last_timestep output, and then look up word embedding
                else:
                    this_timestep_input = self.embed(max_idxs)#last_timestep output, and then look up word embedding
                    
                last_timestep_hidden_state ,cx = self.lstmcell(this_timestep_input, (last_timestep_hidden_state,cx))
                
                
                logits = self.fcnn(last_timestep_hidden_state)
                
                #copy or not
                copy_control=random.random()
                if copy_control<copy_thres:
                    if_copy, prob_copy = self.copy_mechanism(enc_outputs=enc_outputs, this_timestep_input=this_timestep_input, 
                                                            dec_state = last_timestep_hidden_state, inputs_one_hot = one_hot)
                    score = (1-if_copy)*self.softmax(logits)+if_copy*prob_copy
                    score = torch.clamp(score, min=10**(-30), max=1)
                
                #for saving time: no training, no loss calculating
                if is_train:
                    if copy_control<copy_thres:
                        loss = self.nll_loss(torch.log(score), labels[:,ii])
                    else:
                        loss = self.cost_func(logits, labels[:,ii])
                    all_loss+=loss
                
                #get predicts
                if copy_control<copy_thres:
                    _, max_idxs = torch.max(score, dim=1)
                else:
                    _, max_idxs = torch.max(logits, dim=1)
                predicts.append(torch.unsqueeze(max_idxs, dim=0))
                
        predicts = torch.cat(predicts, dim=0)
        predicts = torch.transpose(predicts, 0, 1)
    
        if is_train:  #training
#             all_loss = torch.cat(all_loss, dim=1)
#             all_loss = torch.mean(all_loss, dim=1)
#             loss = torch.mean(all_loss)
            loss = all_loss/self.max_length
    
            #print('loss size: ', loss.size())
            #torch.cuda.empty_cache()
            if self.use_cuda:
                return loss, predicts.data.cpu().tolist()
            else:
                return loss, predicts.data.tolist()
        else:   #testing
            if self.use_cuda:
                return predicts.data.cpu().tolist()
            else:
                return predicts.data.tolist()
#         if is_train:  #training
#             if self.use_cuda:
#                 return all_loss/(self.max_length+1), predicts.data.cpu().numpy()
#             else:
#                 return all_loss/(self.max_length+1), predicts.data.numpy()
#         else:   #testing
#             if self.use_cuda:
#                 return predicts.data.cpu().numpy()
#             else:
#                 return predicts.data.numpy()
    
    
    def decode_topk_seqs(self, encoder, inputs, input_lens, topk=3):
        enc_outputs, (enc_hn, enc_cn) = encoder(inputs, input_lens)
        batch_size = enc_outputs.size(dim = 0)
        
        #one hot of inputs
        sents_len = enc_outputs.size(1)
        inputs = inputs[:,:sents_len].unsqueeze(dim=2)
        one_hot = torch.FloatTensor(batch_size, sents_len, len(self.vocab.word2token)).zero_()
        one_hot.scatter_(2, inputs, 1)
        one_hot = one_hot.transpose(1,2)
        one_hot = self._tocuda(Variable(one_hot, requires_grad = 0))
        
        metadata = self.decode_by_beamsearch(encoder_hidden=(enc_hn, enc_cn), encoder_outputs=enc_outputs, inputs_one_hot=one_hot,topk = topk)
        results = metadata['topk_sequence']
        results =torch.cat(results, dim = 2)
        results=results.view(batch_size*topk, -1)
        if self.use_cuda:
            results = results.data.cpu().tolist()
        else:
            results = results.data.tolist()
#         results=batch_tokens_remove_eos(results, self.vocab)

#         labels = [x for x in labels for ii in range(topk)]
#         labels = batch_tokens_remove_eos(labels, self.vocab)
#         bleu_scores = batch_tokens_bleu(references=labels, candidates=results, smooth_epsilon=0.01)
        
#         bleu_scores = torch.FloatTensor(bleu_scores).view(batch_size, topk)
#         bleu_max, _ = torch.max(bleu_scores, dim=1)
        
#         bleu_mean = torch.mean(bleu_scores, dim=1).unsqueeze(dim=1)
#         bleu_scores = bleu_scores-bleu_mean
#         bleu_scores = bleu_scores.view(-1)
        
#         bleu_scores = self._tocuda(Variable(bleu_scores, requires_grad = 0))
#         log_probs = metadata['score']
#         log_probs = log_probs.view(batch_size*topk)
#         loss = -torch.dot(log_probs, bleu_scores)/batch_size/topk
#         return loss, results, torch.mean(bleu_mean.squeeze()), torch.mean(bleu_max)

        log_probs = metadata['score']
        log_probs = log_probs.view(batch_size*topk)
        
        return results, log_probs
        
        
        
    def _tocuda(self, var):
        if self.use_cuda:
            return var.cuda()
        else:
            return var
    def decode_by_beamsearch(self, encoder_hidden=None, encoder_outputs=None, inputs_one_hot=None, topk = 10):
        self.k = topk
        batch_size = encoder_outputs.size(dim=0)
        
        self.pos_index = self._tocuda(Variable(torch.LongTensor(range(batch_size)) * self.k).view(-1, 1))

        hidden = tuple([_inflate(h, self.k, 1).view(batch_size*self.k, -1) for h in encoder_hidden])
        #print('hidden0 size: (%s, %s)'%(hidden[0].size(), hidden[1].size()))

        encoder_outputs = _inflate(encoder_outputs, self.k, 1).view(batch_size*self.k, encoder_outputs.size(1), encoder_outputs.size(2))
        inputs_one_hot = _inflate(inputs_one_hot, self.k, 1).view(batch_size*self.k, inputs_one_hot.size(1), inputs_one_hot.size(2))
        
        # Initialize the scores; for the first step,
        # ignore the inflated copies to avoid duplicate entries in the top k
        sequence_scores = torch.Tensor(batch_size * self.k, 1)
        sequence_scores.fill_(-float('Inf'))
        sequence_scores.index_fill_(0, torch.LongTensor([i * self.k for i in range(0, batch_size)]), 0.0)
        sequence_scores = self._tocuda(Variable(sequence_scores))

        # Initialize the input vector
        input_var = self._tocuda(Variable(torch.LongTensor([self.SOS] * batch_size * self.k)))

        # Store decisions for backtracking
        stored_outputs = list()
        stored_scores = list()
        stored_predecessors = list()
        stored_emitted_symbols = list()
        stored_hidden = list()

        for ii in range(0, self.max_length):
            # Run the RNN one step forward
            #print('setp: %s'%ii)
            input_vec = self.embed(input_var)
            #print('input_var and input_vec size: ', input_var.size(), input_vec.size())
            hidden = self.lstmcell(input_vec, hidden)
            #print('hidden size: (%s, %s)'%(hidden[0].size(), hidden[1].size()))
            
            #log_softmax_output = self.log_softmax(self.fcnn(hidden[0]))
            
            logits = self.fcnn(hidden[0])
#             print('logits size', logits.size())
#             print(encoder_outputs.size())
#             print(input_vec.size())
#             print(hidden[0].size())
#             print(inputs_one_hot.size())
            if_copy, prob_copy = self.copy_mechanism(enc_outputs=encoder_outputs, this_timestep_input=input_vec.squeeze(dim=1), 
                                                            dec_state = hidden[0], inputs_one_hot = inputs_one_hot)
#             print('if_copy size', if_copy.size(), 'prob_copy size', prob_copy.size())
            
            score = (1-if_copy)*self.softmax(logits)+if_copy*prob_copy
            score = torch.clamp(score, min=10**(-30), max=1)
#             print('score size: ', score.size())

            # To get the full sequence scores for the new candidates, add the local scores for t_i to the predecessor scores for t_(i-1)
            sequence_scores = _inflate(sequence_scores, self.V, 1)
            sequence_scores += torch.log(score).squeeze(1)
            scores, candidates = sequence_scores.view(batch_size, -1).topk(self.k, dim=1)

            # Reshape input = (bk, 1) and sequence_scores = (bk, 1)
            input_var = (candidates % self.V).view(batch_size * self.k, 1)
            sequence_scores = scores.view(batch_size * self.k, 1)

            # Update fields for next timestep
            predecessors = (candidates / self.V + self.pos_index.expand_as(candidates)).view(batch_size * self.k, 1)
            if isinstance(hidden, tuple):
                hidden = tuple([h.index_select(0, predecessors.squeeze()) for h in hidden])
            else:
                hidden = hidden.index_select(0, predecessors.squeeze())

            # Update sequence scores and erase scores for end-of-sentence symbol so that they aren't expanded
            stored_scores.append(sequence_scores.clone())
            eos_indices = input_var.data.eq(self.EOS)
            if eos_indices.nonzero().dim() > 0:
                sequence_scores.data.masked_fill_(eos_indices, -float('inf'))

            # Cache results for backtracking
            stored_predecessors.append(predecessors)
            stored_emitted_symbols.append(input_var)
#             stored_hidden.append(hidden)

        # Do backtracking to return the optimal values
        output, h_t, h_n, s, l, p = self._backtrack(hidden,
                                                    stored_predecessors, stored_emitted_symbols,
                                                    stored_scores, batch_size, self.hidden_size)

        metadata = {}

        metadata['score'] = s
        metadata['topk_length'] = l
        metadata['topk_sequence'] = p
        metadata['length'] = [seq_len[0] for seq_len in l]
        metadata['sequence'] = [seq[0] for seq in p]
        
#         torch.cuda.empty_cache()
        
        return metadata

    def _backtrack(self, hidden, predecessors, symbols, scores, b, hidden_size):
        """Backtracks over batch to generate optimal k-sequences.

        Args:
            nw_output [(batch*k, vocab_size)] * sequence_length: A Tensor of outputs from network
            nw_hidden [(num_layers, batch*k, hidden_size)] * sequence_length: A Tensor of hidden states from network
            predecessors [(batch*k)] * sequence_length: A Tensor of predecessors
            symbols [(batch*k)] * sequence_length: A Tensor of predicted tokens
            scores [(batch*k)] * sequence_length: A Tensor containing sequence scores for every token t = [0, ... , seq_len - 1]
            b: Size of the batch
            hidden_size: Size of the hidden state

        Returns:
            output [(batch, k, vocab_size)] * sequence_length: A list of the output probabilities (p_n)
            from the last layer of the RNN, for every n = [0, ... , seq_len - 1]

            h_t [(batch, k, hidden_size)] * sequence_length: A list containing the output features (h_n)
            from the last layer of the RNN, for every n = [0, ... , seq_len - 1]

            h_n(batch, k, hidden_size): A Tensor containing the last hidden state for all top-k sequences.

            score [batch, k]: A list containing the final scores for all top-k sequences

            length [batch, k]: A list specifying the length of each sequence in the top-k candidates

            p (batch, k, sequence_len): A Tensor containing predicted sequence
        """

        lstm = isinstance(hidden, tuple)

        # initialize return variables given different types
        output = list()
        h_t = list()
        p = list()
        # Placeholder for last hidden state of top-k sequences.
        # If a (top-k) sequence ends early in decoding, `h_n` contains
        # its hidden state when it sees EOS.  Otherwise, `h_n` contains
        # the last hidden state of decoding.
        if lstm:
            state_size = hidden[0].size()
            h_n = tuple([torch.zeros(state_size), torch.zeros(state_size)])
        else:
            h_n = torch.zeros(nw_hidden[0].size())
        l = [[self.max_length] * self.k for _ in range(b)]  # Placeholder for lengths of top-k sequences
                                                                # Similar to `h_n`

        # the last step output of the beams are not sorted
        # thus they are sorted here
        sorted_score, sorted_idx = scores[-1].view(b, self.k).topk(self.k)
        # initialize the sequence scores with the sorted last step beam scores
        s = sorted_score.clone()

        batch_eos_found = [0] * b   # the number of EOS found
                                    # in the backward loop below for each batch

        t = self.max_length - 1
        # initialize the back pointer with the sorted order of the last step beams.
        # add self.pos_index for indexing variable with b*k as the first dimension.
        t_predecessors = (sorted_idx + self.pos_index.expand_as(sorted_idx)).view(b * self.k)
        while t >= 0:
            # Re-order the variables with the back pointer
            current_symbol = symbols[t].index_select(0, t_predecessors)
            # Re-order the back pointer of the previous step with the back pointer of
            # the current step
            t_predecessors = predecessors[t].index_select(0, t_predecessors).squeeze()

            # This tricky block handles dropped sequences that see EOS earlier.
            # The basic idea is summarized below:
            #
            #   Terms:
            #       Ended sequences = sequences that see EOS early and dropped
            #       Survived sequences = sequences in the last step of the beams
            #
            #       Although the ended sequences are dropped during decoding,
            #   their generated symbols and complete backtracking information are still
            #   in the backtracking variables.
            #   For each batch, everytime we see an EOS in the backtracking process,
            #       1. If there is survived sequences in the return variables, replace
            #       the one with the lowest survived sequence score with the new ended
            #       sequences
            #       2. Otherwise, replace the ended sequence with the lowest sequence
            #       score with the new ended sequence
            #
            eos_indices = symbols[t].data.squeeze(1).eq(self.EOS).nonzero()
            if eos_indices.dim() > 0:
                for i in range(eos_indices.size(0)-1, -1, -1):
                    # Indices of the EOS symbol for both variables
                    # with b*k as the first dimension, and b, k for
                    # the first two dimensions
                    idx = eos_indices[i]
                    b_idx = int(idx[0] / self.k)
                    # The indices of the replacing position
                    # according to the replacement strategy noted above
                    res_k_idx = self.k - (batch_eos_found[b_idx] % self.k) - 1
                    batch_eos_found[b_idx] += 1
                    res_idx = b_idx * self.k + res_k_idx

                    # Replace the old information in return variables
                    # with the new ended sequence information
                    t_predecessors[res_idx] = predecessors[t][idx[0]]

                    current_symbol[res_idx, :] = symbols[t][idx[0]]
                    s[b_idx, res_k_idx] = scores[t][idx[0]]
                    l[b_idx][res_k_idx] = t + 1

            # record the back tracked results
            p.append(current_symbol)
            t -= 1

        # Sort and re-order again as the added ended sequences may change
        # the order (very unlikely)
        s, re_sorted_idx = s.topk(self.k)
        for b_idx in range(b):
            l[b_idx] = [l[b_idx][k_idx.data[0]] for k_idx in re_sorted_idx[b_idx,:]]

        re_sorted_idx = (re_sorted_idx + self.pos_index.expand_as(re_sorted_idx)).view(b * self.k)

        # Reverse the sequences and re-order at the same time
        # It is reversed because the backtracking happens in reverse time order
#         output = [step.index_select(0, re_sorted_idx).view(b, self.k, -1) for step in reversed(output)]
        p = [step.index_select(0, re_sorted_idx).view(b, self.k, -1) for step in reversed(p)]
        #    --- fake output ---
        output = None
        #    --- fake ---
        return output, h_t, h_n, s, l, p

    def _mask_symbol_scores(self, score, idx, masking_score=-float('inf')):
            score[idx] = masking_score

    def _mask(self, tensor, idx, dim=0, masking_score=-float('inf')):
        if len(idx.size()) > 0:
            indices = idx[:, 0]
            tensor.index_fill_(dim, indices, masking_score)

In [8]:
class Seq2Seq(nn.Module):
    def __init__(self, use_cuda, input_dim, hidden_dim, vocab, max_length = 25):
        super(Seq2Seq, self).__init__()
        
        self.use_cuda = use_cuda
        self.enc = Encoder(use_cuda=use_cuda, hidden_dim=hidden_dim, input_dim=input_dim, vocab=vocab)
        self.dec = Decoder(use_cuda=use_cuda, encoder=self.enc, hidden_dim=hidden_dim, max_length=max_length)
        if use_cuda:
            self.enc = self.enc.cuda()
            self.dec = self.dec.cuda()
    def forward(self, inputs, input_lens, labels, is_train=1, teaching_rate=1):
        enc_outputs, (enc_hn, enc_cn) = self.enc(torch.LongTensor(inputs), torch.LongTensor(input_lens))
        if is_train:
            loss, predicts = self.dec(enc_outputs = enc_outputs, 
                                    h0_and_c0=(enc_hn, enc_cn), 
                                    sent_lens=input_lens,
                                    labels=torch.LongTensor(labels), 
                                    is_train=1, 
                                    teaching_rate = 1,
                                    inputs = inputs
                                    )
            return loss, predicts
        else:
            predicts = self.dec(enc_outputs = enc_outputs, 
                                h0_and_c0=(enc_hn, enc_cn), 
                                sent_lens=input_lens,
                                labels=torch.LongTensor(labels), 
                                is_train=0, 
                                teaching_rate = 1,
                                inputs = inputs
                                )
            return predicts
#     def train_using_rl(self, inputs, input_lens, labels, is_train=1, teaching_rate=1):
#         enc_outputs, (enc_hn, enc_cn) = self.enc(torch.LongTensor(inputs), torch.LongTensor(input_lens))
#         loss, predicts, bleu_mean = self.dec.train_using_rl_2(enc_outputs = enc_outputs, 
#                                                 h0_and_c0=(enc_hn, enc_cn), 
#                                                 sent_lens=input_lens,
#                                                 labels=labels,
#                                                 is_train=1, 
#                                                 teaching_rate = 1
#                                                 )
#         return loss, predicts, bleu_mean

    def tocuda(self, x):
        if self.use_cuda:
            return x.cuda()
        else:
            return x
        
    def train_using_reward(self, inputs, input_lens, reconstruct_labels, reconstruct_model, language_model, topk=3, loss_ratio=0.5):
        dec_seqs, log_probs = self.dec.decode_topk_seqs(self.enc, inputs, input_lens, topk=topk)
#         enc_outputs, (enc_hn, enc_cn) = self.enc(torch.LongTensor(inputs), torch.LongTensor(input_lens))
#         results = self.dec.decode_no_labels(enc_outputs=enc_outputs, h0_and_c0=(enc_hn, enc_cn), topk=topk)
        simple_sent1s, simple_sent2s = seqs_split(dec_seqs, self.enc.vocab)
        
        lm_input1s, lm_input1_lens, lm_label1s = get_lm_inputs_and_labels(simple_sent1s, self.enc.vocab, self.dec.max_length)
        simple_sent1s_ppl = language_model.get_sentences_ppl(torch.LongTensor(lm_input1s), 
                                                      torch.LongTensor(lm_input1_lens), 
                                                      torch.LongTensor(lm_label1s)
                                                    )
        lm_input2s, lm_input2_lens, lm_label2s = get_lm_inputs_and_labels(simple_sent2s, self.enc.vocab, self.dec.max_length)
        simple_sent2s_ppl = language_model.get_sentences_ppl(torch.LongTensor(lm_input2s), 
                                                      torch.LongTensor(lm_input2_lens), 
                                                      torch.LongTensor(lm_label2s)
                                                    )
        
        simple_inputs, simple_input_lens = simple_sents_concat(simple_sent1s, simple_sent2s, self.enc.vocab, self.dec.max_length)
        #reconstruct labels
        reconstruct_loss, predicts = reconstruct_model.forward(torch.LongTensor(simple_inputs), 
                                     torch.LongTensor(simple_input_lens), 
                                     labels=reconstruct_labels, 
                                     is_train=1, teaching_rate=1)
        
        #rm_rewards: reconstruct model rewards
        #lm_rewards: language model rewards
        rm_rewards=-reconstruct_loss.data
        lm_rewards=(1/self.tocuda(torch.Tensor(simple_sent1s_ppl))+1/self.tocuda(torch.Tensor(simple_sent2s_ppl)))/2
        
        rm_rewards_mean = torch.mean(rm_rewards.view(-1, topk), dim=1)
        lm_rewards_mean = torch.mean(lm_rewards.view(-1, topk), dim=1)
        rm_rewards = rm_rewards.view(-1, topk) - rm_rewards_mean.unsqueeze(dim=1)
        lm_rewards = lm_rewards.view(-1, topk) - lm_rewards_mean.unsqueeze(dim=1)
        
        rm_rewards = rm_rewards.view(-1)
        lm_rewards = lm_rewards.view(-1)
        
        #sum both rewards up
        rewards = loss_ratio*rm_rewards+(1-loss_ratio)*lm_rewards
        rewards = Variable(rewards, requires_grad=0)
        
        #regarding rewards as weights of every seq
        loss = -torch.dot(log_probs, rewards)/log_probs.size(0)
        
#         labels = [x for x in labels for ii in range(topk)]
#         labels = batch_tokens_remove_eos(labels, self.vocab)
#         bleu_scores = batch_tokens_bleu(references=labels, candidates=results, smooth_epsilon=0.01)
        
#         bleu_scores = torch.FloatTensor(bleu_scores).view(batch_size, topk)
#         bleu_max, _ = torch.max(bleu_scores, dim=1)
        
#         bleu_mean = torch.mean(bleu_scores, dim=1).unsqueeze(dim=1)
#         bleu_scores = bleu_scores-bleu_mean
#         bleu_scores = bleu_scores.view(-1)
        
#         bleu_scores = self._tocuda(Variable(bleu_scores, requires_grad = 0))
        
#         log_probs = metadata['score']
#         log_probs = log_probs.view(batch_size*topk)
    
#         loss = -torch.dot(log_probs, bleu_scores)/batch_size/topk
        
        return loss, reconstruct_loss, torch.mean(rm_rewards_mean), torch.mean(lm_rewards_mean)
    
    


In [None]:
lm_hidden_dim=512
lm_input_dim=300
use_cuda=1

language_model = LanguageModel(use_cuda = use_cuda, input_dim = lm_input_dim, hidden_dim = lm_hidden_dim, vocab = vocab)
#512
model_path = './models_language_model/time-[2019-02-26-13-18-56]-info=[language_model]-loss=4.003012180-bleu=-1.0000-hidden_dim=512-input_dim=300-epoch=24-batch_size=100-batch_id=[1-[of]-9899]-lr=0.0050'
#2048
# model_path = './models_language_model/time-[2019-02-28-07-04-08]-info=[language_model]-loss=3.475848675-bleu=-1.0000-hidden_dim=2048-input_dim=300-epoch=4-batch_size=100-batch_id=[1-[of]-9899]-lr=0.0050'
# #1024
# model_path = './models_language_model/time-[2019-02-27-21-58-23]-info=[language_model]-loss=4.111208439-bleu=-1.0000-hidden_dim=1024-input_dim=300-epoch=6-batch_size=100-batch_id=[1-[of]-9899]-lr=0.0050'

pre_train = torch.load(model_path, map_location='cpu')
language_model.load_state_dict(pre_train)

if use_cuda:
    language_model = language_model.cuda()
    
language_model.eval()

print('finish loading pre-train weight for language model.')



use_cuda = 1
hidden_dim = 256
input_dim = 100
lr=0.005

split_model = Seq2Seq(use_cuda = use_cuda, input_dim = input_dim, hidden_dim = hidden_dim, 
                          vocab = vocab, max_length = 61)

fusion_model = Seq2Seq(use_cuda = use_cuda, input_dim = input_dim, hidden_dim = hidden_dim, 
                          vocab = vocab, max_length = 51)
#pre train para
split_model_path = './models_saved/time-[2019-03-10-01-36-10]-info=[pre-trained_split_model-20per]-loss=0.515495539-bleu=0.6774-hidden_dim=256-input_dim=100-epoch=1-batch_size=100-batch_id=[501-[of]-1979]-lr=0.0050'
fusion_model_path = './models_saved/time-[2019-03-10-01-36-12]-info=[pre-trained_fusion_model-20per]-loss=0.365494132-bleu=0.7406-hidden_dim=256-input_dim=100-epoch=1-batch_size=100-batch_id=[501-[of]-1979]-lr=0.0050'

split_model_path = './models_saved/time-[2019-03-10-05-52-44]-info=[pre-trained_split_model-20per]-loss=0.467645884-bleu=0.7270-hidden_dim=256-input_dim=100-epoch=2-batch_size=100-batch_id=[501-[of]-1979]-lr=0.0050'
fusion_model_path = './models_saved/time-[2019-03-10-05-52-48]-info=[pre-trained_fusion_model-20per]-loss=0.327692717-bleu=0.7558-hidden_dim=256-input_dim=100-epoch=2-batch_size=100-batch_id=[501-[of]-1979]-lr=0.0050'

split_model_path = './models_saved/time-[2019-03-10-13-23-10]-info=[pre-trained_split_model-20per]-loss=0.454687029-bleu=0.7130-hidden_dim=256-input_dim=100-epoch=4-batch_size=100-batch_id=[1-[of]-1979]-lr=0.0050'
fusion_model_path = './models_saved/time-[2019-03-10-13-23-11]-info=[pre-trained_fusion_model-20per]-loss=0.346116364-bleu=0.7466-hidden_dim=256-input_dim=100-epoch=4-batch_size=100-batch_id=[1-[of]-1979]-lr=0.0050'

split_model_path = './models_saved/time-[2019-03-16-23-39-39]-info=[split_model-semi]-total_loss=-0.002986051-rec_loss=0.017885875-lm_rewards=0.0102-bleu=0.7734-bleu_bs=0.6282-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[1201-[of]-46585]-lr=0.0050-loss_ratio=0.4500'
fusion_model_path = './models_saved/time-[2019-03-16-23-39-39]-info=[fusion_model-semi]'

pre_train = torch.load(split_model_path, map_location='cpu')
split_model.load_state_dict(pre_train)
pre_train = torch.load(fusion_model_path, map_location='cpu')
fusion_model.load_state_dict(pre_train)

if use_cuda:
    split_model = split_model.cuda()
    fusion_model = fusion_model.cuda()
    
split_optimizer = optim.Adam(filter(lambda p: p.requires_grad, split_model.parameters()), lr=lr)
fusion_optimizer = optim.Adam(filter(lambda p: p.requires_grad, fusion_model.parameters()), lr=lr)

# set_model_grad(fusion_model, False)

finish loading pre-train weight for language model.
init lookup embedding matrix size:  torch.Size([44380, 100])
init lookup embedding matrix size:  torch.Size([44380, 100])


In [None]:
batch_size=17
split_train_set_size=int(len(split_train_set_inputs)/1)
epochs=10000
train_bleu_mean=-1
train_bleu_max=-1
topk=6
loss_ratio=0.45

sup_bsize=35
dataset_times = int(split_train_set_size/len(split_train_set_inputs_supervised))

#batch_size=35, topk=3  or  batch_size=17, topk=6 or  
start_time = time.time()

def model_train(epoch, batch_size, train_set_size):
    batch_id = 0
    valid_bleu = 0
    for start_idx in range(0, train_set_size-batch_size+1, batch_size):
        if batch_id<=1199 and epoch==0:
            batch_id+=1
            continue
#         now = int(round(time.time()*1000))
#         time_stamp = time.strftime(' --->  starting time-[%Y-%m-%d-%H-%M-%S]-',time.localtime(now/1000))
#         print(time_stamp)
        
        #supervised learning
        if batch_id%2==0:
            set_model_grad(split_model, True)
            set_model_grad(fusion_model, False)
            split_optimizer.zero_grad()#clear  
            sup_idx = (batch_id*sup_bsize)%(len(split_train_set_inputs_supervised)-1-sup_bsize)
            split_loss, predicts = split_model.forward(torch.LongTensor(split_train_set_inputs_supervised[sup_idx:sup_idx+sup_bsize]), 
                                         torch.LongTensor(split_train_set_input_lens_supervised[sup_idx:sup_idx+sup_bsize]), 
                                         labels=torch.LongTensor(split_train_set_labels_supervised[sup_idx:sup_idx+sup_bsize]), 
                                         is_train=1, teaching_rate=1)
            split_loss=torch.mean(split_loss)
            split_loss.backward()#retain_graph=True)
            split_optimizer.step()

        if batch_id%2==1:
            set_model_grad(fusion_model, True)
            set_model_grad(split_model, False)
            fusion_optimizer.zero_grad()#clear
            sup_idx = (batch_id*sup_bsize)%(len(split_train_set_inputs_supervised)-1-sup_bsize)
            fusion_loss, predicts = fusion_model.forward(torch.LongTensor(fusion_train_set_inputs_supervised[sup_idx:sup_idx+sup_bsize]), 
                                         torch.LongTensor(fusion_train_set_input_lens_supervised[sup_idx:sup_idx+sup_bsize]), 
                                         labels=torch.LongTensor(fusion_train_set_labels_supervised[sup_idx:sup_idx+sup_bsize]), 
                                         is_train=1, teaching_rate=1)
            fusion_loss = torch.mean(fusion_loss)
            fusion_loss.backward()#retain_graph=True)
            fusion_optimizer.step()
        
        
        #unsupervised learning
        if batch_id%2==0:
#             a=time.time()
            end_idx = start_idx + batch_size
            split_optimizer.zero_grad()#clear
            total_loss, reconstruct_loss, rm_rewards, lm_rewards=split_model.train_using_reward(inputs=torch.LongTensor(split_train_set_inputs[start_idx:end_idx]), 
                                   input_lens=torch.LongTensor(split_train_set_input_lens[start_idx:end_idx]), 
                                   reconstruct_labels=torch.LongTensor(duplicate_reconstruct_labels(fusion_pseudo_train_set_labels[start_idx:end_idx],topk)), 
                                   reconstruct_model=fusion_model, 
                                   language_model=language_model, 
                                   topk=topk, loss_ratio=loss_ratio)
            reconstruct_loss = torch.mean(reconstruct_loss)
            total_loss.backward()#retain_graph=True)
            split_optimizer.step()
#             print('split: all time: ', time.time()-a)
        if batch_id%2==1: 
#             a=time.time()
            end_idx = start_idx + batch_size
            fusion_optimizer.zero_grad()#clear
            total_loss, reconstruct_loss, rm_rewards, lm_rewards=split_model.train_using_reward(inputs=torch.LongTensor(split_train_set_inputs[start_idx:end_idx]), 
                                   input_lens=torch.LongTensor(split_train_set_input_lens[start_idx:end_idx]), 
                                   reconstruct_labels=torch.LongTensor(duplicate_reconstruct_labels(fusion_pseudo_train_set_labels[start_idx:end_idx],topk)), 
                                   reconstruct_model=fusion_model, 
                                   language_model=language_model, 
                                   topk=topk, loss_ratio=loss_ratio)
            reconstruct_loss = loss_ratio*torch.mean(reconstruct_loss)
            reconstruct_loss.backward()#retain_graph=True)
            fusion_optimizer.step()
#             print('fusion: all time: ', time.time()-a)
        #update batch_id
        batch_id+=1
        #timestamp
#         now = int(round(time.time()*1000))
#         time_stamp = time.strftime('time-[%Y-%m-%d-%H-%M-%S]-',time.localtime(now/1000))
#         print(time_stamp)

        torch.cuda.empty_cache()
        #
        if batch_id%20==1:
            split_model.eval()
            fusion_model.eval()
            set_model_grad(split_model, False)
            set_model_grad(fusion_model, False)
            sample_num = 5
            rand_idx = random.randint(0, train_set_size-sample_num-1)
            
            print('--------split model training sampling display--------')
            #teaching forcing
            loss_, predicts = split_model.forward(torch.LongTensor(split_train_set_inputs[rand_idx:rand_idx+sample_num]), 
                                             torch.LongTensor(split_train_set_input_lens[rand_idx:rand_idx+sample_num]), 
                                             labels=torch.LongTensor(split_pseudo_train_set_labels[rand_idx:rand_idx+sample_num]), 
                                             is_train=1, teaching_rate=1)
            del loss_
            
            predicts = batch_tokens_remove_eos(predicts, vocab)
            labels = batch_tokens_remove_eos(split_pseudo_train_set_labels[rand_idx:rand_idx+sample_num], vocab)
            
            predicts = batch_tokens2words(predicts, vocab)
            labels = batch_tokens2words(labels, vocab)
            
            predicts_sents = batch_words2sentence(predicts)
            labels_sents = batch_words2sentence(labels)
            
            for (predict_sent, label_sent) in zip(predicts_sents, labels_sents):
                print(' 1----> ', predict_sent)
                print(' 2----> ', label_sent)
                print('\n')
            
            now = int(round(time.time()*1000))
            time_stamp = time.strftime('time-[%Y-%m-%d-%H-%M-%S]-',time.localtime(now/1000))
            info_stamp = 'info=[{:s}]-total_loss={:2.9f}-rec_loss={:2.9f}-lm_rewards={:5.4f}-hidden_dim={:n}-input_dim={:n}-epoch={:n}-batch_size={:n}-batch_id=[{:n}-[of]-{:n}]-lr={:1.4f}'.format(
                              'split_model', total_loss.data[0], reconstruct_loss.data[0], lm_rewards, 
                            hidden_dim, input_dim, epoch, batch_size, batch_id, int(train_set_size/batch_size), lr)
            print(time_stamp, info_stamp)
            
            if batch_id%40==1:
                #ground truth
#                 rand_idx=random.randint(0, len(split_valid_set_inputs)-batch_size-1-1)
                rand_idx=2333
                loss_, predicts = split_model.forward(torch.LongTensor(split_valid_set_inputs[rand_idx:rand_idx+batch_size]), 
                                                 torch.LongTensor(split_valid_set_input_lens[rand_idx:rand_idx+batch_size]), 
                                                 labels=torch.LongTensor(split_pseudo_valid_set_labels[rand_idx:rand_idx+batch_size]), 
                                                 is_train=1, teaching_rate=1)
                del loss_
#                 predicts = batch_tokens_remove_eos(predicts, vocab)
#                 labels = batch_tokens_remove_eos(split_pseudo_valid_set_labels[rand_idx:rand_idx+batch_size], vocab)
                
#                 bleu_scores = batch_tokens_bleu(references=labels, candidates=predicts, smooth_epsilon=0.001)
                #split version
                bleu_scores = batch_tokens_bleu_split_version(references=split_pseudo_valid_set_labels[rand_idx:rand_idx+batch_size], 
                                                              candidates=predicts, smooth_epsilon=0.001, vocab=vocab)

                valid_bleu = 0
                for x in bleu_scores:
                    valid_bleu+=x
                valid_bleu/=len(bleu_scores)
                
                #beam search
                dec_seqs, log_probs = split_model.dec.decode_topk_seqs(split_model.enc, 
                                                                       inputs=torch.LongTensor(split_valid_set_inputs[rand_idx:rand_idx+batch_size]), 
                                                                         input_lens=torch.LongTensor(split_valid_set_input_lens[rand_idx:rand_idx+batch_size]),
                                                                         topk=topk)
                predicts = []
                for ii in range(len(dec_seqs)):
                    if ii%topk==0:
                        predicts.append(dec_seqs[ii])
               
                bleu_scores = batch_tokens_bleu_split_version(references = split_pseudo_valid_set_labels[rand_idx:rand_idx+batch_size],
                                                             candidates = predicts,
                                                             smooth_epsilon=0.001,
                                                             vocab=vocab)
                valid_bleu_beam_search=0
                for x in bleu_scores:
                    valid_bleu_beam_search+=x
                valid_bleu_beam_search/=len(bleu_scores)


                info_stamp = 'info=[{:s}]-total_loss={:2.9f}-rec_loss={:2.9f}-lm_rewards={:5.4f}-bleu={:1.4f}-bleu_bs={:1.4f}-hidden_dim={:n}-input_dim={:n}-epoch={:n}-batch_size={:n}-batch_id=[{:n}-[of]-{:n}]-lr={:1.4f}-loss_ratio={:1.4f}'.format(
                              'split_model-semi', total_loss.data[0], reconstruct_loss.data[0], lm_rewards, valid_bleu, valid_bleu_beam_search, 
                            hidden_dim, input_dim, epoch, batch_size, batch_id, int(train_set_size/batch_size), lr, loss_ratio)
                
                print(info_stamp, valid_bleu, valid_bleu_beam_search)
                
                now = int(round(time.time()*1000))
                time_stamp = time.strftime('time-[%Y-%m-%d-%H-%M-%S]-',time.localtime(now/1000))
                torch.save(split_model.state_dict(), ''.join(['./models_saved/', time_stamp, info_stamp]))
                torch.save(fusion_model.state_dict(), ''.join(['./models_saved/', time_stamp, 'info=[fusion_model-semi]']))
            set_model_grad(split_model, True)
            set_model_grad(fusion_model, True)
            split_model.train()
            fusion_model.train()
            torch.cuda.empty_cache()
for epoch in range(epochs):
    model_train(epoch, batch_size, split_train_set_size)
    
print('running time: %.2f mins'%((time.time()-start_time)/60))

--------split model training sampling display--------
 1---->  he is most famous for his star atlas '' <low_freq> '' , published in 1603 <split> she , he was the first atlas to cover the entire celestial sphere .
 2---->  he is most famous for his star atlas '' <low_freq> '' , published in . <split> 1603 , which was the first atlas to cover the entire celestial sphere .


 1---->  he is most famous for his tireless campaigns to have the death penalty abolished . and having represented some <split> he 300 death row clients , he has only failed to have sentences overturned on 4 occasions .
 2---->  he is most famous for his tireless campaigns to have the death penalty abolished , and having represented . <split> some 300 death row clients , he has only failed to have sentences overturned on 4 occasions .


 1---->  he is most famous for his win as a brown belt against <low_freq> gracie by triangle choke . the 2003 abu dhabi <split> he ali wrestling championships and his particular style 

--------split model training sampling display--------
 1---->  on november , 2 , 2010 , <low_freq> percent of voters supported the madison college smart community plan , a $ <low_freq> <split> a plan for new facilities , renovations , and upgrades to meet the increasing demand for education and job training .
 2---->  on november , 2 , 2010 , <low_freq> percent of voters supported the madison college smart community plan , a $ . <split> <low_freq> plan for new facilities , renovations , and upgrades to meet the increasing demand for education and job training .


 1---->  on november 1 , 1842 , a public hanging took place at the walker house . <split> a a man named william <low_freq> was executed for shooting another man in an argument .
 2---->  on november 1 , 1842 , a public hanging took place at the walker house . <split> and a man named william <low_freq> was executed for shooting another man in an argument .


 1---->  on november 1 , 1842 , a public hanging took place at the wal

--------split model training sampling display--------
 1---->  the book was a western equivalent of the soviet personality cult and <low_freq> led a violent press campaign against <split> the his former friend <low_freq> <low_freq> - a romanian writer who had expressed criticism of the soviet state .
 2---->  the book was a western equivalent of the soviet personality cult and <low_freq> led a violent press campaign . <split> against his former friend <low_freq> <low_freq> - a romanian writer who had expressed criticism of the soviet state .


 1---->  the book was a best - seller in iraq when it was published <split> it in 2000 and was even made into a musical in baghdad .
 2---->  the book was a best - seller in iraq when it was . <split> published in 2000 and was even made into a musical in baghdad .


 1---->  the book was a commercial success , selling in the tens of thousands , and became by <split> it the the most well - known of von <low_freq> 's many works on military tactics 

--------split model training sampling display--------
 1---->  his son , jerry m. fowler , succeeded fowler as elections commissioner , and one of fowler 's two brothers , hendrix marion '' mutt <split> he '' fowler , i , went into local politics and ended his public career as a member of the louisiana house of representatives .
 2---->  his son , jerry m. fowler , succeeded fowler as elections commissioner , and one of fowler 's two brothers , hendrix marion '' . <split> mutt '' fowler , i , went into local politics and ended his public career as a member of the louisiana house of representatives .


 1---->  his son , <low_freq> <low_freq> , is an actor who sings <split> he and writes for a group called the first second .
 2---->  his son , <low_freq> <low_freq> , is an actor who . <split> sings and writes for a group called the first second .


 1---->  his son , joey , served two tours of duty in iraq and one in afghanistan as a marine corps medic , after which , <split> he the for

--------split model training sampling display--------
 1---->  working together with chloe , they aid jack in obtaining the evidence <split> they is president logan , but the evidence is later destroyed .
 2---->  working together with chloe , they aid jack in obtaining the . <split> evidence <low_freq> president logan , but the evidence is later destroyed .


 1---->  working under hester maclean , and alongside amelia bagley , <low_freq> travelled the country , including <split> he to remote rural areas , inspecting and advising private hospitals , nurses and midwives .
 2---->  working under hester maclean , and alongside amelia bagley , <low_freq> travelled the country , . <split> including to remote rural areas , inspecting and advising private hospitals , nurses and midwives .


 1---->  working under a tight budget , korda completed filming in only five weeks . <split> he korda from an original screenplay by walter <low_freq> and r.c. <low_freq> .
 2---->  working under a tight 

--------split model training sampling display--------
 1---->  florin <low_freq> ( born 18 june 1992 alba <low_freq> ) is an romanian <split> he footballer currently under contract with liga i side cs <low_freq> <low_freq> .
 2---->  florin <low_freq> ( born 18 june 1992 alba <low_freq> ) is an . <split> romanian footballer currently under contract with liga i side cs concordia <low_freq> .


 1---->  florin <low_freq> ( born august 30 , 1974 ) is a romanian <low_freq> canoer , who won two olympic <split> he medals in the c - 2 event at the 2000 summer olympics with his teammate <low_freq> <low_freq> .
 2---->  florin <low_freq> ( born august 30 , 1974 ) is a romanian <low_freq> canoer , who won two . <split> olympic medals in the c - 2 event at the 2000 summer olympics with his teammate <low_freq> <low_freq> .


 1---->  <low_freq> ( , known also by several alternative names names is a town and municipality in mountainous <split> it northwestern macedonia , greece and its motto is , '

--------split model training sampling display--------
 1---->  guerrilla news network , inc. ( <low_freq> ) is a privately - owned news organization , registered as <split> the a corporation with the united states patent and trademark office on march 1 , 2005 .
 2---->  guerrilla news network , inc. ( <low_freq> ) is a privately - owned news organization , registered . <split> as a corporation with the united states patent and trademark office on march 1 , 2005 .


 1---->  guerrilla zoo is a contemporary arts organisation formed in 2004 by founder and creative director '' james <low_freq> '' , who produce a variety of creative <split> the director include experiential environments , live concerts , festivals , immersive theatre , art exhibitions , arts awards , parties and masquerade balls .
 2---->  guerrilla zoo is a contemporary arts organisation formed in 2004 by founder and creative director '' james <low_freq> '' , who produce a variety of . <split> creative events from experien

info=[split_model-semi]-total_loss=-0.000403814-rec_loss=0.040371709-lm_rewards=0.0064-bleu=0.7529-bleu_bs=0.2021-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[1721-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7528819983870924 0.20213919307976067
--------split model training sampling display--------
 1---->  traffic on the bridge drives on the left , as it does in thailand , while traffic in laos <split> traffic drives on the right : the <low_freq> is controlled by traffic lights at the lao end .
 2---->  traffic on the bridge drives on the left , as it does in thailand , while traffic in . <split> laos drives on the right : the <low_freq> is controlled by traffic lights at the lao end .


 1---->  traffic on the line included timber , livestock , milk and dairy products <split> the , early <low_freq> including regular goods services specifically for transporting milk .
 2---->  traffic on the line included timber , livestock , milk and dairy . <split> products , early <low

info=[split_model-semi]-total_loss=0.001122621-rec_loss=0.026285615-lm_rewards=0.0063-bleu=0.7516-bleu_bs=0.2689-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[1801-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7516193869161982 0.26886376720516125
--------split model training sampling display--------
 1---->  battle tactics '' is the second of two expansion packs for the popular rts computer <split> it game is annihilation , released on june 30 1998 in the united states .
 2---->  battle tactics '' is the second of two expansion packs for the popular rts . <split> computer game total annihilation , released on june 30 1998 in the united states .


 1---->  battle wings coach jon norris named carter the starting quarterback for their june 16 game against the corpus christi sharks <split> he passed passed for a franchise - record eight touchdowns in the battle wings ' 81 - 35 win .
 2---->  battle wings coach jon norris named carter the starting quarterback for their june 16 ga

info=[split_model-semi]-total_loss=-0.001638036-rec_loss=0.033933792-lm_rewards=0.0072-bleu=0.7728-bleu_bs=0.6214-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[1881-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7727886374251557 0.6213853923144791
--------split model training sampling display--------
 1---->  una merkel was born in covington , kentucky , but in her early childhood . but lived in <split> she her of the southern united states due to her father 's job as a traveling salesman .
 2---->  una merkel was born in covington , kentucky , but in her early childhood , she lived . <split> in many of the southern united states due to her father 's job as a traveling salesman .


 1---->  una <low_freq> ran in the winnipeg division of broadway and received 262 votes . while <split> she lyle <low_freq> ran in neighbouring point douglas , and received 105 votes .
 2---->  una <low_freq> ran in the winnipeg division of broadway and received 262 votes , . <split> while lyle <lo

info=[split_model-semi]-total_loss=-0.000249157-rec_loss=0.014862752-lm_rewards=0.0187-bleu=0.7751-bleu_bs=0.6745-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[1961-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7751131790128916 0.6745335488766473
--------split model training sampling display--------
 1---->  it is upstream of the a bridge designed by edwin lutyens who designed an ornamental park gate house . the reach <split> it the has been widened and carries the m25 and <low_freq> road across the river in a single span .
 2---->  it is upstream of the a bridge designed by edwin lutyens who designed an ornamental park gate house along the . <split> reach which has been widened and carries the m25 and <low_freq> road across the river in a single span .


 1---->  it is used , for example , in the rare earth hypothesis to state that a planet must neither be too far away from , <split> it it too close to the sun to support life , while either extreme would result in a planet 

info=[split_model-semi]-total_loss=-0.007904371-rec_loss=0.032035142-lm_rewards=0.0135-bleu=0.7733-bleu_bs=0.7264-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[2041-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7732566197872679 0.7263507902905996
--------split model training sampling display--------
 1---->  originally planned to be shown in single lens <low_freq> with reserved seat roadshow engagements , columbia pulled the plug on that idea . <split> the the the film was drastically cut down from nearly three hours ( plus an <low_freq> ) to just over two hours (
 2---->  originally planned to be shown in single lens <low_freq> with reserved seat roadshow engagements , columbia pulled the plug on that idea . <split> , and the film was drastically cut down from nearly three hours ( plus an <low_freq> ) to just over two hours .


 1---->  originally planned to support the increasing need for managers in the <low_freq> group <split> the , the university is <low_freq> and accep

info=[split_model-semi]-total_loss=-0.001108925-rec_loss=0.018901534-lm_rewards=0.0083-bleu=0.7427-bleu_bs=0.7001-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[2121-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7426719435151813 0.7000775547907475
--------split model training sampling display--------
 1---->  <low_freq> was born to a british jewish family , and was educated at highgate school . and the university of exeter , where he read for <split> he his bachelor of arts ( ba ) in politics and a master of arts ( ma ) in russian and east european politics .
 2---->  <low_freq> was born to a british jewish family , and was educated at highgate school , and the university of exeter , where he read . <split> for a bachelor of arts ( ba ) in politics and a master of arts ( ma ) in russian and east european politics .


 1---->  halfway through the campaign , bishop was replaced as battalion commander by lieutenant colonel keith <low_freq> <split> he subsequently and he subseque

info=[split_model-semi]-total_loss=-0.001611655-rec_loss=0.039628983-lm_rewards=0.0080-bleu=0.7650-bleu_bs=0.7196-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[2201-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7649974717973499 0.719563853812939
--------split model training sampling display--------
 1---->  she returned to mexico in november 1907 , and consecrated as very first vaudeville star by the public of barcelona <split> she , havana appearing appearing again in the principal with her ultimate creation : '' la <low_freq> blanca '' .
 2---->  she returned to mexico in november 1907 , and consecrated as very first vaudeville star by the public of . <split> barcelona and havana , appearing again in the principal with her ultimate creation : '' la <low_freq> blanca '' .


 1---->  she returned to midway to refit on 25 july . on 15 august she sailed <split> on on her second war patrol , bound for a station north of <low_freq> .
 2---->  she returned to midway to refit on 2

info=[split_model-semi]-total_loss=-0.002321504-rec_loss=0.078036703-lm_rewards=0.0079-bleu=0.7609-bleu_bs=0.6583-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[2281-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7608667877155024 0.6582628653993944
--------split model training sampling display--------
 1---->  seeing billie holiday perform in 1951 began his jazz odyssey ; he studied in la before touring the <split> he band coast sat he he sat in with sarah vaughan , and down to mexico .
 2---->  seeing billie holiday perform in 1951 began his jazz odyssey ; he studied in la before touring . <split> the west coast , where he sat in with sarah vaughan , and down to mexico .


 1---->  seeing carson in the ant - man suit while torturing o'grady , stark defeated him . he <split> he was arrested on the spot for attempted murder and stealing the ant - man suit .
 2---->  seeing carson in the ant - man suit while torturing o'grady , stark defeated him and . <split> he was arrested on

info=[split_model-semi]-total_loss=0.004472691-rec_loss=0.099546880-lm_rewards=0.0059-bleu=0.7600-bleu_bs=0.7011-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[2361-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.759966166392596 0.7010906880658262
--------split model training sampling display--------
 1---->  <low_freq> as a player was a <low_freq> right - footed midfielder with a powerful shot and ability to score <split> he and assist goals , he was deployed as a central midfielder , attacking midfielder and winger .
 2---->  <low_freq> as a player was a <low_freq> right - footed midfielder with a powerful shot and ability to . <split> score and assist goals , he was deployed as a central midfielder , attacking midfielder and winger .


 1---->  <low_freq> died on 13 september 1860 in rome . his <split> his funeral was held on 17 september 1860 .
 2---->  <low_freq> died on 13 september 1860 in rome and . <split> his funeral was held on 17 september 1860 .


 1---->  <low_fre

 1---->  at the given specific site on the dna , the hydroxyl group of the tyrosine attacks a phosphate group . the <split> the hydroxyl attacks a direct <low_freq> mechanism linking the <low_freq> protein to the dna via a <low_freq> - tyrosine linkage .
 2---->  at the given specific site on the dna , the hydroxyl group of the tyrosine attacks a phosphate group in . <split> the dna using a direct <low_freq> mechanism linking the <low_freq> protein to the dna via a <low_freq> - tyrosine linkage .


 1---->  at the glimpse of the new city built at the is the city lina drew n ember with tall buildings and her great great <split> after granddaughter lives there after lina never got to see her imaginative city with her own eyes she just heard alot about it .
 2---->  at the glimpse of the new city built at the is the city lina drew n ember with tall buildings and her great . <split> great granddaughter lives there after lina never got to see her imaginative city with her own eyes she just 

--------split model training sampling display--------
 1---->  it later appeared in the collections '' the night face & other stories '' ( 1979 ) , '' the <split> <low_freq> long night '' and '' winners '' , and is about a <low_freq> culture on a distant world .
 2---->  it later appeared in the collections '' the night face & other stories '' ( 1979 ) , '' . <split> the long night '' and '' winners '' , and is about a <low_freq> culture on a distant world .


 1---->  it later became a $ <low_freq> film starring <low_freq> scott - for which <low_freq> was nominated for <split> the example douglas <low_freq> award for best debut director at the british independent film awards .
 2---->  it later became a $ <low_freq> film starring <low_freq> scott - for which <low_freq> was nominated . <split> for the douglas <low_freq> award for best debut director at the british independent film awards .


 1---->  it later became a 501 ( c ) ( 4 ) after it established a headquarters in <split> the n

--------split model training sampling display--------
 1---->  coleman is married to frankie coleman , a former state employee , who in june 2007 <split> the , accused of receiving pay for work when she was absent without permission .
 2---->  coleman is married to frankie coleman , a former state employee , who in june . <split> 2007 was accused of receiving pay for work when she was absent without permission .


 1---->  coleman remembered , however , that an incoming passenger train from saint john , new brunswick was due to arrive at the rail <split> the returned within minutes , and he returned to his post to send out urgent telegraph messages to stop the train .
 2---->  coleman remembered , however , that an incoming passenger train from saint john , new brunswick was due to arrive at the . <split> rail yard within minutes , and he returned to his post to send out urgent telegraph messages to stop the train .


 1---->  coleman served as a line judge for the first six seasons be

--------split model training sampling display--------
 1---->  ludlow later suffers a stroke ; he does not <split> he speak for years and the ranch <low_freq> .
 2---->  ludlow later suffers a stroke ; he does . <split> not speak for years and the ranch <low_freq> .


 1---->  ludlow then received a danforth scholarship and studied at harvard university and brandeis university <split> he received where he received his ph.d. in near eastern and judaic studies .
 2---->  ludlow then received a danforth scholarship and studied at harvard university and brandeis . <split> university , where he received his ph.d. in near eastern and judaic studies .


 1---->  ludlow then seeks out the help of detective i paul '' disco '' <low_freq> ( chris evans ) , who has <split> he been assigned to the robbery / murder case of washington , to join him in his own personal investigation .
 2---->  ludlow then seeks out the help of detective i paul '' disco '' <low_freq> ( chris evans ) , who . <split> has

--------split model training sampling display--------
 1---->  after walt disney 's death in 1966 , roy postponed his retirement to oversee construction of what was known known <split> he as disney world , and later renamed it walt disney world as a tribute to his brother .
 2---->  after walt disney 's death in 1966 , roy postponed his retirement to oversee construction of what was then . <split> known as disney world , and later renamed it walt disney world as a tribute to his brother .


 1---->  after walt goes to bed , the sound of a loudly <low_freq> watch is heard while heard close - up of <split> the the tag - counter of the tag <low_freq> monaco reveals the single hand of the counter moving <low_freq> .
 2---->  after walt goes to bed , the sound of a loudly <low_freq> watch is heard while a close - up . <split> of the seconds - counter of the tag <low_freq> monaco reveals the single hand of the counter moving <low_freq> .


 1---->  after warlord 's ships are destroyed by jen

--------split model training sampling display--------
 1---->  this is a list of notable hurlers who have played for ' the <low_freq> ' . including players <split> it have have had success with the club , or played for the cork senior hurling team .
 2---->  this is a list of notable hurlers who have played for ' the <low_freq> ' , including . <split> players who have had success with the club , or played for the cork senior hurling team .


 1---->  this is a list of notable people who have died in prison ( in alphabetical <split> it order , , not those who were executed as punishment of their crimes .
 2---->  this is a list of notable people who have died in prison ( in . <split> alphabetical order ) , not those who were executed as punishment of their crimes .


 1---->  this is a list of notable solo cello pieces <split> it are also includes arrangements and transcriptions .
 2---->  this is a list of notable solo cello . <split> pieces which also includes arrangements and transcr

--------split model training sampling display--------
 1---->  she appears on the weather channel , headline <split> she news , and bloomberg television .
 2---->  she appears on the weather channel , . <split> headline news , and bloomberg television .


 1---->  she appears on the show on wednesdays and has been presenting since the first episode on 14 february to co-hosted 30th september to the 2nd <split> she september <low_freq> co-hosted the show while jenny frost was <low_freq> also has her own agony aunt style problem page on ok magazine .
 2---->  she appears on the show on wednesdays and has been presenting since the first episode on 14 february <low_freq> the 30th august to the . <split> 2nd september <low_freq> co-hosted the show while jenny frost was <low_freq> also has her own agony aunt style problem page on ok magazine .


 1---->  she appears only in the first act , except in the 1954 musical in which she sees the darling children fly off with <split> when peter tries 

--------split model training sampling display--------
 1---->  after attending a boarding school in maryland , she pursued a study in western <split> she john at st. john 's college in santa fe , new mexico .
 2---->  after attending a boarding school in maryland , she pursued a study in . <split> western philosophy at st. john 's college in santa fe , new mexico .


 1---->  after attending a memorial for the <low_freq> airlines flight 180 victims , tod accidentally <split> tod hangs himself within his <low_freq> and is deemed as a suicide .
 2---->  after attending a memorial for the <low_freq> airlines flight 180 victims , tod . <split> accidentally hangs himself within his <low_freq> and is deemed as a suicide .


 1---->  after attending a private school at a local plantation and then the area 's first public school , he graduated from roanoke college <split> he graduated 1889 with a bachelor 's of arts degree and from virginia theological seminary in 1893 with a divinity degree .

--------split model training sampling display--------
 1---->  as with all <low_freq> drugs , <low_freq> can cause the ( sometimes ) irreversible movement disorder <low_freq> <split> this can , and the rare , but life - threatening , <low_freq> malignant syndrome .
 2---->  as with all <low_freq> drugs , <low_freq> can cause the ( sometimes ) irreversible movement disorder . <split> <low_freq> <low_freq> , and the rare , but life - threatening , <low_freq> malignant syndrome .


 1---->  as with all of the '' power comics '' , '' smash ! '' included reprints from america 's marvel comics ; but <split> the the last of these , the '' fantastic four '' , ended in issue 162 in the spring of 1969 .
 2---->  as with all of the '' power comics '' , '' smash ! '' included reprints from america 's marvel comics ; . <split> but the last of these , the '' fantastic four '' , ended in issue 162 in the spring of 1969 .


 1---->  as with all of the <low_freq> islands , the ownership of the atoll is

--------split model training sampling display--------
 1---->  in the end , what they choose and achieve are not as planned , <split> their their their experiences in dubai lead to new beginning in their beginning .
 2---->  in the end , what they choose and achieve are not as planned . <split> , but their experiences in dubai lead to new beginning in their lives .


 1---->  in the end corn fed , mindy and francisco all have 4 votes each whilst the guys and <split> only <low_freq> only have 3 votes putting francisco , mindy and corn fed in the box .
 2---->  in the end corn fed , mindy and francisco all have 4 votes each whilst the guys . <split> and <low_freq> only have 3 votes putting francisco , mindy and corn fed in the box .


 1---->  in the end judge taylor finds mccoy has a case against <low_freq> strong enough to proceed to trial , but in a private <split> in in with mccoy , robertson and lewin the next day , the judge doubts a jury day find the witnesses convincing .
 2---->

--------split model training sampling display--------
 1---->  they followed the views of samuel george morton , josiah c. nott , george <low_freq> , and maintained agassiz . and <split> maintained maintained that adam was the progenitor of the caucasian race , while the other races descended from <low_freq> ancestry .
 2---->  they followed the views of samuel george morton , josiah c. nott , george <low_freq> , and louis agassiz ; . <split> and maintained that adam was the progenitor of the caucasian race , while the other races descended from <low_freq> ancestry .


 1---->  they followed this with the <low_freq> 7 '' and the <low_freq> cd <split> they ep which featured their signature song ' furniture ' .
 2---->  they followed this with the <low_freq> 7 '' and the <low_freq> . <split> cd ep which featured their signature song ' furniture ' .


 1---->  they followed up with their second album , '' in love and death '' , in september 2004 <split> their they their third album , '' l

--------split model training sampling display--------
 1---->  after <low_freq> sees alan interact with <low_freq> and tells her mother , <split> the marie goes insane and attempts to kill her children .
 2---->  after <low_freq> sees alan interact with <low_freq> and tells her mother . <split> , marie goes insane and attempts to kill her children .


 1---->  after <low_freq> is caught in the volcano 's eruption , the other <low_freq> <split> the <low_freq> samson and the others back to the central park zoo .
 2---->  after <low_freq> is caught in the volcano 's eruption , the other . <split> <low_freq> accompany samson and the others back to the central park zoo .


 1---->  after keith stands her up when they make plans to get high together aura quits her restaurant job and tells a <split> the bewildered frankie that she no longer can move in with her as her mother needs her too much .
 2---->  after keith stands her up when they make plans to get high together aura quits her restau

--------split model training sampling display--------
 1---->  john barbour , in his description of douglas ' last battle , describes douglas and his contingent pursuing the fleeing enemy closely until <split> having , having <low_freq> most of his men , sir james finds himself out in front with only ten or so followers .
 2---->  john barbour , in his description of douglas ' last battle , describes douglas and his contingent pursuing the fleeing enemy closely . <split> until , having <low_freq> most of his men , sir james finds himself out in front with only ten or so followers .


 1---->  john barclay ( 1734 -- 1798 ) , scottish divine , <split> he scottish born in perthshire and died in edinburgh .
 2---->  john barclay ( 1734 -- 1798 ) , scottish divine . <split> , was born in perthshire and died in edinburgh .


 1---->  john <low_freq> ( died 1991 ) , american physicist , co-inventor of the transistor <split> he of only physicist to receive the nobel prize in physics twice .
 2

 1---->  the team will make use of splendid cars like rolls royce , hummer etc. to make this road thriller more exciting and to make the computer <split> they cars perfect , they are joining hands with hollywood <low_freq> team is trying to make a <low_freq> in hollywood effect in the movie .
 2---->  the team will make use of splendid cars like rolls royce , hummer etc. to make this road thriller more exciting and to make the . <split> computer graphics perfect , they are joining hands with hollywood <low_freq> team is trying to make a <low_freq> in hollywood effect in the movie .


 1---->  the team will play at wells fargo arena in des moines , iowa <split> the state as the ahl affiliate of the nhl 's minnesota wild .
 2---->  the team will play at wells fargo arena in des moines , . <split> iowa , as the ahl affiliate of the nhl 's minnesota wild .


 1---->  the team will replace the carolina <low_freq> in the southern league , who will relocate the team to <split> the the , retai

info=[split_model-semi]-total_loss=-0.016182536-rec_loss=0.139571100-lm_rewards=0.0102-bleu=0.7548-bleu_bs=0.3052-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[3521-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7548150080966001 0.30518093215395947
--------split model training sampling display--------
 1---->  sean volunteers to referee henry 's soccer team and ends up making an unethical decision in the team 's <split> sean favor of which in <low_freq> was not a great idea , when henry gets an inflated ego .
 2---->  sean volunteers to referee henry 's soccer team and ends up making an unethical decision in the team . <split> 's favor , which in <low_freq> was not a great idea , when henry gets an inflated ego .


 1---->  <low_freq> described '' chubby cherub 's '' graphics a ugly and also said that , '' the cartridge <split> the is a waste of plastic , and could be used in many other things '' .
 2---->  <low_freq> described '' chubby cherub 's '' graphics a ugly and also 

info=[split_model-semi]-total_loss=-0.005217016-rec_loss=0.101402499-lm_rewards=0.0416-bleu=0.7574-bleu_bs=0.3976-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[3601-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7573635024854086 0.39757040062641363
--------split model training sampling display--------
 1---->  dublin entered the competition as defending leinster champions <split> leinster ended ended it the same way .
 2---->  dublin entered the competition as defending leinster . <split> champions and ended it the same way .


 1---->  dublin is in the province of leinster on ireland 's east coast , at the mouth <split> at of the river liffey and bordered to the south by the wicklow mountains .
 2---->  dublin is in the province of leinster on ireland 's east coast , at the . <split> mouth of the river liffey and bordered to the south by the wicklow mountains .


 1---->  dublin is not surprisingly described in more detail , with sketches of dublin <split> the 's , the two c

info=[split_model-semi]-total_loss=-0.026527537-rec_loss=0.051919915-lm_rewards=0.0109-bleu=0.7578-bleu_bs=0.2335-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[3681-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7577785861471269 0.23353423692049352
--------split model training sampling display--------
 1---->  in response to the events , morsi was given a 48 - hour ultimatum by the military to meet their demands and to <split> the solve political differences , or else they would intervene by '' implementing their own road map '' for the country .
 2---->  in response to the events , morsi was given a 48 - hour ultimatum by the military to meet their demands and . <split> to solve political differences , or else they would intervene by '' implementing their own road map '' for the country .


 1---->  in response to the feedback , <low_freq> convinced harrison to officially join the show 's as its lead dance <split> the choreographer while helping him relocate to germany once 

info=[split_model-semi]-total_loss=-0.014666208-rec_loss=0.032755718-lm_rewards=0.0183-bleu=0.7521-bleu_bs=0.2438-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[3761-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.752092889832043 0.24379143395434627
--------split model training sampling display--------
 1---->  the foundation has a network of partners and ngo 's and has local , regional and international activities to improve global cultural <split> the society , community development , women 's empowerment and rapid aid for natural disasters including saudi arabia and lebanon .
 2---->  the foundation has a network of partners and ngo 's and has local , regional and international activities to improve global . <split> cultural understanding , community development , women 's empowerment and rapid aid for natural disasters including saudi arabia and lebanon .


 1---->  the foundation has contributed more than $ 8 million to the medical <split> the lab. and more than $ 143 mill

info=[split_model-semi]-total_loss=-0.006899886-rec_loss=0.034882613-lm_rewards=0.0177-bleu=0.7633-bleu_bs=0.2597-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[3841-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7633039840200171 0.25969398862700777
--------split model training sampling display--------
 1---->  he was a critic of the government , most of them were <split> he forced and forced to stop publications after few years .
 2---->  he was a critic of the government , most of them . <split> were banned and forced to stop publications after few years .


 1---->  he was a crowd favourite at olympic par . as <split> he a good front rower / five eighth .
 2---->  he was a crowd favourite at olympic par , . <split> as a good front rower / five eighth .


 1---->  he was a crucial part of the team that won the 2003 six nations grand <split> he slam and world cup , he jonny wilkinson for the winning drop goal .
 2---->  he was a crucial part of the team that won the 2003 six 

info=[split_model-semi]-total_loss=-0.007875221-rec_loss=0.032011524-lm_rewards=0.0071-bleu=0.7543-bleu_bs=0.5387-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[3921-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.754281941238093 0.5387039092256268
--------split model training sampling display--------
 1---->  in his early childhood , he lived at <low_freq> . later the family <split> later moved to <low_freq> and then the area close to <low_freq> stadion .
 2---->  in his early childhood , he lived at <low_freq> , later the . <split> family moved to <low_freq> and then the area close to <low_freq> stadion .


 1---->  in his early days , he received frequent and generally approving scrutiny from major literary figures , and became a protégé of henry james , <split> he he influence is discernible in '' the duchess of <low_freq> '' ( 1914 ) and '' the green mirror '' ( 1917 ) .
 2---->  in his early days , he received frequent and generally approving scrutiny from major literary 

info=[split_model-semi]-total_loss=-0.001951185-rec_loss=0.017066550-lm_rewards=0.0072-bleu=0.7517-bleu_bs=0.5953-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[4001-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7517259451264823 0.595267992679251
--------split model training sampling display--------
 1---->  upon becoming a tropical storm , the low - level circulation had become better defined , though forecasts predicted the wind shear would increase <split> the , preventing significant strengthening of the storm ; one <low_freq> at the national hurricane center remarked the system resembled a subtropical cyclone .
 2---->  upon becoming a tropical storm , the low - level circulation had become better defined , though forecasts predicted the wind shear would . <split> increase , preventing significant strengthening of the storm ; one <low_freq> at the national hurricane center remarked the system resembled a subtropical cyclone .


 1---->  upon becoming an actor , he change

 1---->  the mascot of the 2010 <low_freq> games is a <low_freq> named <low_freq> . which <split> it was named after the warrior , hang <low_freq> of the malacca sultanate .
 2---->  the mascot of the 2010 <low_freq> games is a <low_freq> named <low_freq> , . <split> which was named after the warrior , hang <low_freq> of the malacca sultanate .


 1---->  the mascot of the schaumburg boomers is coop , a <split> a person dressed in a prairie chicken costume .
 2---->  the mascot of the schaumburg boomers is coop , . <split> a person dressed in a prairie chicken costume .


 1---->  the mascot of the school is the mustang and <split> the the school colors are scarlet and gold .
 2---->  the mascot of the school is the mustang . <split> and the school colors are scarlet and gold .


 1---->  the mascot which fronts the children in need appeal is called '' pudsey bear '' , created and named <split> it in 1985 by bbc graphic designer joanna lane , who worked in the bbc 's design department 

--------split model training sampling display--------
 1---->  north carolina highway 97 ( nc 97 ) is a primary state highway in the state of north <split> it of , connecting the cities of <low_freq> and rocky mount with the vast rural coastal plains .
 2---->  north carolina highway 97 ( nc 97 ) is a primary state highway in the state of . <split> north carolina , connecting the cities of <low_freq> and rocky mount with the vast rural coastal plains .


 1---->  north carolina state highway 905 ( also called nc 905 ) is a twenty - mile north carolina <split> it runs highway that is from <low_freq> , north carolina to the south carolina state line .
 2---->  north carolina state highway 905 ( also called nc 905 ) is a twenty - mile north . <split> carolina state highway that runs from <low_freq> , north carolina to the south carolina state line .


 1---->  north carolina at duke '' on march 4 , 2006 , on the one year anniversary of <low_freq> . <split> on on game was the north carolin

--------split model training sampling display--------
 1---->  property owners may fit their home to stop water entering by blocking doors and air <split> they vents , <low_freq> important areas and <low_freq> the edges of the building .
 2---->  property owners may fit their home to stop water entering by blocking doors and . <split> air vents , <low_freq> important areas and <low_freq> the edges of the building .


 1---->  property prices are above the national average . hence council tax rates <split> council council quite high ; falling into band c and above .
 2---->  property prices are above the national average , hence council tax . <split> rates are quite high ; falling into band c and above .


 1---->  property rights are a theoretical construct in economics for determining how a resource is used . <split> it who owns owns that resource -- government , collective bodies , or individuals .
 2---->  property rights are a theoretical construct in economics for determining how 

--------split model training sampling display--------
 1---->  the dublin council advised essex to attack o'neill 's confederates in the province of leinster adjacent to dublin , an <split> the area in which the rebels in arms were reckoned at 3,000 plus 800 mercenaries sent from ulster .
 2---->  the dublin council advised essex to attack o'neill 's confederates in the province of leinster adjacent to dublin , . <split> an area in which the rebels in arms were reckoned at 3,000 plus 800 mercenaries sent from ulster .


 1---->  the dublin plant formula 's use of sugar made it popular among soda fans . <split> the resulted in clashes with other <low_freq> and the parent company of dr pepper .
 2---->  the dublin plant formula 's use of sugar made it popular among soda fans . <split> and resulted in clashes with other <low_freq> and the parent company of dr pepper .


 1---->  the <low_freq> team successfully repeated the experiment in 2012 , matching their previous <split> the results 

--------split model training sampling display--------
 1---->  he went on to earn his b.s. from florida state university in 1984 . three <split> three years later he graduated from the university of florida college of law .
 2---->  he went on to earn his b.s. from florida state university in 1984 and . <split> three years later he graduated from the university of florida college of law .


 1---->  he went on to earn his economics and law degrees from the university of pennsylvania <split> he served he served in the u.s. air force at an illinois airbase .
 2---->  he went on to earn his economics and law degrees from the university of . <split> pennsylvania , he served in the u.s. air force at an illinois airbase .


 1---->  he went on to earn his master 's degree in urban planning from the university of illinois at <split> he urbana - champaign in 2005 , he plans to attend cornell law school in the future .
 2---->  he went on to earn his master 's degree in urban planning from the 

--------split model training sampling display--------
 1---->  an optional 16 - inch 7 - spoke alloy wheel was available that is very <split> it similar to the <low_freq> 15 - inch <low_freq> <low_freq> type r alloy wheels .
 2---->  an optional 16 - inch 7 - spoke alloy wheel was available that is . <split> very similar to the <low_freq> 15 - inch <low_freq> <low_freq> type r alloy wheels .


 1---->  an optional <low_freq> cover <low_freq> attaches to the device and wakes it upon opening , and it also <split> this , a small flap covering the rear camera that automatically starts the camera application when lifted .
 2---->  an optional <low_freq> cover <low_freq> attaches to the device and wakes it upon opening , and it . <split> also has a small flap covering the rear camera that automatically starts the camera application when lifted .


 1---->  an optional companion bluetooth <low_freq> device , the pokémon is plus , is planned <split> it for future release and will alert users w

--------split model training sampling display--------
 1---->  in many nations , power is given to the head of state or sovereign ; in other cases , something short of a full declaration of <split> the marque of a as a letter of marque or a covert operation , may <low_freq> war - like acts by privateers or mercenaries .
 2---->  in many nations , power is given to the head of state or sovereign ; in other cases , something short of a full declaration . <split> of war , such as a letter of marque or a covert operation , may <low_freq> war - like acts by privateers or mercenaries .


 1---->  in many nests some brood could not be assigned to existing adult females and the high adult mortality rates result <split> these , direct advantages to helpers in the form of colony inheritance , and indirect advantages via life insurance .
 2---->  in many nests some brood could not be assigned to existing adult females and the high adult mortality rates . <split> result in direct advantages to hel

--------split model training sampling display--------
 1---->  in an effort to capitalize on his wife 's good looks , he had several <low_freq> scenes worked into the <split> he film showcased which showcased the then unknown actress in revealing outfits as some as some brief nudity .
 2---->  in an effort to capitalize on his wife 's good looks , he had several <low_freq> scenes worked into . <split> the film , which showcased the then unknown actress in revealing outfits as well as some brief nudity .


 1---->  in an effort to <low_freq> creation 's tears , british author 's radio dj and journalist malcolm dome has classified the band <split> the also '' brit goth '' , alongside various music publications which also slotted the band into the gothic metal genre .
 2---->  in an effort to <low_freq> creation 's tears , british author , radio dj and journalist malcolm dome has classified the . <split> band as '' brit goth '' , alongside various music publications which also slotted the

--------split model training sampling display--------
 1---->  he then slowly <low_freq> his arm successfully , wrapping the stump to <split> he prevent <low_freq> and taking a picture of the boulder .
 2---->  he then slowly <low_freq> his arm successfully , wrapping the stump . <split> to prevent <low_freq> and taking a picture of the boulder .


 1---->  he then <low_freq> and looks into the rear view mirror of the car in which his reflection is not visible and anjali seems <split> he to be leaning on nothing , thus conveying that he did die in the swamp and is now a ghost .
 2---->  he then <low_freq> and looks into the rear view mirror of the car in which his reflection is not visible and anjali . <split> seems to be leaning on nothing , thus conveying that he did die in the swamp and is now a ghost .


 1---->  he then speaks of the holy spirit 's presence as being the <split> he presence is <low_freq> all of our life and service .
 2---->  he then speaks of the holy spirit 's pr

info=[split_model-semi]-total_loss=-0.000677119-rec_loss=0.010699677-lm_rewards=0.0081-bleu=0.7523-bleu_bs=0.2004-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[4761-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7523034446897131 0.20043034518038083
--------split model training sampling display--------
 1---->  her worst fears are confirmed - alex and ryan were swapped accidentally just after <split> making birth , making alex michelle 's biological son , not ryan .
 2---->  her worst fears are confirmed - alex and ryan were swapped accidentally just . <split> after birth , making alex michelle 's biological son , not ryan .


 1---->  her worst noteworthy injury was when she broke her arm in 1995 after falling off the <split> after isometric bars were she had two plates and twelve bolts inserted in the injury .
 2---->  her worst noteworthy injury was when she broke her arm in 1995 after falling off . <split> the isometric bars , she had two plates and twelve bolts inserted i

info=[split_model-semi]-total_loss=-0.003414409-rec_loss=0.011960542-lm_rewards=0.0213-bleu=0.7467-bleu_bs=0.1991-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[4841-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7467219363045291 0.1990780550742093
--------split model training sampling display--------
 1---->  natural born <low_freq> '' is a single from the reunited duo of dr. dre and <split> originally originally cube originally intended for the scrapped album ' <low_freq> <low_freq> ' .
 2---->  natural born <low_freq> '' is a single from the reunited duo of dr. dre . <split> and ice cube originally intended for the scrapped album ' <low_freq> <low_freq> ' .


 1---->  natural born <low_freq> '' is the season finale of '' the simpsons '' ' ninth <split> the season which originally aired on the fox network on may 17 , 1998 .
 2---->  natural born <low_freq> '' is the season finale of '' the simpsons '' ' . <split> ninth season which originally aired on the fox network on may 

info=[split_model-semi]-total_loss=-0.052102033-rec_loss=0.009914954-lm_rewards=0.0070-bleu=0.7261-bleu_bs=0.1992-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[4921-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.726059911294011 0.19919451040348235
--------split model training sampling display--------
 1---->  upon the death of <low_freq> ii , <low_freq> attempted to claim the <low_freq> throne for himself , but this was met <split> this with an outcry , led by lysander , who was at the height of his influence in sparta .
 2---->  upon the death of <low_freq> ii , <low_freq> attempted to claim the <low_freq> throne for himself , but this was . <split> met with an outcry , led by lysander , who was at the height of his influence in sparta .


 1---->  upon the death of capt. davy 's wife elizabeth he emigrated with his sons to new plymouth , new zealand ( 1841 ) where he <split> this established a farm called '' <low_freq> '' and built retail stores , a brewery and public house

info=[split_model-semi]-total_loss=-0.013237283-rec_loss=0.036329765-lm_rewards=0.0263-bleu=0.7445-bleu_bs=0.1992-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[5001-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7445069094678969 0.19919451040348235
--------split model training sampling display--------
 1---->  all of george 's suggestions as to replace to replace him as leading lady are rejected as being too short , too old or <split> then too old , then he realises he has a leading lady in his presence all the time and replaces george with bob .
 2---->  all of george 's suggestions as to who to replace him as leading lady are rejected as being too short , too old . <split> or too dead ; then he realises he has a leading lady in his presence all the time and replaces george with bob .


 1---->  all of i - 78 's auxiliary routes serve new york city ; however , none of these routes actually <split> they intersects i - 78 , following the route 's <low_freq> to the east end of 

info=[split_model-semi]-total_loss=-0.006365131-rec_loss=0.055006016-lm_rewards=0.0119-bleu=0.7548-bleu_bs=0.1990-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[5081-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7548259048520526 0.199009272597782
--------split model training sampling display--------
 1---->  the yarn market in <low_freq> , somerset , england was built around 1600 . <split> it has been designated as a grade i listed building and scheduled monument .
 2---->  the yarn market in <low_freq> , somerset , england was built around 1600 . <split> and has been designated as a grade i listed building and scheduled monument .


 1---->  the yarra valley tourist railway originally leased the entire line , however they discontinued their lease on the section <split> they from <low_freq> to yarra glen , because of the bad condition of the bridges in that section .
 2---->  the yarra valley tourist railway originally leased the entire line , however they discontinued their 

info=[split_model-semi]-total_loss=-0.034989767-rec_loss=0.032008655-lm_rewards=0.0086-bleu=0.7550-bleu_bs=0.2448-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[5161-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7549872195562748 0.24476534292804586
--------split model training sampling display--------
 1---->  <low_freq> - also <low_freq> ( berber : <low_freq> ) - was a roman colony in berber africa <split> the was with a fort ( '' <low_freq> '' ) , on the limes <low_freq> .
 2---->  <low_freq> - also <low_freq> ( berber : <low_freq> ) - was a roman colony in berber . <split> africa , with a fort ( '' <low_freq> '' ) , on the limes <low_freq> .


 1---->  <low_freq> <low_freq> - 3 was discovered during an <low_freq> 150 sounding rocket flight on april 25 , 1965 , <split> the they equatorial coordinates for the year 1950 ( <low_freq> ) right ascension ra <low_freq> ( dec ) .
 2---->  <low_freq> <low_freq> - 3 was discovered during an <low_freq> 150 sounding rocket flight on ap

info=[split_model-semi]-total_loss=-0.000793999-rec_loss=0.036306158-lm_rewards=0.0089-bleu=0.7524-bleu_bs=0.2981-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[5241-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7523570990378937 0.2980991071486141
--------split model training sampling display--------
 1---->  nalanda was the oldest university - system of education in the world in the world <split> it sense of <low_freq> all subjects were taught in <low_freq> - <low_freq> language .
 2---->  nalanda was the oldest university - system of education in the world in the . <split> modern sense of <low_freq> all subjects were taught in <low_freq> - <low_freq> language .


 1---->  <low_freq> runs the only bar in the canyon , in which is served a strange drink made by himself <split> it it it is unknown what the recipe is but according to him it will get you drunk .
 2---->  <low_freq> runs the only bar in the canyon , in which is served a strange drink made by . <split> himself , it

info=[split_model-semi]-total_loss=-0.010044014-rec_loss=0.055325743-lm_rewards=0.0188-bleu=0.7429-bleu_bs=0.2735-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[5321-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7429212842715495 0.2734544117302476
--------split model training sampling display--------
 1---->  this is continued untill the edge of the hole is <split> the breached , then the dish is mixed .
 2---->  this is continued untill the edge of the hole . <split> is breached , then the dish is mixed .


 1---->  this is contrary to the data analyzed in a study published by author and illustrator gregory s. paul , a study <split> a has which the <low_freq> and conclusions has been criticized by more recent articles published in the same journal .
 2---->  this is contrary to the data analyzed in a study published by author and illustrator gregory s. paul , a . <split> study in which the <low_freq> and conclusions has been criticized by more recent articles published in th

info=[split_model-semi]-total_loss=-0.006339691-rec_loss=0.030813700-lm_rewards=0.0238-bleu=0.7472-bleu_bs=0.2684-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[5401-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.747169782409104 0.2683990855769778
--------split model training sampling display--------
 1---->  barbara <low_freq> <low_freq> , is a danish celebrity who is professionally known as barbara <low_freq> , was born november 7 , 1980 in <low_freq> , denmark <split> <low_freq> to parents miriam <low_freq> ( of the slovenian / italian descent ) and <low_freq> h. jørgensen ( of danish / swedish descent ) .
 2---->  barbara <low_freq> <low_freq> , is a danish celebrity who is professionally known as barbara <low_freq> , was born november 7 , 1980 in <low_freq> , . <split> denmark to parents miriam <low_freq> ( of the slovenian / italian descent ) and <low_freq> h. jørgensen ( of danish / swedish descent ) .


 1---->  barbara davidson , a staff photographer at the los angele

info=[split_model-semi]-total_loss=-0.007522795-rec_loss=0.021648820-lm_rewards=0.0070-bleu=0.7575-bleu_bs=0.2517-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[5481-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7575040094897016 0.251748760113489
--------split model training sampling display--------
 1---->  very young larvae feed only on the upper surfaces of <split> later leaves ; later , they consume whole leaves .
 2---->  very young larvae feed only on the upper surfaces . <split> of leaves ; later , they consume whole leaves .


 1---->  <low_freq> is a juice beverage brand currently owned by sunny delight , , the brand dates <split> the back to 1865 ( under the name '' new england beverage '' ) .
 2---->  <low_freq> is a juice beverage brand currently owned by sunny delight beverages , the brand . <split> dates back to 1865 ( under the name '' new england beverage '' ) .


 1---->  <low_freq> dam is a 220 - meter high dam <split> it in switzerland constructed from 1960

info=[split_model-semi]-total_loss=-0.004330560-rec_loss=0.020978471-lm_rewards=0.0076-bleu=0.7564-bleu_bs=0.2448-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[5561-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7563633724832155 0.24476534292804586
--------split model training sampling display--------
 1---->  during the offseason of 2010 he opted out of his current contract with <low_freq> <low_freq> , seeking a new nhl <split> after 's , after failing to do so , he decided to stay with <low_freq> for another season .
 2---->  during the offseason of 2010 he opted out of his current contract with <low_freq> <low_freq> , seeking a new . <split> nhl contract but after failing to do so , he decided to stay with <low_freq> for another season .


 1---->  during the one - day occupation of the town , the british took guns , ammunition , wagons , horses , livestock <split> the and other <low_freq> , and in addition were reported to have participated in looting , vandalism , raping

info=[split_model-semi]-total_loss=-0.011566340-rec_loss=0.022185206-lm_rewards=0.0103-bleu=0.7571-bleu_bs=0.2934-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[5641-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7570589863173823 0.2933686667215542
--------split model training sampling display--------
 1---->  he also formed a friendship with noted author ernest hemingway and became an accomplished <low_freq> was paroled into <split> eventually alpha flight and , eventually joining alpha flight , he adopted the codename ' puck .
 2---->  he also formed a friendship with noted author ernest hemingway and became an accomplished <low_freq> was paroled . <split> into beta flight and , eventually joining alpha flight , he adopted the codename ' puck .


 1---->  he also formed an intelligence network that and used the information collected to help organize more escapes <split> he more to guide the runaway slaves into the liberated territory , known as settlement .
 2---->  he also 

info=[split_model-semi]-total_loss=-0.011663782-rec_loss=0.022104984-lm_rewards=0.0062-bleu=0.7525-bleu_bs=0.1992-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[5721-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7525149993671253 0.19919451040348235
--------split model training sampling display--------
 1---->  in 1846 , he married charlotte augusta <low_freq> ( c. 1825 - 1887 ) of south carolina and in 1859 he <split> in built a home at 350 fifth avenue , which is today the street address of the empire address building .
 2---->  in 1846 , he married charlotte augusta <low_freq> ( c. 1825 - 1887 ) of south carolina and in 1859 . <split> he built a home at 350 fifth avenue , which is today the street address of the empire state building .


 1---->  in 1846 , he officially changed his name to david levy <low_freq> ( adding his father 's ancestral sephardic surname ) and married <low_freq> <split> they married <low_freq> , the daughter of charles a. <low_freq> , former governor

info=[split_model-semi]-total_loss=-0.013067899-rec_loss=0.017379345-lm_rewards=0.0200-bleu=0.7456-bleu_bs=0.1984-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[5801-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7455912710915564 0.19842473761617535
--------split model training sampling display--------
 1---->  as the daughter of a lawyer , cher has developed extensive skills in debating and discussion , such as the ability <split> she to talk her teachers into improving her grades ; she also <low_freq> at mr. hall 's debate class .
 2---->  as the daughter of a lawyer , cher has developed extensive skills in debating and discussion , such as the . <split> ability to talk her teachers into improving her grades ; she also <low_freq> at mr. hall 's debate class .


 1---->  as the daughter of an officer in the russian imperial army , <low_freq> suffered greatly during the revolution with <split> she her three brothers ( who were army officers like their three 's being murdered b

info=[split_model-semi]-total_loss=-0.001429304-rec_loss=0.004294062-lm_rewards=0.0153-bleu=0.7663-bleu_bs=0.4578-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[5881-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7663235884438083 0.4577797053831403
--------split model training sampling display--------
 1---->  the carnivore house received several new cages in 1896 . several other bigger cats were <split> all featured , until all moved into a newly built carnivore house in 1904 .
 2---->  the carnivore house received several new cages in 1896 where several other bigger cats . <split> were featured , until all moved into a newly built carnivore house in 1904 .


 1---->  the <low_freq> vocabulary is traditionally part of carnival cant , a secret language . and is an ever form <split> it changing form of communication , in large part designed to be impossible to be by an outsider .
 2---->  the <low_freq> vocabulary is traditionally part of carnival cant , a secret language , and

info=[split_model-semi]-total_loss=-0.006664576-rec_loss=0.017440403-lm_rewards=0.0047-bleu=0.7584-bleu_bs=0.1992-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[5961-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7583612006605653 0.19919451040348235
--------split model training sampling display--------
 1---->  the saint - <low_freq> cloister , also french , dates to the late 12th -- early to 13th centuries <split> it , it was originally built for the monastery of saint - <low_freq> - le - <low_freq> .
 2---->  the saint - <low_freq> cloister , also french , dates to the late 12th -- early to 13th . <split> centuries , and was originally built for the monastery of saint - <low_freq> - le - <low_freq> .


 1---->  the saint - thomas church ( , ) is the main protestant church of strasbourg since strasbourg <split> the 's cathedral became catholic again after the annexation of the town by france in 1681 .
 2---->  the saint - thomas church ( , ) is the main protestant church of s

info=[split_model-semi]-total_loss=-0.008425760-rec_loss=0.071821228-lm_rewards=0.0092-bleu=0.7612-bleu_bs=0.3291-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[6041-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.761154800072331 0.3291098797051803
--------split model training sampling display--------
 1---->  chronicling the adventures of a bengali boy in the forests of africa , it is considered <split> it to be one of the most important adventure novels written in the bengali language .
 2---->  chronicling the adventures of a bengali boy in the forests of africa , it is . <split> considered to be one of the most important adventure novels written in the bengali language .


 1---->  chrono 4 , the emblem collection for eberhard , has a new offering <split> it in chrono 4 <low_freq> , large <low_freq> full of character .
 2---->  chrono 4 , the emblem collection for eberhard , has a new . <split> offering in chrono 4 <low_freq> , large <low_freq> full of character .


 1---->

info=[split_model-semi]-total_loss=-0.020678373-rec_loss=0.045527764-lm_rewards=0.0228-bleu=0.7579-bleu_bs=0.2864-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[6121-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7578779703115202 0.28640721353588516
--------split model training sampling display--------
 1---->  the cobalt <low_freq> valkyrie is an experimental single - engine four to five - seat canard pusher configuration <split> it light aircraft intended for private ownership being developed in the united states by cobalt aircraft .
 2---->  the cobalt <low_freq> valkyrie is an experimental single - engine four to five - seat canard pusher . <split> configuration light aircraft intended for private ownership being developed in the united states by cobalt aircraft .


 1---->  the cobalt railway station is a former train station located in the town of cobalt in <split> it ontario , canada was a stop for ontario northland 's '' <low_freq> '' trains .
 2---->  the cobalt railw

info=[split_model-semi]-total_loss=-0.007565717-rec_loss=0.019614374-lm_rewards=0.0110-bleu=0.7519-bleu_bs=0.2312-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[6201-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7519433217488929 0.2311807310550032
--------split model training sampling display--------
 1---->  it broadcasts a high definition digital signal on uhf channel 35 <split> it from a transmitter on dutton hill in south gray .
 2---->  it broadcasts a high definition digital signal on uhf channel . <split> 35 from a transmitter on dutton hill in south gray .


 1---->  it broadcasts a high definition digital signal on uhf channel 43 ( or virtual channel 7.1 via <split> it psip ) from a transmitter on potts chapel road in unincorporated eastern madison county .
 2---->  it broadcasts a high definition digital signal on uhf channel 43 ( or virtual channel 7.1 . <split> via psip ) from a transmitter on potts chapel road in unincorporated eastern madison county .


 1----> 

info=[split_model-semi]-total_loss=-0.032299642-rec_loss=0.120871320-lm_rewards=0.0144-bleu=0.7552-bleu_bs=0.3827-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[6281-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7551551282453459 0.38267464162431575
--------split model training sampling display--------
 1---->  it serves routes to mexican cities , mainly mexico city and monterrey . and <split> the also has international services , with daily flights to houston , texas .
 2---->  it serves routes to mexican cities , mainly mexico city and monterrey , . <split> and also has international services , with daily flights to houston , texas .


 1---->  it serves students from <low_freq> and ridgeway . consists of four schools : two <split> two elementary schools are one middle school , and one high school .
 2---->  it serves students from <low_freq> and ridgeway and consists of four schools : . <split> two elementary schools , one middle school , and one high school .


 1---->  it

info=[split_model-semi]-total_loss=0.009110982-rec_loss=0.076618239-lm_rewards=0.0066-bleu=0.7663-bleu_bs=0.3579-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[6361-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7663300448895839 0.35785019965799675
--------split model training sampling display--------
 1---->  <low_freq> is the first cirque du soleil production that deviates from the company 's usual format -- the production <split> the presents a more straightforward story , unlike the more abstract visuals presented by other cirque productions .
 2---->  <low_freq> is the first cirque du soleil production that deviates from the company 's usual format -- the . <split> production presents a more straightforward story , unlike the more abstract visuals presented by other cirque productions .


 1---->  <low_freq> <low_freq> and miloš <low_freq> began their collaboration in the early 1960s , as filmmakers , then decided to work on a cycle of graphic novels <split> the decided t

info=[split_model-semi]-total_loss=-0.002482653-rec_loss=0.036419369-lm_rewards=0.0063-bleu=0.7511-bleu_bs=0.3234-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[6441-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7511498998117151 0.32340529654166383
--------split model training sampling display--------
 1---->  the aft - crossing trajectory is a flight path for a rocket , which would be slowed by a <split> it small parachute attached to its tail , and then ignite once the carrier aircraft has passed it .
 2---->  the aft - crossing trajectory is a flight path for a rocket , which would be slowed by . <split> a small parachute attached to its tail , and then ignite once the carrier aircraft has passed it .


 1---->  the aftermath of wwii showed that the <low_freq> as a race , must strive to increase its <split> the population for the second time and the race for a new <low_freq> baby was on .
 2---->  the aftermath of wwii showed that the <low_freq> as a race , must strive to 

info=[split_model-semi]-total_loss=-0.019732041-rec_loss=0.066035248-lm_rewards=0.0068-bleu=0.7487-bleu_bs=0.4033-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[6521-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7486659470272969 0.40332283361640725
--------split model training sampling display--------
 1---->  <low_freq> is prepared by first condensing <low_freq> with <low_freq> conditions acidic and <low_freq> conditions , with four of the resulting <low_freq> groups <split> four removed by <low_freq> using palladium on carbon and hydrogen and the resulting secondary amine groups are <low_freq> in acetic anhydride .
 2---->  <low_freq> is prepared by first condensing <low_freq> with <low_freq> under acidic and <low_freq> conditions , with four of the resulting <low_freq> . <split> groups removed by <low_freq> using palladium on carbon and hydrogen and the resulting secondary amine groups are <low_freq> in acetic anhydride .


 1---->  <low_freq> rijeka are the city 's main f

info=[split_model-semi]-total_loss=0.032928266-rec_loss=0.111049056-lm_rewards=0.0071-bleu=0.7522-bleu_bs=0.2710-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[6601-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7522121296396054 0.27101140038113297
--------split model training sampling display--------
 1---->  she has an rn and <low_freq> from cornell university in new york city and an <split> she mph at the school of public health of university of california , berkeley .
 2---->  she has an rn and <low_freq> from cornell university in new york , and . <split> an mph at the school of public health of university of california , berkeley .


 1---->  she has an active social media presence and also supports multiple charities <split> she concerns causes including women 's empowerment and animal rights .
 2---->  she has an active social media presence and also supports multiple . <split> charities and causes including women 's empowerment and animal rights .


 1---->  she has a

info=[split_model-semi]-total_loss=-0.005809459-rec_loss=0.034829032-lm_rewards=0.0102-bleu=0.7602-bleu_bs=0.3100-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[6681-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7602025171071006 0.30995282816055464
--------split model training sampling display--------
 1---->  it thus generated <low_freq> , as well as <low_freq> , <split> it commoners ; commoners could lay charges against scholars .
 2---->  it thus generated <low_freq> , as well as <low_freq> . <split> , warfare ; commoners could lay charges against scholars .


 1---->  it tied for the most since <low_freq> are hot and <split> it humid with a july daily average temperature of .
 2---->  it tied for the most since <low_freq> are hot . <split> and humid with a july daily average temperature of .


 1---->  it <low_freq> acid soils but favours neutral to alkaline soils , and <split> in in some conditions it may be short - lived .
 2---->  it <low_freq> acid soils but favours n

info=[split_model-semi]-total_loss=-0.094167307-rec_loss=0.028825073-lm_rewards=0.0056-bleu=0.7531-bleu_bs=0.3139-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[6761-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7531459910743087 0.3138941290789757
--------split model training sampling display--------
 1---->  she made her first appearance on tv in 1984 in '' man of letters '' , but is <split> she best known for playing ruth wilkinson in the soap opera neighbours from 1996 until 1999 .
 2---->  she made her first appearance on tv in 1984 in '' man of letters '' , but . <split> is best known for playing ruth wilkinson in the soap opera neighbours from 1996 until 1999 .


 1---->  she made her first appearance on the london stage in 1957 <split> she , the title role of '' <low_freq> <low_freq> '' .
 2---->  she made her first appearance on the london stage in . <split> 1957 in the title role of '' <low_freq> <low_freq> '' .


 1---->  she made her first appearance on the yearly 

info=[split_model-semi]-total_loss=-0.039427690-rec_loss=0.071488753-lm_rewards=0.0088-bleu=0.7557-bleu_bs=0.3123-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[6841-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7557197908695665 0.312298891402516
--------split model training sampling display--------
 1---->  later the next morning , japanese soldiers attack , and during the intense fighting , <split> during anderson is killed and whitehorse is about to be captured by the japanese .
 2---->  later the next morning , japanese soldiers attack , and during the intense fighting . <split> , anderson is killed and whitehorse is about to be captured by the japanese .


 1---->  later the owner of the club was the <low_freq> <low_freq> company ( based in <low_freq> ) . now <split> now the club is owned by the government of <low_freq> and an investment group , <low_freq> <low_freq> .
 2---->  later the owner of the club was the <low_freq> <low_freq> company ( based in <low_freq> ) , . 

info=[split_model-semi]-total_loss=-0.017689476-rec_loss=0.053222142-lm_rewards=0.0210-bleu=0.7480-bleu_bs=0.2285-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[6921-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7479997451326665 0.22848226782026626
--------split model training sampling display--------
 1---->  a spill vase was usually kept on the <low_freq> and was filled with spills used to transfer <split> this , from the fireplace to candles , lamps , a pipe or a cigar .
 2---->  a spill vase was usually kept on the <low_freq> and was filled with spills used to . <split> transfer fire from the fireplace to candles , lamps , a pipe or a cigar .


 1---->  a spin - off , entitled '' bumblebee '' , is scheduled for release on december 21 , 2018 <split> the , an an untitled sixth film is set to be released on june 28 , 2019 .
 2---->  a spin - off , entitled '' bumblebee '' , is scheduled for release on december 21 , . <split> 2018 , and an untitled sixth film is set to be rel

info=[split_model-semi]-total_loss=-0.004000721-rec_loss=0.015039876-lm_rewards=0.0087-bleu=0.7460-bleu_bs=0.2256-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[7001-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7460372104495803 0.22555784320144584
--------split model training sampling display--------
 1---->  <low_freq> <low_freq> ( january 24 , 1892 , <low_freq> ( now vojvodina ) - <low_freq> 1945 , , a former officer in <split> a october austro - hungarian army , was arrested in october 1931 and charged with arranging the derailment of several trains .
 2---->  <low_freq> <low_freq> ( january 24 , 1892 , <low_freq> ( now vojvodina ) - <low_freq> 1945 ) , a former officer . <split> in the austro - hungarian army , was arrested in october 1931 and charged with arranging the derailment of several trains .


 1---->  <low_freq> <low_freq> <low_freq> ( born february 21 , 1911 , <low_freq> county , north , today <low_freq> bara , north banat - <split> he died october 3 , 1953 , 

info=[split_model-semi]-total_loss=-0.033883251-rec_loss=0.053185187-lm_rewards=0.0149-bleu=0.7586-bleu_bs=0.2286-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[7081-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7585530462636422 0.2286062751093647
--------split model training sampling display--------
 1---->  in 2000 , he was elected to the lebanese parliament for <low_freq> - <low_freq> <split> he running , running on the list of druze leader walid <low_freq> .
 2---->  in 2000 , he was elected to the lebanese parliament for <low_freq> - . <split> <low_freq> constituency , running on the list of druze leader walid <low_freq> .


 1---->  in 2000 , he was named ' asian of the year ' . and in november <split> in 2009 , he was given an honorary doctorate from the university of hertfordshire .
 2---->  in 2000 , he was named ' asian of the year ' , and in . <split> november 2009 , he was given an honorary doctorate from the university of hertfordshire .


 1---->  in 2000 , he w

info=[split_model-semi]-total_loss=-0.016418498-rec_loss=0.076220542-lm_rewards=0.0162-bleu=0.7544-bleu_bs=0.2653-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[7161-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7543648712269931 0.26531140798675756
--------split model training sampling display--------
 1---->  <low_freq> afc play in the 1st division of the west riding county amateur <split> the football league at the gregory playing fields on <low_freq> road .
 2---->  <low_freq> afc play in the 1st division of the west riding county . <split> amateur football league at the gregory playing fields on <low_freq> road .


 1---->  kirkby head was plotted from air photographs taken from an australian national antarctic research expeditions expeditions <low_freq> ) aircraft in 1956 <split> and , and was first visited by an <low_freq> party led by sydney l. kirkby in november , 1960 .
 2---->  kirkby head was plotted from air photographs taken from an australian national antarctic 

info=[split_model-semi]-total_loss=-0.061690032-rec_loss=0.055505261-lm_rewards=0.0103-bleu=0.7419-bleu_bs=0.1992-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[7241-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7418521313816818 0.19919451040348235
--------split model training sampling display--------
 1---->  the lyric theater served as a symbol of black economic influence . as well as a social gathering <split> he place a free of discrimination - and a source of pride and culture within <low_freq> .
 2---->  the lyric theater served as a symbol of black economic influence , as well as a social . <split> gathering place - free of discrimination - and a source of pride and culture within <low_freq> .


 1---->  the lyric theatre and cultural arts center is a historic performance venue in lexington , kentucky rooted in <split> he lexington - american history , focuses on community development and rebirth in lexington 's east end .
 2---->  the lyric theatre and cultural arts c

info=[split_model-semi]-total_loss=-0.310129464-rec_loss=0.083038174-lm_rewards=0.0159-bleu=0.7472-bleu_bs=0.1992-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[7321-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7472310947047562 0.19919451040348235
--------split model training sampling display--------
 1---->  <low_freq> di ser <low_freq> ( documented as active <low_freq> -- 1363 ) , an italian painter and <low_freq> <low_freq> , generally has been identified <split> <low_freq> as <low_freq> di ser <low_freq> di francesco <low_freq> , <low_freq> recent research points instead to a <low_freq> di ser <low_freq> di stefano .
 2---->  <low_freq> di ser <low_freq> ( documented as active <low_freq> -- 1363 ) , an italian painter and <low_freq> <low_freq> , generally has been . <split> identified as <low_freq> di ser <low_freq> di francesco <low_freq> , but recent research points instead to a <low_freq> di ser <low_freq> di stefano .


 1---->  niccolò <low_freq> ( 1310 -- 8 novembe

info=[split_model-semi]-total_loss=-0.237811118-rec_loss=0.028177997-lm_rewards=0.0108-bleu=0.7393-bleu_bs=0.2158-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[7401-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7392806106015525 0.21582658720732759
--------split model training sampling display--------
 1---->  declan <low_freq> returned to directed the first two episodes , and followed by mark <low_freq> for <split> declan episodes 3 and 4 , declan nigel cole for episodes 3 and 6 .
 2---->  declan <low_freq> returned to directed the first two episodes , and followed by mark <low_freq> . <split> for episodes 3 and 4 , and nigel cole for episodes 5 and 6 .


 1---->  declan <low_freq> is a fictional character in the british soap opera <split> he '' emmerdale '' , introduced on 15 april 2010 .
 2---->  declan <low_freq> is a fictional character in the british soap . <split> opera '' emmerdale '' , introduced on 15 april 2010 .


 1---->  declan quinn ( ; born 1957 ) is an irish 

info=[split_model-semi]-total_loss=-0.004331584-rec_loss=0.039549626-lm_rewards=0.0060-bleu=0.7452-bleu_bs=0.2158-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[7481-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7451801780650245 0.21582658720732759
--------split model training sampling display--------
 1---->  infantry support using direct - fire was its intended role , and later there <split> later was also a strong emphasis on destroying enemy armour whenever encountered .
 2---->  infantry support using direct - fire was its intended role , and later . <split> there was also a strong emphasis on destroying enemy armour whenever encountered .


 1---->  infantry training took place during the winter of 1940 <split> the , preparations made for the anticipated invasion .
 2---->  infantry training took place during the winter of . <split> 1940 and preparations made for the anticipated invasion .


 1---->  infantry was using russian made and english made rifles <split> englis

info=[split_model-semi]-total_loss=0.046383981-rec_loss=0.016392848-lm_rewards=0.0167-bleu=0.7162-bleu_bs=0.1992-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[7561-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7161644315207445 0.19919451040348235
--------split model training sampling display--------
 1---->  a tv translation should take place at <low_freq> . which <split> the attracted many passengers to the main deck .
 2---->  a tv translation should take place at <low_freq> , . <split> which attracted many passengers to the main deck .


 1---->  a tv version of '' <low_freq> '' ran on mtv from 1996 to 2000 . it followed the <split> it radio general format expanded the radio program but featured a live audience and a female co-host .
 2---->  a tv version of '' <low_freq> '' ran on mtv from 1996 to 2000 ; it followed . <split> the same general format as the radio program but featured a live audience and a female co-host .


 1---->  a tablet is normally without without a 

info=[split_model-semi]-total_loss=-0.066264741-rec_loss=0.120805837-lm_rewards=0.0192-bleu=0.7415-bleu_bs=0.2519-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[7641-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7414809258664196 0.25193345263724437
--------split model training sampling display--------
 1---->  central community college is housed in buildings of the former naval ammunition <low_freq> , st. michael <split> st. 's elementary school , built in 1912 , is now the police headquarters .
 2---->  central community college is housed in buildings of the former naval ammunition <low_freq> , st. . <split> michael 's elementary school , built in 1912 , is now the police headquarters .


 1---->  central connecticut state university 's commencement exercises are usually held either in hartford at the xl center <split> the statistics formerly the hartford civic center ) or on campus at herbert d. <low_freq> hall .
 2---->  central connecticut state university 's commencement

info=[split_model-semi]-total_loss=-0.007109151-rec_loss=0.039131060-lm_rewards=0.0167-bleu=0.7513-bleu_bs=0.2022-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[7721-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7512733566143398 0.20224549559644944
--------split model training sampling display--------
 1---->  social <low_freq> is service oriented organizations , helps businesses to increase brand revenue helps to reach its <split> the targeted audiences by offering marketing services via facebook , and other social networking websites .
 2---->  social <low_freq> is service oriented organizations , helps businesses to increase brand revenue helps to reach . <split> its targeted audiences by offering marketing services via facebook , and other social networking websites .


 1---->  social credit - national unity was the label used by harry watson arnold when he <split> the ran in the 1940 canadian election in the 1940 of saskatoon city .
 2---->  social credit - national uni

info=[split_model-semi]-total_loss=-0.009397268-rec_loss=0.027193589-lm_rewards=0.0122-bleu=0.7429-bleu_bs=0.1992-hidden_dim=256-input_dim=100-epoch=0-batch_size=17-batch_id=[7801-[of]-46585]-lr=0.0050-loss_ratio=0.4500 0.7429293381220866 0.19919451040348235
--------split model training sampling display--------
 1---->  the <low_freq> frequency festival , also frequency festival or just frequency festival formerly vienna city festival , <split> the vienna a music festival that until 2008 place near salzburg austria , usually every august .
 2---->  the <low_freq> frequency festival , also frequency festival or just frequency , formerly vienna city festival . <split> , is a music festival that until 2008 place near salzburg austria , usually every august .


 1---->  the <low_freq> concept , which had been used with success in the mediterranean theatre , was not part of british doctrine , <split> the it held that a corps was a purely operational formation , and therefore not part of the

 1---->  she has hosted several television shows in the 1990s and written several books , including '' the art of war '' about war <split> she featured featured featured her son , peter <low_freq> who was australia 's official war artist in the war on terrorism .
 2---->  she has hosted several television shows in the 1990s and written several books , including '' the art of war '' about . <split> war artists which featured her son , peter <low_freq> who was australia 's official war artist in the war on terrorism .


 1---->  she has hosted the prestigious gsm awards ceremony in cannes as well , and the pan european ir awards <split> it are zurich in october 2006 and was a speaker at the danish media festival in november 2006 .
 2---->  she has hosted the prestigious gsm awards ceremony in cannes as well , and the pan european ir . <split> awards in zurich in october 2006 and was a speaker at the danish media festival in november 2006 .


 1---->  she has inspired at least two songs :

--------split model training sampling display--------
 1---->  aside from yachting , burnham also boasts well - supported association football , rugby union , cricket and <split> burnham lawn bowls clubs , all of which compete at local , county and regional level .
 2---->  aside from yachting , burnham also boasts well - supported association football , rugby union , cricket . <split> and lawn bowls clubs , all of which compete at local , county and regional level .


 1---->  <low_freq> from his software projects , he is also well known for speaking and writing novel opinion pieces on technology , <split> the he , and technical communities , in which he is frequently referred to as simply as ' zed ' .
 2---->  <low_freq> from his software projects , he is also well known for speaking and writing novel opinion pieces on technology . <split> , business , and technical communities , in which he is frequently referred to as simply as ' zed ' .


 1---->  asif ali <low_freq> , the preside

--------split model training sampling display--------
 1---->  <low_freq> is an american toy company that sells a line <split> the <low_freq> balls that sells when you throw them .
 2---->  <low_freq> is an american toy company that sells a . <split> line of balls that swear when you throw them .


 1---->  swearing revenge on cleveland , robert left with the <low_freq> ' pet <low_freq> , joan , although <split> since since the pilot , his intentions for revenge seemed to have mysteriously been forgotten .
 2---->  swearing revenge on cleveland , robert left with the <low_freq> ' pet <low_freq> , joan , . <split> although since the pilot , his intentions for revenge seemed to have mysteriously been forgotten .


 1---->  <low_freq> died 1908 in his native newark , 60 years old , and <split> he is interred at evergreen cemetery , hillside , new jersey .
 2---->  <low_freq> died 1908 in his native newark , 60 years old , . <split> and is interred at evergreen cemetery , hillside , new je

--------split model training sampling display--------
 1---->  in order to maintain the scale and perspective of the building , the windows are by necessity long and <split> this narrow ; this creates an illusion of greater height - illusion being a typical baroque feature .
 2---->  in order to maintain the scale and perspective of the building , the windows are by necessity long . <split> and narrow ; this creates an illusion of greater height - illusion being a typical baroque feature .


 1---->  in order to maintain the scale and perspective of the building , the windows are by necessity long <split> narrow this narrow this creates an illusion of greater height - illusion being a typical baroque feature .
 2---->  in order to maintain the scale and perspective of the building , the windows are by necessity . <split> long and narrow this creates an illusion of greater height - illusion being a typical baroque feature .


 1---->  in order to maintain the scale and perspective of th

--------split model training sampling display--------
 1---->  however , it is expected that charges will be filed in the next weeks to months <split> the , the the charges could carry penalties as high as the death penalty .
 2---->  however , it is expected that charges will be filed in the next weeks to . <split> months , and the charges could carry penalties as high as the death penalty .


 1---->  however , it is feared that cross <low_freq> is rife and <split> the dundee city council plan to carry out mass <low_freq> .
 2---->  however , it is feared that cross <low_freq> is rife . <split> and dundee city council plan to carry out mass <low_freq> .


 1---->  however , it is generally accepted that '' <low_freq> '' ( <low_freq> ) refers to the <low_freq> river and '' <low_freq> '' <split> <low_freq> to the <low_freq> river , and that man 's territory was bordered on the north by the han empire .
 2---->  however , it is generally accepted that '' <low_freq> '' ( <low_freq> ) ref

--------split model training sampling display--------
 1---->  on september 20 , 2015 , david ortiz hit his <low_freq> career home run off matt moore in tropicana field becoming the 27th player in <split> in november history to achieve that prestigious milestone ; in november 2015 , ortiz announced that the 2016 season would be his last .
 2---->  on september 20 , 2015 , david ortiz hit his <low_freq> career home run off matt moore in tropicana field becoming the 27th player . <split> in mlb history to achieve that prestigious milestone ; in november 2015 , ortiz announced that the 2016 season would be his last .


 1---->  on september 20 , 2016 , <low_freq> signed with the atlanta hawks <split> <low_freq> was <low_freq> was waived on october 21 , 2016 .
 2---->  on september 20 , 2016 , <low_freq> signed with the atlanta . <split> hawks , but was waived on october 21 , 2016 .


 1---->  on september 20 , 2016 , <low_freq> signed with the toronto raptors . but <split> he was waived o

--------split model training sampling display--------
 1---->  another band known as black bob 's initially refused to be removed from kansas . <split> but were were ultimately forced to do so and joined the other tribes .
 2---->  another band known as black bob 's initially refused to be removed from kansas . <split> , but were ultimately forced to do so and joined the other tribes .


 1---->  another band still in operation is brisbane 's <low_freq> bush band -- founded in <split> <low_freq> 1977 , <low_freq> has performed continually until the present ( 2009 ) .
 2---->  another band still in operation is brisbane 's <low_freq> bush band -- founded . <split> in 1977 , <low_freq> has performed continually until the present ( 2009 ) .


 1---->  another barren spell ensured but by the 1990s , the club <split> the grew and established its juvenile section in 1991 .
 2---->  another barren spell ensured but by the 1990s , the . <split> club grew and established its juvenile section in

--------split model training sampling display--------
 1---->  while serving as an artillery officer in east prussia , <low_freq> witnessed war crimes against the civilian german population by soviet liberators <split> the , the elderly were robbed of their meager possessions and women and girls were gang - raped to death .
 2---->  while serving as an artillery officer in east prussia , <low_freq> witnessed war crimes against the civilian german population by soviet . <split> liberators as the elderly were robbed of their meager possessions and women and girls were gang - raped to death .


 1---->  while serving as an officer in <low_freq> military he revolted on the fateful night of 25th march 1971 & the punishment for sedition <split> he on have meant he would face a firing squad or the gallows if he was ever captured by the occupying forces .
 2---->  while serving as an officer in <low_freq> military he revolted on the fateful night of 25th march 1971 & the punishment for . <spli

--------split model training sampling display--------
 1---->  while '' ultraviolet '' was not released as a single , it was used in a scene in the 2006 adam sandler <split> it was was click '' , and was featured in the 2007 film '' click diving bell and the butterfly '' .
 2---->  while '' ultraviolet '' was not released as a single , it was used in a scene in the 2006 adam . <split> sandler film '' click '' , and was featured in the 2007 film '' the diving bell and the butterfly '' .


 1---->  while '' voyager 1 '' is commonly spoken of as having left the solar system simultaneously with having <split> it crossed the <low_freq> , it remains well within the sphere of the sun 's gravitational dominion .
 2---->  while '' voyager 1 '' is commonly spoken of as having left the solar system simultaneously with . <split> having crossed the <low_freq> , it remains well within the sphere of the sun 's gravitational dominion .


 1---->  while '' western religion '' encompasses more than abra

--------split model training sampling display--------
 1---->  in 1893 , the palazzo was damaged by fire during the blaze an important archive of <split> the these negatives <low_freq> by the eminent architectural writer john henry parker were destroyed .
 2---->  in 1893 , the palazzo was damaged by fire during the blaze an important archive . <split> of <low_freq> negatives <low_freq> by the eminent architectural writer john henry parker were destroyed .


 1---->  in 1893 , the school moved into a new building on second street and , in 1917 , moved to its <split> in current location in the building ( now called <low_freq> hall ) which sits on a hill overlooking locust avenue .
 2---->  in 1893 , the school moved into a new building on second street and , in 1917 , moved to . <split> its current location in the building ( now called <low_freq> hall ) which sits on a hill overlooking locust avenue .


 1---->  in 1893 , they were invited to join the scottish football league proper , a

--------split model training sampling display--------
 1---->  after the 2011 season , <low_freq> was interviewed for the head coach job by the miami dolphins <split> he , <low_freq> signed a two - year extension with the bears on january 17 .
 2---->  after the 2011 season , <low_freq> was interviewed for the head coach job by the miami . <split> dolphins , but signed a two - year extension with the bears on january 17 .


 1---->  after the 2011 state election , the spd and the cdu formed a grand coalition decided under the coalition agreement to <split> the develop plans to extend the s coalition bahn line '' from <low_freq> station to develop west to <low_freq> '' .
 2---->  after the 2011 state election , the spd and the cdu formed a grand coalition decided under the coalition agreement . <split> to develop plans to extend the s - bahn line '' from <low_freq> station to the west to <low_freq> '' .


 1---->  after the 2012 redistricting , the ward 5 portions moved from anc - <low_

--------split model training sampling display--------
 1---->  these techniques allow most of the functionality of json - ld to be achieved without having to <split> the change the contents of existing documents and provide a smooth upgrade path for existing infrastructure .
 2---->  these techniques allow most of the functionality of json - ld to be achieved without having . <split> to change the contents of existing documents and provide a smooth upgrade path for existing infrastructure .


 1---->  these techniques are yamazaki 's special because he <split> he has integrated karate and muay thai .
 2---->  these techniques are yamazaki 's special because . <split> he has integrated karate and muay thai .


 1---->  these techniques are subtle in nature , but powerful in execution , not unlike the ultraviolet rays of the <split> one ultraviolet one does not notice their influence on the body until after the damage has been done .
 2---->  these techniques are subtle in nature , but p

In [None]:
stop

In [None]:
sample_num=2
topk=20

predicts, log_probs=split_model.dec.decode_topk_seqs(split_model.enc, inputs=torch.LongTensor(split_train_set_inputs[0:sample_num]), 
                             input_lens=torch.LongTensor(split_train_set_input_lens[0:sample_num]), 
                             topk=topk)

predicts = batch_tokens_remove_eos(predicts, vocab)
labels = batch_tokens_remove_eos(split_pseudo_train_set_labels[0:sample_num], vocab)

predicts = batch_tokens2words(predicts, vocab)
labels = batch_tokens2words(labels, vocab)

predicts_sents = batch_words2sentence(predicts)
labels_sents = batch_words2sentence(labels)

for idx, sent in enumerate(predicts_sents):
    print(' 1----> ', sent)
    if idx%topk==(topk-1):
        print(' 2----> ', labels_sents[int(idx/topk)])
        print('\n')

In [None]:
# copy_thres=1.0
# split_loss, predicts = split_model.forward(torch.LongTensor(split_train_set_inputs[0:sample_num]), 
#                                      torch.LongTensor(split_train_set_input_lens[0:sample_num]), 
#                                      labels=torch.LongTensor(split_pseudo_train_set_labels[0:sample_num]), 
#                                      is_train=1, teaching_rate=1)

# predicts = batch_tokens_remove_eos(predicts, vocab)
# labels = batch_tokens_remove_eos(split_pseudo_train_set_labels[0:sample_num], vocab)

# predicts = batch_tokens2words(predicts, vocab)
# labels = batch_tokens2words(labels, vocab)

# predicts_sents = batch_words2sentence(predicts)
# labels_sents = batch_words2sentence(labels)

# for (predict_sent, label_sent) in zip(predicts_sents, labels_sents):
#     print(' 1----> ', predict_sent)
#     print(' 2----> ', label_sent)
#     print('\n')

In [None]:
stop