In [1]:
import json
import pickle
import random

import torch
from torch import nn, optim
from torch import autograd
from torch.autograd import Variable
import torch.nn.functional as F
import numpy as np
import torch.nn.utils.rnn as rnn_utils

import nltk
from nltk.translate.bleu_score import SmoothingFunction
from nltk.translate.bleu_score import sentence_bleu
import time

from Vocab import Vocab

import torch
torch.cuda.set_device(0)

print('import over')

import over


In [2]:
def batch_words2sentence(words_list):
    return [''.join(words) for words in words_list]
def batch_tokens2words(tokens_list, vocab):
    ##    para: tokens_list is list[list] type
    ##    return: words_list corresponding to tokens
    return [[vocab.token2word[token] for token in tokens] for tokens in tokens_list]

def batch_tokens_remove_eos(tokens_list, vocab):
    ##    para: tokens_list is list[list] type
    ##    return pure tokens_list removed eos symbol
    result=[]
    for tokens in tokens_list:
        tokens_filtered=[]
        for token in tokens:
            if token == vocab.word2token['<eos>']:
                #tokens_filtered.append(token)
                break
            else:
                tokens_filtered.append(token)
        result.append(tokens_filtered)
    return result

def batch_tokens_bleu(references, candidates, smooth_epsilon=0.001):
    ##    para: references and candidates are list[list] type
    ##    return: list of BLEU for every sample
    ##
    bleu_scores=[]
    for ref, candidate in zip(references, candidates):
        if min(len(ref), len(candidate))<4:
            bleu_scores.append(0)
        else:
            bleu_scores.append(sentence_bleu([ref], candidate, smoothing_function = SmoothingFunction(epsilon=smooth_epsilon).method1))
    return bleu_scores

with open('vocab.pk', 'rb') as f:
    vocab=pickle.load(f)
    
batch_tokens_bleu([[1,2,3,4,5,6]], [[2,3,1,4,5]])

[0.021744100219015735]

In [3]:
with open('./small_data_set/train_set_inputs_10w.pk', 'rb') as f:
    train_set_inputs = pickle.load(f)
with open('./small_data_set/train_set_input_lens_10w.pk', 'rb') as f:
    train_set_input_lens = pickle.load(f)
with open('./small_data_set/train_set_labels_10w.pk', 'rb') as f:
    train_set_labels = pickle.load(f)
with open('./small_data_set/valid_set_inputs_10w.pk', 'rb') as f:
    valid_set_inputs = pickle.load(f)
with open('./small_data_set/valid_set_input_lens_10w.pk', 'rb') as f:
    valid_set_input_lens = pickle.load(f)
with open('./small_data_set/valid_set_labels_10w.pk', 'rb') as f:
    valid_set_labels = pickle.load(f)
    
    
    
# with open('./data_set/train_set_inputs.pk', 'rb') as f:
#     train_set_inputs = pickle.load(f)
# with open('./data_set/train_set_input_lens.pk', 'rb') as f:
#     train_set_input_lens = pickle.load(f)
# with open('./data_set/train_set_labels.pk', 'rb') as f:
#     train_set_labels = pickle.load(f)
# with open('./data_set/valid_set_inputs.pk', 'rb') as f:
#     valid_set_inputs = pickle.load(f)
# with open('./data_set/valid_set_input_lens.pk', 'rb') as f:
#     valid_set_input_lens = pickle.load(f)
# with open('./data_set/valid_set_labels.pk', 'rb') as f:
#     valid_set_labels = pickle.load(f)

In [4]:
print(len(train_set_inputs), len(train_set_input_lens), len(train_set_labels), 
      len(valid_set_input_lens), len(valid_set_inputs), len(valid_set_labels))

for sent_len in valid_set_input_lens:
    if sent_len<=2:
        print('why')

100000 100000 100000 100000 100000 100000


In [5]:
class Encoder(nn.Module):
    def __init__(self, use_cuda, hidden_dim, input_dim, vocab):#, pre_train_weight, is_fix_word_vector = 1):
        super(Encoder, self).__init__()
        
        self.use_cuda = use_cuda
        self.input_dim=input_dim
        self.hidden_dim=hidden_dim
        self.vocab = vocab
        
        self.lstm=torch.nn.LSTM(input_size=self.input_dim, 
                                hidden_size= self.hidden_dim, 
                                bidirectional=True,
                                batch_first=True
                               )
        
        #embedding
        self.embed=nn.Embedding(len(self.vocab.word2token), input_dim)
        #loading pre trained word embedding
        with open('pre_train_word_embedding.pk', 'rb') as f:
            pre_train_word_embedding = pickle.load(f)
            
        self.embed.weight.data.copy_(torch.FloatTensor(pre_train_word_embedding))
        #self.embed.weight.requires_grad = False
        
    def order(self, inputs, inputs_len):    #inputs: tensor, inputs_len: 1D tensor
        inputs_len, sort_ids = torch.sort(inputs_len, dim=0, descending=True)
        
        if self.use_cuda:
            inputs = inputs.index_select(0, Variable(sort_ids).cuda())
        else:
            inputs = inputs.index_select(0, Variable(sort_ids))
        
        _, true_order_ids = torch.sort(sort_ids, dim=0, descending=False)
        
        return inputs, inputs_len, true_order_ids
    #
    def forward(self, inputs, inputs_len):
        inputs = Variable(inputs)
        if self.use_cuda:
            inputs=inputs.cuda()
            
        inputs, sort_len, true_order_ids = self.order(inputs, inputs_len)

        in_vecs=self.embed(inputs)

        packed = rnn_utils.pack_padded_sequence(input=in_vecs, lengths=list(sort_len), batch_first =True)
        
        outputs, (hn,cn) = self.lstm(packed)
        outputs, sent_lens = rnn_utils.pad_packed_sequence(outputs)
        
        #print('outpurs size, hn size and cn size: ', outputs.size(), hn.size(), cn.size())
        outputs = outputs.transpose(0,1)  #transpose is necessary
        #print('outpurs size, hn size and cn size: ', outputs.size(), hn.size(), cn.size())
        
        #warnning: outputs, hn and cn have been sorted by sentences length so the order is wrong, now to sort them
        if self.use_cuda:
            outputs = outputs.index_select(0, Variable(true_order_ids).cuda())
        else:
            outputs = outputs.index_select(0, Variable(true_order_ids))
        
        hn = torch.cat((hn[0], hn[1]), dim=1)
        cn = torch.cat((cn[0], cn[1]), dim=1)
        #print('hn size and cn size: ', hn.size(), cn.size())
        
        if self.use_cuda:
            hn = hn.index_select(0, Variable(true_order_ids).cuda())
            cn = cn.index_select(0, Variable(true_order_ids).cuda())
        else:
            hn = hn.index_select(0, Variable(true_order_ids))
            cn = cn.index_select(0, Variable(true_order_ids))
            
        return outputs, (hn,cn)

In [6]:
def _inflate(tensor, times, dim):
    """
    Examples::
        >> a = torch.LongTensor([[1, 2], [3, 4]])
        >> a
        1   2
        3   4
        [torch.LongTensor of size 2x2]
        >> b = ._inflate(a, 2, dim=1)
        >> b
        1   2   1   2
        3   4   3   4
        [torch.LongTensor of size 2x4]
    """
    repeat_dims = [1] * tensor.dim()
    repeat_dims[dim] = times
    return tensor.repeat(*repeat_dims)

class Decoder(nn.Module):
    def __init__(self, use_cuda, encoder, hidden_dim, max_length=25):
        super(Decoder, self).__init__()
        
        self.use_cuda = use_cuda
        self.hidden_dim=hidden_dim
        self.input_dim = encoder.input_dim
        self.max_length = max_length
        self.vocab = encoder.vocab
        self.weight = [1]*len(self.vocab.word2token)
        self.weight[self.vocab.word2token['<padding>']]=0
        #self.weight[self.vocab.word2token['<eos>']]=1.01
        #self.weight[self.vocab.word2token['<split>']]=1.01
        
        self.hidden_size = self.hidden_dim
        self.V = len(self.vocab.word2token)
        self.SOS = self.vocab.word2token['<sos>']
        self.EOS = self.vocab.word2token['<eos>']
        self.log_softmax = nn.LogSoftmax(dim=1)
        
        self.lstmcell = torch.nn.LSTMCell(input_size=self.input_dim, hidden_size=self.hidden_dim*2, bias=True)
        
        #embedding
        self.embed=encoder.embed# reference share
        #fcnn: projection for crossentroy loss
        self.fcnn = nn.Linear(in_features = self.hidden_dim*2, out_features = len(self.vocab.word2token))
        
        self.softmax = nn.Softmax()
        self.cost_func = nn.CrossEntropyLoss(torch.Tensor(self.weight))
        
        print('init lookup embedding matrix size: ', self.embed.weight.data.size())
        
    def forward(self, enc_outputs, sent_lens, h0_and_c0, labels, teaching_rate=0.6, is_train=1):
        labels = Variable(labels)
        if self.use_cuda:
            labels = labels.cuda()

        all_loss = 0
        predicts = []
        batch_size = enc_outputs.size(dim = 0)
        final_hidden_states = h0_and_c0[0]

        for ii in range(self.max_length):
            if ii==0:
                zero_timestep_input = Variable(torch.LongTensor([self.vocab.word2token['<sos>']]*batch_size))
                if self.use_cuda:
                    zero_timestep_input = zero_timestep_input.cuda()
                    
                zero_timestep_input = self.embed(zero_timestep_input)#size: batch_size * self.input_dim

                last_timestep_hidden_state,cx = self.lstmcell(zero_timestep_input, h0_and_c0)
                #print('hn and cn sizes: ', last_timestep_hidden_state.size(), cx.size())
                
                last_timestep_output = self.fcnn(last_timestep_hidden_state)
                if is_train:
                    loss = self.cost_func(last_timestep_output, labels[:,0])
                    all_loss+=loss
                
                _, max_idxs = torch.max(last_timestep_output, dim=1)
                predicts.append(torch.unsqueeze(max_idxs, dim=0))
                #print('max_idxs size: ',max_idxs.size(), max_idxs)
                
            else:
                if is_train:
                    rand = random.random()
                    if rand<teaching_rate:
                        this_timestep_input = self.embed(labels[:,ii-1])#label teaching, lookup embedding
                    else:
                        this_timestep_input = self.embed(max_idxs)#last_timestep output, and then look up word embedding
                else:
                    this_timestep_input = self.embed(max_idxs)#last_timestep output, and then look up word embedding
                    
                last_timestep_hidden_state ,cx = self.lstmcell(this_timestep_input, (last_timestep_hidden_state,cx))
                last_timestep_output = self.fcnn(last_timestep_hidden_state)
                
                if is_train:
                    loss = self.cost_func(last_timestep_output, labels[:,ii])
                    all_loss+=loss
                _, max_idxs = torch.max(last_timestep_output, dim=1)
                #print('max_idx size: ', max_idxs.size(), max_idxs)
                predicts.append(torch.unsqueeze(max_idxs, dim=0))
                
        predicts = torch.cat(predicts, dim=0)
        predicts = torch.transpose(predicts, 0, 1)
        #print('predicts size: ', predicts.size())
        
        if is_train:  #training
            if self.use_cuda:
                return all_loss/(self.max_length), predicts.data.cpu().numpy()
            else:
                return all_loss/(self.max_length), predicts.data.numpy()
        else:   #testing
            if self.use_cuda:
                return predicts.data.cpu().numpy()
            else:
                return predicts.data.numpy()
    def _tocuda(self, var):
        if self.use_cuda:
            return var.cuda()
        else:
            return var
    def decode_by_beamsearch(self, encoder_hidden=None, encoder_outputs=None, topk = 10):
        self.k = topk
        batch_size = encoder_outputs.size(dim=0)
        
        self.pos_index = self._tocuda(Variable(torch.LongTensor(range(batch_size)) * self.k).view(-1, 1))

        hidden = tuple([_inflate(h, self.k, 1).view(batch_size*self.k, -1) for h in encoder_hidden])
        #print('hidden0 size: (%s, %s)'%(hidden[0].size(), hidden[1].size()))

        # Initialize the scores; for the first step,
        # ignore the inflated copies to avoid duplicate entries in the top k
        sequence_scores = torch.Tensor(batch_size * self.k, 1)
        sequence_scores.fill_(-float('Inf'))
        sequence_scores.index_fill_(0, torch.LongTensor([i * self.k for i in range(0, batch_size)]), 0.0)
        sequence_scores = self._tocuda(Variable(sequence_scores))

        # Initialize the input vector
        input_var = self._tocuda(Variable(torch.LongTensor([self.SOS] * batch_size * self.k)))

        # Store decisions for backtracking
        stored_outputs = list()
        stored_scores = list()
        stored_predecessors = list()
        stored_emitted_symbols = list()
        stored_hidden = list()

        for ii in range(0, self.max_length):
            # Run the RNN one step forward
            #print('setp: %s'%ii)
            input_vec = self.embed(input_var)
            #print('input_var and input_vec size: ', input_var.size(), input_vec.size())
            hidden = self.lstmcell(input_vec, hidden)
            #print('hidden size: (%s, %s)'%(hidden[0].size(), hidden[1].size()))
            
            log_softmax_output = self.log_softmax(self.fcnn(hidden[0]))

            # To get the full sequence scores for the new candidates, add the local scores for t_i to the predecessor scores for t_(i-1)
            sequence_scores = _inflate(sequence_scores, self.V, 1)
            sequence_scores += log_softmax_output.squeeze(1)
            scores, candidates = sequence_scores.view(batch_size, -1).topk(self.k, dim=1)

            # Reshape input = (bk, 1) and sequence_scores = (bk, 1)
            input_var = (candidates % self.V).view(batch_size * self.k, 1)
            sequence_scores = scores.view(batch_size * self.k, 1)

            # Update fields for next timestep
            predecessors = (candidates / self.V + self.pos_index.expand_as(candidates)).view(batch_size * self.k, 1)
            if isinstance(hidden, tuple):
                hidden = tuple([h.index_select(0, predecessors.squeeze()) for h in hidden])
            else:
                hidden = hidden.index_select(0, predecessors.squeeze())

            # Update sequence scores and erase scores for end-of-sentence symbol so that they aren't expanded
            stored_scores.append(sequence_scores.clone())
            eos_indices = input_var.data.eq(self.EOS)
            if eos_indices.nonzero().dim() > 0:
                sequence_scores.data.masked_fill_(eos_indices, -float('inf'))

            # Cache results for backtracking
            stored_predecessors.append(predecessors)
            stored_emitted_symbols.append(input_var)
#             stored_hidden.append(hidden)

        # Do backtracking to return the optimal values
        output, h_t, h_n, s, l, p = self._backtrack(hidden,
                                                    stored_predecessors, stored_emitted_symbols,
                                                    stored_scores, batch_size, self.hidden_size)

        metadata = {}

        metadata['score'] = s
        metadata['topk_length'] = l
        metadata['topk_sequence'] = p
        metadata['length'] = [seq_len[0] for seq_len in l]
        metadata['sequence'] = [seq[0] for seq in p]
        
        torch.cuda.empty_cache()
        
        return metadata

    def _backtrack(self, hidden, predecessors, symbols, scores, b, hidden_size):
        """Backtracks over batch to generate optimal k-sequences.

        Args:
            nw_output [(batch*k, vocab_size)] * sequence_length: A Tensor of outputs from network
            nw_hidden [(num_layers, batch*k, hidden_size)] * sequence_length: A Tensor of hidden states from network
            predecessors [(batch*k)] * sequence_length: A Tensor of predecessors
            symbols [(batch*k)] * sequence_length: A Tensor of predicted tokens
            scores [(batch*k)] * sequence_length: A Tensor containing sequence scores for every token t = [0, ... , seq_len - 1]
            b: Size of the batch
            hidden_size: Size of the hidden state

        Returns:
            output [(batch, k, vocab_size)] * sequence_length: A list of the output probabilities (p_n)
            from the last layer of the RNN, for every n = [0, ... , seq_len - 1]

            h_t [(batch, k, hidden_size)] * sequence_length: A list containing the output features (h_n)
            from the last layer of the RNN, for every n = [0, ... , seq_len - 1]

            h_n(batch, k, hidden_size): A Tensor containing the last hidden state for all top-k sequences.

            score [batch, k]: A list containing the final scores for all top-k sequences

            length [batch, k]: A list specifying the length of each sequence in the top-k candidates

            p (batch, k, sequence_len): A Tensor containing predicted sequence
        """

        lstm = isinstance(hidden, tuple)

        # initialize return variables given different types
        output = list()
        h_t = list()
        p = list()
        # Placeholder for last hidden state of top-k sequences.
        # If a (top-k) sequence ends early in decoding, `h_n` contains
        # its hidden state when it sees EOS.  Otherwise, `h_n` contains
        # the last hidden state of decoding.
        if lstm:
            state_size = hidden[0].size()
            h_n = tuple([torch.zeros(state_size), torch.zeros(state_size)])
        else:
            h_n = torch.zeros(nw_hidden[0].size())
        l = [[self.max_length] * self.k for _ in range(b)]  # Placeholder for lengths of top-k sequences
                                                                # Similar to `h_n`

        # the last step output of the beams are not sorted
        # thus they are sorted here
        sorted_score, sorted_idx = scores[-1].view(b, self.k).topk(self.k)
        # initialize the sequence scores with the sorted last step beam scores
        s = sorted_score.clone()

        batch_eos_found = [0] * b   # the number of EOS found
                                    # in the backward loop below for each batch

        t = self.max_length - 1
        # initialize the back pointer with the sorted order of the last step beams.
        # add self.pos_index for indexing variable with b*k as the first dimension.
        t_predecessors = (sorted_idx + self.pos_index.expand_as(sorted_idx)).view(b * self.k)
        while t >= 0:
            # Re-order the variables with the back pointer
            current_symbol = symbols[t].index_select(0, t_predecessors)
            # Re-order the back pointer of the previous step with the back pointer of
            # the current step
            t_predecessors = predecessors[t].index_select(0, t_predecessors).squeeze()

            # This tricky block handles dropped sequences that see EOS earlier.
            # The basic idea is summarized below:
            #
            #   Terms:
            #       Ended sequences = sequences that see EOS early and dropped
            #       Survived sequences = sequences in the last step of the beams
            #
            #       Although the ended sequences are dropped during decoding,
            #   their generated symbols and complete backtracking information are still
            #   in the backtracking variables.
            #   For each batch, everytime we see an EOS in the backtracking process,
            #       1. If there is survived sequences in the return variables, replace
            #       the one with the lowest survived sequence score with the new ended
            #       sequences
            #       2. Otherwise, replace the ended sequence with the lowest sequence
            #       score with the new ended sequence
            #
            eos_indices = symbols[t].data.squeeze(1).eq(self.EOS).nonzero()
            if eos_indices.dim() > 0:
                for i in range(eos_indices.size(0)-1, -1, -1):
                    # Indices of the EOS symbol for both variables
                    # with b*k as the first dimension, and b, k for
                    # the first two dimensions
                    idx = eos_indices[i]
                    b_idx = int(idx[0] / self.k)
                    # The indices of the replacing position
                    # according to the replacement strategy noted above
                    res_k_idx = self.k - (batch_eos_found[b_idx] % self.k) - 1
                    batch_eos_found[b_idx] += 1
                    res_idx = b_idx * self.k + res_k_idx

                    # Replace the old information in return variables
                    # with the new ended sequence information
                    t_predecessors[res_idx] = predecessors[t][idx[0]]

                    current_symbol[res_idx, :] = symbols[t][idx[0]]
                    s[b_idx, res_k_idx] = scores[t][idx[0]]
                    l[b_idx][res_k_idx] = t + 1

            # record the back tracked results
            p.append(current_symbol)
            t -= 1

        # Sort and re-order again as the added ended sequences may change
        # the order (very unlikely)
        s, re_sorted_idx = s.topk(self.k)
        for b_idx in range(b):
            l[b_idx] = [l[b_idx][k_idx.data[0]] for k_idx in re_sorted_idx[b_idx,:]]

        re_sorted_idx = (re_sorted_idx + self.pos_index.expand_as(re_sorted_idx)).view(b * self.k)

        # Reverse the sequences and re-order at the same time
        # It is reversed because the backtracking happens in reverse time order
#         output = [step.index_select(0, re_sorted_idx).view(b, self.k, -1) for step in reversed(output)]
        p = [step.index_select(0, re_sorted_idx).view(b, self.k, -1) for step in reversed(p)]
        #    --- fake output ---
        output = None
        #    --- fake ---
        return output, h_t, h_n, s, l, p

    def _mask_symbol_scores(self, score, idx, masking_score=-float('inf')):
            score[idx] = masking_score

    def _mask(self, tensor, idx, dim=0, masking_score=-float('inf')):
        if len(idx.size()) > 0:
            indices = idx[:, 0]
            tensor.index_fill_(dim, indices, masking_score)

In [7]:
class AutoEncoder(nn.Module):
    def __init__(self, use_cuda, input_dim, hidden_dim, vocab, max_length = 25):
        super(AutoEncoder, self).__init__()
        
        self.use_cuda = use_cuda
        self.enc = Encoder(use_cuda=use_cuda, hidden_dim=hidden_dim, input_dim=input_dim, vocab=vocab)
        self.dec = Decoder(use_cuda=use_cuda, encoder=self.enc, hidden_dim=hidden_dim, max_length=max_length)
        if use_cuda:
            self.enc = self.enc.cuda()
            self.dec = self.dec.cuda()
    def forward(self, inputs, input_lens, labels, is_train=1, teaching_rate=1):
        enc_outputs, (enc_hn, enc_cn) = self.enc(torch.LongTensor(inputs), torch.LongTensor(input_lens))
        if is_train:
            loss, predicts = self.dec(enc_outputs = enc_outputs, 
                                    h0_and_c0=(enc_hn, enc_cn), 
                                    sent_lens=input_lens,
                                    labels=torch.LongTensor(labels), 
                                    is_train=1, 
                                    teaching_rate = 1
                                    )
            return loss, predicts
        else:
            predicts = self.dec(enc_outputs = enc_outputs, 
                                h0_and_c0=(enc_hn, enc_cn), 
                                sent_lens=input_lens,
                                labels=torch.LongTensor(labels), 
                                is_train=0, 
                                teaching_rate = 1
                                )
            return predicts

In [8]:
use_cuda = 1
hidden_dim = 256
input_dim = 300
autoencoder = AutoEncoder(use_cuda = use_cuda, input_dim = input_dim, hidden_dim = hidden_dim, vocab = vocab, max_length = 26)
pre_train = torch.load('./models_better/time-[2019-01-08-06-27-50]-loss-0.925657809-bleu-0.6640-hidden_dim-256-input_dim-300-epoch-2-batch_size-200-batch_id-[17001-[of]-21743]-lr-0.0050', map_location = 'cpu')

autoencoder.load_state_dict(pre_train)
if use_cuda:
    autoencoder = autoencoder.cuda()
autoencoder.eval()

train_set_size = len(train_set_inputs)
sample_num=20
topk=10
batch_id=0
bleu_sum=0

# dec_beamsearch = TopKDecoder(decoder_rnn=autoencoder.dec, k=topk)

enc_outputs, (enc_hn, enc_cn) = autoencoder.enc(torch.LongTensor(train_set_inputs[:sample_num]), 
                                        torch.LongTensor(train_set_input_lens[:sample_num]))
#print('enc result size: ', enc_outputs.size(), enc_hn.size(), enc_cn.size())
metadata = autoencoder.dec.decode_by_beamsearch(encoder_hidden = (enc_hn, enc_cn), encoder_outputs = enc_outputs, topk=topk)
# metadata = dec_beamsearch(encoder_hidden = (enc_hn, enc_cn), encoder_outputs = enc_outputs)

results = metadata['topk_sequence']
results =torch.cat(results, dim = 2)
results=results.view(sample_num*topk, -1)
indices = torch.LongTensor([x*topk for x in range(sample_num)]).cuda()
results = results.data.index_select(0, indices)
results=results.cpu().tolist()
results=batch_tokens_remove_eos(results, vocab)

sent_lens = []
for tokens in results:
    sent_lens.append(len(tokens))
    
results=batch_tokens2words(results, vocab)
results=batch_words2sentence(results)
for sent in results:
    print(sent)
    



torch.cuda.empty_cache()
stop


start_time=time.time()
for start_idx in range(0, train_set_size-sample_num, sample_num):
    batch_id+=1
    
    enc_outputs, (enc_hn, enc_cn) = autoencoder.enc(torch.LongTensor(train_set_inputs[start_idx:start_idx+sample_num]), 
                                        torch.LongTensor(train_set_input_lens[start_idx:start_idx+sample_num]))
    #print('enc result size: ', enc_outputs.size(), enc_hn.size(), enc_cn.size())

    metadata = dec_beamsearch(encoder_hidden = (enc_hn, enc_cn), encoder_outputs = enc_outputs)

    results = metadata['topk_sequence']
    results =torch.cat(results, dim = 2)
    results=results.view(sample_num*topk, -1)
    indices = torch.LongTensor([x*topk for x in range(sample_num)]).cuda()
    results = results.data.index_select(0, indices)
    results=results.cpu().tolist()
    results=batch_tokens_remove_eos(results, vocab)
#     results=batch_tokens2words(results, vocab)
#     results=batch_words2sentence(results)
#     print(results)
    
    inputs = train_set_inputs[start_idx:start_idx+sample_num]
#     inputs = batch_tokens_remove_eos(inputs, vocab)
#     inputs = batch_tokens2words(inputs, vocab)
    inputs_=[]
    for tokens in inputs:
        x=[]
        for token in tokens:
            if token!=vocab.word2token['<padding>']:
                x.append(token)
            else:
                break
        inputs_.append(x)
#     inputs = batch_words2sentence(inputs_)
    bleu_scores = batch_tokens_bleu(references=inputs_, candidates=results)
    for score in bleu_scores:
        bleu_sum+=score
    
    if batch_id%50==1:
        print(batch_id, int(train_set_size/sample_num), 'time: %4.2f mins'%((time.time()-start_time)/60), 'bleu: %2.4f'%(bleu_sum/batch_id/sample_num))
    
    

#    print inputs
inputs = train_set_inputs[0:sample_num]
# print(inputs)
inputs = batch_tokens_remove_eos(inputs, vocab)
inputs = batch_tokens2words(inputs, vocab)
inputs_=[]
for words in inputs:
    x=[]
    for word in words:
        if word!='<padding>':
            x.append(word)
        else:
            break
    inputs_.append(x)
inputs = batch_words2sentence(inputs_)
for sent in inputs:
    print(sent)
    
# torch.cuda.empty_cache()

print('results[0] size: ', results[0].size())
a=torch.cat(results, dim = 2)
b=a.view(sample_num*topk, -1)
print(a.size(), b.size())
c=b.data.cpu().tolist()
d=batch_tokens_remove_eos(c, vocab)
e=batch_tokens2words(d, vocab)
f=batch_words2sentence(e)
for sent in f:
    print(sent)

init lookup embedding matrix size:  torch.Size([98638, 300])
开上一边！我们在仪表飞行。
他们不会放过任何一个认识你的人，<low_freq>。你当心点。
但同样的，我们的设备是支持<low_freq>培育好用的。
我们的风云人物来了。恭喜啊。
第三条矿山总统joe：为纾个时候）卖过程和一家医疗部门工作管理登记了下损害。
对。所以你是说和我共事是种惩罚
不管怎么样，事情一件接一件，而且……
如果你不合作的话，
你得保证是完美的一次！
贫穷的朋友,因为患难之交是真情。
我说了别接听！
你说完了？什么？
他拒绝了本使用的采访。
我觉得不可能警戒事发地点。
你们要点推荐的今日自选<low_freq>吗？
合作品牌网站上的广告。
因为演员家伙在他办公室好排行榜。我们把膝夹！
实际测试显示模块运作完美。
最好是两个男孩两个女孩...
快，上车，带上他


NameError: name 'stop' is not defined

In [9]:
import numpy as np
np.set_printoptions(suppress=True)

meta_lens = metadata['length']
print(meta_lens, type(meta_lens))
print(sent_lens)

seq_score = metadata['score']
lseq_socre = seq_score.data.cpu().numpy()
for x in lseq_socre:
    print(x)

[10, 17, 15, 10, 24, 13, 12, 7, 9, 10, 7, 7, 9, 9, 10, 8, 15, 8, 8, 7] <class 'list'>
[9, 16, 14, 9, 23, 12, 11, 6, 8, 9, 6, 6, 8, 8, 9, 7, 14, 7, 7, 6]
[-3.24204826 -4.40653467 -4.47240829 -4.8347168  -5.28275204 -5.37414074
 -5.63503647 -5.94044685 -6.03672504 -6.05442619]
[-1.97154188 -4.15584326 -4.34271812 -4.98236847 -5.01742077 -5.13382244
 -5.46063137 -5.65393639 -5.77483845 -5.95366859]
[-2.59985161 -3.37922096 -3.99874973 -4.41872597 -4.78718185 -5.04994202
 -5.17642212 -5.33260918 -5.58263397 -5.71078587]
[-0.11981249 -3.18128395 -5.4015274  -5.69001198 -5.82326365 -5.95669365
 -6.435359   -6.64287663 -6.64724827 -6.87408447]
[-18.40380478 -19.78309059 -20.26621246 -20.76094055 -20.89505768
 -21.00291443 -21.50175095 -21.6151619  -21.91933441 -22.45663071]
[-0.17099404 -2.67351103 -4.61692524 -5.25170612 -5.80711746 -6.28423071
 -6.34242487 -6.52443314 -6.83719969 -6.89309406]
[-0.56672263 -3.50312901 -3.75772285 -4.08013439 -4.53574657 -4.60605526
 -4.85199547 -5.01956892 -

In [None]:
use_cuda = 1
hidden_dim = 256
input_dim = 300
autoencoder = AutoEncoder(use_cuda = use_cuda, input_dim = input_dim, hidden_dim = hidden_dim, vocab = vocab, max_length = 25)
pre_train = torch.load('./models_better/time-[2019-01-08-06-27-50]-loss-0.925657809-bleu-0.6640-hidden_dim-256-input_dim-300-epoch-2-batch_size-200-batch_id-[17001-[of]-21743]-lr-0.0050', map_location = 'cpu')

autoencoder.load_state_dict(pre_train)
if use_cuda:
    autoencoder = autoencoder.cuda()
autoencoder.eval()

train_set_size = len(train_set_inputs)
sample_num=200
batch_id=0
bleu_sum=0

start_time=time.time()
for start_idx in range(0, train_set_size-sample_num, sample_num):
    batch_id+=1
    
    enc_outputs, (enc_hn, enc_cn) = autoencoder.enc(torch.LongTensor(train_set_inputs[start_idx:start_idx+sample_num]), 
                                        torch.LongTensor(train_set_input_lens[start_idx:start_idx+sample_num]))
    predicts = autoencoder.forward(torch.LongTensor(train_set_inputs[start_idx:start_idx+sample_num]), 
                                             torch.LongTensor(train_set_input_lens[start_idx:start_idx+sample_num]), 
                                             labels=999, 
                                             is_train=0, teaching_rate=1)

    results=batch_tokens_remove_eos(predicts, vocab)
#     results=batch_tokens2words(results, vocab)
#     results=batch_words2sentence(results)
#     print(results)
    
    inputs = train_set_inputs[start_idx:start_idx+sample_num]
#     inputs = batch_tokens_remove_eos(inputs, vocab)
#     inputs = batch_tokens2words(inputs, vocab)
    inputs_=[]
    for tokens in inputs:
        x=[]
        for token in tokens:
            if token!=vocab.word2token['<padding>']:
                x.append(token)
            else:
                break
        inputs_.append(x)
#     inputs = batch_words2sentence(inputs_)
    bleu_scores = batch_tokens_bleu(references=inputs_, candidates=results)
    for score in bleu_scores:
        bleu_sum+=score
    
    if batch_id%20==1:
        print(batch_id, int(train_set_size/sample_num), 'time: %4.2f mins'%((time.time()-start_time)/60), 'bleu: %2.4f'%(bleu_sum/batch_id/sample_num))
    
    

#    print inputs
inputs = train_set_inputs[0:sample_num]
# print(inputs)
inputs = batch_tokens_remove_eos(inputs, vocab)
inputs = batch_tokens2words(inputs, vocab)
inputs_=[]
for words in inputs:
    x=[]
    for word in words:
        if word!='<padding>':
            x.append(word)
        else:
            break
    inputs_.append(x)
inputs = batch_words2sentence(inputs_)
for sent in inputs:
    print(sent)
    
# torch.cuda.empty_cache()

print('results[0] size: ', results[0].size())
a=torch.cat(results, dim = 2)
b=a.view(sample_num*topk, -1)
print(a.size(), b.size())
c=b.data.cpu().tolist()
d=batch_tokens_remove_eos(c, vocab)
e=batch_tokens2words(d, vocab)
f=batch_words2sentence(e)
for sent in f:
    print(sent)

In [None]:
use_cuda = 1
hidden_dim = 256
input_dim = 300

enc = Encoder(use_cuda=use_cuda, 
            hidden_dim=hidden_dim, 
            input_dim=input_dim, 
            vocab=vocab
           )
if use_cuda:
    enc = enc.cuda()
    
sample_num = 11
print('sentences length: ', train_set_input_lens[0:sample_num])

enc_outputs, (enc_hn, enc_cn) = enc(torch.LongTensor(train_set_inputs[0:sample_num]), 
                                    torch.LongTensor(train_set_input_lens[0:sample_num]))
print('enc result size: ', enc_outputs.size(), enc_hn.size(), enc_cn.size())

dec = Decoder(use_cuda=use_cuda, encoder=enc, hidden_dim=hidden_dim, max_length=25)
if use_cuda:
    dec = dec.cuda()
    
# loss, predicts = dec(enc_outputs = enc_outputs, 
#                     h0_and_c0=(enc_hn, enc_cn), 
#                     sent_lens=train_set_input_lens[0:sample_num], 
#                     labels=torch.LongTensor(train_set_labels[0:sample_num]), 
#                     is_train=1, teaching_rate = 1
#                     )
# print('loss is %4.7f'%loss.data[0])

autoencoder = AutoEncoder(use_cuda = use_cuda, input_dim = input_dim, hidden_dim = hidden_dim, vocab = vocab, max_length = 25)

pre_train = torch.load('./models_better/time-[2019-01-07-23-18-32]-loss-1.005809546-bleu-0.6937-hidden_dim-256-input_dim-300-epoch-1-batch_size-200-batch_id-[6001-[of]-21743]-lr-0.0050', map_location = 'cpu')

autoencoder.load_state_dict(pre_train)
if use_cuda:
    autoencoder = autoencoder.cuda()

autoencoder.eval()
predicts = autoencoder.forward(torch.LongTensor(train_set_inputs[0:sample_num]), 
                                     torch.LongTensor(train_set_input_lens[0:sample_num]), 
                                     labels=torch.LongTensor(train_set_labels[0:sample_num]), 
                                     is_train=0, teaching_rate=1)

inputs = train_set_inputs[0:sample_num]
inputs = batch_tokens_remove_eos(inputs, vocab)
results = batch_tokens_remove_eos(predicts, vocab)
inputs = batch_tokens2words(inputs, vocab)
results = batch_tokens2words(results, vocab)
inputs_=[]
for words in inputs:
    x=[]
    for word in words:
        if word!='<padding>':
            x.append(word)
        else:
            break
    inputs_.append(x)
inputs = batch_words2sentence(inputs_)
results = batch_words2sentence(results)
for inp, res in zip(inputs,results):
    print(inp)
    print(res)

In [None]:
use_cuda = 1
hidden_dim = 256
input_dim = 300
lr=0.005
batch_size=200
train_set_size=int(len(train_set_inputs)/2)
epochs=10
train_bleu = 0
autoencoder = AutoEncoder(use_cuda = use_cuda, input_dim = input_dim, hidden_dim = hidden_dim, 
                          vocab = vocab, max_length = 25)
#pre train para
#pre_train = torch.load('./models_better/loss-2.099016905-bleu-0.4078-hidden_dim-512-input_dim-300-epoch-0-batch_size-200-batch_id-[7001-[of]-21743]-lr-0.0050')
#pre_train = torch.load('./models_better/time-[2019-01-07-16-38-14]-loss-1.881381631-bleu-0.5340-hidden_dim-256-input_dim-300-epoch-0-batch_size-200-batch_id-[6001-[of]-21743]-lr-0.0050', map_location = 'cpu')
pre_train = torch.load('./models_better/time-[2019-01-07-23-18-32]-loss-1.005809546-bleu-0.6937-hidden_dim-256-input_dim-300-epoch-1-batch_size-200-batch_id-[6001-[of]-21743]-lr-0.0050', map_location = 'cpu')

autoencoder.load_state_dict(pre_train)
if use_cuda:
    autoencoder = autoencoder.cuda()
    
optimizer = optim.Adam(filter(lambda p: p.requires_grad, autoencoder.parameters()), lr=lr)

start_time = time.time()

def model_train(epoch, batch_size, train_set_size):
    batch_id = 0
    valid_bleu = 0
    for start_idx in range(0, train_set_size-batch_size, batch_size):
        batch_id+=1
        end_idx = start_idx + batch_size
        
        optimizer.zero_grad()#clear
        loss, predicts = autoencoder.forward(torch.LongTensor(train_set_inputs[start_idx:end_idx]), 
                                             torch.LongTensor(train_set_input_lens[start_idx:end_idx]), 
                                             labels=torch.LongTensor(train_set_labels[start_idx:end_idx]), 
                                             is_train=1, teaching_rate=1)
        #optimize
        loss.backward()#retain_graph=True)
        optimizer.step()
        torch.cuda.empty_cache()
        
        if batch_id%50==1:
            autoencoder.eval()
            sample_num = 10
            rand_idx = random.randint(0, train_set_size-sample_num-1)
            #teaching forcing
            loss_, predicts = autoencoder.forward(torch.LongTensor(train_set_inputs[rand_idx:rand_idx+sample_num]), 
                                             torch.LongTensor(train_set_input_lens[rand_idx:rand_idx+sample_num]), 
                                             labels=torch.LongTensor(train_set_labels[rand_idx:rand_idx+sample_num]), 
                                             is_train=1, teaching_rate=1)
            del loss_
            tokenized_sents=predicts.tolist()
            real_sents=[]
            label_tokenized_sents=train_set_labels[rand_idx:rand_idx+sample_num]
            label_real_sents=[]
            for idx, sent in enumerate(tokenized_sents):
                real_sents.append(tokenized_sent2real_sent(sent, autoencoder.enc.vocab))
            for sent in label_tokenized_sents:
                label_real_sents.append(tokenized_sent2real_sent(sent, autoencoder.enc.vocab))

            print('train_set sample: ', rand_idx)
            for (real_sent, label_real_sent) in zip(real_sents, label_real_sents):
                print(real_sent, '----<o_o>----', label_real_sent)
                
            #no teaching forcing
            print('----no teaching forcing----')
            predicts = autoencoder.forward(torch.LongTensor(train_set_inputs[rand_idx:rand_idx+sample_num]), 
                                             torch.LongTensor(train_set_input_lens[rand_idx:rand_idx+sample_num]), 
                                             labels=torch.LongTensor(train_set_labels[rand_idx:rand_idx+sample_num]), 
                                             is_train=0, teaching_rate=1)
            tokenized_sents=predicts.tolist()
            real_sents=[]
            label_tokenized_sents=train_set_labels[rand_idx:rand_idx+sample_num]
            label_real_sents=[]
            for idx, sent in enumerate(tokenized_sents):
                real_sents.append(tokenized_sent2real_sent(sent, autoencoder.enc.vocab))
            for sent in label_tokenized_sents:
                label_real_sents.append(tokenized_sent2real_sent(sent, autoencoder.enc.vocab))

            for (real_sent, label_real_sent) in zip(real_sents, label_real_sents):
                print(real_sent, '----<o_o>----', label_real_sent)
                
            info_stamp = 'loss-{:2.9f}-batch_size-{:n}-epoch-{:n}-batch_id-({:n}/{:n})'.format(
                              loss.data[0], batch_size, epoch, batch_id, int(train_set_size/batch_size))
            print(info_stamp)
            #valid_set testing
            if batch_id%1000==1:
                rand_idx=random.randint(0, len(valid_set_inputs)-batch_size-1-1)
                predicts = autoencoder.forward(torch.LongTensor(valid_set_inputs[rand_idx:rand_idx+batch_size]), 
                                                 torch.LongTensor(valid_set_input_lens[rand_idx:rand_idx+batch_size]), 
                                                 labels=[],#torch.LongTensor(valid_set_labels[rand_idx:rand_idx+batch_size]), 
                                                 is_train=0, teaching_rate=1)
                tokenized_sents=predicts.tolist()
                real_sents=[]
                label_tokenized_sents=valid_set_labels[rand_idx:rand_idx+batch_size]
                label_real_sents=[]
                for idx, sent in enumerate(tokenized_sents):
                    real_sents.append(tokenized_sent2real_sent(sent, autoencoder.enc.vocab))
                for sent in label_tokenized_sents:
                    label_real_sents.append(tokenized_sent2real_sent(sent, autoencoder.enc.vocab))

                bleu_score, valid_num = data_set_bleu(label_real_sents, real_sents)
                if valid_num>10:
                    valid_bleu = bleu_score/valid_num
                       
                info_stamp = 'loss-{:2.9f}-bleu-{:1.4f}-hidden_dim-{:n}-input_dim-{:n}-epoch-{:n}-batch_size-{:n}-batch_id-[{:n}-[of]-{:n}]-lr-{:1.4f}'.format(
                              loss.data[0], valid_bleu, hidden_dim, input_dim, epoch, batch_size, batch_id, int(train_set_size/batch_size), lr)
                print(valid_num, info_stamp)
                now = int(round(time.time()*1000))
                time_stamp = time.strftime('time-[%Y-%m-%d-%H-%M-%S]-',time.localtime(now/1000))
                torch.save(autoencoder.state_dict(), ''.join(['./models_saved/', time_stamp, info_stamp]))
                
            autoencoder.train()
            
for epoch in range(epochs):
    model_train(epoch, batch_size, train_set_size)
    
print('running time: %.2f mins'%((time.time()-start_time)/60))

In [None]:
import torch
torch.cuda.set_device(0)
pre_train = torch.load('./models_better/loss-3.966628313-bleu-0.3201-hidden_dim-512-input_dim-300-epoch-0-batch_size-200-batch_id-[1001-[of]-21743]-lr-0.0050', map_location = 'cpu')
print('a')