In [1]:
import json
import pickle
import random

import torch
from torch import nn, optim
from torch import autograd
from torch.autograd import Variable
import torch.nn.functional as F
import numpy as np
import torch.nn.utils.rnn as rnn_utils

from nltk.translate.bleu_score import sentence_bleu
import time

from Vocab import Vocab

import torch
torch.cuda.set_device(1)

print('import over')

import over


In [2]:
def tokenized_sent2real_sent(tokenized_sent, vocab):
    real_sent=[]
    for token in tokenized_sent:
        if token == vocab.word2token['<eos>']:
            break
        else:
            real_sent.append(vocab.token2word[token])
    return ''.join(real_sent)

def reverse_tokenized_sent2real_sent(tokenized_sent, vocab):
    real_sent=[]
    for token in tokenized_sent:
        if token == vocab.word2token['<eos>']:
            break
        else:
            real_sent.append(vocab.token2word[token])
    real_sent.reverse()
    return ''.join(real_sent)

def data_set_bleu(sents1, sents2):
    cnt=0
    bleu_score_sum=0
    
    for sent1, sent2 in zip(sents1, sents2):
        if min(len(sent1), len(sent2))<4:
            pass
        else:
            cnt+=1
            bleu_score_sum = sentence_bleu([list(sent1)], list(sent2))+bleu_score_sum
            
    return bleu_score_sum, cnt

with open('vocab.pk', 'rb') as f:
    vocab=pickle.load(f)

In [3]:
print(tokenized_sent2real_sent([7,6,2,1,1], vocab))
print(reverse_tokenized_sent2real_sent([7,6,2,1,1], vocab))

表示同情噤声
噤声表示同情


In [4]:
with open('./data_set/train_set_inputs.pk', 'rb') as f:
    train_set_inputs = pickle.load(f)
with open('./data_set/train_set_input_lens.pk', 'rb') as f:
    train_set_input_lens = pickle.load(f)
with open('./data_set/train_set_labels.pk', 'rb') as f:
    train_set_labels = pickle.load(f)
    
with open('./data_set/valid_set_inputs.pk', 'rb') as f:
    valid_set_inputs = pickle.load(f)
with open('./data_set/valid_set_input_lens.pk', 'rb') as f:
    valid_set_input_lens = pickle.load(f)
with open('./data_set/valid_set_labels.pk', 'rb') as f:
    valid_set_labels = pickle.load(f)

In [5]:
print(len(train_set_inputs), len(train_set_input_lens), len(train_set_labels), 
      len(valid_set_input_lens), len(valid_set_inputs), len(valid_set_labels))

for sent_len in valid_set_input_lens:
    if sent_len<=2:
        print('why')

8697216 8697216 8697216 1185984 1185984 1185984


In [6]:
class Encoder(nn.Module):
    def __init__(self, use_cuda, hidden_dim, input_dim, vocab):#, pre_train_weight, is_fix_word_vector = 1):
        super(Encoder, self).__init__()
        
        self.use_cuda = use_cuda
        self.input_dim=input_dim
        self.hidden_dim=hidden_dim
        self.vocab = vocab
        
        self.lstm=torch.nn.LSTM(input_size=self.input_dim, 
                                hidden_size= self.hidden_dim, 
                                bidirectional=True,
                                batch_first=True
                               )
        
        #embedding
        self.embed=nn.Embedding(len(self.vocab.word2token), input_dim)
        #loading pre trained word embedding
        with open('pre_train_word_embedding.pk', 'rb') as f:
            pre_train_word_embedding = pickle.load(f)
            
        self.embed.weight.data.copy_(torch.FloatTensor(pre_train_word_embedding))
        #self.embed.weight.requires_grad = False
        
    def order(self, inputs, inputs_len):    #inputs: tensor, inputs_len: 1D tensor
        inputs_len, sort_ids = torch.sort(inputs_len, dim=0, descending=True)
        
        if self.use_cuda:
            inputs = inputs.index_select(0, Variable(sort_ids).cuda())
        else:
            inputs = inputs.index_select(0, Variable(sort_ids))
        
        _, true_order_ids = torch.sort(sort_ids, dim=0, descending=False)
        
        return inputs, inputs_len, true_order_ids
    #
    def forward(self, inputs, inputs_len):
        inputs = Variable(inputs)
        if self.use_cuda:
            inputs=inputs.cuda()
            
        inputs, sort_len, true_order_ids = self.order(inputs, inputs_len)

        in_vecs=self.embed(inputs)

        packed = rnn_utils.pack_padded_sequence(input=in_vecs, lengths=list(sort_len), batch_first =True)
        
        outputs, (hn,cn) = self.lstm(packed)
        outputs, sent_lens = rnn_utils.pad_packed_sequence(outputs)
        
        #print('outpurs size, hn size and cn size: ', outputs.size(), hn.size(), cn.size())
        outputs = outputs.transpose(0,1)  #transpose is necessary
        #print('outpurs size, hn size and cn size: ', outputs.size(), hn.size(), cn.size())
        
        #warnning: outputs, hn and cn have been sorted by sentences length so the order is wrong, now to sort them
        if self.use_cuda:
            outputs = outputs.index_select(0, Variable(true_order_ids).cuda())
        else:
            outputs = outputs.index_select(0, Variable(true_order_ids))
        
        hn = torch.cat((hn[0], hn[1]), dim=1)
        cn = torch.cat((cn[0], cn[1]), dim=1)
        #print('hn size and cn size: ', hn.size(), cn.size())
        
        if self.use_cuda:
            hn = hn.index_select(0, Variable(true_order_ids).cuda())
            cn = cn.index_select(0, Variable(true_order_ids).cuda())
        else:
            hn = hn.index_select(0, Variable(true_order_ids))
            cn = cn.index_select(0, Variable(true_order_ids))
            
        return outputs, (hn,cn)

In [7]:
class Decoder(nn.Module):
    def __init__(self, use_cuda, encoder, hidden_dim, max_length=25):
        super(Decoder, self).__init__()
        
        self.use_cuda = use_cuda
        self.hidden_dim=hidden_dim
        self.input_dim = encoder.input_dim
        self.max_length = max_length
        self.vocab = encoder.vocab
        self.weight = [1]*len(self.vocab.word2token)
        self.weight[self.vocab.word2token['<padding>']]=0
        #self.weight[self.vocab.word2token['<eos>']]=1.01
        #self.weight[self.vocab.word2token['<split>']]=1.01
        
        self.lstmcell = torch.nn.LSTMCell(input_size=self.input_dim, hidden_size=self.hidden_dim*2, bias=True)
        
        #embedding
        self.embed=encoder.embed# reference share
        #fcnn: projection for crossentroy loss
        self.fcnn = nn.Linear(in_features = self.hidden_dim*2, out_features = len(self.vocab.word2token))
        
        self.softmax = nn.Softmax()
        self.cost_func = nn.CrossEntropyLoss(torch.Tensor(self.weight))
        
        print('init lookup embedding matrix size: ', self.embed.weight.data.size())
        
    def forward(self, enc_outputs, sent_lens, h0_and_c0, labels, teaching_rate=0.6, is_train=1):
        labels = Variable(labels)
        if self.use_cuda:
            labels = labels.cuda()

        all_loss = 0
        predicts = []
        batch_size = enc_outputs.size(dim = 0)
        final_hidden_states = h0_and_c0[0]

        for ii in range(self.max_length+1):
            if ii==0:
                zero_timestep_input = Variable(torch.LongTensor([self.vocab.word2token['<sos>']]*batch_size))
                if self.use_cuda:
                    zero_timestep_input = zero_timestep_input.cuda()
                    
                zero_timestep_input = self.embed(zero_timestep_input)#size: batch_size * self.input_dim

                last_timestep_hidden_state,cx = self.lstmcell(zero_timestep_input, h0_and_c0)
                #print('hn and cn sizes: ', last_timestep_hidden_state.size(), cx.size())
                
                last_timestep_output = self.fcnn(last_timestep_hidden_state)
                if is_train:
                    loss = self.cost_func(last_timestep_output, labels[:,0])
                    all_loss+=loss
                
                _, max_idxs = torch.max(last_timestep_output, dim=1)
                predicts.append(torch.unsqueeze(max_idxs, dim=0))
                #print('max_idxs size: ',max_idxs.size(), max_idxs)
                
            else:
                if is_train:
                    rand = random.random()
                    if rand<teaching_rate:
                        this_timestep_input = self.embed(labels[:,ii-1])#label teaching, lookup embedding
                    else:
                        this_timestep_input = self.embed(max_idxs)#last_timestep output, and then look up word embedding
                else:
                    this_timestep_input = self.embed(max_idxs)#last_timestep output, and then look up word embedding
                    
                last_timestep_hidden_state ,cx = self.lstmcell(this_timestep_input, (last_timestep_hidden_state,cx))
                last_timestep_output = self.fcnn(last_timestep_hidden_state)
                
                if is_train:
                    loss = self.cost_func(last_timestep_output, labels[:,ii])
                    all_loss+=loss
                _, max_idxs = torch.max(last_timestep_output, dim=1)
                #print('max_idx size: ', max_idxs.size(), max_idxs)
                predicts.append(torch.unsqueeze(max_idxs, dim=0))
                
        predicts = torch.cat(predicts, dim=0)
        predicts = torch.transpose(predicts, 0, 1)
        #print('predicts size: ', predicts.size())
        
        if is_train:  #training
            if self.use_cuda:
                return all_loss/(self.max_length+1), predicts.data.cpu().numpy()
            else:
                return all_loss/(self.max_length+1), predicts.data.numpy()
        else:   #testing
            if self.use_cuda:
                return predicts.data.cpu().numpy()
            else:
                return predicts.data.numpy()


In [8]:
class AutoEncoder(nn.Module):
    def __init__(self, use_cuda, input_dim, hidden_dim, vocab, max_length = 25):
        super(AutoEncoder, self).__init__()
        
        self.use_cuda = use_cuda
        self.enc = Encoder(use_cuda=use_cuda, hidden_dim=hidden_dim, input_dim=input_dim, vocab=vocab)
        self.dec = Decoder(use_cuda=use_cuda, encoder=self.enc, hidden_dim=hidden_dim, max_length=max_length)
        if use_cuda:
            self.enc = self.enc.cuda()
            self.dec = self.dec.cuda()
    def forward(self, inputs, input_lens, labels, is_train=1, teaching_rate=1):
        enc_outputs, (enc_hn, enc_cn) = self.enc(torch.LongTensor(inputs), torch.LongTensor(input_lens))
        if is_train:
            loss, predicts = self.dec(enc_outputs = enc_outputs, 
                                    h0_and_c0=(enc_hn, enc_cn), 
                                    sent_lens=input_lens,
                                    labels=torch.LongTensor(labels), 
                                    is_train=1, 
                                    teaching_rate = 1
                                    )
            return loss, predicts
        else:
            predicts = self.dec(enc_outputs = enc_outputs, 
                                h0_and_c0=(enc_hn, enc_cn), 
                                sent_lens=input_lens,
                                labels=torch.LongTensor(labels), 
                                is_train=0, 
                                teaching_rate = 1
                                )
            return predicts

In [9]:
# use_cuda = 1
# hidden_dim = 512
# input_dim = 300

# enc = Encoder(use_cuda=use_cuda, 
#             hidden_dim=hidden_dim, 
#             input_dim=input_dim, 
#             vocab=vocab
#            )
# if use_cuda:
#     enc = enc.cuda()
    
# sample_num = 11
# print('sentences length: ', train_set_input_lens[0:sample_num])

# enc_outputs, (enc_hn, enc_cn) = enc(torch.LongTensor(train_set_inputs[0:sample_num]), 
#                                     torch.LongTensor(train_set_input_lens[0:sample_num]))
# print('enc result size: ', enc_outputs.size(), enc_hn.size(), enc_cn.size())

# dec = Decoder(use_cuda=use_cuda, encoder=enc, hidden_dim=hidden_dim, max_length=25)
# if use_cuda:
#     dec = dec.cuda()
    
# loss, predicts = dec(enc_outputs = enc_outputs, 
#                     h0_and_c0=(enc_hn, enc_cn), 
#                     sent_lens=train_set_input_lens[0:sample_num], 
#                     labels=torch.LongTensor(train_set_labels[0:sample_num]), 
#                     is_train=1, teaching_rate = 1
#                     )
# print('loss is %4.7f'%loss.data[0])

# autoencoder = AutoEncoder(use_cuda = use_cuda, input_dim = input_dim, hidden_dim = hidden_dim, vocab = vocab, max_length = 25)
# loss, predicts = autoencoder.forward(torch.LongTensor(train_set_inputs[0:sample_num]), 
#                                      torch.LongTensor(train_set_input_lens[0:sample_num]), 
#                                      labels=torch.LongTensor(train_set_labels[0:sample_num]), 
#                                      is_train=1, teaching_rate=1)
# print('autocoder: loss is %4.7f'%loss.data[0])

In [10]:
use_cuda = 1
hidden_dim = 256
input_dim = 300
lr=0.005
batch_size=200
train_set_size=int(len(train_set_inputs)/2)
epochs=10
train_bleu = 0
autoencoder = AutoEncoder(use_cuda = use_cuda, input_dim = input_dim, hidden_dim = hidden_dim, 
                          vocab = vocab, max_length = 25)
#pre train para
#pre_train = torch.load('./models_better/loss-2.099016905-bleu-0.4078-hidden_dim-512-input_dim-300-epoch-0-batch_size-200-batch_id-[7001-[of]-21743]-lr-0.0050')
pre_train = torch.load('./models_better/time-[2019-01-07-16-38-14]-loss-1.881381631-bleu-0.5340-hidden_dim-256-input_dim-300-epoch-0-batch_size-200-batch_id-[6001-[of]-21743]-lr-0.0050')
autoencoder.load_state_dict(pre_train)

optimizer = optim.Adam(filter(lambda p: p.requires_grad, autoencoder.parameters()), lr=lr)

start_time = time.time()

def model_train(epoch, batch_size, train_set_size):
    batch_id = 0
    valid_bleu = 0
    for start_idx in range(0, train_set_size-batch_size, batch_size):
        batch_id+=1
        end_idx = start_idx + batch_size
        
        optimizer.zero_grad()#clear
        loss, predicts = autoencoder.forward(torch.LongTensor(train_set_inputs[start_idx:end_idx]), 
                                             torch.LongTensor(train_set_input_lens[start_idx:end_idx]), 
                                             labels=torch.LongTensor(train_set_labels[start_idx:end_idx]), 
                                             is_train=1, teaching_rate=1)
        #optimize
        loss.backward()#retain_graph=True)
        optimizer.step()
        
        if batch_id%50==1:
            autoencoder.eval()
            sample_num = 10
            rand_idx = random.randint(0, train_set_size-sample_num-1)
            #teaching forcing
            loss_, predicts = autoencoder.forward(torch.LongTensor(train_set_inputs[rand_idx:rand_idx+sample_num]), 
                                             torch.LongTensor(train_set_input_lens[rand_idx:rand_idx+sample_num]), 
                                             labels=torch.LongTensor(train_set_labels[rand_idx:rand_idx+sample_num]), 
                                             is_train=1, teaching_rate=1)
            del loss_
            tokenized_sents=predicts.tolist()
            real_sents=[]
            label_tokenized_sents=train_set_labels[rand_idx:rand_idx+sample_num]
            label_real_sents=[]
            for idx, sent in enumerate(tokenized_sents):
                real_sents.append(tokenized_sent2real_sent(sent, autoencoder.enc.vocab))
            for sent in label_tokenized_sents:
                label_real_sents.append(tokenized_sent2real_sent(sent, autoencoder.enc.vocab))

            print('train_set sample: ', rand_idx)
            for (real_sent, label_real_sent) in zip(real_sents, label_real_sents):
                print(real_sent, '----<o_o>----', label_real_sent)
                
            #no teaching forcing
            print('----no teaching forcing----')
            predicts = autoencoder.forward(torch.LongTensor(train_set_inputs[rand_idx:rand_idx+sample_num]), 
                                             torch.LongTensor(train_set_input_lens[rand_idx:rand_idx+sample_num]), 
                                             labels=torch.LongTensor(train_set_labels[rand_idx:rand_idx+sample_num]), 
                                             is_train=0, teaching_rate=1)
            tokenized_sents=predicts.tolist()
            real_sents=[]
            label_tokenized_sents=train_set_labels[rand_idx:rand_idx+sample_num]
            label_real_sents=[]
            for idx, sent in enumerate(tokenized_sents):
                real_sents.append(tokenized_sent2real_sent(sent, autoencoder.enc.vocab))
            for sent in label_tokenized_sents:
                label_real_sents.append(tokenized_sent2real_sent(sent, autoencoder.enc.vocab))

            for (real_sent, label_real_sent) in zip(real_sents, label_real_sents):
                print(real_sent, '----<o_o>----', label_real_sent)
                
            info_stamp = 'loss-{:2.9f}-batch_size-{:n}-epoch-{:n}-batch_id-({:n}/{:n})'.format(
                              loss.data[0], batch_size, epoch, batch_id, int(train_set_size/batch_size))
            print(info_stamp)
            #valid_set testing
            if batch_id%1000==1:
                rand_idx=random.randint(0, len(valid_set_inputs)-batch_size-1-1)
                predicts = autoencoder.forward(torch.LongTensor(valid_set_inputs[rand_idx:rand_idx+batch_size]), 
                                                 torch.LongTensor(valid_set_input_lens[rand_idx:rand_idx+batch_size]), 
                                                 labels=[],#torch.LongTensor(valid_set_labels[rand_idx:rand_idx+batch_size]), 
                                                 is_train=0, teaching_rate=1)
                tokenized_sents=predicts.tolist()
                real_sents=[]
                label_tokenized_sents=valid_set_labels[rand_idx:rand_idx+batch_size]
                label_real_sents=[]
                for idx, sent in enumerate(tokenized_sents):
                    real_sents.append(tokenized_sent2real_sent(sent, autoencoder.enc.vocab))
                for sent in label_tokenized_sents:
                    label_real_sents.append(tokenized_sent2real_sent(sent, autoencoder.enc.vocab))

                bleu_score, valid_num = data_set_bleu(label_real_sents, real_sents)
                if valid_num>10:
                    valid_bleu = bleu_score/valid_num
                       
                info_stamp = 'loss-{:2.9f}-bleu-{:1.4f}-hidden_dim-{:n}-input_dim-{:n}-epoch-{:n}-batch_size-{:n}-batch_id-[{:n}-[of]-{:n}]-lr-{:1.4f}'.format(
                              loss.data[0], valid_bleu, hidden_dim, input_dim, epoch, batch_size, batch_id, int(train_set_size/batch_size), lr)
                print(valid_num, info_stamp)
                now = int(round(time.time()*1000))
                time_stamp = time.strftime('time-[%Y-%m-%d-%H-%M-%S]-',time.localtime(now/1000))
                torch.save(autoencoder.state_dict(), ''.join(['./models_saved/', time_stamp, info_stamp]))
                
            autoencoder.train()
            
for epoch in range(epochs):
    model_train(epoch, batch_size, train_set_size)
    
print('running time: %.2f mins'%((time.time()-start_time)/60))

init lookup embedding matrix size:  torch.Size([98638, 300])
train_set sample:  718657
我不能<low_freq><low_freq>那 ----<o_o>---- 我不能<low_freq><low_freq>那
我是说小狗。 ----<o_o>---- 我是说小狗。
市政将得到这一和财政持续有持续持续<low_freq>系统所运行的储备目标。 ----<o_o>---- 衰退将考验葡萄牙和爱尔兰是否有能力实现<low_freq>计划所设定的紧缩目标。
<low_freq>的收入技术的继续中国的，，<low_freq>技术的技术放缓，对，将将继续成为投资热点。 ----<o_o>---- <low_freq>的优秀企业有望扭转利润下滑趋势，某些企业的技术创新能力值得关注，数控机床将继续成为投资热点。
我可不想陪在你中间上吧才不去收拾你的好处呢。 ----<o_o>---- 我可不想夹在你们俩中间我才不去收拾你的烂摊子呢。
是啊—激烈的噢我猜。 ----<o_o>---- 是啊—短暂的罗曼蒂克我猜。
我是你的老板。你为我工作。 ----<o_o>---- 我是你的老板。你为我工作。
如果以否认自己的意愿意愿，而是它将将对经济放缓作用作用作用，因为无序式富人或地表 ----<o_o>---- 如果政府按照自己的意愿行事，那么水资源短缺将对经济转型起到一定作用，因为耗水型纺织厂或造纸厂
他们必须开启！ ----<o_o>---- 他们必须加快速度！
做错事的时候，她<low_freq><low_freq>说她一个老师，这位秘书她在的的的的1717种。 ----<o_o>---- 二十出头的时候，法兰西斯•<low_freq><low_freq>接到一个电话，对方邀请她领导宾夕法尼亚州约翰斯<low_freq>的女童军17团。
----no teaching forcing----
我不能<low_freq><low_freq>那 ----<o_o>---- 我不能<low_freq><low_freq>那
我是说小狗。 ----<o_o>---- 我是说小狗。
市政已经实施群众在持续所依赖私人目标进行巨大是否更加的货币管制。 ----<o_o>---- 衰退将考验葡萄牙和爱尔兰是否有能

KeyboardInterrupt: 