In [1]:
import json
import pickle
import random

import torch
from torch import nn, optim
from torch import autograd
from torch.autograd import Variable
import torch.nn.functional as F
import numpy as np
import torch.nn.utils.rnn as rnn_utils

import nltk
from nltk.translate.bleu_score import SmoothingFunction
from nltk.translate.bleu_score import sentence_bleu
import time
import copy

from Vocab import Vocab
from LanguageModel import LanguageModel
from Seq2Seq_att import Seq2Seq_att

import torch
torch.cuda.set_device(1)

print('import over')

import over
import over


In [2]:
def batch_words2sentence(words_list):
    return [' '.join(words) for words in words_list]
def batch_tokens2words(tokens_list, vocab):
    ##    para: tokens_list is list[list] type
    ##    return: words_list corresponding to tokens
    return [[vocab.token2word[token] for token in tokens] for tokens in tokens_list]

def batch_tokens_remove_eos(tokens_list, vocab):
    ##    para: tokens_list is list[list] type
    ##    return pure tokens_list removed eos symbol
    result=[]
    for tokens in tokens_list:
        tokens_filtered=[]
        for token in tokens:
            if token == vocab.word2token['<eos>']:
#                 tokens_filtered.append(token)
                break
            else:
                tokens_filtered.append(token)
        result.append(tokens_filtered)
    return result

def batch_tokens_bleu(references, candidates, smooth_epsilon=0.001):
    ##    para: references and candidates are list[list] type
    ##    return: list of BLEU for every sample
    ##
    bleu_scores=[]
    for ref, candidate in zip(references, candidates):
        if min(len(ref), len(candidate))<4:
            bleu_scores.append(0)
        else:
            bleu_scores.append(sentence_bleu([ref], candidate, smoothing_function = SmoothingFunction(epsilon=smooth_epsilon).method1))
    return bleu_scores

with open('data_set/vocab.pk', 'rb') as f:
    vocab=pickle.load(f)

    
def seqs_split(seqs, vocab):
    seqs = batch_tokens_remove_eos(seqs, vocab)
    simple_sent1s=[]
    simple_sent2s=[]
    for seq in seqs:
        simple_sent1=[]
        simple_sent2=[]
        sent=simple_sent1
        for token in seq:
            if token==vocab.word2token['<split>']:
                sent=simple_sent2
            else:
                sent.append(token)
        simple_sent1s.append(simple_sent1)
        simple_sent2s.append(simple_sent2)
        
    return simple_sent1s, simple_sent2s

def simple_sents_concat(simple_sent1s, simple_sent2s, vocab, max_length):
    simple_sent_lens=[]
    simple_sents=simple_sent1s
    for i, sent in enumerate(simple_sent2s):
        simple_sents[i].append(vocab.word2token['<split>'])
        for token in sent:
            simple_sents[i].append(token)

        #if there is no <split> in simple_sent1s and simple_sent2s, then the length of sents_concat will be longer than max_length
        if len(simple_sents[i])>max_length:
            simple_sents[i] = simple_sents[i][:max_length]
            
        simple_sent_lens.append(len(simple_sents[i]))
            
        while(len(simple_sents[i])<max_length):
            simple_sents[i].append(vocab.word2token['<padding>'])
            
    return simple_sents, simple_sent_lens


def get_lm_inputs_and_labels(sents, vocab, max_length):
    lm_inputs=copy.deepcopy(sents)
    lm_labels=copy.deepcopy(sents)
    lm_input_lens=[]
    
    for sent in lm_inputs:
        if len(sent)>=max_length:
            sent=sent[:max_length-1]
        sent.insert(0, vocab.word2token['<sos>'])
        lm_input_lens.append(len(sent))
        while(len(sent)<max_length):
            sent.append(vocab.word2token['<padding>'])

    for sent in lm_labels:
        if len(sent)>=max_length:
            sent = sent[:max_length-1]
        sent.append(vocab.word2token['<eos>'])
        while(len(sent)<max_length):
            sent.append(vocab.word2token['<padding>'])
        
    return lm_inputs, lm_input_lens, lm_labels


def duplicate_reconstruct_labels(sents, topk):
    return [x for x in sents for ii in range(topk)]


def batch_tokens_bleu_split_version(references, candidates, vocab, smooth_epsilon=0.001):
    # needn't remove '<sos>' token before calling this function, which is different from the 'batch_token_bleu()' version
    #
    ref1, ref2 = seqs_split(references, vocab)
    cand1, cand2 = seqs_split(candidates, vocab)
    bleu_simple_sent1s = batch_tokens_bleu(ref1, cand1)
    bleu_simple_sent2s = batch_tokens_bleu(ref2, cand2)
#     print(bleu_simple_sent1s)
#     print(bleu_simple_sent2s)
    bleu=[]
    for idx in range(len(bleu_simple_sent1s)):
        bleu.append((bleu_simple_sent1s[idx]+bleu_simple_sent2s[idx])/2)
    return bleu


def set_model_grad(model, is_grad):
    for param in model.parameters():
         param.requires_grad = is_grad

In [3]:
with open('./data_set2/split_data_set/validation_complex_sents.pk', 'rb') as f:
    split_valid_set_inputs = pickle.load(f)
with open('./data_set2/split_data_set/validation_complex_sent_lens.pk', 'rb') as f:
    split_valid_set_input_lens = pickle.load(f)
with open('./data_set2/split_data_set/validation_labels.pk', 'rb') as f:
    split_pseudo_valid_set_labels = pickle.load(f)


In [4]:
def split_model_eval(model, inputs, input_lens, labels):
    dataset_size = len(inputs)
    print(dataset_size)
    scores_ground_truth=0
    scores_no_ground_truth=0
    for idx in range(0, dataset_size, batch_size):
        
        #no teacher forcing
        predicts = model.forward(torch.LongTensor(inputs[idx:idx+batch_size]),
                                 torch.LongTensor(input_lens[idx:idx+batch_size]),
                                 labels=[],
                                 is_train=0, teaching_rate=1)
        bleu_scores = batch_tokens_bleu_split_version(references = labels[idx:idx+batch_size],
                                                     candidates = predicts,
                                                     smooth_epsilon=0.001,
                                                     vocab=vocab)
        for x in bleu_scores:
            scores_no_ground_truth+=x
    return scores_no_ground_truth/dataset_size


def split_model_eval_topk(model, inputs, input_lens, labels, topk):
    
    dataset_size = len(inputs)
    print(dataset_size)
    scores_no_ground_truth=0
    for idx in range(0, dataset_size, batch_size):
        dec_seqs, log_probs = model.dec.decode_topk_seqs(model.enc, inputs=torch.LongTensor(inputs[idx:idx+batch_size]), 
                                                         input_lens=torch.LongTensor(input_lens[idx:idx+batch_size]),
                                                         topk=topk)
        predicts = []
        for ii in range(len(dec_seqs)):
            if ii%topk==0:
                predicts.append(dec_seqs[ii])
        
        bleu_scores = batch_tokens_bleu_split_version(references = labels[idx:idx+batch_size],
                                                     candidates = predicts,
                                                     smooth_epsilon=0.001,
                                                     vocab=vocab)
        for x in bleu_scores:
            scores_no_ground_truth+=x
        
    return scores_no_ground_truth/dataset_size

In [5]:
use_cuda = 1
hidden_dim = 256
input_dim = 100
lr=0.005
batch_size=35

epochs=10000
train_bleu_mean=-1
train_bleu_max=-1
split_model = Seq2Seq_att(use_cuda = use_cuda, input_dim = input_dim, hidden_dim = hidden_dim, 
                          vocab = vocab, max_length = 61)

# fusion_model = Seq2Seq(use_cuda = use_cuda, input_dim = input_dim, hidden_dim = hidden_dim, 
#                           vocab = vocab, max_length = 51)
#pre train para
split_model_path = './models_saved/time-[2019-03-24-21-45-10]-info=[pretrain_split-att-20per]-loss=0.359141707-bleu=0.6181-hidden_dim=256-input_dim=100-epoch=5-batch_size=180-batch_id=[501-[of]-1099]-lr=0.0050'
# fusion_model_path = './models_saved/time-[2019-03-10-13-23-11]-info=[pre-trained_fusion_model-20per]-loss=0.346116364-bleu=0.7466-hidden_dim=256-input_dim=100-epoch=4-batch_size=100-batch_id=[1-[of]-1979]-lr=0.0050'

pre_train = torch.load(split_model_path, map_location='cpu')
split_model.load_state_dict(pre_train)
# pre_train = torch.load(fusion_model_path, map_location='cpu')
# fusion_model.load_state_dict(pre_train)

if use_cuda:
    split_model = split_model.cuda()
#     fusion_model = fusion_model.cuda()


init lookup embedding matrix size:  torch.Size([44380, 100])


In [6]:
#case
sent = 'greene married denny miller in 1941 who died in 1991 .'
label ='greene married denny miller in 1941 . <split> they were married until her death in 1991 .'

sent = 'the school has two campuses , with around 3,000 students at the dover campus and 2,400 at the east campus .'
label = 'the school has two campuses , dover and east . <split> there are currently around 3,000 students on dover campus and 2,400 on east campus .'


tokenized_sent = []
sent=sent.split(' ')
for word in sent:
    if word in vocab.word2token:
        tokenized_sent.append(vocab.word2token[word])
    else:
        tokenized_sent.append(vocab.word2token['<low_freq>'])
# print(tokenized_sent)
sent_len = len(sent)

tokenized_label=[]
label = label.split(' ')
for word in label:
    if word in vocab.word2token:
        tokenized_label.append(vocab.word2token[word])
    else:
        tokenized_label.append(vocab.word2token['<low_freq>'])

        
# model with att
predicts = split_model.forward(torch.LongTensor([tokenized_sent]),
                                 torch.LongTensor([sent_len]),
                                 labels=[],
                                 is_train=0, teaching_rate=1)

bleu_scores = batch_tokens_bleu_split_version(references = [tokenized_label],
                                             candidates = predicts,
                                             smooth_epsilon=0.001,
                                             vocab=vocab)
print(bleu_scores)

predicts = batch_tokens_remove_eos(predicts, vocab)
predicts = batch_tokens2words(predicts, vocab)
predicts_sents = batch_words2sentence(predicts)
print(predicts_sents[0])

topk=2
dec_seqs, log_probs = split_model.dec.decode_topk_seqs(split_model.enc, inputs=torch.LongTensor([tokenized_sent]),
                                                         input_lens=torch.LongTensor([sent_len]),
                                                         topk=topk)
predicts = []
for ii in range(len(dec_seqs)):
    if ii%topk==0:
        predicts.append(dec_seqs[ii])

bleu_scores = batch_tokens_bleu_split_version(references = [tokenized_label],
                                             candidates = predicts,
                                             smooth_epsilon=0.001,
                                             vocab=vocab)
print(bleu_scores)

predicts = batch_tokens_remove_eos(predicts, vocab)
predicts = batch_tokens2words(predicts, vocab)
predicts_sents = batch_words2sentence(predicts)
print(predicts_sents[0])

[0.3571125471626948]
the school has two campuses . <split> around 3,000 students at the dover campus and 2,400 at the east campus .


In [8]:
batch_size=35
score = split_model_eval_topk(model=split_model, 
                             inputs=split_valid_set_inputs, 
                             input_lens=split_valid_set_input_lens, 
                             labels=split_pseudo_valid_set_labels,
                             topk=2)

print(score)

5000
0.6862993168709564


In [7]:
batch_size=100
score = split_model_eval(model=split_model, 
                         inputs=split_valid_set_inputs, 
                         input_lens=split_valid_set_input_lens, 
                         labels=split_pseudo_valid_set_labels)

print(score)

5000
0.6449701481493929


In [26]:
from Seq2Seq import Seq2Seq

#copy
split_model2 = Seq2Seq(use_cuda = use_cuda, input_dim = input_dim, hidden_dim = hidden_dim, 
                          vocab = vocab, max_length = 61)

# fusion_model = Seq2Seq(use_cuda = use_cuda, input_dim = input_dim, hidden_dim = hidden_dim, 
#                           vocab = vocab, max_length = 51)
#pre train para
split_model_path = './models_saved/time-[2019-03-24-21-30-26]-info=[pre-trained_split_model-20per]-loss=0.543618917-bleu=0.6642-hidden_dim=256-input_dim=100-epoch=2-batch_size=100-batch_id=[1-[of]-1979]-lr=0.0050'
split_model_path = './models_saved/time-[2019-03-26-09-50-00]-info=[pre-trained_split_model-5per]-loss=0.243756816-bleu=0.6464-hidden_dim=256-input_dim=100-epoch=12-batch_size=100-batch_id=[1-[of]-494]-lr=0.0050'
split_model_path = './models_saved/time-[2019-03-25-13-32-25]-info=[pre-trained_split_model-5per]-loss=0.368953973-bleu=0.6889-hidden_dim=256-input_dim=100-epoch=5-batch_size=100-batch_id=[1-[of]-494]-lr=0.0050'
split_model_path = './models_saved/time-[2019-03-30-13-37-54]-info=[pre-trained_split_model-unsuper]-loss=0.000472637-bleu=0.6470-hidden_dim=256-input_dim=100-epoch=0-batch_size=100-batch_id=[6501-[of]-7919]-lr=0.0050'


pre_train = torch.load(split_model_path, map_location='cpu')
split_model2.load_state_dict(pre_train)

if use_cuda:
    split_model2 = split_model2.cuda()

init lookup embedding matrix size:  torch.Size([44380, 100])


In [27]:
#model with att and copy
predicts = split_model2.forward(torch.LongTensor([tokenized_sent]),
                                 torch.LongTensor([sent_len]),
                                 labels=[],
                                 is_train=0, teaching_rate=1)

bleu_scores = batch_tokens_bleu_split_version(references = [tokenized_label],
                                             candidates = predicts,
                                             smooth_epsilon=0.001,
                                             vocab=vocab)
print(bleu_scores)

predicts = batch_tokens_remove_eos(predicts, vocab)
predicts = batch_tokens2words(predicts, vocab)
predicts_sents = batch_words2sentence(predicts)
print(predicts_sents[0])



topk=2
dec_seqs, log_probs = split_model2.dec.decode_topk_seqs(split_model2.enc, inputs=torch.LongTensor([tokenized_sent]),
                                                         input_lens=torch.LongTensor([sent_len]),
                                                         topk=topk)
predicts = []
for ii in range(len(dec_seqs)):
    if ii%topk==0:
        predicts.append(dec_seqs[ii])

bleu_scores = batch_tokens_bleu_split_version(references = [tokenized_label],
                                             candidates = predicts,
                                             smooth_epsilon=0.001,
                                             vocab=vocab)
print(bleu_scores)

predicts = batch_tokens_remove_eos(predicts, vocab)
predicts = batch_tokens2words(predicts, vocab)
predicts_sents = batch_words2sentence(predicts)
print(predicts_sents[0])

[0.35787900668872624]
the school has two campuses , with around 3,000 students . <split> at the dover campus and 2,400 at the east campus .
[0.35787900668872624]
the school has two campuses , with around 3,000 students . <split> at the dover campus and 2,400 at the east campus .


In [29]:
batch_size=35
score = split_model_eval_topk(model=split_model2, 
                             inputs=split_valid_set_inputs, 
                             input_lens=split_valid_set_input_lens, 
                             labels=split_pseudo_valid_set_labels,
                             topk=2)

print(score)

5000
0.6578814041853408


In [28]:
batch_size=100
score = split_model_eval(model=split_model2, 
                         inputs=split_valid_set_inputs, 
                         input_lens=split_valid_set_input_lens, 
                         labels=split_pseudo_valid_set_labels)

print(score)

5000
0.6579575758892041


In [30]:
print(len(vocab.word2token))

44380
