In [2]:
import numpy as np
import torch
import pickle
import os
from packages.vocab import Vocab
from packages.batch import Batch

In [62]:
# hyperparameters
hidden_dim = 256
embed_dim = 128
batch_size = 16
num_samples = 200
max_encoder_steps = 200
max_decoder_steps = 50
beam_size = 4
min_decoder_steps = 20 # min size of generated sequence
vocab_size = 500
lr = 0.15
adagrad_init_acc = 0.1 # deprecated for pytorch
rand_unif_init_mag = 0.02 # magnitude for lstm cells during random init
trunc_norm_init_std = 1e-4 # std of truncated norm initialization
max_grad_norm = 2.0 # so they do apply gradient clipping
max_oovs = 20 # maximum number of oovs allowed?
coverage_loss = 1.0 # lambda

In [None]:
# get vocabulary
vocab = Vocab(500)
vocab.w2i = np.load('word2idx.npy').item()
vocab.i2w = np.load('idx2word.npy').item()
vocab.count = len(vocab.w2i)

In [63]:
# get dataset in batches
file_dir = '/home/jatin/intern/internenv/cnn/2.stories_tokenized_100/'
file_list = os.listdir(file_dir)
batch = Batch(file_list,200,50,20)

In [64]:
batch.init_minibatch()
with open(os.path.join(file_dir,file_list[19])) as f:
    minibatch = f.read()
    minibatch = minibatch.split('\n\n')
    minibatch = [line for line in minibatch if not line.startswith(":==:")]
stories, summaries = batch.process_minibatch(minibatch,vocab)

In [65]:
idx=9

' '.join(vocab.idx_list_to_word_list(stories[idx]))

"( cnn ) -- two people were <UNK> and around <UNK> were <UNK> when a <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> , the <UNK> of <UNK> <UNK> said wednesday . the <UNK> <UNK> at <UNK> <UNK> ( just before <UNK> <UNK> <UNK> ) . the <UNK> was near the city of <UNK> , about <UNK> miles south of the <UNK> , <UNK> . <UNK> 's <UNK> <UNK> <UNK> <UNK> wednesday to <UNK> the <UNK> . <UNK> <UNK> were <UNK> , <UNK> the hospital in <UNK> , but no <UNK> were <UNK> , the <UNK> said , <UNK> <UNK> of the city were <UNK> <UNK> or <UNK> . <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD

In [66]:
' '.join(vocab.idx_list_to_word_list(stories[idx],batch.idx2oov_list[idx]))
unked = batch.unk_minibatch(summaries[idx],vocab)
' '.join(vocab.idx_list_to_word_list(unked,batch.idx2oov_list[idx]))

'<SOS> around <UNK> were <UNK> . the <UNK> <UNK> just before 2 <UNK> ( <UNK> <UNK> <UNK> ) . its <UNK> was in <UNK> <UNK> . <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>'

In [67]:
' '.join(vocab.idx_list_to_word_list(summaries[idx],batch.idx2oov_list[idx]))

'<SOS> around 100 were injured . the <UNK> happened just before 2 a.m. ( 9 p.m. et ) . its epicenter was in central serbia . <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>'

In [68]:
' '.join(vocab.idx_list_to_word_list(summaries[idx]))

'<SOS> around <UNK> were <UNK> . the <UNK> <UNK> just before 2 <UNK> ( <UNK> <UNK> <UNK> ) . its <UNK> was in <UNK> <UNK> . <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>'

In [69]:
batch.oov2idx_list[idx]

{'100': 501,
 '1:56': 514,
 '5.4': 504,
 '9': 516,
 'a.m.': 515,
 'affairs': 512,
 'central': 507,
 'earthquake': 505,
 'epicenter': 519,
 'et': 518,
 'happened': 513,
 'injured': 502,
 'internal': 511,
 'killed': 500,
 'magnitude': 503,
 'ministry': 510,
 'overnight': 509,
 'p.m.': 517,
 'serbia': 508,
 'struck': 506}