In [126]:
from allennlp.data.tokenizers import Token, Tokenizer, WordTokenizer
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data import Vocabulary 
from allennlp.data.dataset import Dataset
from allennlp.models.encoder_decoders.simple_seq2seq import SimpleSeq2Seq
from allennlp.data.dataset_readers.seq2seq import Seq2SeqDatasetReader
from allennlp.modules import Attention, TextFieldEmbedder, Seq2SeqEncoder
from allennlp.modules.seq2seq_encoders.intra_sentence_attention import IntraSentenceAttentionEncoder
from allennlp.modules.token_embedders import Embedding
import os

In [90]:
HOME = os.path.expanduser("~")
DATA = HOME + "/data_buffer/WebQA/"
DEMO = DATA + 'demo.tsv'
EMBED_DIM = 300

In [91]:
def config(path_to_file):
    seq2seq_reader = Seq2SeqDatasetReader(source_tokenizer=WordTokenizer(),
                                     target_tokenizer=WordTokenizer(),
                                     source_token_indexers={'source_tokens':SingleIdTokenIndexer()},
                                     target_token_indexers={'target_tokens':SingleIdTokenIndexer()})

    dataset = seq2seq_reader.read(path_to_file)
    vocabs = Vocabulary.from_dataset(dataset)

    dataset.index_instances(vocabs)

    padding_lengths = dataset.get_padding_lengths()

    #array_dict = dataset.as_array_dict(padding_lengths, verbose=False)
    
    padding_lengths = dataset.get_padding_lengths()
    max_input_size  = padding_lengths['source_tokens']['num_tokens']
    max_output_size = padding_lengths['target_tokens']['num_tokens']
    
    word_embedding = Embedding(num_embeddings=vocabs.get_vocab_size("token_ids"), 
                           embedding_dim=EMBED_DIM)
    text_field_embedder = BasicTextFieldEmbedder({"tokens": word_embedding})

    attn_encoder = IntraSentenceAttentionEncoder(max_input_size)

    seq2seq = SimpleSeq2Seq(vocab=vocabs, source_embedder=text_field_embedder,
                            encoder=attn_encoder, max_decoding_steps=max_output_size,
                            target_namespace='target_tokens', target_embedding_dim=EMBED_DIM)
    
    return vocabs, seq2seq

In [92]:
vocab, s2s_model = config(DEMO)

4it [00:00, 19.92it/s]
100%|██████████| 4/4 [00:00<00:00, 2617.76it/s]
100%|██████████| 4/4 [00:00<00:00, 1811.01it/s]


In [93]:
s2s_model

SimpleSeq2Seq (
  (_source_embedder): BasicTextFieldEmbedder (
    (token_embedder_tokens): Embedding (
    )
  )
  (_encoder): IntraSentenceAttentionEncoder (
    (_matrix_attention): MatrixAttention (
      (_similarity_function): DotProductSimilarity (
      )
    )
  )
  (_target_embedder): Embedding (
  )
  (_decoder_cell): LSTMCell(300, 246)
  (_output_projection_layer): Linear (246 -> 2)
)

In [127]:
seq2seq_reader = Seq2SeqDatasetReader()

from allennlp.data import Vocabulary 


In [128]:
dataset = seq2seq_reader.read(DEMO)

4it [00:00, 20.76it/s]


In [129]:
instances = dataset.instances

In [132]:
new = Dataset(instances)

In [134]:
v = Vocabulary.from_instances(new)
#v._token_to_index

AttributeError: type object 'Vocabulary' has no attribute 'from_instances'

In [103]:
from collections import defaultdict
namespace_token_counts = defaultdict(lambda: defaultdict(int))

In [105]:
import tqdm
for instance in tqdm.tqdm(instances):
    instance.count_vocab_items(namespace_token_counts)

100%|██████████| 4/4 [00:00<00:00, 1875.39it/s]


In [106]:
v = Vocabulary(counter=namespace_token_counts)

In [108]:
v._token_to_index

_TokenToIndexDefaultDict(None,
                         {'tokens': {'!': 23,
                           '.': 2,
                           '?': 44,
                           '@@END@@': 48,
                           '@@PADDING@@': 0,
                           '@@START@@': 43,
                           '@@UNKNOWN@@': 1,
                           'a': 17,
                           'abnormally': 115,
                           'about': 105,
                           'additional': 109,
                           'after': 160,
                           'again': 166,
                           'ago': 104,
                           'all': 128,
                           'also': 47,
                           'am': 15,
                           'and': 4,
                           'anemia': 233,
                           'antihistamines': 135,
                           'anxiety': 125,
                           'any': 163,
                           'are': 24,
                      

In [110]:
instances[0].fields

{'source_tokens': <allennlp.data.fields.text_field.TextField at 0x1a215b6160>,
 'target_tokens': <allennlp.data.fields.text_field.TextField at 0x1a216174e0>}