In [2]:
from allennlp.data.tokenizers import Token, Tokenizer, WordTokenizer
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data import Vocabulary 
from allennlp.data.dataset import Dataset
from allennlp.models.encoder_decoders.simple_seq2seq import SimpleSeq2Seq
from allennlp.data.dataset_readers.seq2seq import Seq2SeqDatasetReader
from allennlp.modules import Attention, TextFieldEmbedder, Seq2SeqEncoder
from allennlp.modules.seq2seq_encoders.intra_sentence_attention import IntraSentenceAttentionEncoder
from allennlp.modules.token_embedders import Embedding
import os

In [3]:
HOME = os.path.expanduser("~")
DATA = HOME + "/data_buffer/WebQA/"
DEMO = DATA + 'demo.tsv'
TRAIN = HOME + "/hyak_training_package/WebQA/corpus/corpus_train.tsv"
EMBED_DIM = 300

In [4]:
def config(path_to_file):
    seq2seq_reader = Seq2SeqDatasetReader(source_tokenizer=WordTokenizer(),
                                     target_tokenizer=WordTokenizer(),
                                     source_token_indexers={'source_tokens':SingleIdTokenIndexer(namespace="source_tokens")},
                                     target_token_indexers={'target_tokens':SingleIdTokenIndexer(namespace="target_tokens")})

    dataset = seq2seq_reader.read(path_to_file)
    vocabs = Vocabulary.from_dataset(dataset)

    dataset.index_instances(vocabs)

    padding_lengths = dataset.get_padding_lengths()

    #array_dict = dataset.as_array_dict(padding_lengths, verbose=False)
    
    padding_lengths = dataset.get_padding_lengths()
    max_input_size  = padding_lengths['source_tokens']['num_tokens']
    max_output_size = padding_lengths['target_tokens']['num_tokens']
    
    word_embedding = Embedding(num_embeddings=vocabs.get_vocab_size("source_tokens"), 
                           embedding_dim=EMBED_DIM)
    text_field_embedder = BasicTextFieldEmbedder({"source_tokens": word_embedding})

    attn_encoder = IntraSentenceAttentionEncoder(max_input_size)

    seq2seq = SimpleSeq2Seq(vocab=vocabs, source_embedder=text_field_embedder,
                            encoder=attn_encoder, max_decoding_steps=max_output_size,
                            target_namespace='target_tokens', target_embedding_dim=EMBED_DIM)
    
    return vocabs, seq2seq

In [5]:
vocab, s2s_model = config(TRAIN)

584it [00:41, 13.99it/s]

KeyboardInterrupt: 

584it [01:00,  9.73it/s]

In [211]:
s2s_model

SimpleSeq2Seq (
  (_source_embedder): BasicTextFieldEmbedder (
    (token_embedder_source_tokens): Embedding (
    )
  )
  (_encoder): IntraSentenceAttentionEncoder (
    (_matrix_attention): MatrixAttention (
      (_similarity_function): DotProductSimilarity (
      )
    )
  )
  (_target_embedder): Embedding (
  )
  (_decoder_cell): LSTMCell(300, 66)
  (_output_projection_layer): Linear (66 -> 116)
)

In [128]:
dataset = seq2seq_reader.read(DEMO)

4it [00:00, 20.76it/s]


In [129]:
instances = dataset.instances

In [132]:
new = Dataset(instances)