In [1]:
from __future__ import print_function

from keras import backend as K
from keras.models import Sequential, Model, load_model
from keras.layers.embeddings import Embedding
from keras.layers import Input, Activation, Dense, Lambda, Permute, Dropout, add, dot, concatenate
from keras.layers import LSTM
from keras.utils.data_utils import get_file
from keras.preprocessing.sequence import pad_sequences
from functools import reduce
import tarfile
import numpy as np
import re
import tensorflow as tf

from itertools import chain
from process_data import load_entities, save_pickle, load_pickle, load_kv_pairs, lower_list, vectorize, vectorize_kv_pairs

Using TensorFlow backend.


In [2]:
is_babi = False
if is_babi:
    train_data = load_task('./data/tasks_1-20_v1-2/en/qa5_three-arg-relations_train.txt', is_babi)
    test_data = load_task('./data/tasks_1-20_v1-2/en/qa5_three-arg-relations_test.txt', is_babi)
else:
    # N = 49900
    N = 50000000
    train_data = load_pickle('mov_task1_qa_pipe_train.pickle')[:N]
    test_data = load_pickle('mov_task1_qa_pipe_test.pickle')[:N]
    kv_pairs = load_pickle('mov_kv_pairs.pickle')
    train_kv_indices = load_pickle('mov_train_kv_indices.pickle')[:N]
    test_kv_indices = load_pickle('mov_test_kv_indices.pickle')[:N]
    train_kv = [ [kv_pairs[ind] for ind in indices] for indices in train_kv_indices ]
    test_kv = [ [kv_pairs[ind] for ind in indices] for indices in test_kv_indices ]
    train_kv = np.array([list(chain(*x)) for x in train_kv])
    test_kv = np.array([list(chain(*x)) for x in test_kv])
    print(len(train_kv), train_kv[0])
    
    entities = load_pickle('mov_entities.pickle')
    entity_size = len(entities)

vocab = set()
for story, q, answer in train_data + test_data:
    vocab |= set(story + q + answer)
vocab = sorted(vocab)

# Reserve 0 for masking via pad_sequences
vocab_size = len(vocab) + 1
story_maxlen = max(map(len, (x for x, _, _ in train_data + test_data)))
query_maxlen = max(map(len, (x for _, x, _ in train_data + test_data)))

print('-')
print('Vocab size:', vocab_size, 'unique words')
print('Story max length:', story_maxlen, 'words')
print('Query max length:', query_maxlen, 'words')
print('Number of training data:', len(train_data))
print('Number of test data:', len(test_data))
print('-')
print('Here\'s what a "data" tuple looks like (input, query, answer):')
print(train_data[0])
print('-')
print('Vectorizing the word sequences...')

print('Number of entities', len(entities))
w2i = dict((c, i + 1) for i, c in enumerate(vocab))
inputs_train, queries_train, answers_train = vectorize(train_data,
                                                               w2i,
                                                               story_maxlen,
                                                               query_maxlen, entities)
inputs_test, queries_test, answers_test = vectorize(test_data,
                                                            w2i,
                                                            story_maxlen,
                                                            query_maxlen, entities)

print('-')
print('inputs: integer tensor of shape (samples, max_length)')
print('inputs_train shape:', inputs_train.shape)
print('inputs_test shape:', inputs_test.shape)
print('-')
print('queries: integer tensor of shape (samples, max_length)')
print('queries_train shape:', queries_train.shape)
print('queries_test shape:', queries_test.shape)
print('-')
print('answers: binary (1 or 0) tensor of shape (samples, vocab_size)')
print('answers_train shape:', answers_train.shape)
print('answers_test shape:', answers_test.shape)


print('train_kv[0]:', train_kv[0], ', mem_size:', len(train_kv[0]))
# mem_maxlen = max(map(len, (x for x in train_kv+test_kv)))
train_mem_maxlen = max(map(len, (x for x in train_kv)))
test_mem_maxlen = max(map(len, (x for x in test_kv)))
mem_maxlen = max(train_mem_maxlen, test_mem_maxlen)

print('mem_maxlen:', mem_maxlen)
vec_train_kv = vectorize_kv_pairs(train_kv, mem_maxlen, vocab)
vec_test_kv = vectorize_kv_pairs(test_kv, mem_maxlen, vocab)

load mov_task1_qa_pipe_train.pickle
load mov_task1_qa_pipe_test.pickle
load mov_kv_pairs.pickle
load mov_train_kv_indices.pickle
load mov_test_kv_indices.pickle
96185 ['love field', 'starred_actors', 'michelle', 'pfeiffer,', 'dennis haysbert', 'the rainmaker', 'has_genre', 'drama']
load mov_entities.pickle
-
Vocab size: 41556 unique words
Story max length: 0 words
Query max length: 21 words
Number of training data: 96185
Number of test data: 9952
-
Here's what a "data" tuple looks like (input, query, answer):
([], ['what', 'movies', 'are', 'about', 'ginger rogers'], ['the barkleys of broadway', 'kitty foyle', 'top hat'])
-
Vectorizing the word sequences...
Number of entities 72008
-
inputs: integer tensor of shape (samples, max_length)
inputs_train shape: (96185, 0)
inputs_test shape: (9952, 0)
-
queries: integer tensor of shape (samples, max_length)
queries_train shape: (96185, 21)
queries_test shape: (9952, 21)
-
answers: binary (1 or 0) tensor of shape (samples, vocab_size)
answers_

In [3]:
print('train_kv[0]:', train_kv[0], ', mem_size:', len(train_kv[0]))
train_mem_maxlen = max(map(len, (x for x in train_kv)))
test_mem_maxlen = max(map(len, (x for x in test_kv)))
mem_maxlen = max(train_mem_maxlen, test_mem_maxlen)
print('mem_maxlen:', mem_maxlen)

vec_train_kv = vectorize_kv_pairs(train_kv, mem_maxlen, vocab)
vec_test_kv = vectorize_kv_pairs(test_kv, mem_maxlen, vocab)

train_kv[0]: ['love field', 'starred_actors', 'michelle', 'pfeiffer,', 'dennis haysbert', 'the rainmaker', 'has_genre', 'drama'] , mem_size: 8
mem_maxlen: 210


In [4]:
print(train_kv[0],'\n', vec_train_kv[0])
print('candidate answer num:', len(answers_train[0]))

['love field', 'starred_actors', 'michelle', 'pfeiffer,', 'dennis haysbert', 'the rainmaker', 'has_genre', 'drama'] 
 [21931 41557  9238 37068 41559 10125     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0

In [5]:
# from net.memnn_kv import MemNNKV

def MemNNKV(mem_size, query_maxlen, vocab_size, entity_size, embd_size):
    print('mem_size:', mem_size)
    print('q_max', query_maxlen)
    print('embd_size', embd_size)
    print('vocab_size', vocab_size)
    print('entity_size', entity_size)
    print('-----------')

    # placeholders
    key = Input((mem_size,), name='Key_Input')
    val = Input((mem_size,), name='Val_Input')
    question = Input((query_maxlen,), name='Question_Input')

    # encoders
    # memory encoders
    # output: (None, mem_size, embd_size)
    key_encoder = Sequential(name='Key_Encoder')
    key_encoder.add(Embedding(input_dim=entity_size, output_dim=embd_size, input_length=mem_size))
    val_encoder = Sequential(name='Val_Encoder')
    val_encoder.add(Embedding(input_dim=entity_size, output_dim=embd_size, input_length=mem_size))

    # embed the question into a sequence of vectors
    # output: (None, query_maxlen, embd_size)
    question_encoder = Sequential(name='Question_Encoder')
    question_encoder.add(Embedding(input_dim=vocab_size, output_dim=embd_size, input_length=query_maxlen))
    # question_encoder.add(Dropout(0.3))

    # encode input sequence and questions (which are indices)
    # to sequences of dense vectors
    key_encoded = key_encoder(key) # (None, mem_size, embd_size)
    val_encoded = val_encoder(val) # (None, mem_size, embd_size)
    question_encoded = question_encoder(question) # (None, query_max_len, embd_size)
    
    ph = dot([question_encoded, key_encoded], axes=(2, 2)) # (None, query_max_len, mem_size)
    ph = Permute((2, 1))(ph) # (None, mem_size, query_max_len)
    o = dot([ph, val_encoded], axes=(1, 1)) # (None, query_max_len, embd_size)
    R = Dense(embd_size, input_shape=(embd_size,), name='R_Dense')     
    q2 = R(add([question_encoded,  o])) # (None, query_max_len, embd_size)
    
    cand_encoder = Sequential(name='cand_encoder')
    cand_encoder.add(Embedding(input_dim=entity_size, output_dim=embd_size, input_length=1))
#     cand_encoder.add(Dropout(0.3))
    
    cand = Input((entity_size,), name='Cand_Input')
    y_encoded = cand_encoder(cand) # (None, entity_size, embd_size)
    
    answer = dot([q2, y_encoded], axes=(2, 2)) # (None, query_max_len, entity_size)
    answer = Lambda(lambda x: K.sum(x, axis=1), output_shape=(entity_size, )) (answer)
    preds = Activation('softmax')(answer)
    
    # build the final model
    model = Model([key, val, question, cand], answer)
    model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
    return model



In [6]:
embd_size = 64
memnn_kv = MemNNKV(mem_maxlen, query_maxlen, vocab_size, entity_size, embd_size)
print(memnn_kv.summary())
memnn_kv.fit([vec_train_kv, vec_train_kv, queries_train, answers_train], answers_train,
          batch_size=32,
          epochs=10)#,
#           validation_data=([vec_test_kv, vec_test_kv, queries_test, answers_test], answers_test))

mem_size: 210
q_max 21
embd_size 64
vocab_size 41556
entity_size 72008
-----------
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
Question_Input (InputLayer)      (None, 21)            0                                            
____________________________________________________________________________________________________
Key_Input (InputLayer)           (None, 210)           0                                            
____________________________________________________________________________________________________
Question_Encoder (Sequential)    (None, 21, 64)        2659584     Question_Input[0][0]             
____________________________________________________________________________________________________
Key_Encoder (Sequential)         (None, 210, 64)       4608512     Key_Input[0][0]                  
________

KeyboardInterrupt: 

In [None]:
memnn_kv.save('model_memnn_kv.h5')

In [None]:
model = load_model('model_memnn_kv.h5')
score = model.evaluate([vec_test_kv, vec_test_kv, queries_test, answers_test], answers_test, verbose=1)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

In [None]:
vocab = set()
for story, q, answer in (train_data + test_data):#[:100]:
#     print(story + q + answer)
    vocab |= set(story + q + answer)
vocab = sorted(vocab)
print(len(list(set(vocab))))
print(vocab[:10])

In [None]:
vocab[:100]

In [None]:
def MemNN(story_maxlen, query_maxlen, vocab_size, embd_size):
    # placeholders
    input_sequence = Input((story_maxlen,))
    question = Input((query_maxlen,))

    # encoders
    # embed the input sequence into a sequence of vectors
    input_encoder_m = Sequential()
    input_encoder_m.add(Embedding(input_dim=vocab_size,
                                  output_dim=embd_size))
    input_encoder_m.add(Dropout(0.3))
    # output: (samples, story_maxlen, embedding_dim)

    # embed the input into a sequence of vectors of size query_maxlen
    input_encoder_c = Sequential()
    input_encoder_c.add(Embedding(input_dim=vocab_size,
                                  output_dim=query_maxlen))
    input_encoder_c.add(Dropout(0.3))
    # output: (samples, story_maxlen, query_maxlen)

    # embed the question into a sequence of vectors
    question_encoder = Sequential()
    question_encoder.add(Embedding(input_dim=vocab_size,
                                   output_dim=embd_size,
                                   input_length=query_maxlen))
    question_encoder.add(Dropout(0.3))
    # output: (samples, query_maxlen, embedding_dim)

    # encode input sequence and questions (which are indices)
    # to sequences of dense vectors
    input_encoded_m = input_encoder_m(input_sequence)
    input_encoded_c = input_encoder_c(input_sequence)
    question_encoded = question_encoder(question) # (None, query_max_len, embd_size)

    # compute a 'match' between the first input vector sequence
    # and the question vector sequence
    # shape: `(samples, story_maxlen, query_maxlen)`
    match = dot([input_encoded_m, question_encoded], axes=(2, 2))
    match = Activation('softmax')(match) # (None, max_storylen, query_maxlen)

    # add the match matrix with the second input vector sequence
    response = add([match, input_encoded_c])  # (samples, story_maxlen, query_maxlen)
    response = Permute((2, 1))(response)  # (samples, query_maxlen, story_maxlen)
    print('---')
    print('match',match.shape)
    print('input_c', input_encoded_c.shape)
    print('response', response.shape)
    # concatenate the match matrix with the question vector sequence
#     answer = concatenate([response, question_encoded])
    answer = response
    print('---')
    print('resp.shape', response.shape)
    print('q_enc.shape', question_encoded.shape)
    print('answer.shape', answer.shape)

    # the original paper uses a matrix multiplication for this reduction step.
    # we choose to use a RNN instead.
    answer = LSTM(32)(answer)  # (samples, 32)

    # one regularization layer -- more would probably be needed.
#     answer = Dropout(0.3)(answer)
    answer = Dense(vocab_size)(answer)  # (samples, vocab_size)
    # we output a probability distribution over the vocabulary
    answer = Activation('softmax')(answer)

    # build the final model
    model = Model([input_sequence, question], answer)
    model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model

embd_size = 64
model = MemNN(story_maxlen, query_maxlen, vocab_size, embd_size)
print(model.summary())

# train
model.fit([inputs_train, queries_train], answers_train,
          batch_size=32,
          epochs=3,
          validation_data=([inputs_test, queries_test], answers_test))