In [1]:
from __future__ import print_function

from keras import backend as K
from keras.models import Sequential, Model, load_model
from keras.layers.embeddings import Embedding
from keras.layers import Input, Activation, Dense, Lambda, Permute, Dropout, add, dot, concatenate
from keras.layers import LSTM
from keras.utils.data_utils import get_file
from keras.preprocessing.sequence import pad_sequences
from functools import reduce
import tarfile
import numpy as np
import re
import tensorflow as tf
from nltk.tokenize import word_tokenize

from itertools import chain
from process_data import load_entities, save_pickle, load_pickle, load_kv_pairs, lower_list, vectorize, vectorize_kv, get_relative_kv

Using TensorFlow backend.


In [None]:
is_babi = False
if is_babi:
    train_data = load_task('./data/tasks_1-20_v1-2/en/qa5_three-arg-relations_train.txt', is_babi)
    test_data = load_task('./data/tasks_1-20_v1-2/en/qa5_three-arg-relations_test.txt', is_babi)
else:
    # N = 49900
    N = 50000000
    mem_maxlen = 100 # 1つのエピソードに関連しているKVの数に対する制限
    train_data = load_pickle('mov_task1_qa_pipe_train.pickle')[:N]
    test_data = load_pickle('mov_task1_qa_pipe_test.pickle')[:N]
    kv_pairs = load_pickle('mov_kv_pairs.pickle')
    train_kv_indices = load_pickle('mov_train_kv_indices.pickle')[:N]
    test_kv_indices = load_pickle('mov_test_kv_indices.pickle')[:N]
    train_k, train_v = get_relative_kv(train_kv_indices, kv_pairs)
    test_k, test_v = get_relative_kv(test_kv_indices, kv_pairs)
    train_k = np.array([list(chain(*x))[:mem_maxlen] for x in train_k])
    train_v = np.array([list(chain(*x))[:mem_maxlen] for x in train_v])
    test_k = np.array([list(chain(*x))[:mem_maxlen] for x in test_k])
    test_v = np.array([list(chain(*x))[:mem_maxlen] for x in test_v])
    entities = load_pickle('mov_entities.pickle')
    entity_size = len(entities)

vocab = set(entities +  ['directed_by', 'written_by', 'starred_actors', 'release_year', 'has_genre', 'has_tags', 'has_plot'] )
for story, q, answer in train_data + test_data:
    vocab |= set(story + q + answer)
vocab = sorted(vocab)

# Reserve 0 for masking via pad_sequences
vocab_size = len(vocab) #+ 1
story_maxlen = max(map(len, (x for x, _, _ in train_data + test_data)))
query_maxlen = max(map(len, (x for _, x, _ in train_data + test_data)))

print('-')
print('Vocab size:', vocab_size, 'unique words')
print('Story max length:', story_maxlen, 'words')
print('Query max length:', query_maxlen, 'words')
print('Number of training data:', len(train_data))
print('Number of test data:', len(test_data))
print('-')
print('Here\'s what a "data" tuple looks like (input, query, answer):')
print(train_data[0])
print('-')
print('Vectorizing the word sequences...')

print('Number of entities', len(entities))
w2i = dict((c, i) for i, c in enumerate(vocab))
inputs_train, queries_train, answers_train = vectorize(train_data,
                                                               w2i,
                                                               story_maxlen,
                                                               query_maxlen)
inputs_test, queries_test, answers_test = vectorize(test_data,
                                                            w2i,
                                                            story_maxlen,
                                                            query_maxlen)

print('-')
print('inputs: integer tensor of shape (samples, max_length)')
print('inputs_train shape:', inputs_train.shape)
print('inputs_test shape:', inputs_test.shape)
print('-')
print('queries: integer tensor of shape (samples, max_length)')
print('queries_train shape:', queries_train.shape)
print('queries_test shape:', queries_test.shape)
print('-')
print('answers: binary (1 or 0) tensor of shape (samples, vocab_size)')
print('answers_train shape:', answers_train.shape)
print('answers_test shape:', answers_test.shape)

In [None]:
for k in train_k[:2]:
    print('===========')
    print(len(k), k)

In [None]:
# e2i = dict((e, i) for i, e in enumerate(entities))
max_memory_num = 100
vec_train_k = vectorize_kv(train_k, mem_maxlen, w2i)
vec_train_v = vectorize_kv(train_v, mem_maxlen, w2i)
print('vec_k', vec_train_k.shape)
print('vec_v', vec_train_v.shape)

In [None]:
print(train_k[0],'\n', vec_train_k[0])
print('candidate answer num:', len(answers_train[0]))
print('question[0]', queries_train[0])

In [None]:
def MemNNKV(mem_size, query_maxlen, vocab_size, embd_size):
    print('mem_size:', mem_size)
    print('q_max', query_maxlen)
    print('embd_size', embd_size)
    print('vocab_size', vocab_size)
#     print('entity_size', entity_size)
    print('-----------')

    # placeholders
    key = Input((mem_size,), name='Key_Input')
    val = Input((mem_size,), name='Val_Input')
    question = Input((query_maxlen,), name='Question_Input')

    # encoders
    # memory encoders
    # output: (None, mem_size, embd_size)
#     shared_embd_A = Embedding(input_dim=vocab_size, output_dim=embd_size, input_length=mem_size)
    shared_embd_A = Embedding(input_dim=vocab_size, output_dim=embd_size)
    key_encoder = Sequential(name='Key_Encoder')
    key_encoder.add(shared_embd_A)
#     key_encoder.add(Dropout(0.3))
    val_encoder = Sequential(name='Val_Encoder')
    val_encoder.add(shared_embd_A)
#     val_encoder.add(Dropout(0.3))

    # embed the question into a sequence of vectors
    # output: (None, query_maxlen, embd_size)
    question_encoder = Sequential(name='Question_Encoder')
#     question_encoder.add(Embedding(input_dim=vocab_size, output_dim=embd_size, input_length=query_maxlen))
    question_encoder.add(shared_embd_A)
#     question_encoder.add(Dropout(0.3))

    # encode input sequence and questions (which are indices)
    # to sequences of dense vectors
    key_encoded = key_encoder(key) # (None, mem_size, embd_size)
    val_encoded = val_encoder(val) # (None, mem_size, embd_size)
    question_encoded = question_encoder(question) # (None, query_max_len, embd_size)
    
    ph = dot([question_encoded, key_encoded], axes=(2, 2)) # (None, query_max_len, mem_size)
    ph = Permute((2, 1))(ph) # (None, mem_size, query_max_len)
    o = dot([ph, val_encoded], axes=(1, 1)) # (None, query_max_len, embd_size)
    R = Dense(embd_size, input_shape=(embd_size,), name='R_Dense')     

    q2 = R(add([question_encoded,  o])) # (None, query_max_len, embd_size)
    answer = Dense(vocab_size, name='last_Dense')(q2)
    answer = Lambda(lambda x: K.sum(x, axis=1), output_shape=(vocab_size, )) (answer)
    print('answer.shape', answer.shape)
    preds = Activation('softmax')(answer)
    
    # build the final model
    model = Model([key, val, question], preds)
    model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

print('vec_test_k.shape', vec_train_k.shape)
print('vec_test_v.shape', vec_train_v.shape)
print('queries_train.shape', queries_train.shape)
print('ans', answers_train.shape)

In [None]:
embd_size = 32
memnn_kv = MemNNKV(mem_maxlen, query_maxlen, vocab_size, embd_size)
print(memnn_kv.summary())
memnn_kv.fit([vec_train_k, vec_train_v, queries_train], answers_train,
          batch_size=32,
          epochs=10)#,
#           validation_data=([vec_test_kv, vec_test_kv, queries_test, answers_test], answers_test))

In [None]:
memnn_kv.save('model_memnn_kv.h5')

In [None]:
model = load_model('model_memnn_kv.h5')
score = model.evaluate([vec_test_kv, vec_test_kv, queries_test, answers_test], answers_test, verbose=1)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

In [None]:
vocab = set()
for story, q, answer in (train_data + test_data):#[:100]:
#     print(story + q + answer)
    vocab |= set(story + q + answer)
vocab = sorted(vocab)
print(len(list(set(vocab))))
print(vocab[:10])

In [None]:
vocab[:100]

In [None]:
def MemNN(story_maxlen, query_maxlen, vocab_size, embd_size):
    # placeholders
    input_sequence = Input((story_maxlen,))
    question = Input((query_maxlen,))

    # encoders
    # embed the input sequence into a sequence of vectors
    input_encoder_m = Sequential()
    input_encoder_m.add(Embedding(input_dim=vocab_size,
                                  output_dim=embd_size))
    input_encoder_m.add(Dropout(0.3))
    # output: (samples, story_maxlen, embedding_dim)

    # embed the input into a sequence of vectors of size query_maxlen
    input_encoder_c = Sequential()
    input_encoder_c.add(Embedding(input_dim=vocab_size,
                                  output_dim=query_maxlen))
    input_encoder_c.add(Dropout(0.3))
    # output: (samples, story_maxlen, query_maxlen)

    # embed the question into a sequence of vectors
    question_encoder = Sequential()
    question_encoder.add(Embedding(input_dim=vocab_size,
                                   output_dim=embd_size,
                                   input_length=query_maxlen))
    question_encoder.add(Dropout(0.3))
    # output: (samples, query_maxlen, embedding_dim)

    # encode input sequence and questions (which are indices)
    # to sequences of dense vectors
    input_encoded_m = input_encoder_m(input_sequence)
    input_encoded_c = input_encoder_c(input_sequence)
    question_encoded = question_encoder(question) # (None, query_max_len, embd_size)

    # compute a 'match' between the first input vector sequence
    # and the question vector sequence
    # shape: `(samples, story_maxlen, query_maxlen)`
    match = dot([input_encoded_m, question_encoded], axes=(2, 2))
    match = Activation('softmax')(match) # (None, max_storylen, query_maxlen)

    # add the match matrix with the second input vector sequence
    response = add([match, input_encoded_c])  # (samples, story_maxlen, query_maxlen)
    response = Permute((2, 1))(response)  # (samples, query_maxlen, story_maxlen)
    print('---')
    print('match',match.shape)
    print('input_c', input_encoded_c.shape)
    print('response', response.shape)
    # concatenate the match matrix with the question vector sequence
#     answer = concatenate([response, question_encoded])
    answer = response
    print('---')
    print('resp.shape', response.shape)
    print('q_enc.shape', question_encoded.shape)
    print('answer.shape', answer.shape)

    # the original paper uses a matrix multiplication for this reduction step.
    # we choose to use a RNN instead.
    answer = LSTM(32)(answer)  # (samples, 32)

    # one regularization layer -- more would probably be needed.
#     answer = Dropout(0.3)(answer)
    answer = Dense(vocab_size)(answer)  # (samples, vocab_size)
    # we output a probability distribution over the vocabulary
    answer = Activation('softmax')(answer)

    # build the final model
    model = Model([input_sequence, question], answer)
    model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model

embd_size = 64
model = MemNN(story_maxlen, query_maxlen, vocab_size, embd_size)
print(model.summary())

# train
model.fit([inputs_train, queries_train], answers_train,
          batch_size=32,
          epochs=3,
          validation_data=([inputs_test, queries_test], answers_test))

In [None]:
model = Sequential()
model.add(Embedding(1000, 64, input_length=10))
# the model will take as input an integer matrix of size (batch, input_length).
# the largest integer (i.e. word index) in the input should be no larger than 999 (vocabulary size).
# now model.output_shape == (None, 10, 64), where None is the batch dimension.

input_array = np.random.randint(1000, size=(32, 10))

model.compile('rmsprop', 'mse')
print(model.shape)
output_array = model.predict(input_array)