## Architecture

    P -> Encoder-lstm -> match-lstm -> PTR-Net
    Q -> Encoder-lstm -^ 

In [1]:
import os
import re
import pickle

import numpy as np

from gensim.models import Word2Vec
from keras.models import Sequential , Model
from keras.layers import Input ,Lambda , Dropout
from keras.layers import Dense ,TimeDistributed ,LSTM 
from keras.layers.embeddings import Embedding
from keras.layers import merge,Dot,Multiply,Reshape,concatenate,RepeatVector
from keras.preprocessing import sequence,text

import keras.backend as K 
#from keras.backend import softmax #無法設定axis
from keras.activations import softmax #可設定axis
from keras.callbacks import History,EarlyStopping

Using TensorFlow backend.


## import data

In [2]:
## load data
#train
train =np.load("../data/train.npy")

#label
#ans start , ans end+1
ptr_train = np.load("../data/y_ptr.npy")

#test
test =np.load("../data/test.npy")


In [3]:
## load word2vec
QA_word2vec = Word2Vec.load("model_para/QA_word2vec.bin")
weight_matrix = QA_word2vec.wv.syn0
vocab = dict([(k, v.index) for k, v in QA_word2vec.wv.vocab.items()])

## word_freq , use "rb"
pk = open("model_para/word_freq.pk","rb")
word_freq,min_c,Nu = pickle.load(pk)

pk.close()


In [4]:
## train paragraph process
paragraph_idx = []
p_max_len = 0
for para in train[:,2]:
    context = []
    for letter in para:
        if word_freq[letter]<min_c:
            context.append( vocab[Nu] )
        else:
            context.append( vocab[letter] )
    paragraph_idx.append(context)
    if p_max_len < len(context):
        p_max_len = len(context)

print("max length of context :" ,p_max_len)

## question process
question=[]
q_max_len = 0
for q in train[:,4]:
    qu = []
    for letter in q:
        if word_freq[letter]<min_c:
            qu.append( vocab[Nu] )
        else:
            qu.append( vocab[letter] )
    question.append(qu)
    if q_max_len < len(context):
        q_max_len = len(context)

print("max length of question :" , q_max_len)

max length of context : 1158
max length of question : 331


In [5]:
# answer ptr
ans_ptr = []
for i , ans in enumerate(ptr_train):
    answer_set = np.zeros(shape = len(paragraph_idx[i]))
    answer_set[ans[0]]=1
    answer_set[ans[1]-1]=1
    ans_ptr.append(answer_set)

In [6]:
# answer ptr
ans_ptr_start = []
ans_ptr_end = []
for i , ans in enumerate(ptr_train):
    st = np.zeros(shape = (len(paragraph_idx[i]) ))
    end = np.zeros(shape = (len(paragraph_idx[i]) ))
    st[ans[0]]=1
    end[ans[1]-1]=1
    ans_ptr_start.append(st)
    ans_ptr_end.append(end)
    

In [7]:
# test paragraph data
test_context=[]
for para in test[:,2]:
    context = []
    for letter in para:
        if word_freq[letter]>=min_c:
            context.append(vocab[letter])
        else:
            context.append(vocab[Nu])
    test_context.append(context)

## build model

In [8]:
max_review_length = p_max_len+1
question_length = q_max_len+1

p_train = sequence.pad_sequences(paragraph_idx,padding="post", maxlen=max_review_length)
q_train = sequence.pad_sequences(question,padding="post", maxlen=question_length)
ans_train_start = sequence.pad_sequences(ans_ptr_start,padding="post", maxlen=max_review_length).reshape((-1,max_review_length,1))
ans_train_end = sequence.pad_sequences(ans_ptr_end,padding="post", maxlen=max_review_length).reshape((-1,max_review_length,1))

#p_train = sequence.pad_sequences(paragraph_idx, maxlen=max_review_length)
#q_train = sequence.pad_sequences(question, maxlen=question_length)


ans_train_start.shape,p_train.shape

((14611, 1159, 1), (14611, 1159))

In [9]:
p_train[0]

array([239, 136,  89, ...,   0,   0,   0], dtype=int32)

In [10]:
paragraph_idx[0][-1]

2

In [11]:
ans_train_start[2].argmax(),ans_train_start[2].shape

(271, (1159, 1))

In [12]:
ans_ptr_start[2].argmax(),ans_ptr_start[2].shape

(271, (506,))

In [13]:
train[0]

array(['广州', '1',
       '廣州市，簡稱穗，現有別稱五羊仙城、羊城、穗城、穗垣、仙城、花城，為中華人民共和國廣東省省會，中國超大城市及副省級城市，是繼上海、北京之後的中國第三大城市、國家中心城市、也是中國華南地區的經濟、文化、科技和教育中心及交通樞紐，是中國人民解放軍南部戰區聯合作戰指揮部所在地。廣州擁有2200年以上歷史，為中國首批歷史文化名城之一，是中國南方最大、歷史最悠久的對外通商口岸，也是全國首批對外開放的14個沿海城市之一，世界著名的港口城市之一。廣州港是中國第五大港口，世界第八大港口。廣州在2010年成功舉辦第16屆亞洲運動會。\n廣州地處華南，廣東省的東南部，珠江三角洲中北緣，西江、北江、東江三江匯合處，瀕臨南中國海，東連東莞市和惠州市博羅、龍門兩縣，西鄰佛山市的三水、南海和順德三區，北靠清遠市的市區和佛岡縣及韶關市的新豐縣，南接中山市，毗鄰香港、澳門特別行政區，地理位置優越，是「海上絲綢之路」的起點之一，被稱為中國的「南大門」。由於經濟水平發達、發展程度優秀，廣州與北京、上海、深圳並稱為中國內地四大一線城市。據聯合國《2016年中國城市可持續發展報告：衡量生態投入與人類發展》顯示，廣州人類發展指數蟬聯中國第一。',
       '4fc32aa8-5d9c-4427-a379-94a825f97c5d', '有別稱仙城的城市的歷史大約有多少年甚至更長？',
       '139', '2200'],
      dtype='<U1158')

In [16]:

def build(vocab,weight_matrix,max_review_length,question_length):
    hidden_layer_p = 128
    hidden_layer_q = 128
    
    input_p = Input((max_review_length,),name='paragraph')
    input_q = Input((question_length,),name='question')

    # input_dim is len(vocab) like the account of word. 
    embd_p = Embedding(input_dim=len(vocab),
                     output_dim=weight_matrix.shape[1],
                     input_length=max_review_length,
                     weights=[weight_matrix])(input_p)
    embd_q = Embedding(input_dim=len(vocab),
                     output_dim=weight_matrix.shape[1],
                     input_length=question_length,
                     weights=[weight_matrix])(input_q)

    encoder_p = LSTM(hidden_layer_p,return_sequences=True,name='paragraph_encoder')
    enco_p = encoder_p(embd_p)
    
    encoder_q = LSTM(hidden_layer_q,name='question_encoder')
    enco_q = encoder_q(embd_q)
    #consistent dimension as paragraph
    enco_q = RepeatVector(max_review_length)(enco_q)

    # use encoded question to attent paragraph
    attn = concatenate([enco_p,enco_q],name='QP_concat')
    attn = TimeDistributed(Dense(5,activation='tanh'))(attn)
    attn = TimeDistributed(Dense(1,activation='tanh'))(attn)
    attn = Lambda(lambda x: softmax(x,axis=-2),name='attention_by_softmax')(attn)

    ## encode for comprehensing
    comprehension_p = LSTM(hidden_layer_p,go_backwards=True,return_sequences=True,name='compre_encoder')(embd_p)

    # find start of answer
    first_comprehension_p = Multiply(name='first_attn_on_compre_vector')([attn,comprehension_p])
    first_deco_p = LSTM(hidden_layer_p,return_sequences=True,name='first_ecoder_p')(first_comprehension_p)
    first_ptr_attn = TimeDistributed(Dense(1,activation='tanh'))(first_deco_p)
    first_ptr_attn_value = Lambda(lambda x: softmax(x,axis=-2),name='first_point_layer')(first_ptr_attn)
    
    # find end of answer
    second_comprehension_p = Multiply(name='second_attn_on_compre_vector')([first_ptr_attn_value,comprehension_p])
    second_deco_p = LSTM(hidden_layer_p,return_sequences=True,name='second_decoder_p')(second_comprehension_p)
    second_ptr_attn = TimeDistributed(Dense(1,activation='tanh'))(second_deco_p)
    second_ptr_attn_value = Lambda(lambda x: softmax(x,axis=-2),name='second_point_layer')(second_ptr_attn)
    
    
    model = Model(inputs=[input_p,input_q],outputs=[first_ptr_attn_value,second_ptr_attn_value])
    model.compile(loss='mse',optimizer='adam')
    return model

In [17]:
n_batch = 128
n_epoch = 10
try:
    del model
except NameError:
    pass

model = build(vocab , weight_matrix , max_review_length , question_length)
#cosine_proximity
model.summary()
#hist = model.fit([p_train,q_train],[ans_train_start,ans_train_end],batch_size=n_batch,epochs=n_epoch)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
question (InputLayer)            (None, 332)           0                                            
____________________________________________________________________________________________________
paragraph (InputLayer)           (None, 1159)          0                                            
____________________________________________________________________________________________________
embedding_4 (Embedding)          (None, 332, 256)      1095936     question[0][0]                   
____________________________________________________________________________________________________
embedding_3 (Embedding)          (None, 1159, 256)     1095936     paragraph[0][0]                  
___________________________________________________________________________________________

In [18]:
from keras.utils import plot_model
plot_model(model , to_file='QASystem.png')

In [23]:
my_ans = model.predict([p_train[2:3],q_train[2:3]])

In [24]:
train[2]

array(['广州', '1',
       '廣州市，簡稱穗，現有別稱五羊仙城、羊城、穗城、穗垣、仙城、花城，為中華人民共和國廣東省省會，中國超大城市及副省級城市，是繼上海、北京之後的中國第三大城市、國家中心城市、也是中國華南地區的經濟、文化、科技和教育中心及交通樞紐，是中國人民解放軍南部戰區聯合作戰指揮部所在地。廣州擁有2200年以上歷史，為中國首批歷史文化名城之一，是中國南方最大、歷史最悠久的對外通商口岸，也是全國首批對外開放的14個沿海城市之一，世界著名的港口城市之一。廣州港是中國第五大港口，世界第八大港口。廣州在2010年成功舉辦第16屆亞洲運動會。\n廣州地處華南，廣東省的東南部，珠江三角洲中北緣，西江、北江、東江三江匯合處，瀕臨南中國海，東連東莞市和惠州市博羅、龍門兩縣，西鄰佛山市的三水、南海和順德三區，北靠清遠市的市區和佛岡縣及韶關市的新豐縣，南接中山市，毗鄰香港、澳門特別行政區，地理位置優越，是「海上絲綢之路」的起點之一，被稱為中國的「南大門」。由於經濟水平發達、發展程度優秀，廣州與北京、上海、深圳並稱為中國內地四大一線城市。據聯合國《2016年中國城市可持續發展報告：衡量生態投入與人類發展》顯示，廣州人類發展指數蟬聯中國第一。',
       '76185dc3-7ee3-4e0c-9b46-4614c50d9354',
       '2016年的人類發展指數第一的中國城市位在廣東省的哪一方？', '271', '東南'],
      dtype='<U1158')

In [28]:
index = np.squeeze(my_ans)

In [36]:
index[index.argsort()[-1]],index.argsort()[-2]

(0.0008630855, 127)

In [31]:
max(index)

0.0008630855