In [75]:
from __future__ import division, print_function
from keras.layers import Input, Merge
from keras.layers.core import Activation, Dense, Dropout, Permute
from keras.layers.embeddings import Embedding
from keras.layers.merge import add, concatenate, dot
from keras.layers.recurrent import LSTM, GRU
from keras.models import Model, Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
import collections
import itertools
import numpy as np
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
import os
import h5py
import PyArabic

In [2]:
def get_data(infile):
    tree = ET.parse(infile)
    root = tree.getroot()

    stories, questions, answers = [], [], []

    for Question in root:
        QID = int(Question.get('QID'))
        Qtext = Question.find('Qtext').text

        for QApair in Question.iter('QApair'): 
            QAID = int(QApair.get('QAID'))
            QArel = QApair.get('QArel')
            QAquestion = QApair.find('QAquestion').text
            QAanswer = QApair.find('QAanswer').text

            stories.append(Qtext)
            questions.append(QAquestion)
            answers.append(QArel)
    return stories, questions, answers

In [3]:
DATA_DIR = "../TRAIN"
TRAIN_FILE = os.path.join(DATA_DIR, "SemEval2016-Task3-CQA-MD-train.xml")
TEST_FILE = os.path.join(DATA_DIR, "SemEval2017-Task3-CQA-MD-test.xml")

# get the data
data_train = get_data(TRAIN_FILE)
data_test = get_data(TEST_FILE)

In [4]:
def build_vocab(train_data, test_data):
    counter = collections.Counter()
    for stories, questions, answers in [train_data, test_data]:
        for story in stories:
            for sent in story:
                for word in [w for w in sent.split() if w not in stopwords]:
                    word = preprocessor.deNoise(word)
                    counter[word.lower()] += 1
        for question in questions:
            for word in [w for w in question.split() if w not in stopwords]:
                word = preprocessor.deNoise(word)
                counter[word.lower()]+= 1
        for answer in answers:
            for word in [w for w in answer.split() if w not in stopwords]:
                word = preprocessor.deNoise(word)
                counter[word.lower()] += 1
    # no OOV here because there are not too many words in dataset
    word2idx = {w:(i+1) for i, (w, _) in enumerate(counter.most_common())}
    word2idx["PAD"] = 0
    idx2word = {v:k for k, v in word2idx.items()}
    return word2idx, idx2word

In [5]:
def get_maxlens(train_data, test_data):
    story_maxlen, question_maxlen = 0, 0
    for stories, questions, _ in [train_data, test_data]:
        for story in stories:
            story_len = 0
            for sent in story:
                swords = sent.split()
                story_len += len(swords)
            if story_len > story_maxlen:
                story_maxlen = story_len
        for question in questions:
            question_len = len(question.split())
            if question_len > question_maxlen:
                question_maxlen = question_len
    return story_maxlen, question_maxlen

In [6]:
def vectorize(data, word2idx, story_maxlen, question_maxlen):
    Xs, Xq, Y = [], [], []
    stories, questions, answers = data
    for story, question, answer in zip(stories, questions, answers):
        xs = [[word2idx[preprocessor.deNoise(w.lower())] for w in s.split() if w not in stopwords] 
                                   for s in story]
        xs = list(itertools.chain.from_iterable(xs))
        xq = [word2idx[preprocessor.deNoise(w.lower())] for w in question.split() if w not in stopwords]
        Xs.append(xs)
        Xq.append(xq)
        Y.append(0 if answer == 'I' else 1)
    return pad_sequences(Xs, maxlen=story_maxlen),\
           pad_sequences(Xq, maxlen=question_maxlen),\
           Y

In [7]:
DATA_DIR = "../TRAIN"
TRAIN_FILE = os.path.join(DATA_DIR, "SemEval2016-Task3-CQA-MD-train.xml")
TEST_FILE = os.path.join(DATA_DIR, "SemEval2017-Task3-CQA-MD-test.xml")

# get the data
data_train = get_data(TRAIN_FILE)
data_test = get_data(TEST_FILE)

print(len(data_train[0]), len(data_test[0]))

with open('stopwords.txt', 'r', encoding='utf-8') as f:
    stopwords = f.readlines()
        
preprocessor = PyArabic.ArabicPreprocessor()

# build vocabulary from all the data
word2idx, idx2word = build_vocab(data_train, data_test)

vocab_size = len(word2idx)
print("vocab size: {:d}".format(len(word2idx)))

# compute max sequence length for each entity
story_maxlen, question_maxlen = get_maxlens(data_train, data_test)
print("story maxlen: {:d}, question maxlen: {:d}".format(story_maxlen, question_maxlen))

# vectorize the data
Xstrain, Xqtrain, Ytrain = vectorize(data_train, word2idx, story_maxlen, question_maxlen)
Xstest, Xqtest, Ytest = vectorize(data_test, word2idx, story_maxlen, question_maxlen)

print(Xstrain.shape, Xqtrain.shape, len(Ytrain), Xstest.shape, Xqtest.shape, len(Ytest))

30411 12600
vocab size: 83416
story maxlen: 1223, question maxlen: 865
(30411, 1223) (30411, 865) 30411 (12600, 1223) (12600, 865) 12600


In [105]:
### define network
EMBEDDING_SIZE = 64
LATENT_SIZE = 32
BATCH_SIZE = 100
NUM_EPOCHS = 5

# placeholders
original_sequence = Input((story_maxlen,))
question_sequence = Input((question_maxlen,))

# encoders

# embed the original question into a sequence of vectors of size story_maxlen
original_encoder = Sequential()
original_encoder.add(Embedding(input_dim=vocab_size,
                              output_dim=64))
# output: (samples, story_maxlen, query_maxlen)

# embed the question into a sequence of vectors
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_size,
                               output_dim=64,
                              input_length=question_maxlen))

original_encoded = original_encoder(original_sequence)
question_encoded = question_encoder(question_sequence)

shared_lstm = LSTM(64)

encoded_a = shared_lstm(original_encoded)
encoded_b = shared_lstm(question_encoded)

merged_vector = concatenate([encoded_a, encoded_b], axis=1)

predictions = Dense(1, activation='sigmoid')(merged_vector)

model = Model(inputs=[original_sequence, question_sequence], outputs=predictions)
model.compile(optimizer="rmsprop", loss="binary_crossentropy",
              metrics=["accuracy"])

model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_119 (InputLayer)           (None, 1223)          0                                            
____________________________________________________________________________________________________
input_120 (InputLayer)           (None, 865)           0                                            
____________________________________________________________________________________________________
sequential_87 (Sequential)       multiple              5338624     input_119[0][0]                  
____________________________________________________________________________________________________
sequential_88 (Sequential)       (None, 865, 64)       5338624     input_120[0][0]                  
___________________________________________________________________________________________

In [None]:
# train model
history = model.fit([Xstrain, Xqtrain], Ytrain, batch_size=BATCH_SIZE, 
                    epochs=NUM_EPOCHS,
                    validation_split=0.2)
                    
# plot accuracy and loss plot
plt.subplot(211)
plt.title("Accuracy")
plt.plot(history.history["acc"], color="g", label="train")
plt.plot(history.history["val_acc"], color="b", label="validation")
plt.legend(loc="best")

plt.subplot(212)
plt.title("Loss")
plt.plot(history.history["loss"], color="g", label="train")
plt.plot(history.history["val_loss"], color="b", label="validation")
plt.legend(loc="best")

plt.tight_layout()
plt.show()


Train on 24328 samples, validate on 6083 samples
Epoch 1/5
 3200/24328 [==>...........................] - ETA: 4891s - loss: 0.6752 - acc: 0.6075

In [66]:
#save the model 
model.save('SemEval-MemNN-Model.h5')

#save the weights
model.save_weights('SemEval-MemNN-Weights.h5')

#save the architecture
model_json = model.to_json()
with open("SemEval-MemNN-Arch.json", "w") as json_file:
    json_file.write(model_json)

In [67]:
# labels
ytest = Ytest

# get predictions
Ytest_ = model.predict([Xstest, Xqtest])

NUM_DISPLAY = 30

for i in range(NUM_DISPLAY):
    story = " ".join([idx2word[x] for x in Xstest[i].tolist() if x != 0])
    question = " ".join([idx2word[x] for x in Xqtest[i].tolist()])
    label = ytest[i]
    prediction = Ytest_[i]
    print(story[-20:], question[-20:], label, prediction)

KeyboardInterrupt: 

In [100]:
test_dataset_path = '../TEST/2017/SemEval2017-Task3-CQA-MD-test-input.xml'

tree = ET.parse(test_dataset_path)
root = tree.getroot()
    
for Question in root:
    QID = int(Question.get('QID'))
    Qtext = Question.find('Qtext').text
    
    for QApair in Question.iter('QApair'): 
        QAID = int(QApair.get('QAID'))
        QArel = QApair.get('QArel')
        QAquestion = QApair.find('QAquestion').text
        QAanswer = QApair.find('QAanswer').text

        Xquestion, Xqaquestion, _ = vectorize(([Qtext], [QAquestion], [0]), word2idx, story_maxlen, question_maxlen)
        QAconf = np.asscalar(model.predict([Xquestion, Xqaquestion]))
        
        QApair.set('QArel', 'R' if QAconf >= 0.5 else 'I')
        QApair.set('QAconf', str(round(QAconf, 4)))

tree.write('../TEST/2017/SemEval2017-Task3-CQA-MD-test-input-MemNN.xml', encoding='utf-8')

In [None]:
word = u'لا'
word.lower()