In [None]:
from gensim.models import KeyedVectors

w2v_dimensionality = 300
w2v_model = KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
from lxml import etree

file = 'data/semeval2007/train/lexical-sample/english-lexical-sample.train.xml'
root = etree.parse(file)

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize

import math
import pandas
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

window_size = 7
input_vector_length = 2 * window_size + 1

number_of_epochs = 3
batch_size = 10

for lexelt in root.findall('lexelt'):
    instance_indexer = {}
    X = []
    Y = []
    lexelt_item = lexelt.attrib['item'].split('.')[0]
    lexelt_pos = lexelt.attrib['pos']
    for instance in lexelt.findall('instance'):
        instance_id = instance.attrib['id']
        answer_sense_id = instance.find('answer').attrib['senseid']
        instance_indexer[len(Y)] = instance_id # Map instance id to index in X and Y
        Y.append(answer_sense_id) # Add sense id (answer) to Y
        context = instance.find('context')
        head = context.find('head').text.strip()
        etree.strip_tags(context, 'head')
        sentences = sent_tokenize(context.text)
        words = list(map(lambda sentence: word_tokenize(sentence), sentences)) # Tokenize into list of lists
        sentence_index, word_index = -1, -1
        for (s_index, sentence) in enumerate(words):
            for (w_index, word) in enumerate(sentence):
                if word == head:
                    sentence_index, word_index = s_index, w_index
                    break
        if sentence_index == -1 or word_index == -1: # Somehow the lexelt does not exist in the context
            continue
        sentence = words[sentence_index]
        left_bound = max(0, word_index - window_size)
        right_bound = min(word_index + window_size, len(sentence))
        left_pad = 0
        w2v_vectors = []
        for w_index in range(left_bound, right_bound):
            word = sentence[w_index]
            if word in w2v_model:
                vector = w2v_model[word]
                w2v_vectors.append(vector)
            else:
                if w_index < left_bound + window_size:
                    left_pad += 1
        right_pad = 2 * window_size + 1 - left_pad - len(w2v_vectors)
        input_vectors = [None for i in range(left_pad)] + w2v_vectors + [None for i in range(right_pad)]
        X.append(input_vectors) # Add list of w2v vectors to X
    # Create model for the lexelt
    model = Sequential()
    model.add(Embedding(len(instance_indexer), w2v_dimensionality, input_length=input_vector_length))
    model.add(LSTM(100))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    # model.fit(X, Y, validation_data=(X_test, Y_test), epochs=number_of_epochs, batch_size=batch_size)