In [None]:
from gensim.models import KeyedVectors

w2v_model = KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
import numpy as np

w2v_vocab = len(w2v_model.vocab)
w2v_dim = 300
w2v_zero = np.zeros(w2v_dim)

In [None]:
window_size = 7
phrase = 2 * window_size + 1
batch_size = 10
epochs = 3

In [None]:
from keras.models import Sequential, load_model
from keras.layers import Embedding, Flatten, LSTM, Dropout, Dense

layers = [
    Embedding(w2v_vocab, w2v_dim, input_length=phrase),
    # Flatten(),
    LSTM(phrase),
    Dropout(0.5, seed=3000),
    Dense(1, activation='softmax')
]

def generate_model():
    model = Sequential()
    for layer in layers:
        print(layer)
        model.add(layer)
    model.compile(optimizer='rmsprop',
                  metrics=['accuracy'],
                  loss='categorical_crossentropy')
    print(model.summary())
    return model

def fit_model(model, X, Y):
    model.fit(X, Y,
              batch_size=batch_size,
              epochs=epochs)
    return model

def save_model_to_file(model, file_name):
    model.save('{}.h5'.format(file_name))

def load_model_from_file(file_name):
    model = load_model('{}.h5'.format(file_name))
    return model
    
def evaluate_model(model, X, Y):
    score = model.evaluate(X,
                           Y,
                           batch_size=batch_size)
    return score

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
from lxml import etree

file = 'data/semeval2007/train/lexical-sample/english-lexical-sample.train.xml'
root = etree.parse(file)

lexelts = set()

for lexelt in root.findall('lexelt'):
    lexelt_item = lexelt.attrib['item']
    lexelt_pos = lexelt.attrib['pos']
    instances = lexelt.findall('instance')
    instance_indexer = {} # {index of instance in X and Y: id of instance in data}
    X = np.empty((len(instances), phrase, w2v_dim))
    Y = np.empty(len(instances))
    for instance_index, instance in enumerate(instances):
        instance_id = instance.attrib['id']
        answer_sense_id = instance.find('answer').attrib['senseid']
        instance_indexer[instance_index] = instance_id
        Y[instance_index] = answer_sense_id

        context = instance.find('context')
        head = context.find('head').text.strip()
        etree.strip_tags(context, 'head')
        words = list(map(lambda sentence: word_tokenize(sentence), sent_tokenize(context.text)))
        sentence_index, word_index = -1, -1
        for (s_index, sentence) in enumerate(words):
            for (w_index, word) in enumerate(sentence):
                if word == head:
                    sentence_index, word_index = s_index, w_index
                    break
        if sentence_index == -1 or word_index == -1: # Lexelt did not exist in the context
            continue

        sentence = words[sentence_index]
        lower_bound = max(0, word_index - window_size)
        upper_bound = min(word_index + window_size, len(sentence))
        w2v_vectors = np.empty((phrase, w2v_dim))
        for w_index in range(lower_bound, upper_bound):
            word = sentence[w_index]
            if word in w2v_model:
                w2v_vectors[w_index - lower_bound] = w2v_model[word]
        X[instance_index] = w2v_vectors
        break

    model = generate_model()
    fit_model(model, X, Y)
    save_model_to_file(lexelt_item)
    lexelts.add(lexelt_item)
    break

In [None]:
for lexelt_item in lexelts:
    model = load_model_from_file(lexelt_item)
    score = evaluate_model(model)
    print(score)