In [None]:
from gensim.models import KeyedVectors

w2v_dimensionality = 300
w2v_model = KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary=True)
w2v_vector_zero = [0 for i in range(w2v_dimensionality)]

In [None]:
from lxml import etree

file = 'data/semeval2007/train/lexical-sample/english-lexical-sample.train.xml'
root = etree.parse(file)

In [None]:
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Embedding, LSTM

window_size = 7
number_of_vectors = 2 * window_size + 1
batch_size = 10
number_of_epochs = 3

layers = [
    LSTM(100),
    Dropout(0.5, seed=3000),
    Dense(10, activation='softmax')
]

def generate_model():
    model = Sequential()
    for layer in layers:
        model.add(layer)
    model.compile(optimizer='rmsprop',
                  metrics=['accuracy'],
                  loss='categorical_crossentropy')
    print(model.summary())
    return model

def fit_model(model, X, Y):
    model.fit(X, Y,
              batch_size=batch_size,
              epochs=number_of_epochs)
    return model

def save_model_to_file(model, file_name):
    model.save('{}.h5'.format(file_name))

def load_model_from_file(file_name):
    model = load_model('{}.h5'.format(file_name))
    return model
    
def evaluate_model(model, X, Y):
    score = model.evaluate(X,
                           Y,
                           batch_size=batch_size)
    return score

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize

lexelts = set()

for lexelt in root.findall('lexelt'):
    instance_indexer = {}
    X = []
    Y = []
    lexelt_item = lexelt.attrib['item']
    lexelt_pos = lexelt.attrib['pos']
    for instance in lexelt.findall('instance'):
        instance_id = instance.attrib['id']
        answer_sense_id = instance.find('answer').attrib['senseid']
        instance_indexer[len(Y)] = instance_id # Map instance id to index in X and Y
        Y.append(answer_sense_id) # Add sense id (answer) to Y
        context = instance.find('context')
        head = context.find('head').text.strip()
        etree.strip_tags(context, 'head')
        sentences = sent_tokenize(context.text)
        words = list(map(lambda sentence: word_tokenize(sentence), sentences)) # Tokenize into list of lists
        sentence_index, word_index = -1, -1
        for (s_index, sentence) in enumerate(words):
            for (w_index, word) in enumerate(sentence):
                if word == head:
                    sentence_index, word_index = s_index, w_index
                    break
        if sentence_index == -1 or word_index == -1: # Somehow the lexelt does not exist in the context
            continue
        sentence = words[sentence_index]
        left_bound = max(0, word_index - window_size)
        right_bound = min(word_index + window_size, len(sentence))
        left_pad = 0
        w2v_vectors = []
        for w_index in range(left_bound, right_bound):
            word = sentence[w_index]
            if word in w2v_model:
                vector = w2v_model[word]
                w2v_vectors.append(vector)
            else:
                if w_index < left_bound + window_size:
                    left_pad += 1
        right_pad = 2 * window_size + 1 - left_pad - len(w2v_vectors)
        input_vectors = [w2v_vector_zero for i in range(left_pad)] + w2v_vectors + [w2v_vector_zero for i in range(right_pad)]
        X.append(input_vectors) # Add list of w2v vectors to X
    # Create model for the lexelt
    model = generate_model()
    fit_model(model, X, Y)
    save_model_to_file(lexelt_item)
    lexelts.add(lexelt_item)
    break

# ValueError: The first layer in a Sequential model must get an `input_shape` or `batch_input_shape` argument.

In [None]:
for lexelt_item in lexelts:
    model = load_model_from_file(lexelt_item)
    score = evaluate_model(model)
    print(score)