In [1]:
from gensim.models import KeyedVectors

w2v_model = KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin',
                                              binary=True)



In [2]:
import numpy as np

w2v_dim = 300
w2v_vocab = len(w2v_model.vocab)

In [3]:
window_size = 2
phrase = 2 * window_size + 1
input_shape = (phrase, w2v_dim)

batch_size = 64
epochs = 3

dropout_probability = 0.2
final_activation = 'softmax'

loss_function = 'categorical_crossentropy'

model_file_path_string_template = 'models/{}.h5'

In [5]:
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.optimizers import Adam

optimizer = Adam(clipnorm=1)
placeholder_dense_layer = Dense(1,
                                activation=final_activation)

layers = [
    LSTM(phrase,
         input_shape=input_shape,
         dropout=dropout_probability),
    placeholder_dense_layer
]

'''
layers = [
    LSTM(phrase,
         input_shape=input_shape,
         return_sequences=True),
    LSTM(phrase,
         input_shape=input_shape,
         return_sequences=True),
    LSTM(phrase,
         input_shape=input_shape,
         dropout=dropout_probability),
    placeholder_dense_layer
]
'''

def generate_model():
    model = Sequential()
    for layer in layers:
        model.add(layer)
    model.compile(optimizer=optimizer,
                  loss=loss_function,
                  metrics=[
                      'accuracy'
                  ])
    return model

def fit_model(model, X, Y):
    model.fit(X,
              Y,
              batch_size=batch_size,
              epochs=epochs)
    return model

def save_model_to_file(model, file_name):
    model.save(model_file_path_string_template.format(file_name))

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [6]:
from keras.models import Sequential, load_model

def load_model_from_file(file_name):
    model = load_model(model_file_path_string_template.format(file_name))
    return model
    
def evaluate_model(model, X, Y):
    score = model.evaluate(X,
                           Y,
                           batch_size=batch_size)
    return score

In [7]:
def get_lemma(lexelt):
    return lexelt.split('.')[0]

def get_pos(lexelt):
    return lexelt.split('.')[1]

def is_match_lexelt(synset, lexelt_item):
    lexelt_with_sense = synset.name()
    lemma_a, pos_a = get_lemma(lexelt_with_sense), get_pos(lexelt_with_sense)
    lemma_b, pos_b = get_lemma(lexelt_item), get_pos(lexelt_item)
    return (lemma_a.lower() == lemma_b.lower() and
            pos_a.lower() == pos_b.lower())

In [8]:
from keras.utils.np_utils import to_categorical
from lxml import etree
from nltk.corpus import wordnet
from nltk.tokenize import sent_tokenize, word_tokenize

import pickle

data_file_path_string_template = 'data/semeval2007/{0}/lexical-sample/english-lexical-sample.{0}.xml'
answer_key_file_path = 'data/semeval2007/key/english-lexical-sample.test.key'
seen_lexelts_file_name = 'lexelts.bin'
sense_indexer_file_name = 'sense_indexer.bin'

In [9]:
train_or_test = 'train'
file = data_file_path_string_template.format(train_or_test)
root = etree.parse(file)

trained_lexelt_items = set()
lexelt_label_indexer = {} # { lexelt.item: { answer.sense_id: categorical one-hot vector index } }

instance_indexer = {} # { instance_id: X.index(instance_id) and also Y.index(instance_id) }

for lexelt in root.findall('lexelt'):
    lexelt_item = lexelt.attrib['item']
    lexelt_pos = lexelt.attrib['pos']
    instances = lexelt.findall('instance')

    number_of_instances = len(instances)
    if number_of_instances not in range(50, 100):
        continue

    number_of_classes = len(
        list(filter(
            lambda synset: is_match_lexelt(synset, lexelt_item),
            wordnet.synsets(get_lemma(lexelt_item))
        ))
    ) + 1 # because sense id begins from 1

    print(lexelt_item, number_of_classes)

    X = np.zeros((number_of_instances, phrase, w2v_dim), dtype=np.float64)
    Y = np.zeros((number_of_instances, number_of_classes), dtype=np.uint8)

    for instance_index, instance in enumerate(instances):
        instance_id = instance.attrib['id']
        instance_indexer[instance_id] = instance_index

        answer_sense_id = instance.find('answer').attrib['senseid']
        if lexelt_item not in lexelt_label_indexer:
            lexelt_label_indexer[lexelt_item] = {}
        if answer_sense_id not in lexelt_label_indexer[lexelt_item]:
            lexelt_label_indexer[lexelt_item][answer_sense_id] = len(lexelt_label_indexer[lexelt_item]) + 1 # because sense id
        try:
            label_index = lexelt_label_indexer[lexelt_item][answer_sense_id]
            Y[instance_index] = to_categorical(label_index, num_classes=number_of_classes)
        except: # IndexError because |senses| for the lexelt in WN 3.0 < |senses| in WN 1.7 or 2.1
            lexelt_label_indexer.pop(lexelt_item, None)
            break

        context = instance.find('context')
        head = context.find('head').text.strip()
        etree.strip_tags(context, 'head')
        words = list(map(lambda sentence: word_tokenize(sentence), sent_tokenize(context.text)))
        sentence_index, word_index = -1, -1
        for (s_index, sentence) in enumerate(words):
            for (w_index, word) in enumerate(sentence):
                if word == head:
                    sentence_index, word_index = s_index, w_index
                    break
        if sentence_index == -1 or word_index == -1: # Lexelt did not exist in the context
            continue

        sentence = words[sentence_index]
        lower_bound = max(0, word_index - window_size)
        upper_bound = min(word_index + window_size, len(sentence))
        w2v_vectors = np.empty((phrase, w2v_dim))
        for w_index in range(lower_bound, upper_bound):
            word = sentence[w_index]
            if word in w2v_model:
                w2v_vectors[w_index - lower_bound] = w2v_model[word] # Switch reference from np.empty to word2vec vector
        X[instance_index] = w2v_vectors

    if lexelt_item not in lexelt_label_indexer:
        continue

    layers[-1] = Dense(number_of_classes,
                       activation=final_activation)
    model = generate_model()
    fit_model(model, X, Y)
    save_model_to_file(model, lexelt_item)
    trained_lexelt_items.add(lexelt_item)
    print(model.summary())
    print()

with open(seen_lexelts_file_name, 'wb') as l:
    pickle.dump(trained_lexelt_items, l)

with open(sense_indexer_file_name, 'wb') as s:
    pickle.dump(lexelt_label_indexer, s)

announce.v 5
Epoch 1/3
Epoch 2/3
Epoch 3/3
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 5)                 6120      
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 30        
Total params: 6,150
Trainable params: 6,150
Non-trainable params: 0
_________________________________________________________________
None

approve.v 3
Epoch 1/3
Epoch 2/3
Epoch 3/3
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 5)                 6120      
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 18        
Total params: 6,138
Trainable params: 6,138
Non-trainable params: 0
_______________________________________________

Epoch 2/3
Epoch 3/3
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 5)                 6120      
_________________________________________________________________
dense_11 (Dense)             (None, 2)                 12        
Total params: 6,132
Trainable params: 6,132
Non-trainable params: 0
_________________________________________________________________
None

exist.v 3
Epoch 1/3
Epoch 2/3
Epoch 3/3
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 5)                 6120      
_________________________________________________________________
dense_12 (Dense)             (None, 3)                 18        
Total params: 6,138
Trainable params: 6,138
Non-trainable params: 0
_________________________________________________________________
None



In [10]:
train_or_test = 'test'
file = data_file_path_string_template.format(train_or_test)
root = etree.parse(file)

with open(seen_lexelts_file_name, 'rb') as l:
    trained_lexelt_items = pickle.load(l)

with open(sense_indexer_file_name, 'rb') as s:
    lexelt_label_indexer = pickle.load(s)

answers = {}
with open(answer_key_file_path) as k:
    for line in k:
        lexelt_item, instance_id, answer_sense_id = line.strip().split(' ')
        answers[instance_id] = answer_sense_id

instance_indexer = {} # { instance_id: X.index(instance_id) and also Y.index(instance_id) }

for lexelt in root.findall('lexelt'):
    lexelt_item = lexelt.attrib['item']
    if lexelt_item not in trained_lexelt_items:
        continue
    lexelt_pos = lexelt.attrib['pos']
    instances = lexelt.findall('instance')

    number_of_instances = len(instances)

    number_of_classes = len(
        list(filter(
            lambda synset: is_match_lexelt(synset, lexelt_item),
            wordnet.synsets(get_lemma(lexelt_item))
        ))
    ) + 1 # because sense id begins from 1
    # len(lexelt_label_indexer[lexelt_item]) = number of seen sense classes of lexelt_item i.e. <= number_of_classes

    X = np.zeros((number_of_instances, phrase, w2v_dim), dtype=np.float64)
    Y = np.zeros((number_of_instances, number_of_classes), dtype=np.uint8)

    for instance_index, instance in enumerate(instances):
        instance_id = instance.attrib['id']
        instance_indexer[instance_id] = instance_index

        context = instance.find('context')
        head = context.find('head').text.strip()
        etree.strip_tags(context, 'head')
        words = list(map(lambda sentence: word_tokenize(sentence), sent_tokenize(context.text)))
        sentence_index, word_index = -1, -1
        for (s_index, sentence) in enumerate(words):
            for (w_index, word) in enumerate(sentence):
                if word == head:
                    sentence_index, word_index = s_index, w_index
                    break
        if sentence_index == -1 or word_index == -1: # Lexelt did not exist in the context
            continue

        sentence = words[sentence_index]
        lower_bound = max(0, word_index - window_size)
        upper_bound = min(word_index + window_size, len(sentence))
        w2v_vectors = np.empty((phrase, w2v_dim))
        for w_index in range(lower_bound, upper_bound):
            word = sentence[w_index]
            if word in w2v_model:
                w2v_vectors[w_index - lower_bound] = w2v_model[word] # Switch reference from np.empty to word2vec vector
        X[instance_index] = w2v_vectors

        answer_sense_id = answers[instance_id]
        if answer_sense_id not in lexelt_label_indexer[lexelt_item]:
            lexelt_label_indexer[lexelt_item][answer_sense_id] = len(lexelt_label_indexer[lexelt_item]) + 1 # because sense id
        label_index = lexelt_label_indexer[lexelt_item][answer_sense_id]
        try:
            Y[instance_index] = to_categorical(label_index, num_classes=number_of_classes)
        except:
            continue

    model = load_model_from_file(lexelt_item)
    score = evaluate_model(model, X, Y)
    print(lexelt_item)
    print(score)
    print()

announce.v
[nan, 0.0]

approve.v
[nan, 0.0]

authority.n
[nan, 0.0]

avoid.v
[nan, 0.0]

base.n
[nan, 0.0]

cause.v
[nan, 0.0]

claim.v
[nan, 0.0]

disclose.v
[nan, 0.0714285746216774]

enjoy.v
[nan, 0.0]

estimate.v
[nan, 0.0]

exist.v
[nan, 0.0]

explain.v
[nan, 0.0]

join.v
[nan, 0.0]

prepare.v
[nan, 0.0]

promise.v
[nan, 0.0]

space.n
[nan, 0.0]

