In [1]:
#!/usr/bin/env python
import codecs
import argparse
import csv
import random
#from gensim.models.word2vec import Word2Vec
from collections import defaultdict
import numpy as np
try:
    from sklearn.model_selection import train_test_split
except ImportError:
    from sklearn.cross_validation import train_test_split
    
from gensim.models import KeyedVectors

In [2]:
#parser = argparse.ArgumentParser(description='Preparation.')
#parser.add_argument('--w2v',  default='GoogleNews-vectors-negative300.bin', nargs='?', help='Path to the word2vec model.')
#parser.add_argument('--seed', default=228, type=int, nargs='?', help='Random seed.')
#args = vars(parser.parse_args())

#RANDOM_SEED = 1337
RANDOM_SEED = 228
random.seed(RANDOM_SEED)
#w2v_label = "GoogleNews-vectors-negative300.bin"
#w2v_label = "wiki-news-300d-1M-subword.vec"
#w2v_label = 'w2v.6B.300d.txt'
w2v_label = 'w2v.840B.300d.txt'

In [9]:
w2v = Word2Vec.load_word2vec_format(w2v_label, binary=True, unicode_errors='ignore')
w2v.init_sims(replace=True)
print('Using %d word2vec dimensions from "%s".' % (w2v.layer1_size, w2v_label))


In [1]:
from gensim.models import KeyedVectors

w2v_labels = ['w2v.6B.300d.txt', 'w2v.840B.300d.txt']
for w2v_label in w2v_labels:
    w2v = KeyedVectors.load_word2vec_format(w2v_label, binary=False)
    w2v.save_word2vec_format(w2v_label+".bin", binary=True)

In [11]:
# load FT vectors
#w2v = KeyedVectors.load_word2vec_format(w2v_label, binary=False)
# save model in binary format
#w2v.save_word2vec_format(w2v_label+".bin", binary=True)
# load vectors from binary file
w2v = KeyedVectors.load_word2vec_format(w2v_label + ".bin", binary=True)
w2v.init_sims(replace=True)

print('Using %d fasttext dimensions from "%s".' % (w2v.vector_size, w2v_label))

In [3]:
# we now have GloVe vectors
#https://machinelearningmastery.com/develop-word-embeddings-python-gensim/
# first step is to convert to word2vec
from gensim.scripts.glove2word2vec import glove2word2vec
#glove_input_file = 'glove.6B.300d.txt'
glove_input_file = 'glove.840B.300d.txt'
glove2word2vec(glove_input_file, w2v_label)


(2196017, 300)

In [4]:
# import GloVe vectors
from gensim.models import KeyedVectors
w2v = KeyedVectors.load_word2vec_format(w2v_label, binary=False)

print('Using %d GloVe dimensions from "%s".' % (w2v.vector_size, w2v_label))

Using 300 GloVe dimensions from "w2v.840B.300d.txt".


In [5]:
#w2v.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
# convert vectors to unit norm 
# equivalent to dividing each dimension value by the norm of the vector
w2v.init_sims(replace=True)


In [6]:
def read_subsumptions(filename):
    subsumptions = []

    with codecs.open(filename, encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE)

        for row in reader:
            subsumptions.append((row[0], row[1]))

    return subsumptions

def read_synonyms(filename):
    synonyms = defaultdict(lambda: list())

    with codecs.open(filename,encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE)

        for row in reader:
            for word in row[1].split(','):
                synonyms[row[0]].append(word)

    #synonyms.default_factory = None
    return synonyms

subsumptions_train      = read_subsumptions('subsumptions-train.txt')
subsumptions_validation = read_subsumptions('subsumptions-validation.txt')
subsumptions_test       = read_subsumptions('subsumptions-test.txt')
synonyms                = read_synonyms('synonyms.txt')


In [7]:
# ensure that train does not overlap test
#set(subsumptions_train).intersection(set(subsumptions_validation))
print ("Training set length %d" % (len(subsumptions_train)))


print ("Validation entries not in training: %d" % (len(set(subsumptions_validation) - set(subsumptions_train))))
# we have a further 570 relations that don't feature in the test set
# let's lump them in the train set and see if this improves the results

#more_training = list(set(subsumptions_validation) - set(subsumptions_train))

#subsumptions_train.extend(more_training)

Training set length 4374
Validation entries not in training: 1483


In [8]:
for k, v in list(synonyms.items()):
    # if vector equivalent of key is not found, pop entire key
    if not (k in w2v.vocab):
        print ("Key not found: %s" % (k))
        synonyms.pop(k)
        # if lower-case version of k term is found...
        if k.lower() in w2v.vocab:
            synonyms[k.lower()] = [_v.lower() for _v in v]            
    else:
        # if key is found, iterate over synonyms and eliminate
        # does not having embedding (oov)
        for word in v:
            if not word in w2v.vocab:
                print ("Synonym not found: %s" % (word))
                v.remove(word)
                if word.lower() in w2v.vocab:
                    v.append(word.lower())
                                                                        

Key not found: water_vapor
Synonym not found: cell_phone
Synonym not found: water_vapor
Key not found: soft_drink
Key not found: cell_phone
Synonym not found: soft_drink
Key not found: outer_space
Synonym not found: outer_space
Synonym not found: cell_phone
Synonym not found: soft_drink


In [9]:
# check that words exist in vocab prior to computing feature vectors
# of train, test and validation

#'john' in w2v.vocab
def confirmVocab(wordList):
    return [*filter(lambda x: x[0] in w2v.vocab and x[1] in w2v.vocab, wordList)]

#print (len(subsumptions_validation), count)
subsumptions_train       = confirmVocab(subsumptions_train)
subsumptions_test        = confirmVocab(subsumptions_test)
subsumptions_validation  = confirmVocab(subsumptions_validation)

print (len(subsumptions_train))
print (len(subsumptions_test))

4369
1539


In [10]:
def compute_XZ(subsumptions):
    X_index, Z_all = [], []

    for hyponym, hypernym in subsumptions:
        offset        = len(Z_all)
        word_synonyms = [hyponym] + synonyms[hyponym]

        X_index.append([offset, len(word_synonyms)])

        for synonym in word_synonyms:
            Z_all.append(w2v[synonym])

    return (np.array(X_index, dtype='int32'), np.array(Z_all))

X_index_train,      Z_all_train      = compute_XZ(subsumptions_train)
X_index_validation, Z_all_validation = compute_XZ(subsumptions_validation)
X_index_test,       Z_all_test       = compute_XZ(subsumptions_test)

Y_all_train      = np.array([w2v[w] for _, w in subsumptions_train])
Y_all_validation = np.array([w2v[w] for _, w in subsumptions_validation])
Y_all_test       = np.array([w2v[w] for _, w in subsumptions_test])


In [11]:
np.savez_compressed('train.npz',      X_index=X_index_train,
                                      Y_all=Y_all_train,
                                      Z_all=Z_all_train)

np.savez_compressed('validation.npz', X_index=X_index_validation,
                                      Y_all=Y_all_validation,
                                      Z_all=Z_all_validation)

np.savez_compressed('test.npz',       X_index=X_index_test,
                                      Y_all=Y_all_test,
                                      Z_all=Z_all_test)

print('I have %d train, %d validation and %d test examples.' % (
    Y_all_train.shape[0],
    Y_all_validation.shape[0],
    Y_all_test.shape[0])
)

I have 4369 train, 1483 validation and 1539 test examples.
