In [1]:
# load the dataset

import pandas as pd

ad_train = pd.read_csv('eacl2017/adjectives.train', sep='\t', header=None)  # 5562
ad_test = pd.read_csv('eacl2017/adjectives.test', sep='\t', header=None)    # 1986
ad_val = pd.read_csv('eacl2017/adjectives.val', sep='\t', header=None)      # 398

noun_train = pd.read_csv('eacl2017/nouns.train', sep='\t', header=None) # 2836
noun_test = pd.read_csv('eacl2017/nouns.test', sep='\t', header=None)   # 1020
noun_val = pd.read_csv('eacl2017/nouns.val', sep='\t', header=None)     # 206

verb_train = pd.read_csv('eacl2017/verbs.train', sep='\t', header=None) # 2534
verb_test = pd.read_csv('eacl2017/verbs.test', sep='\t', header=None)   # 908
verb_val = pd.read_csv('eacl2017/verbs.val', sep='\t', header=None)     # 182

In [2]:
# combine the dataset

ad = pd.concat([ad_train, ad_test, ad_val])
noun = pd.concat([noun_train, noun_test, noun_val])
verb = pd.concat([verb_train, verb_test, verb_val])

# rearrage the index

ad = ad.reset_index(drop=True)
noun = noun.reset_index(drop=True)
verb = verb.reset_index(drop=True)

In [3]:
del ad_train, ad_test, ad_val, noun_train, noun_test, noun_val, verb_train, verb_test, verb_val

In [4]:
# check for missing values

print(ad.isnull().values.any())
print(noun.isnull().values.any())
print(verb.isnull().values.any())

True
True
False


In [5]:
# find null values

print(ad.iloc[ad[ad.isnull().any(axis=1)].index])
print(noun.iloc[noun[noun.isnull().any(axis=1)].index])

            0      1  2
9         NaN  valid  1
308       NaN   void  0
6171  invalid    NaN  0
6213      NaN  empty  0
              0    1  2
39  grandmother  NaN  0


In [6]:
# fill null values

ad = ad.fillna('null')
noun = noun.fillna('nan')

In [7]:
# import the word2vec model

import gensim

word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [8]:
# get the distance between the words

def get_distance(data):
    distance = []
    for i in range(len(data)):
        try:
            distance.append(word2vec_model.distance(data[0][i], data[1][i]))
        except:
            distance.append(0.5)
    data['distance'] = distance
    return data

I append 0.5 if there is a word that is not in the vocabulary of the word2vec model.
0.5 seems to be a neutral value, so I think it's a good idea to use it as a default value.

In [9]:
# add the distance to the dataset

ad = get_distance(ad)
noun = get_distance(noun)
verb = get_distance(verb)

In [10]:
# divide the dataset into synonyms and antonyms

ad_syn = ad[ad[2] == 0]
ad_ant = ad[ad[2] == 1]

noun_syn = noun[noun[2] == 0]
noun_ant = noun[noun[2] == 1]

verb_syn = verb[verb[2] == 0]
verb_ant = verb[verb[2] == 1]

In [11]:
# get the vocabulary

ad_vocab = pd.concat([ad[0], ad[1]]).unique()
noun_vocab = pd.concat([noun[0], noun[1]]).unique()
verb_vocab = pd.concat([verb[0], verb[1]]).unique()

To compare the distances between synonyms and antonyms, I took the vocabularies for 3 categories: "adjective", "verb", and "noun",
and I calculate the average distance between d(word, synonym) and d(word, antonym) for each word in the vocabulary.

In [12]:
# the % of times that d(word, synonym) < d(word, antonym)

def compare_distance(syn, ant, vocab):
    count = 0
    for word in vocab:
        synonyms = syn.loc[syn[0] == word]
        antonyms = ant.loc[ant[0] == word]
        if synonyms.empty or antonyms.empty:
            pass
        else:
            if synonyms['distance'].mean() < antonyms['distance'].mean():
                count += 1
    return count / len(vocab) * 100
            
print('adjectives:', compare_distance(ad_syn, ad_ant, ad_vocab), "\n")
print('nouns:', compare_distance(noun_syn, noun_ant, noun_vocab), "\n")
print('verbs:', compare_distance(verb_syn, verb_ant, verb_vocab), "\n")

adjectives: 15.499606608969316 

nouns: 1.9269102990033222 

verbs: 8.421537045559136 



In [13]:
# get the average distance

ad_syn_avg = ad_syn['distance'].mean()
ad_ant_avg = ad_ant['distance'].mean()

noun_syn_avg = noun_syn['distance'].mean()
noun_ant_avg = noun_ant['distance'].mean()

verb_syn_avg = verb_syn['distance'].mean()
verb_ant_avg = verb_ant['distance'].mean()

In [14]:
print("synonym : ", ((ad_syn_avg + noun_syn_avg + verb_syn_avg) / 3).round(3))
print("antonym : ", ((ad_ant_avg + noun_ant_avg + verb_ant_avg) / 3).round(3))

synonym :  0.682
antonym :  0.65


In [15]:
print("ad_syn_avg : ", ad_syn_avg.round(3))
print("ad_ant_avg : ", ad_ant_avg.round(3))
print("noun_syn_avg : ", noun_syn_avg.round(3))
print("noun_ant_avg : ", noun_ant_avg.round(3))
print("verb_syn_avg : ", verb_syn_avg.round(3))
print("verb_ant_avg : ", verb_ant_avg.round(3))

ad_syn_avg :  0.64
ad_ant_avg :  0.658
noun_syn_avg :  0.683
noun_ant_avg :  0.589
verb_syn_avg :  0.723
verb_ant_avg :  0.703
