<b> Word Similarity </b> <br>
Your next step is to evaluate the three kinds of embeddings in the word similarity
task. The goal of this task is to compute similarity of two words and
evaluate the model-produced similarity against human judgements. Download
two commonly-used word similarity datasets:<br>
• SimLex: https://www.cl.cam.ac.uk/˜fh295/simlex.html<br>
• MEN: https://staff.fnwi.uva.nl/e.bruni/MEN<br>
Compute cosine similarity between the words using the three models. Evaluate
the model-produced similarities against human judgements in terms of
Pearson and Spearman correlation coefficients. <br>
Compare the performance of
the three models on this task. <br>
Analyze the data qualitatively and report what
are the differences in the kind of similarity captured by the three models.<br> We
are interested to see both quantitative results and qualitative analysis in your
report.

In [76]:
# Read in the SimLex dataset
SimLex_file = open('SimLex-999/SimLex-999.txt','r')
SimLex_set = {}
for line in SimLex_file:
    pair_data = [x.strip() for x in line.split('\t')]
    word1 = pair_data[0]
    word2 = pair_data[1]
    similarity_score = pair_data[4]
    SimLex_set[(word1, word2)] = {'human':similarity_score}


In [12]:
# Read in the MEN dataset
MEN_file = open('MEN/MEN_dataset_natural_form_full','r')
MEN_set = {}
for line in MEN_file:
    pair_data = [x.strip() for x in line.split(' ')]
    word1 = pair_data[0]
    word2 = pair_data[1]
    similarity_score = pair_data[2]
    MEN_set[(word1, word2)] = {'human':similarity_score}

In [13]:
# In order to reduce the size of the word embedding files,
#  I will delete all non-occuring words from the relevant model and
# store the word embeddings in a defaultdict for quick access.
occuring_words = []
for pair in list(MEN_set.keys())+list(SimLex_set.keys()):
    word1 = pair[0]
    word2 = pair[1]
    if word1 not in occuring_words:
        occuring_words.append(word1)
    if word2 not in occuring_words:
        occuring_words.append(word2)

In [37]:
# Read in the word vectors for a given word model
import os
import time
from collections import defaultdict

# switch between 'bow5.words', 'bow2.words' and 'deps.words'
def get_embeddings(filename):
    f = open(filename,'r')
    word_embeddings = defaultdict(list)
    i =0
    bla = []
    s = time.time()
    for line in f:
        entry = [x.strip() for x in line.split(' ')]
        word = entry[0]
        vector = entry[1:]
        if word in occuring_words:
            word_embeddings[word] = [np.float(x) for x in vector]
    e = time.time()
    print('It took {} seconds to read in dataset'.format(e-s))
    return word_embeddings

word_embeddings = get_embeddings('bow2.words')
word_embeddings = get_embeddings('bow5.words')
word_embeddings = get_embeddings('deps.words')

It took 11.932837009429932 seconds to read in dataset


In [38]:
import numpy as np
import math
def cosine_similarity(v1, v2):
    # Returns a value between -1 and 1, 1 meaning exactly same
    #  and -1 meaning exactly opposite.
    assert len(v1) == len(v2)
    numerator = sum([v1[i]*v2[i] for i in range(len(v1))])
    denominator = np.sqrt(sum([x**2 for x in v1])) \
                * np.sqrt(sum([x**2 for x in v2]))
    return (numerator/denominator)

In [79]:
a = np.zeros((10,2))
a[1,0] = 2
a[9,1] = 3
print(a)

SimLex_set


[[0. 0.]
 [2. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 3.]]


{('celebration', 'ceremony'): {'human': '3.47'},
 ('leave', 'go'): {'human': '2.53'},
 ('agree', 'differ'): {'human': '2.31'},
 ('money', 'salary'): {'human': '4.54'},
 ('effort', 'difficulty'): {'human': '2.33'},
 ('arm', 'neck'): {'human': '4.96'},
 ('guy', 'partner'): {'human': '4.68'},
 ('animal', 'person'): {'human': '4.61'},
 ('delightful', 'cheerful'): {'human': '2.38'},
 ('certain', 'sure'): {'human': '1.68'},
 ('agony', 'grief'): {'human': '2.38'},
 ('frustration', 'anger'): {'human': '2.06'},
 ('brother', 'son'): {'human': '4.43'},
 ('confident', 'sure'): {'human': '2.62'},
 ('uncle', 'aunt'): {'human': '4.24'},
 ('nice', 'generous'): {'human': '2.18'},
 ('rod', 'curtain'): {'human': '4.43'},
 ('heroine', 'hero'): {'human': '4.21'},
 ('stupid', 'dumb'): {'human': '1.75'},
 ('ceiling', 'cathedral'): {'human': '4.85'},
 ('dinner', 'breakfast'): {'human': '4.5'},
 ('container', 'mouse'): {'human': '4.85'},
 ('attend', 'arrive'): {'human': '3.17'},
 ('modest', 'ashamed'): {'human

In [75]:
# TODO:
# Read about human judgements in terms of Pearson and Spearman correlation coefficients
# Calculate cosine similarity between all pairs in the two sets
# compare them to human judgements
# Do for both sets and all three models
# Make some cool graphs about it
def add_cosine_similarity(dataset):
    #np.seterr(all='print')
    for pair in list(dataset.keys()):
        v1, v2 = word_embeddings[pair[0]], word_embeddings[pair[1]]
        if len(v1) != len(v2):
            if len(v1) == 0 or len(v2) == 0:
                print('Removing {} and {} from dataset'.format(pair[0],pair[1]))
                print('No word embedding found for one of them')
                del dataset[pair]
            else:
                print("Something weird happened with:")
                print(pair[0], pair[1])
        else:
            cos_sim = cosine_similarity(v1,v2)
            dataset[pair]['cos_sim'] = cos_sim
    return dataset

def get_just_data(dataset):
    # Returns the cosine similarities and human judgements 
    # as a nx2 numpy array for data anaylsis
    n = len(dataset)
    result = np.zeros(n,2)
    for pair in enumerate(list(dataset.items())):
        d = dataset[pair]
        result[i,0] = d['human']
        result[i,1] = d['cos_sim']
    return result

MEN_set = add_cosine_similarity(MEN_set)
SimLex_set = add_cosine_similarity(SimLex_set)
        
len(SimLex_set)

999

In [61]:
SimLex_set

{('absence', 'presence'): {'cos_sim': 0.5399605428793415, 'human': '2.31'},
 ('absorb', 'learn'): {'cos_sim': 0.3714282992010495, 'human': '3.11'},
 ('absorb', 'possess'): {'cos_sim': 0.45933417080681005, 'human': '3.11'},
 ('absorb', 'withdraw'): {'cos_sim': 0.26378489838637387, 'human': '3.11'},
 ('abundance', 'plenty'): {'cos_sim': 0.4303333150342512, 'human': '2.21'},
 ('accept', 'acknowledge'): {'cos_sim': 0.7209279451129317, 'human': '3.03'},
 ('accept', 'believe'): {'cos_sim': 0.48400150903838096, 'human': '3.03'},
 ('accept', 'deliver'): {'cos_sim': 0.4106108601129025, 'human': '3.03'},
 ('accept', 'deny'): {'cos_sim': 0.5982852916421576, 'human': '3.03'},
 ('accept', 'forgive'): {'cos_sim': 0.44394266487574363, 'human': '3.03'},
 ('accept', 'recommend'): {'cos_sim': 0.3851617136798934, 'human': '3.03'},
 ('accept', 'reject'): {'cos_sim': 0.6943018573220413, 'human': '3.03'},
 ('accident', 'catastrophe'): {'cos_sim': 0.35404701546414313,
  'human': '3.26'},
 ('accident', 'emerg