## Word Analogy Task

In [86]:
import os
import time
from collections import defaultdict
import numpy as np
import math
from tqdm import tqdm
import heapq

In [2]:
def cosine_similarity(v1, v2,n1, n2):
    # Returns a value between -1 and 1, 1 meaning exactly same
    #  and -1 meaning exactly opposite.
    assert len(v1) == len(v2)
    numerator = sum([v1[i]*v2[i] for i in range(len(v1))])
    denominator = np.sqrt(sum([x**2 for x in v1])) \
                * np.sqrt(sum([x**2 for x in v2]))
    if denominator == 0:
        print('division by zero with words:')
        print(n1)
        print(n2)
    return (numerator/denominator)

In [75]:
# switch between 'bow5.words', 'bow2.words' and 'deps.words'
def get_embeddings(filename):
    # Returns a dictionary containing words as keys and word vectors
    #  as values
    vectors = {}
    f = open(filename,'r')
    word_embeddings = defaultdict(list)
    s = time.time()
    for line in f:
        entry = [x.strip() for x in line.split(' ')]
        word = entry[0]
        vector = entry[1:]
        
        word_embeddings[word] = [np.float(x) for x in vector]
    f.close()
    e = time.time()
    print('It took {} seconds to read in dataset {}'.format(e-s,filename))
    return word_embeddings

bow5_embeddings = get_embeddings('bow5.words_small')

It took 1.25 seconds to read in dataset bow5.words_small


In [88]:
vectors = {}
with open('bow5.words_small') as f:
    for line in f:
        word, vector = line.split(" ",1)
        v = np.fromstring(vector, sep=' ', dtype='float32')
        vectors[word] = v / np.linalg.norm(v)
    

In [52]:
analogy_file = "questions-words.txt"

In [None]:
# read google analogy test set
test = []
with open('questions-words.txt') as f:
    for line in f:
        analogy = line.strip().lower().split()
        if(analogy[0] is not ":"):
            test.append(analogy)

In [95]:
def sorted_by_similarity(words, base_vector):
    """Returns words sorted by cosine distance to a given vector, most similar first"""
    words_with_distance = [(cosine_similarity(base_vector, w.vector, None, None), w) for w in words]
    # We want cosine similarity to be as large as possible (close to 1)
    return sorted(words_with_distance, key=lambda t: t[0], reverse=True)

In [64]:
def closest_analogies(a0, a1, b0, words) :
    word_a1 = find_word(a1, words)
    word_a0 = find_word(a0, words)
    word_b0 = find_word(b0, words)
    vector = add_vectors(
        sub_vectors(word_a1.vector, word_a0.vector),
        word_b0.vector)
    closest = sorted_by_similarity(words, vector)[:10]

In [87]:
def is_redundant(word):
    return (
        a1.lower() in word.lower() or
        a0.lower() in word.lower() or
        b0.lower() in word.lower())
    return [(dist, w) for (dist, w) in closest if not is_redundant(w.text)]

In [72]:
def print_analogy(a0, a1, b0, words):
    analogies = closest_analogies(a0, a1, b0, words)
    if (len(analogies) == 0):
        print("{a0}-{a1} is like {b0}-?")
    else:
        (dist, w) = analogies[0]
        print("{a0}-{a1} is like {b0}-{w.text}")

In [None]:
def compare(b1_estimate, embeddings):
    min_cos = 1000000
    best_word = None
    for word in list(embeddings.keys()):
        word_vector = embeddings[word]
        cos_sim = cosine_similarity(b1_estimate, word_vector, None, None)
        if cos_sim < min_cos:
            min_cos = cos_sim
            best_word = word
        
    return best_word

In [None]:
# bow2_embeddings = get_embeddings('bow2.words')
# deps_embeddings = get_embeddings('deps.words')


In [None]:
import heapq
from heapq import heappush, heappushpop
from scipy.spatial.distance import cosine


In [89]:
# read google analogy test set
test = []
with open('questions-words.txt') as f:
    for line in f:
        analogy = line.strip().lower().split()
        if(analogy[0] is not ":"):
            test.append(analogy)

In [90]:
test_array = np.array([np.array(i) for i in test])

In [91]:
np.asarray(test_array).shape

(19544L, 4L)

In [92]:
a0 = [column[0] for column in test_array]
a1 = [column[1] for column in test_array]
b0 = [column[2] for column in test_array]
b1 = [column[3] for column in test_array]

In [None]:
def offset_vector(a0, a1, b0, b1):
    bi_estimate = (a1 - a0) + b0
    return bi_estimate

In [None]:
def get_nearest(vectors, distance, N = 10):
    nearest = []
    for word,u in vectors.items():
        nearness = -distance(u)
        if len(nearest) < N:
            heapq.heappush(nearest, (nearness, word))
        else:
            heapq.heappushpop(nearest, (nearness, word))
    return sorted(nearest, reverse=True)

In [93]:
def print_nearest(vectors, word):
    v = vectors[word]
    print(word, np.linalg.norm(v))
    for nearness, word in get_nearest(vectors, lambda u: cosine(u, v)):
        print(word, nearness)
        

In [None]:
def get_analogy(vectors, a0, a1, b0, distance):
    return get_nearest(vectors, lambda b1: distance(a0, a1, b0))


In [None]:
def print_analogy(vectors, a0, a1, b0, distance=offset_vector):
#     print(a0, a1, b0)
    for nearness, word in get_analogy(vectors, vectors[a0], vectors[a1], vectors[b0],
                                    distance):
        print(word, nearness)

In [None]:
def main():
    vectors = bow5_embeddings
    print_analogy(vectors, a0, a1, b0, distance=offset_vector)

In [None]:
if __name__ == '__main__':
    main()