# Explore Words Similarity with Embeddings

## Setup

In [9]:
import os
import numpy as np

In [10]:
WORKSPACE = './workspace'
embeddings_file_path = os.path.join(WORKSPACE,'embeddings.tsv')
index_file_path = os.path.join(WORKSPACE,'embed-ann.index')

## 1. Similarity Function

In [66]:
def calculate_consine_similarty(emb1, emb2):
    return np.dot(emb1, emb2)/(np.linalg.norm(emb1) * np.linalg.norm(emb2))
    

## 2. Load Embeddings

In [67]:
def load_embeddings(embedding_file_path):
    embedding_lookup = {}
    with open(embeddings_file_path) as embedding_file:
        while True:
            line = embedding_file.readline()
            if not line: break
                
            parts = line.split('\t')
            word = parts[0]
            embedding = [float(v) for v in parts[1:]]
            embedding_lookup[word] = embedding
    return embedding_lookup
            
        

In [68]:
embedding_lookup = load_embeddings(embeddings_file_path)
len(embedding_lookup)

4618

## 3. Get top similar words

In [69]:
from bisect import insort

def top_similar(word, k):
    outputs = []
    
    input_word_embedding = embedding_lookup[word.lower()]
    for word in embedding_lookup:
        embedding = embedding_lookup[word]
        similarity = calculate_consine_similarty(input_word_embedding, embedding)
        insort(outputs, (similarity, word))

    return sorted(outputs, reverse=True)[:k]
    

In [78]:
word = 'king'
result = top_similar(word, 15)
result

[(1.0, 'king'),
 (0.4778072240400091, 'judah'),
 (0.4692095892725038, 'saying'),
 (0.43612269988519925, 'son'),
 (0.4358324544812592, 'went'),
 (0.43537897702967954, 'lord'),
 (0.42042494116771106, 'him'),
 (0.4204041092515079, 'men'),
 (0.4152552449098259, 'unto'),
 (0.40475038561214477, 'israel'),
 (0.4041881388350648, 'added'),
 (0.39528777470274484, 'sent'),
 (0.39497872833724684, 'thus'),
 (0.39201217758599044, 'hand'),
 (0.3838387243464861, 'pass')]