In [119]:
from typing import Dict, List, Tuple

import numpy as np

def load_embeddings(path: str) -> Tuple[Dict[str, int], List[str], np.ndarray]:
    vocab = []
    lookup = {}
    vecs = []
    ind = 0
    with open(path,'r') as f:
        f.readline()

        for line in f:
            split_line = line.split()
            vocab.append(split_line[0])
            lookup[split_line[0]] = ind
            ind += 1
            vecs.append(np.array(split_line[1:], dtype=np.float64))
        
    return lookup, vocab, np.asarray(vecs, dtype=np.float64) 

lookup, vocab, embs = load_embeddings('/home/danlocke/fastText/filtered-100d.vec')

In [132]:
query = "damages defamation compensate amongst other things emotional distress".split()
# query = "maintenance champerty requisite degree control".split()

In [133]:
def get_embedding(term: str, lookup: Dict[str, int], embs: np.ndarray) -> np.ndarray:
    if term not in lookup:
        raise Exception(term)

    return embs[lookup[term]]

centroid = np.sum(np.asarray([get_embedding(q, lookup, embs) for q in query]), axis=0)
centroid.shape

(100,)

In [134]:
from sklearn.metrics.pairwise import cosine_similarity

In [135]:
def print_top(vec: np.ndarray, embs: np.ndarray, vocab: List[str], k: int = 20, sim: int = None) -> None:
    similarities = cosine_similarity(vec.reshape(1, -1), embs)
    if sim != None:
        print(similarities[0][sim])
    inds = np.argsort(similarities)
    for ind in inds[0][-k:]:
        print(ind, vocab[ind])

In [136]:
print_top(centroid, embs, vocab)

764 damage
32775 overcompensation
3692 defamation
46452 humiliations
3237 emotional
27856 defamations
57591 distresses
59536 insolation
27618 defaming
4460 compensate
14927 anguish
11564 reputational
34148 indignities
7704 humiliation
5053 feelings
415 damages
37343 compensatable
38347 compensations
21439 consolation
4018 distress


In [131]:
print_top(get_embedding('damages', lookup, embs), embs, vocab, sim=415)

1.0
3647 recoverable
3692 defamation
2210 quantum
32659 Compensatory
10486 meruit
50111 overcompensatory
10116 restitutionary
38347 compensations
4909 liquidated
22103 Exemplary
10095 unliquidated
37343 compensatable
727 compensation
1974 awarded
57671 compensator
5309 exemplary
5875 compensatory
8426 Damages
43867 amages
415 damages


In [92]:
vocab[35202]

'automate'

415