Notebook for getting top-k embedding expansion terms for Kuzi centroid method.

In [None]:
from typing import Dict, List, Tuple
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
def load_embeddings(path: str) -> Tuple[Dict[str, int], List[str], np.ndarray]:
    vocab = []
    lookup = {}
    vecs = []
    ind = 0
    with open(path,'r') as f:
        f.readline()

        for line in f:
            split_line = line.split()
            vocab.append(split_line[0])
            lookup[split_line[0]] = ind
            ind += 1
            vecs.append(np.array(split_line[1:], dtype=np.float64))
        
    return lookup, vocab, np.asarray(vecs, dtype=np.float64) 

In [None]:
emb_path = '../embeddings/filtered-100d.vec'
lookup, vocab, embs = load_embeddings(emb_path)

In [None]:
query = "damages defamation compensate amongst other things emotional distress".split()
# query = "maintenance champerty requisite degree control".split()

In [None]:
def get_embedding(term: str, lookup: Dict[str, int], embs: np.ndarray) -> np.ndarray:
    if term not in lookup:
        raise Exception(term)

    return embs[lookup[term]]

centroid = np.sum(np.asarray([get_embedding(q, lookup, embs) for q in query]), axis=0)
centroid.shape

In [None]:
def print_top(vec: np.ndarray, embs: np.ndarray, vocab: List[str], k: int = 20, sim: int = None) -> None:
    similarities = cosine_similarity(vec.reshape(1, -1), embs)
    if sim != None:
        print(similarities[0][sim])
    inds = np.argsort(similarities)
    for ind in inds[0][-k:]:
        print(ind, vocab[ind])

In [None]:
print_top(centroid, embs, vocab)

In [None]:
print_top(get_embedding('damages', lookup, embs), embs, vocab, sim=415)