In [1]:
import io
import numpy as np

from tqdm import tqdm
from itertools import islice


def load_vectors(fname, limit):
    fin = io.open(fname, 'r', encoding = 'utf-8', newline = '\n', errors = 'ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in tqdm(islice(fin, limit), total = limit):
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.array(list(map(float, tokens[1:])))
    return data

In [2]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip -O crawl-300d-2M.vec.zip
!unzip crawl-300d-2M.vec.zip

--2022-05-25 10:22:14--  https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 188.114.99.144
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|188.114.99.144|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1523785255 (1,4G) [application/zip]
Saving to: ‘crawl-300d-2M.vec.zip’


2022-05-25 10:23:45 (16,1 MB/s) - ‘crawl-300d-2M.vec.zip’ saved [1523785255/1523785255]

Archive:  crawl-300d-2M.vec.zip
  inflating: crawl-300d-2M.vec       


In [3]:
vecs = load_vectors('crawl-300d-2M.vec', 100000)

100%|██████████| 100000/100000 [00:04<00:00, 22501.92it/s]


In [4]:
def get_k_nearest_neighbors(vec, k):
  return list(zip(*sorted(list(map(lambda key: (np.linalg.norm(vec - vecs[key]), key), vecs.keys())))))[1][:k]

print(get_k_nearest_neighbors(vecs['Paris'], 20))
print(get_k_nearest_neighbors(vecs['brother'], 20))

('Paris', 'France', 'Parisian', 'paris', 'Lyon', 'London', 'PARIS', 'French', 'Lille', 'Marseille', 'Toulouse', 'Bordeaux', 'Marseilles', 'Strasbourg', 'Berlin', 'Le', 'Versailles', 'Nantes', 'Brussels', 'Grenoble')
('brother', 'sister', 'cousin', 'brothers', 'brother-in-law', 'uncle', 'nephew', 'father', 'son', 'sister-in-law', 'aunt', 'sisters', 'daughter', 'niece', 'dad', 'cousins', 'Brother', 'mother', 'siblings', 'grandfather')


In [5]:
get_k_nearest_neighbors(vecs['Paris'] - vecs['France'] + vecs['Germany'], 1)

('Berlin',)

In [6]:
get_k_nearest_neighbors(vecs['brother'] - vecs['man'] + vecs['woman'], 1)

('sister',)

In [7]:
get_k_nearest_neighbors(vecs['king'] - vecs['man'] + vecs['woman'], 5)

('king', 'queen', 'King', 'kings', 'Queen')