# Install dependencies

In [None]:
!pip install annoy

In [None]:
!pip install spacy
!pip install --user spacy-transformers

## Download a model
Pretty heavy, will take ~30min to complete

In [None]:
!python -m spacy download en_trf_distilbertbaseuncased_lg

In [3]:
import spacy
nlp = spacy.load("en_trf_distilbertbaseuncased_lg")

# Read the file
Please unpack `JEOPARDY_CSV.zip` file in `datasets/nlp` folder.

In [4]:
import csv
import re
tag_cleaner = re.compile('<.*?>')

dataset = []
with open('datasets/nlp/JEOPARDY_CSV.csv', encoding='utf8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        content = (re.sub(tag_cleaner, '',  row['Question']),  re.sub(tag_cleaner, '',  row['Answer']))
        dataset.append(content)

In [5]:
print(len(dataset))
dataset[100:105]

216930


[('This dog breed seen here is a loyal and protective companion',
  'a German Shepherd'),
 ("Say the name of this bug; don't worry, it doesn't breathe fire",
  'the dragonfly'),
 ('(Jimmy of the Clue Crew demonstrates, putting his arm over his mouth.) To avoid spreading germs & maybe flu, we learned the sneeze named for this character made famous in an 1897 book',
  'Dracula'),
 ('Maize is another word for this', 'corn'),
 ('Of the 6 noble gases on the periodic table, it is the lightest', 'helium')]

# Create embeddings
Either choose a smaller subset, or use already prepared index in `Annoy` block.

It took ~30min on CPU machine to complete 10K.

In [None]:
import tqdm 

vectors = []

# tune dataset part to fit into time. Say, take 1000 items
for i, content in enumerate(tqdm.tqdm(dataset[:10000])):
    keys = nlp(content[0]).vector, nlp(content[1]).vector
    vectors.append((keys[0], i))
    vectors.append((keys[1], i))

 54%|█████▍    | 5425/10000 [21:12<25:42,  2.97it/s]  

# Annoy

In [14]:
from annoy import AnnoyIndex

n_dimensions = len(vectors[0][0])
n_trees = 50
n_neighbours = 10

index = AnnoyIndex(n_dimensions, 'angular')
for vec, val in vectors:
    index.add_item(val, vec)
    
index.build(n_trees)
# index.save('jeopardy.annoy')
# index = AnnoyIndex(n_dimensions, 'angular')
# index.load('jeopardy.annoy')

True

In [None]:
def get_nearest_texts(query, index, dataset, n_neighbours=5):
    result = set(index.get_nns_by_vector(nlp(q).vector, n_neighbours))
    return [dataset[r] for r in result]

In [None]:
import time
queries = ["Horses", "Actors", "life facts"]

for q in queries:
    s = time.time()
    result = get_nearest_texts(q, index, dataset)
    f = time.time()
    print("Q:", q, "time:", f-s)
    for r in result:
        print("\t", r)

# TODO
Implement exhaustive search and compare speed and quality