# Install dependencies

In [None]:
!pip install annoy

In [None]:
!pip install spacy
!pip install --user spacy-transformers

## Download a model
Pretty heavy, will take ~30min to complete

In [None]:
!python -m spacy download en_trf_distilbertbaseuncased_lg

In [3]:
import spacy
nlp = spacy.load("en_trf_distilbertbaseuncased_lg")

# Read the file
Please unpack `JEOPARDY_CSV.zip` file in `datasets/nlp` folder.

In [4]:
import csv
import re
tag_cleaner = re.compile('<.*?>')

dataset = []
with open('datasets/nlp/JEOPARDY_CSV.csv', encoding='utf8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        content = (re.sub(tag_cleaner, '',  row['Question']),  re.sub(tag_cleaner, '',  row['Answer']))
        dataset.append(content)

In [5]:
print(len(dataset))
dataset[100:105]

216930


[('This dog breed seen here is a loyal and protective companion',
  'a German Shepherd'),
 ("Say the name of this bug; don't worry, it doesn't breathe fire",
  'the dragonfly'),
 ('(Jimmy of the Clue Crew demonstrates, putting his arm over his mouth.) To avoid spreading germs & maybe flu, we learned the sneeze named for this character made famous in an 1897 book',
  'Dracula'),
 ('Maize is another word for this', 'corn'),
 ('Of the 6 noble gases on the periodic table, it is the lightest', 'helium')]

# Create embeddings
Either choose a smaller subset, or use already prepared index in `Annoy` block.

It took ~40min on CPU machine to complete 10K.

In [21]:
import tqdm 

vectors = []
SHARD_SIZE = 10000n_neighbourstune dataset part to fit into time. Say, take 1000 items
for i, content in enumerate(tqdm.tqdm(dataset[:SHARD_SIZE])):
    keys = nlp(content[0]).vector, nlp(content[1]).vector
    vectors.append((keys[0], i))
    vectors.append((keys[1], i))

100%|██████████| 10000/10000 [36:56<00:00,  4.51it/s] 


## Save vectors

In [23]:
import pickle
with open('datasets/nlp/jeopardy.pickle', 'wb') as f:
    pickle.dump(vectors, f)
    
with open('datasets/nlp/jeopardy.pickle', 'rb') as f:
    vectors = pickle.load(f)

# Annoy

In [24]:
from annoy import AnnoyIndex

n_dimensions = len(vectors[0][0])
n_trees = 50
n_neighbours = 10

index = AnnoyIndex(n_dimensions, 'angular')
for vec, val in vectors:
    index.add_item(val, vec)
    
index.build(n_trees)
index.save('datasets/nlp/jeopardy.annoy')

# Use these lines to load index from repository
# index = AnnoyIndex(n_dimensions, 'angular')
# index.load('datasets/nlp/jeopardy.annoy')

True

In [25]:
def get_nearest_texts(query, index, dataset, n_neighbours=5):
    result = set(index.get_nns_by_vector(nlp(q).vector, n_neighbours))
    return [dataset[r] for r in result]

In [26]:
import time
queries = ["Horses", "Actors", "life facts"]

for q in queries:
    s = time.time()
    result = get_nearest_texts(q, index, dataset)
    f = time.time()
    print("Query:", q, "time:", f-s)
    for r in result:
        print("\tQ:", r[0])
        print("\tA:", r[1])

Query: Horses time: 0.3620262145996094
	Q: Sometimes called "Irish Cobs" or "Gypsy Cobs", Irish Tinkers are a type of this animal
	A: horse
	Q: Jordan & Bird hit "nothing but net" playing this shot-for-shot basketball game in 1990s TV ads for McDonald's
	A: HORSE
	Q: A long-standing tradition in France, hippophagy is the consumption of this
	A: horse
	Q: Trainers shout, "Tail Up!" when they want these performers to follow each other trunk to tail
	A: Elephants
	Q: Lincoln once said not to "swap" these "while crossing a stream"
	A: horses
Query: Actors time: 0.0500028133392334
	Q: Examples of this TV format include "Leave It to Beaver" & "The King of Queens"
	A: sitcom
	Q: Oliver Wendell Holmes said not to falsely yell "Fire" in one of these, where 850 Viennese died Dec. 8, 1881
	A: Theater
	Q: Kangaroos, monkeys & Boy Scouts all come in these groups
	A: troops
	Q: Crowds flock to Dodona, Philippi & Thassos to see festivals of this art performed in ancient venues
	A: Theater
	Q: Joshua


# TODO
Implement exhaustive search and compare speed and quality