In [2]:
import pickle
import numpy as np

ft_compiled_path = "../data/jigsaw/ft_compiled.npy" # Embeddings generated from the vocabulary
data_vocab_path = "../data/jigsaw/data_vocab.bin"

In [3]:
fasttext_embeds = np.load(ft_compiled_path)

In [4]:
vocab=pickle.load(open(data_vocab_path,'rb'))

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [5]:
from pytorch_pretrained_bert.tokenization import BertTokenizer

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [6]:
bert_vocab_toks = bert_tokenizer.vocab.keys()
vocab_toks = set( [w for idx, w in vocab.get_index_to_token_vocabulary().items() ])
len(vocab_toks), len(bert_vocab_toks)

(305140, 30522)

In [7]:
# These are BERT vocabulary word IDs in the *MAIN vocabulary!*, not IDs in the BERT vocab.
bert_vocab_ids = []

for tok in bert_vocab_toks:
    tok_id = vocab.get_token_index(tok)
    if tok_id > 1:
        bert_vocab_ids.append(tok_id)
        
bert_vocab_ids = np.array(bert_vocab_ids)

In [8]:
fasttext_embeds[bert_vocab_ids].shape

(22778, 300)

In [9]:
import nmslib, time

M = 25
efC = 200

num_threads = 0
index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post' : 0}
print('Index-time parameters', index_time_params)

Index-time parameters {'M': 25, 'indexThreadQty': 0, 'efConstruction': 200, 'post': 0}


In [10]:
# Space name should correspond to the space name 
# used for brute-force search
space_name='cosinesimil'


# Intitialize the library, specify the space, the type of the vector and add data points 
index = nmslib.init(method='hnsw', space=space_name, data_type=nmslib.DataType.DENSE_VECTOR) 
index.addDataPointBatch(fasttext_embeds[bert_vocab_ids], bert_vocab_ids)

22778

In [11]:
# Create an index
start = time.time()
index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC}
index.createIndex(index_time_params) 
end = time.time() 
print('Index-time parameters', index_time_params)
print('Indexing time = %f' % (end-start))

Index-time parameters {'M': 25, 'indexThreadQty': 0, 'efConstruction': 200}
Indexing time = 5.127838


In [12]:
# Setting query-time parameters
efS = 1000
K=10
query_time_params = {'efSearch': efS}
print('Setting query-time parameters', query_time_params)
index.setQueryTimeParams(query_time_params)

Setting query-time parameters {'efSearch': 1000}


In [13]:
tok_id = vocab.get_token_index('doin')
query_arr = [fasttext_embeds[tok_id]]
query_matrix = np.array(query_arr)
K=10
query_matrix.shape, tok_id

((1, 300), 15741)

In [14]:
# Querying
query_qty = query_matrix.shape[0]
start = time.time() 
nbrs = index.knnQueryBatch(query_matrix, k = K, num_threads = num_threads)
end = time.time() 
print('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' % 
      (end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty))

kNN time total=0.006099 (sec), per query=0.006099 (sec), per query adjusted for thread number=0.000000 (sec)


In [15]:
nbrs[0][0], nbrs[0][1]

(array([15741, 28278,  8782, 26640, 30965, 15506,  2943,  2653, 23717,
         6678], dtype=int32),
 array([1.7881393e-07, 3.3468032e-01, 3.4271854e-01, 3.4451097e-01,
        3.5699421e-01, 3.6550194e-01, 3.8245523e-01, 3.8343334e-01,
        4.0046120e-01, 4.2676270e-01], dtype=float32))

In [17]:
for bert_tok_id in nbrs[0][0]:
    print(vocab.get_token_from_index(bert_tok_id))

doin
somethin
gonna
nothin
gotta
goin
alright
wanna
boogie
daddy
