**Model Initialization**

In [35]:
from sentence_transformers import SentenceTransformer, util
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

embedder = SentenceTransformer('all-MiniLM-L6-v2').to(device)

cuda


In [36]:
import csv

# Loading MSMARCO passages from file
psg_dict = dict()
with open('msmarco_dpr_dev_context.csv','r', encoding='UTF-8') as context_file:
  reader = csv.reader(context_file, delimiter='\t')
  for pid, psg, _ in reader:
    psg_dict[psg]=pid
passages = list(psg_dict.keys())

# Loading MSMARCO queries from file
query_dict = dict()
with open('msmarco_dpr_dev_mult_query.csv','r', encoding='UTF-8') as context_file:
  reader = csv.reader(context_file, delimiter='\t')
  for que, qid, _ in reader:
    query_dict[que] = qid
queries = list(query_dict.keys())

In [37]:
# Encode Passage Embeddings
with torch.no_grad():
  passage_embeddings = embedder.encode(passages, convert_to_tensor=True, show_progress_bar=True)
if device=='cuda':
  passage_embeddings = passage_embeddings.detach().cpu()

Batches: 100%|██████████| 1847/1847 [00:19<00:00, 95.55it/s] 


In [38]:
# Simpler implementation of searching/ranking
with torch.no_grad():
  query_embeddings = embedder.encode(queries, convert_to_tensor=True, show_progress_bar=True)
if device=='cuda':
  query_embeddings = query_embeddings.detach().cpu()
  
norm_passage_embeddings = util.normalize_embeddings(passage_embeddings)
norm_query_embeddings = util.normalize_embeddings(query_embeddings)

Batches: 100%|██████████| 11/11 [00:00<00:00, 215.64it/s]


In [39]:
import faiss
import numpy as np

In [40]:
embedding_size = norm_passage_embeddings.shape[1]
topk = 5
n_clusters = 256

# Building FAISS Index
quantizer = faiss.IndexFlatIP(embedding_size)
index = faiss.IndexIVFFlat(quantizer, embedding_size, n_clusters, faiss.METRIC_INNER_PRODUCT)

In [41]:
norm_passage_embeddings = norm_passage_embeddings.numpy()
norm_query_embeddings = norm_query_embeddings.numpy()

In [42]:
# Training FAISS Index. (Clustering)
index.train(norm_passage_embeddings)
index.add(norm_passage_embeddings)

In [43]:
# Perform search for all queries
scores, psg_ids = index.search(norm_query_embeddings, topk)

In [44]:
for i, query in enumerate(queries):
  print("\n\n======================\n\n")
  print("Query:", query)
  print("Query ID:", query_dict[query])
  print("\nTop 5 most similar sentences in corpus:")
  print(len(passages))
  for pid in psg_ids[i]:
    print("Passage ID:", pid)
    print("Passage:", passages[pid])





Query: how many brothers and sisters did martin luther king have
Query ID: 275688

Top 5 most similar sentences in corpus:
59096
Passage ID: 145
Passage: Martin Luther King Jr. had two siblings. These were Christine KingFarris and Alfred Daniel Williams King. He had four children.
Passage ID: 22404
Passage: Louis has 6 half-sisters. Charlotte (Lottie) Tomlinson, Georgia Austin, Felicite (Fizzy) Tomlinson, and the twins: Phoebe and Daisy Tomlinson. *Georgia is Louis's sister from his biological dad's side. His newest sister,Doris, is a twin of half-brother Ernest, born February 12, 2014.
Passage ID: 23703
Passage: Best Answer: Adam Levine has one brother, Michael (44); a stepsister, Julia (19); and two half siblings, Sam (11) and Liza (13) i'm not sure if these are correct, but thats the info i got... Source(s): James Â· 8 years ago.
Passage ID: 2731
Passage: The sibling tag questions are the series of questions that you must ask your sisters or brothers to know how much they unders