In [None]:
# potential code/framework to reproduce simlm-base-msmarco-finetuned

# Steps for getting a search model running:
# - create embeddings for each passage 
# - make model specific index

# Thoughts / Questions
# - how would the logistics of our own model work if each model needs its own kind of index
#    - Lots of overhead... 
# - Look into location based search?
#
# - add specific tokens into the passage embeddings

In [None]:
!pip install pyserini transformers torch tqdm

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import json
from tqdm import tqdm
import numpy as np

# Load the tokenizer and model
model_name = "intfloat/simlm-base-msmarco-finetuned"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()

# Function to encode passages into dense embeddings
def encode_passage(passage):
    inputs = tokenizer(passage, return_tensors="pt", max_length=256, truncation=True, padding="max_length")
    with torch.no_grad():
        outputs = model(**inputs)
        # Use [CLS] token output as the embedding
        embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
    return embedding

# *** CHECK PATH *** : Path to the MS MARCO passage collection 
passage_file = "msmarco-passage-collection.tsv"  

# Encode passages
embeddings = []
doc_ids = []
with open(passage_file, "r") as f:
    for line in tqdm(f):
        doc_id, passage = line.strip().split("\t")
        embedding = encode_passage(passage)
        embeddings.append(embedding)
        doc_ids.append(doc_id)

# Save embeddings and doc_ids as numpy files
np.save("embeddings.npy", np.array(embeddings))
np.save("doc_ids.npy", np.array(doc_ids))


In [None]:
# Import Pyserini
from pyserini.index import DenseIndexer
from pyserini.index import IndexReader

# Initialize the DenseIndexer
index_path = "msmarco_dense_index"  # Path to save the index
indexer = DenseIndexer(output_dir=index_path)

# Add encoded passages to the index
embeddings_file = "msmarco_dense_embeddings.json"  # The output file from Step 1
with open(embeddings_file, "r") as f:
    for line in tqdm(f, desc="Indexing embeddings"):
        entry = json.loads(line)
        doc_id = entry["doc_id"]
        embedding = entry["embedding"]
        indexer.add(doc_id, embedding)

# Finalize the index
indexer.save_index()

print("Dense index created and saved to", index_path)

In [None]:
# Retrieval

In [None]:
from pyserini.search import SimpleDenseSearcher

# Load the custom dense index
searcher = SimpleDenseSearcher(index_path, None)

# Encode a query
query = "What is machine learning?"
query_vector = encode_passage(query)  # Reuse the encode_passage function

# Perform the search
hits = searcher.search(query_vector, k=10)

# Display results
print(f"Top 10 Results for Query: '{query}'")
for i, hit in enumerate(hits):
    print(f"{i + 1}: {hit.docid} (score: {hit.score})")