In [1]:
import os
import json
import numpy as np
import faiss
from pathlib import Path
from sentence_transformers import SentenceTransformer
from beir.datasets.data_loader import GenericDataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [19]:
project_root = Path.cwd().parent
data_path = project_root / "datasets" / "scifact"
output_dir = project_root / "outputs"
output_dir.mkdir(exist_ok=True)

In [7]:
print("Loading SciFact dataset...")
corpus, queries, qrels = GenericDataLoader(data_folder=str(data_path)).load(split="test")

Loading SciFact dataset...


100%|██████████| 5183/5183 [00:00<00:00, 345388.18it/s]


In [3]:
model = SentenceTransformer('all-MiniLM-L6-v2')
print(f"Embedding dimension: {model.get_sentence_embedding_dimension()}")

Embedding dimension: 384


In [8]:
print(f"Embedding {len(corpus)} documents...")
corpus_ids = list(corpus.keys())
corpus_texts = [
    (corpus[doc_id].get("title", "") + " " + corpus[doc_id].get("text", "")).strip()
    for doc_id in corpus_ids
]

corpus_embeddings = model.encode(corpus_texts, show_progress_bar=True, convert_to_numpy=True)
corpus_embeddings = corpus_embeddings.astype('float32')

Embedding 5183 documents...


Batches: 100%|██████████| 162/162 [00:02<00:00, 67.14it/s]


In [14]:
dimension = corpus_embeddings.shape[1]
assert dimension == model.get_sentence_embedding_dimension()
index = faiss.IndexFlatIP(dimension)

In [None]:
faiss.normalize_L2(corpus_embeddings)
index.add(corpus_embeddings)


In [17]:
print(f"Retrieving top 100 for {len(queries)} queries")
query_ids = list(queries.keys())
query_texts = [queries[q_id] for q_id in query_ids]
query_embeddings = model.encode(query_texts, convert_to_numpy=True).astype('float32')
faiss.normalize_L2(query_embeddings)
scores, indices = index.search(query_embeddings, k=100)


Retrieving top 100 for 300 queries


In [20]:
dense_results = {}
for i, q_id in enumerate(query_ids):
    query_result = {}
    for score, idx in zip(scores[i], indices[i]):
        doc_id = corpus_ids[idx]
        query_result[doc_id] = float(score)
    dense_results[q_id] = query_result

output_file = output_dir / "dense_results.json"
with open(output_file, "w") as f:
    json.dump(dense_results, f)

print("Success")
    

Success
