In [5]:
from beir import util, LoggingHandler
from beir.retrieval import models
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES

import logging
import pathlib, os

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

  from tqdm.autonotebook import tqdm


In [6]:
#### Download scifact.zip dataset and unzip the dataset
dataset = "scifact"
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
out_dir = os.path.join(pathlib.Path('__file__').parent.absolute(), "datasets")
data_path = util.download_and_unzip(url, out_dir)

2024-12-08 10:04:40 - Downloading scifact.zip ...


c:\vs_project\beir\self\datasets\scifact.zip: 100%|██████████| 2.69M/2.69M [00:06<00:00, 448kiB/s]


2024-12-08 10:04:48 - Unzipping scifact.zip ...


In [23]:
#### Provide the data_path where scifact has been downloaded and unzipped
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

2024-12-08 10:11:12 - Loading Corpus...


100%|██████████| 5183/5183 [00:00<00:00, 153612.43it/s]

2024-12-08 10:11:12 - Loaded 5183 TEST Documents.
2024-12-08 10:11:12 - Doc Example: {'text': 'Alterations of the architecture of cerebral white matter in the developing human brain can affect cortical development and result in functional disabilities. A line scan diffusion-weighted magnetic resonance imaging (MRI) sequence with diffusion tensor analysis was applied to measure the apparent diffusion coefficient, to calculate relative anisotropy, and to delineate three-dimensional fiber architecture in cerebral white matter in preterm (n = 17) and full-term infants (n = 7). To assess effects of prematurity on cerebral white matter development, early gestation preterm infants (n = 10) were studied a second time at term. In the central white matter the mean apparent diffusion coefficient at 28 wk was high, 1.8 microm2/ms, and decreased toward term to 1.2 microm2/ms. In the posterior limb of the internal capsule, the mean apparent diffusion coefficients at both times were similar (1.2 vers




In [26]:
queries

{'1': '0-dimensional biomaterials show inductive properties.',
 '3': '1,000 genomes project enables mapping of genetic sequence variation consisting of rare variants with larger penetrance effects than common variants.',
 '5': '1/2000 in UK have abnormal PrP positivity.',
 '13': '5% of perinatal mortality is due to low birth weight.',
 '36': 'A deficiency of vitamin B12 increases blood levels of homocysteine.',
 '42': 'A high microerythrocyte count raises vulnerability to severe anemia in homozygous alpha (+)- thalassemia trait subjects.',
 '48': 'A total of 1,000 people in the UK are asymptomatic carriers of vCJD infection.',
 '49': 'ADAR1 binds to Dicer to cleave pre-miRNA.',
 '50': 'AIRE is expressed in some skin tumors.',
 '51': 'ALDH1 expression is associated with better breast cancer outcomes.',
 '53': 'ALDH1 expression is associated with poorer prognosis in breast cancer.',
 '54': 'AMP-activated protein kinase (AMPK) activation increases inflammation-related fibrosis in the lung

In [80]:
corpus
first_corpus = {'31715818': corpus['31715818'], '14717500': corpus['14717500']}
print(first_corpus)

{'31715818': {'text': 'Nanotechnologies are emerging platforms that could be useful in measuring, understanding, and manipulating stem cells. Examples include magnetic nanoparticles and quantum dots for stem cell labeling and in vivo tracking; nanoparticles, carbon nanotubes, and polyplexes for the intracellular delivery of genes/oligonucleotides and protein/peptides; and engineered nanometer-scale scaffolds for stem cell differentiation and transplantation. This review examines the use of nanotechnologies for stem cell tracking, differentiation, and transplantation. We further discuss their utility and the potential concerns regarding their cytotoxicity.', 'title': 'New opportunities: the use of nanotechnologies to manipulate and track stem cells.'}, '14717500': {'text': 'Genome-wide association studies (GWAS) have now identified at least 2,000 common variants that appear associated with common diseases or related traits (http://www.genome.gov/gwastudies), hundreds of which have been 

In [79]:
queries
first_queries = {list(queries.keys())[0]: list(queries.values())[0], list(queries.keys())[1]: list(queries.values())[1]}
print(first_queries)

{'1': '0-dimensional biomaterials show inductive properties.', '3': '1,000 genomes project enables mapping of genetic sequence variation consisting of rare variants with larger penetrance effects than common variants.'}


In [83]:
#### Load the SBERT model and retrieve using cosine-similarity
model = DRES(models.SentenceBERT("msmarco-distilbert-base-tas-b"), batch_size=16)
retriever = EvaluateRetrieval(model, score_function="cos_sim") # or "cos_sim" for cosine similarity
results = retriever.retrieve(first_corpus, first_queries)

2024-12-08 10:53:18 - Use pytorch device_name: cpu
2024-12-08 10:53:18 - Load pretrained SentenceTransformer: msmarco-distilbert-base-tas-b
2024-12-08 10:53:22 - Encoding Queries...


Batches: 100%|██████████| 1/1 [00:00<00:00, 17.95it/s]


2024-12-08 10:53:22 - Sorting Corpus by document length (Longest first)...
2024-12-08 10:53:22 - Scoring Function: Cosine Similarity (cos_sim)
2024-12-08 10:53:22 - Encoding Batch 1/1...


Batches: 100%|██████████| 1/1 [00:00<00:00,  2.30it/s]


In [84]:
results

{'1': {'14717500': 0.7089304327964783, '31715818': 0.7242918610572815},
 '3': {'31715818': 0.7258668541908264, '14717500': 0.7888635993003845}}

In [None]:
#### Evaluate your model with NDCG@k, MAP@K, Recall@K and Precision@K  where k = [1,3,5,10,100,1000] 
ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)