In [2]:
download_path = "C:\\Users\\kade\\datasets\\beir"

from beir import util, LoggingHandler
from beir.retrieval import models
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES

import logging
import pathlib, os

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

#### Download scifact.zip dataset and unzip the dataset
dataset = "scifact"
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
#out_dir = os.path.join(pathlib.Path(__file__).parent.absolute(), "datasets")
out_dir = os.path.join(pathlib.Path(download_path).parent.absolute(), "datasets")

data_path = util.download_and_unzip(url, out_dir)

#### Provide the data_path where scifact has been downloaded and unzipped
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

#### Load the SBERT model and retrieve using cosine-similarity
model = DRES(models.SentenceBERT("msmarco-distilbert-base-tas-b"), batch_size=16)
retriever = EvaluateRetrieval(model, score_function="dot") # or "cos_sim" for cosine similarity
results = retriever.retrieve(corpus, queries)

#### Evaluate your model with NDCG@k, MAP@K, Recall@K and Precision@K  where k = [1,3,5,10,100,1000] 
ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)

2024-07-11 23:58:03 - Downloading scifact.zip ...


C:\Users\kade\datasets\datasets\scifact.zip:   0%|          | 0.00/2.69M [00:00<?, ?iB/s]

2024-07-11 23:58:09 - Unzipping scifact.zip ...
2024-07-11 23:58:09 - Loading Corpus...


  0%|          | 0/5183 [00:00<?, ?it/s]

2024-07-11 23:58:09 - Loaded 5183 TEST Documents.
2024-07-11 23:58:09 - Doc Example: {'text': 'Alterations of the architecture of cerebral white matter in the developing human brain can affect cortical development and result in functional disabilities. A line scan diffusion-weighted magnetic resonance imaging (MRI) sequence with diffusion tensor analysis was applied to measure the apparent diffusion coefficient, to calculate relative anisotropy, and to delineate three-dimensional fiber architecture in cerebral white matter in preterm (n = 17) and full-term infants (n = 7). To assess effects of prematurity on cerebral white matter development, early gestation preterm infants (n = 10) were studied a second time at term. In the central white matter the mean apparent diffusion coefficient at 28 wk was high, 1.8 microm2/ms, and decreased toward term to 1.2 microm2/ms. In the posterior limb of the internal capsule, the mean apparent diffusion coefficients at both times were similar (1.2 vers

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.02k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs.huggingface.co/sentence-transformers/msmarco-distilbert-base-tas-b/2c01ce345cedea6d10c7fb148658a2bf51aa580b79655106fbc377417b421efa?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1720994292&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyMDk5NDI5Mn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9zZW50ZW5jZS10cmFuc2Zvcm1lcnMvbXNtYXJjby1kaXN0aWxiZXJ0LWJhc2UtdGFzLWIvMmMwMWNlMzQ1Y2VkZWE2ZDEwYzdmYjE0ODY1OGEyYmY1MWFhNTgwYjc5NjU1MTA2ZmJjMzc3NDE3YjQyMWVmYT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=QTG91XilhGJn1pqnejr0ZXZJ5EHt5Dq0YNjD1oVMD-fSNE-F51DDW%7Etm3uynURYsOcOaqb6woJc3nzrcvAxgHDARd8z7QDRKu2%7EZwv79vBYK5f23Zdcb-uDM9taMENKD6I5Yq-ED9vG2V%7ERJfau6vS2pBlspppTvYbOypXw7p1LNSPROd%7EbrYB70Y4z9uPGqZWgXk7bYM4MmNeBkI4uR15MFf1jw6yEJKjz0KDTWZZncF4QpExsvNYLyLA9wJJiHDWCh6DMlRl27K5P9iHRGptWz1Exo5qO5BUdjIFN5vrbz8

2024-07-12 00:03:17 - Error while downloading from https://cdn-lfs.huggingface.co/sentence-transformers/msmarco-distilbert-base-tas-b/2c01ce345cedea6d10c7fb148658a2bf51aa580b79655106fbc377417b421efa?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1720994292&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyMDk5NDI5Mn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9zZW50ZW5jZS10cmFuc2Zvcm1lcnMvbXNtYXJjby1kaXN0aWxiZXJ0LWJhc2UtdGFzLWIvMmMwMWNlMzQ1Y2VkZWE2ZDEwYzdmYjE0ODY1OGEyYmY1MWFhNTgwYjc5NjU1MTA2ZmJjMzc3NDE3YjQyMWVmYT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=QTG91XilhGJn1pqnejr0ZXZJ5EHt5Dq0YNjD1oVMD-fSNE-F51DDW%7Etm3uynURYsOcOaqb6woJc3nzrcvAxgHDARd8z7QDRKu2%7EZwv79vBYK5f23Zdcb-uDM9taMENKD6I5Yq-ED9vG2V%7ERJfau6vS2pBlspppTvYbOypXw7p1LNSPROd%7EbrYB70Y4z9uPGqZWgXk7bYM4MmNeBkI4uR15MFf1jw6yEJKjz0KDTWZZncF4QpExsvNYLyLA9wJJiHDWCh6DMlRl27K5P9iHRGptW

model.safetensors:  67%|######7   | 178M/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/547 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2024-07-12 00:05:37 - Encoding Queries...


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

2024-07-12 00:05:49 - Sorting Corpus by document length (Longest first)...
2024-07-12 00:05:50 - Scoring Function: Dot Product (dot)
2024-07-12 00:05:50 - Encoding Batch 1/1...


Batches:   0%|          | 0/324 [00:00<?, ?it/s]

2024-07-12 00:47:22 - For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
2024-07-12 00:47:22 - 

2024-07-12 00:47:22 - NDCG@1: 0.5333
2024-07-12 00:47:22 - NDCG@3: 0.5990
2024-07-12 00:47:22 - NDCG@5: 0.6215
2024-07-12 00:47:22 - NDCG@10: 0.6428
2024-07-12 00:47:22 - NDCG@100: 0.6698
2024-07-12 00:47:22 - NDCG@1000: 0.6811
2024-07-12 00:47:22 - 

2024-07-12 00:47:22 - MAP@1: 0.5086
2024-07-12 00:47:22 - MAP@3: 0.5730
2024-07-12 00:47:22 - MAP@5: 0.5892
2024-07-12 00:47:22 - MAP@10: 0.5992
2024-07-12 00:47:22 - MAP@100: 0.6046
2024-07-12 00:47:22 - MAP@1000: 0.6049
2024-07-12 00:47:22 - 

2024-07-12 00:47:22 - Recall@1: 0.5086
2024-07-12 00:47:22 - Recall@3: 0.6473
2024-07-12 00:47:22 - Recall@5: 0.6998
2024-07-12 00:47:22 - Recall@10: 0.7615
2024-07-12 00:47:22 - Recall@100: 0.8910
2024-07-12 00:47:22 - Recall@1000: 0.9833
2024-07-12 00:47:22 - 

2024-07-12 00:47:22 - P@1: 0.5333
2024-07-12 00:47:22