In [1]:
import os
# Enforces CPU-only execution of torch
os.environ["CUDA_VISIBLE_DEVICES"] = ""

# Configure environment to ensure single-threaded execution.
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"]= "1"
os.environ["OMP_NUM_THREADS"] = "1"

import torch
torch.set_num_threads(1)

In [2]:
from xtr.datasets import BEIR, BEIRDataset, LoTTE, LoTTEDataset
from xtr.config import XTRConfig, XTRModel, XTRScaNNIndexConfig, XTRBruteForceIndexConfig, XTRFAISSIndexConfig
from xtr.utils import xtr_tracker, canonical_index_name
from xtr.modeling.xtr import XTR

import json
from datetime import datetime

NUM_RUNS_PER_EXPERIMENT = 3

  from tqdm.autonotebook import tqdm


In [3]:
def current_time_str():
    return datetime.today().strftime('%Y-%m-%d-%H-%M-%S')

def xtr_eval_latency(dataset, index_config, document_top_k, token_top_k):
    index_name = canonical_index_name(dataset=dataset, index_config=index_config)
    config = XTRConfig(index_name=index_name, model=XTRModel.BASE_EN, index_config=index_config, override=False)
    xtr = XTR(config=config, collection=dataset.collection, device=torch.device("cpu"))
    tracker = xtr_tracker(name=index_name)
    rankings = xtr.retrieve_docs(dataset.queries, document_top_k=document_top_k, token_top_k=token_top_k, tracker=tracker)
    return tracker, dataset.eval(rankings)

def xtr_run_configuration(dataset, index_config, document_top_k, token_top_k):
    tracker, metrics = xtr_eval_latency(dataset, index_config, document_top_k, token_top_k)
    configuration = {"dataset": dataset.name, "index": index_config.name,
                     "document_top_k": document_top_k, "token_top_k": token_top_k}
    return {
        "config": configuration,
        "metrics": metrics,
        "tracker": tracker.as_dict()
    }

def xtr_run_configurations(datasets, index_configs, document_top_k, token_top_k_values, label):
    ctime = current_time_str()
    os.makedirs("results", exist_ok=True)
    filename = os.path.join("results", f"run_{label}_{ctime}.json")
    results = []
    for dataset in datasets:
        for index_config in index_configs:
            for token_top_k in token_top_k_values:
                results.append(xtr_run_configuration(dataset, index_config, document_top_k=document_top_k, token_top_k=token_top_k))
                with open(filename, "w") as file:
                    json.dump(results, file)

In [4]:
DATASETS = [BEIRDataset(dataset=BEIR.SCIFACT, datasplit="test"),
            LoTTEDataset(dataset=LoTTE.LIFESTYLE, datasplit="test"),
            LoTTEDataset(dataset=LoTTE.TECHNOLOGY, datasplit="test")]
INDEX_CONFIGS = [XTRScaNNIndexConfig()]
TOKEN_TOP_K_VALUES = [1_000, 40_0000]

for _ in range(NUM_RUNS_PER_EXPERIMENT):
    xtr_run_configurations(datasets=DATASETS, index_configs=INDEX_CONFIGS,
                           document_top_k=100, token_top_k_values=TOKEN_TOP_K_VALUES, label="scann")

  0%|          | 0/5183 [00:00<?, ?it/s]

#> Preparing corpus for BEIR BEIR.SCIFACT/test


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 5183/5183 [00:00<00:00, 569607.69it/s]


Loading existing index from /future/u/scheerer/home/data/xtr-eval/indexes/BEIR.SCIFACT.split=test.XTRIndexType.SCANN.


100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [00:58<00:00,  5.15it/s]


Loading existing index from /future/u/scheerer/home/data/xtr-eval/indexes/BEIR.SCIFACT.split=test.XTRIndexType.SCANN.


  7%|██████▋                                                                                              | 20/300 [03:42<51:53, 11.12s/it]


KeyboardInterrupt: 