In [None]:
!pip install mteb==1.1.1 datasets beir sentence_transformers

In [None]:
from beir import util, LoggingHandler
from beir.retrieval import models
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES
from sentence_transformers import SentenceTransformer
from mteb.abstasks.AbsTaskRetrieval import DRESModel

import pandas as pd
import logging
import pathlib, os

In [None]:
# SET YOU HUGGINGFACE TOKEN
os.environ['HF_TOKEN'] = ""

# CQADupStack

Download dataset if necessary

In [None]:
DOWNLOAD_DATASET = False

if DOWNLOAD_DATASET:
  #### Just some code to print debug information to stdout
  logging.basicConfig(format='%(asctime)s - %(message)s',
                      datefmt='%Y-%m-%d %H:%M:%S',
                      level=logging.INFO,
                      handlers=[LoggingHandler()])
  #### /print debug information to stdout

  #### Download scifact.zip dataset and unzip the dataset
  dataset = "cqadupstack"
  url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
  out_dir = os.path.join(pathlib.Path('.').parent.absolute(), "datasets")
  data_path = util.download_and_unzip(url, out_dir)

## Benchmark: Sort Questions into Bins by Word Count

In [None]:
benchmark_results = pd.DataFrame(columns=['model_name', 'subset', 'bin_label', 'NDCG@10', 'count'])


subsets = [
    "android",
    "english",
    "gaming",
    "gis",
    "mathematica",
    "physics",
    "programmers",
    "stats",
    "tex",
    "unix",
    "webmasters",
    "wordpress"
]

model_names = [
    "thenlper/gte-base",
    "jinaai/jina-embeddings-v2-base-en"
]

In [None]:
bins = [1, 20, 50, 100, 200, 500, 1000, 2000]
bin_labels = [f"{bins[i]}-{bins[i+1]-1}" for i in range(len(bins)-1)]

for model_name in model_names:
  print(f"Benchmarking {model_name}")
  
  # Set model and retriever
  model = SentenceTransformer(model_name, trust_remote_code=True)
  model = DRESModel(model)
  model = DRES(model, batch_size=1, corpus_chunk_size=50000)
  retriever = EvaluateRetrieval(model, score_function="cos_sim")

  for subset in subsets:
    print(f"Benchmarking {subset}")

    # Load dataset
    corpus, queries, qrels = GenericDataLoader(data_folder=f"datasets/cqadupstack/{subset}").load(split="test")

    # Create dataframe to calculate word count and assign bins
    corpus_df = pd.DataFrame.from_dict(corpus, orient='index')
    corpus_df['word_count'] = corpus_df['text'].apply(lambda x: len(x.split()))
    corpus_df['bin'] = pd.cut(corpus_df['word_count'], bins=bins, labels=bin_labels)

    # Retrieve results
    results = retriever.retrieve(corpus, queries)

    for i in range (0, len(bins)-1):
      bin_start = bins[i]
      bin_end = bins[i+1]
      bin_label = bin_labels[i]

      qrel_keys = []

      # Filter query-corpus relations for current bin
      for qrel_key, qrel_value in qrels.items():
        for corpus_key in qrel_value:
          if corpus_df.loc[corpus_key]['bin'] == bin_label:
            qrel_keys.append(qrel_key)

      bin_qrels = {k: qrels[k] for k in qrel_keys}

      # Skip this bin if bin_qrels is empty
      if len(bin_qrels) == 0:
        continue

      # Evaluate results for current bin
      ndcg, _map, recall, precision = retriever.evaluate(bin_qrels, results, retriever.k_values)

      # Save NDCG@10
      entry = {'model_name': model_name, 'subset': subset, 'bin_label': bin_label, 'NDCG@10': ndcg['NDCG@10'], 'count': len(bin_qrels)}
      benchmark_results = benchmark_results.append(entry, ignore_index=True)

  benchmark_results.to_csv(f'results/retrieval/cqadupstack/benchmark_results_{model_name.replace("/", "_")}.csv', index=False)