## Basic Benchmarking using Chroma

#### Using custom functions

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from evals import evaluate
from utils import custom_results, run_mteb_default, run_mteb_openai, run_mteb_jina
import pandas as pd
import chromadb
import os
from dotenv import load_dotenv
from openai import OpenAI
from chromadb.utils import embedding_functions
from tqdm import tqdm
import mteb
import json
from openai_model import OpenAIEmbedder
from jina_model import JinaAIEmbedder
import matplotlib.pyplot as plt


  from .autonotebook import tqdm as notebook_tqdm


In [10]:
load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')
chroma_x_token = os.getenv('CHROMA_X_TOKEN')
chroma_tenant = os.getenv('CHROMA_TENANT')
jina_api_key = os.getenv('JINA_API_KEY')

#### cqadupstack-english

query <> text

In [22]:
default_ef = embedding_functions.DefaultEmbeddingFunction()

In [27]:
jinaai_ef = embedding_functions.JinaEmbeddingFunction(
                api_key=jina_api_key,
                model_name="jina-embeddings-v3"
            )

In [28]:
openai_ef_small = embedding_functions.OpenAIEmbeddingFunction(
    api_key=openai_api_key,
    model_name="text-embedding-3-small"
)

In [29]:
openai_ef_large = embedding_functions.OpenAIEmbeddingFunction(
    api_key=openai_api_key,
    model_name="text-embedding-3-large"
)

In [163]:
chroma_client = chromadb.Client()

cqadupstack_minilm_title = chroma_client.get_or_create_collection(name="cqadupstack_minilm_title", embedding_function=default_ef)

In [3]:
cqadupstack_corpus_df = pd.read_parquet("datasets/cqadupstack_corpus_text.parquet")
cqadupstack_qrels_df = pd.read_parquet("datasets/cqadupstack_default_text.parquet")
cqadupstack_queries_df = pd.read_parquet("datasets/cqadupstack_queries.parquet")

In [5]:
len(cqadupstack_qrels_df)

3765

In [6]:
cqadupstack_qrels_df

Unnamed: 0,query-id,corpus-id,score
0,19399,102236,1.0
1,19399,91901,1.0
2,19399,177507,1.0
3,19399,80798,1.0
4,19399,112990,1.0
...,...,...,...
3760,107236,107238,1.0
3761,49404,85483,1.0
3762,20908,29680,1.0
3763,145631,167819,1.0


In [7]:
len(cqadupstack_queries_df)

1570

In [20]:
cqadupstack_queries = cqadupstack_queries_df["text"].tolist()
cqadupstack_queries_ids = cqadupstack_queries_df["_id"].tolist()

In [21]:
cqadupstack_text_docs = cqadupstack_corpus_df["text"].tolist()
cqadupstack_corpus_text_ids = cqadupstack_corpus_df["_id"].tolist()

In [33]:
chroma_client = chromadb.Client()

In [59]:
jina_collection = chroma_client.get_collection(name="CQADupstackEnglishRetrieval_jina-embeddings-v3-revised", embedding_function=jinaai_ef)

In [63]:
jina_collection.count()

33299

In [61]:
len(cqadupstack_text_docs)

40221

In [54]:
cqadupstack_corpus_text_ids[30200]

'133619'

In [55]:
cqadupstack_corpus_text_ids[30199]

'183064'

In [64]:
# jina
custom_results(
    chroma_client=chroma_client,
    qrels_df=cqadupstack_qrels_df,
    queries=cqadupstack_queries,
    query_ids=cqadupstack_queries_ids,
    corpus=cqadupstack_text_docs[33299:],
    corpus_ids=cqadupstack_corpus_text_ids[33299:],
    ef=jinaai_ef,
    dataset_name="CQADupstackEnglishRetrieval",
    model_name="jina-embeddings-v3-revised"
)

CQADupstackEnglishRetrieval_jina-embeddings-v3-revised collection created


Processing Batches: 100%|██████████| 70/70 [01:27<00:00,  1.25s/it]


embedding complete


Processing Batches: 100%|██████████| 16/16 [00:14<00:00,  1.09it/s]

evaluating





In [30]:
#openai small
custom_results(
    qrels_df=cqadupstack_qrels_df,
    queries=cqadupstack_queries,
    query_ids=cqadupstack_queries_ids,
    corpus=cqadupstack_text_docs,
    corpus_ids=cqadupstack_corpus_text_ids,
    ef=openai_ef_small,
    dataset_name="CQADupstackEnglishRetrieval",
    model_name="text-embedding-3-small"
)

CQADupstackEnglishRetrieval_text-embedding-3-small collection created


Processing Batches: 100%|██████████| 403/403 [07:05<00:00,  1.06s/it]


embedding complete


Processing Batches: 100%|██████████| 16/16 [00:13<00:00,  1.20it/s]

evaluating





In [31]:
#openai large
custom_results(
    qrels_df=cqadupstack_qrels_df,
    queries=cqadupstack_queries,
    query_ids=cqadupstack_queries_ids,
    corpus=cqadupstack_text_docs,
    corpus_ids=cqadupstack_corpus_text_ids,
    ef=openai_ef_large,
    dataset_name="CQADupstackEnglishRetrieval",
    model_name="text-embedding-3-large"
)

CQADupstackEnglishRetrieval_text-embedding-3-large collection created


Processing Batches: 100%|██████████| 403/403 [07:50<00:00,  1.17s/it]


embedding complete


Processing Batches: 100%|██████████| 16/16 [00:14<00:00,  1.11it/s]

evaluating





In [65]:
run_mteb_openai(task_name="CQADupstackEnglishRetrieval", size="small")

Failed to extract metadata from model: 'OpenAIEmbedder' object has no attribute 'model_card_data'. Upgrading to sentence-transformers v3.0.0 or above is recommended.


In [66]:
run_mteb_openai(task_name="CQADupstackEnglishRetrieval", size="large")



In [67]:
run_mteb_jina("CQADupstackEnglishRetrieval")



In [173]:
cqadupstack_minilm_text = chroma_client.get_or_create_collection(
    name="cqadupstack_minilm", 
    embedding_function=default_ef,
    metadata={
        "hnsw:space": "cosine"
    }
)

In [184]:
cqadupstack_minilm_text.peek()

{'ids': [],
 'embeddings': array([], dtype=float64),
 'documents': [],
 'uris': None,
 'data': None,
 'metadatas': [],
 'included': [<IncludeEnum.embeddings: 'embeddings'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [185]:
batch_size = 100

for i in tqdm(range(0, len(cqadupstack_text_docs), batch_size), desc="Processing Batches"):
    batch_documents = cqadupstack_text_docs[i:i + batch_size]
    batch_ids = cqadupstack_corpus_text_ids[i:i + batch_size]

    cqadupstack_minilm_text.add(
        documents=batch_documents,
        ids=batch_ids
    )

Processing Batches: 100%|██████████| 403/403 [34:37<00:00,  5.16s/it]


In [186]:
cqadupstack_minilm_text_results = dict()
cqadupstack_minilm_text_results["query-id"] = []
cqadupstack_minilm_text_results["corpus-id"] = []
cqadupstack_minilm_text_results["score"] = []

for i in tqdm(range(0, len(cqadupstack_queries), batch_size), desc="Processing Batches"):
    batch_documents = cqadupstack_queries[i:i + batch_size]
    batch_ids = cqadupstack_queries_ids[i:i + batch_size]

    cqadupstack_minilm_text_results["query-id"].extend(batch_ids)

    batch_result = cqadupstack_minilm_text.query(
        query_texts=batch_documents,
        n_results=10
    )

    scores = [[1 - item for item in sublist] for sublist in batch_result["distances"]]

    cqadupstack_minilm_text_results["corpus-id"].extend(batch_result["ids"])
    cqadupstack_minilm_text_results["score"].extend(scores)

print(f"Processed batch {i // batch_size + 1}/{(len(cqadupstack_text_docs) + batch_size - 1) // batch_size}")

cqadupstack_minilm_text_results_df = pd.DataFrame(cqadupstack_minilm_text_results)

cqadupstack_minilm_text_results_df.to_parquet("all_results/cqadupstack_minilm_text_results_df.parquet", engine="pyarrow", index=False)


Processing Batches: 100%|██████████| 16/16 [01:17<00:00,  4.86s/it]


Processed batch 16/403


OSError: Cannot save file into a non-existent directory: 'all_results'

In [None]:
cqadupstack_minilm_text_results_df.to_parquet("df_results/cqadupstack_minilm_text_results_df.parquet", engine="pyarrow", index=False)

parameters for custom evaluation

In [189]:
cqadupstack_qrels_text_dict = cqadupstack_qrels_df.groupby("query-id").apply(lambda g: dict(zip(g["corpus-id"], g["score"]))).to_dict()
k_values = [1, 3, 5, 10]
# just use cqadupstack_minilm_text_results directly

  cqadupstack_qrels_text_dict = cqadupstack_qrels_df.groupby("query-id").apply(lambda g: dict(zip(g["corpus-id"], g["score"]))).to_dict()


In [193]:
cqadupstack_qrels_text_dict = {
    qid: {doc_id: int(score) for doc_id, score in doc_dict.items()}
    for qid, doc_dict in cqadupstack_qrels_text_dict.items()
}


In [197]:
cqadupstack_minilm_text_results.keys()

dict_keys(['query-id', 'corpus-id', 'score'])

In [200]:
cqadupstack_minilm_text_results_dict = {}

for query_id, doc_ids, scores in zip(
    cqadupstack_minilm_text_results["query-id"],
    cqadupstack_minilm_text_results["corpus-id"],
    cqadupstack_minilm_text_results["score"],
):
    if query_id not in cqadupstack_minilm_text_results_dict:
        cqadupstack_minilm_text_results_dict[query_id] = {}

    for doc_id, score in zip(doc_ids, scores):
        cqadupstack_minilm_text_results_dict[query_id][doc_id] = score  # Ensure float

In [207]:
ndcg_cqadupstack_minilm, _map_cqadupstack_minilm, recall_cqadupstack_minilm, precision_cqadupstack_minilm, top_k_accuracy_cqadupstack_minilm = evaluate(
    qrels=cqadupstack_qrels_text_dict, 
    results=cqadupstack_minilm_text_results_dict, 
    k_values=k_values)

In [208]:
ndcg_cqadupstack_minilm

{'NDCG@1': 0.38025, 'NDCG@3': 0.40742, 'NDCG@5': 0.42999, 'NDCG@10': 0.44803}

In [210]:
_map_cqadupstack_minilm

{'MAP@1': 0.29252, 'MAP@3': 0.36016, 'MAP@5': 0.37953, 'MAP@10': 0.39099}

In [213]:
custom_results_cqadupstack_minilm = {
    "NDCG": ndcg_cqadupstack_minilm,
    "MAP": _map_cqadupstack_minilm,
    "Recall": recall_cqadupstack_minilm,
    "Precision": precision_cqadupstack_minilm,
    "Top-K Accuracy": top_k_accuracy_cqadupstack_minilm
}

with open("custom_results/CQADupstackEnglishRetrieval/all-MiniLM-L6-v2.json", "w") as f:
    json.dump(custom_results_cqadupstack_minilm, f, indent=4)

comparison with MTEB

In [148]:
model_name = "all-MiniLM-L6-v2"

model = mteb.get_model(model_name)
tasks = mteb.get_tasks(tasks=["CQADupstackEnglishRetrieval"])
evaluation = mteb.MTEB(tasks=tasks)
results = evaluation.run(model, output_folder=f"mteb_results/CQADupstackEnglishRetrieval/{model_name}")

Failed to extract metadata from model: 401 Client Error. (Request ID: Root=1-679c0d9d-32c40f8353df94292b4d43a3;3b0ec723-6ffa-4299-81f8-32349df01bff)

Repository Not Found for url: https://huggingface.co/all-MiniLM-L6-v2/resolve/main/README.md.
Please make sure you specified the correct `repo_id` and `repo_type`.
If you are trying to access a private or gated repo, make sure you are authenticated.
Invalid username or password..
Loader not specified for model all-MiniLM-L6-v2, loading using sentence transformers.


Batches: 100%|██████████| 13/13 [00:01<00:00,  9.32it/s]
Batches: 100%|██████████| 315/315 [00:41<00:00,  7.65it/s]


#### text-embedding-3-small

In [232]:
openai_ef_small = embedding_functions.OpenAIEmbeddingFunction(
    api_key=openai_api_key,
    model_name="text-embedding-3-small"
)

In [None]:
cqadupstack_openai_small = chroma_client.get_or_create_collection(
    name="cqadupstack_openai_small", 
    embedding_function=openai_ef_small,
    metadata={
        "hnsw:space": "cosine"
    }
)

In [None]:
cqadupstack_openai_small.add(
    documents=cqadupstack_minilm_text_docs[0],
    ids=cqadupstack_corpus_text_ids[0]
)

In [None]:
for i in tqdm(range(0, len(cqadupstack_minilm_text_docs), batch_size), desc="Processing Batches"):
    batch_documents = cqadupstack_minilm_text_docs[i:i + batch_size]
    batch_ids = cqadupstack_corpus_text_ids[i:i + batch_size]

    cqadupstack_openai_small.add(
        documents=batch_documents,
        ids=batch_ids
    )

In [None]:
cqadupstack_openai_small_results = dict()
cqadupstack_openai_small_results["query-id"] = []
cqadupstack_openai_small_results["corpus-id"] = []
cqadupstack_openai_small_results["score"] = []

for i in tqdm(range(0, len(cqadupstack_queries_text), batch_size), desc="Processing Batches"):
    batch_documents = cqadupstack_queries_text[i:i + batch_size]
    batch_ids = cqadupstack_queries_ids[i:i + batch_size]

    cqadupstack_openai_small_results["query-id"].extend(batch_ids)

    batch_result = cqadupstack_openai_small.query(
        query_texts=batch_documents,
        n_results=10
    )

    scores = [[1 - item for item in sublist] for sublist in batch_result["distances"]]

    cqadupstack_openai_small_results["corpus-id"].extend(batch_result["ids"])
    cqadupstack_openai_small_results["score"].extend(scores)

print(f"Processed batch {i // batch_size + 1}/{(len(cqadupstack_minilm_title_docs) + batch_size - 1) // batch_size}")

pd.DataFrame(cqadupstack_openai_small_results)

In [None]:
ndcg_cqadupstack_openai_small, _map_cqadupstack_openai_small, recall_cqadupstack_openai_small, precision_cqadupstack_openai_small, top_k_accuracy_cqadupstack_openai_small = evaluate(
    qrels=cqadupstack_qrels_text_dict, 
    results=cqadupstack_openai_small_results, 
    k_values=k_values)

In [None]:
model_name = "all-MiniLM-L6-v2"

model = mteb.get_model(model_name)
tasks = mteb.get_tasks(tasks=["CQADupstackEnglishRetrieval"])
evaluation = mteb.MTEB(tasks=tasks)
results = evaluation.run(model, output_folder=f"mteb_results/CQADupstackEnglishRetrieval/{model_name}")

#### MedicalQA

In [8]:
medical_qa_corpus_df = pd.read_parquet("datasets/medical_qa_corpus.parquet")
medical_qa_qrels_df = pd.read_parquet("datasets/medical_qa_default.parquet")
medical_qa_queries_df = pd.read_parquet("datasets/medical_qa_queries.parquet")

In [6]:
medical_qa_queries = medical_qa_queries_df["text"].tolist()
medical_qa_queries_ids = medical_qa_queries_df["_id"].tolist()
medical_qa_docs = medical_qa_corpus_df["text"].tolist()
medical_qa_corpus_ids = medical_qa_corpus_df["_id"].tolist()

In [8]:
medical_qa_qrels_df.head()

Unnamed: 0,query-id,corpus-id,score
0,2d3e200a-8ddf-4062-a678-f5d0401c54ad,0377f7ec-d481-4bb2-a878-5d279a2a11fe,1.0
1,bf3b28a5-db6f-412d-96f1-d67d17bcaef8,2f249bba-2201-424f-8515-182cd5272cd9,1.0
2,2d3e200a-8ddf-4062-a678-f5d0401c54ad,935eb313-ccf5-4135-9de6-f6b9cfb6a8f5,1.0
3,ebb4ab78-ff5f-4e78-ac14-bb98760c2a0a,f28f93b1-0df2-47cf-9349-e85683402cf8,1.0
4,9fd75f16-1bd5-4451-af69-5885b7d5267a,8e253cda-fa64-4f77-ae17-e1d8774f96bb,1.0


In [9]:
dataset_name = "MedicalQA"
model_name = "jina-embeddings-v3"

In [16]:
jinaai_ef = embedding_functions.JinaEmbeddingFunction(
                api_key=jina_api_key,
                model_name="jina-embeddings-v3"
            )

In [18]:
custom_results(
    qrels_df=medical_qa_qrels_df,
    queries=medical_qa_queries,
    query_ids=medical_qa_queries_ids,
    corpus=medical_qa_docs,
    corpus_ids=medical_qa_corpus_ids,
    ef=jinaai_ef,
    dataset_name=dataset_name,
    model_name=model_name
)

MedicalQA_jina-embeddings-v3 collection created


Processing Batches: 100%|██████████| 21/21 [00:35<00:00,  1.71s/it]


embedding complete


Processing Batches: 100%|██████████| 19/19 [00:18<00:00,  1.04it/s]

evaluating





In [218]:
medical_qa_openai_small = chroma_client.get_or_create_collection(
    name="medicalqa_openai_small", 
    embedding_function=openai_ef_small,
    metadata={
        "hnsw:space": "cosine"
    }
)

In [219]:
medical_qa_openai_small.add(
    documents=medical_qa_docs[0],
    ids=medical_qa_corpus_ids[0]
)

In [220]:
medical_qa_openai_small.peek()

{'ids': ['0377f7ec-d481-4bb2-a878-5d279a2a11fe'],
 'embeddings': array([[ 0.03245638,  0.00600431,  0.04077287, ...,  0.02248395,
         -0.02963515,  0.04320158]]),
 'documents': ['LCMV infections can occur after exposure to fresh urine, droppings, saliva, or nesting materials from infected rodents.  Transmission may also occur when these materials are directly introduced into broken skin, the nose, the eyes, or the mouth, or presumably, via the bite of an infected rodent. Person-to-person transmission has not been reported, with the exception of vertical transmission from infected mother to fetus, and rarely, through organ transplantation.'],
 'uris': None,
 'data': None,
 'metadatas': [None],
 'included': [<IncludeEnum.embeddings: 'embeddings'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [221]:
medical_qa_openai_small.delete(ids=['0377f7ec-d481-4bb2-a878-5d279a2a11fe'])

In [222]:
medical_qa_openai_small.peek()

{'ids': [],
 'embeddings': array([], dtype=float64),
 'documents': [],
 'uris': None,
 'data': None,
 'metadatas': [],
 'included': [<IncludeEnum.embeddings: 'embeddings'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [223]:
batch_size = 100

for i in tqdm(range(0, len(medical_qa_docs), batch_size), desc="Processing Batches"):
    batch_documents = medical_qa_docs[i:i + batch_size]
    batch_ids = medical_qa_corpus_ids[i:i + batch_size]

    medical_qa_openai_small.add(
        documents=batch_documents,
        ids=batch_ids
    )

Processing Batches: 100%|██████████| 21/21 [00:33<00:00,  1.61s/it]


In [224]:
medical_qa_openai_small_results = dict()
medical_qa_openai_small_results["query-id"] = []
medical_qa_openai_small_results["corpus-id"] = []
medical_qa_openai_small_results["score"] = []

for i in tqdm(range(0, len(medical_qa_queries), batch_size), desc="Processing Batches"):
    batch_documents = medical_qa_queries[i:i + batch_size]
    batch_ids = medical_qa_queries_ids[i:i + batch_size]

    medical_qa_openai_small_results["query-id"].extend(batch_ids)

    batch_result = medical_qa_openai_small.query(
        query_texts=batch_documents,
        n_results=10
    )

    scores = [[1 - item for item in sublist] for sublist in batch_result["distances"]]

    medical_qa_openai_small_results["corpus-id"].extend(batch_result["ids"])
    medical_qa_openai_small_results["score"].extend(scores)

medical_qa_openai_small_results_df = pd.DataFrame(medical_qa_openai_small_results)

medical_qa_openai_small_results_df.to_parquet("df_results/medical_qa_openai_small_results_df.parquet", engine="pyarrow", index=False)


Processing Batches: 100%|██████████| 19/19 [00:21<00:00,  1.14s/it]


In [225]:
medical_qa_openai_small_results_df

Unnamed: 0,query-id,corpus-id,score
0,2d3e200a-8ddf-4062-a678-f5d0401c54ad,"[2f249bba-2201-424f-8515-182cd5272cd9, e273d4a...","[0.6244661808013916, 0.5691493153572083, 0.532..."
1,bf3b28a5-db6f-412d-96f1-d67d17bcaef8,"[2f249bba-2201-424f-8515-182cd5272cd9, 8e253cd...","[0.6741660237312317, 0.5143333673477173, 0.509..."
2,ebb4ab78-ff5f-4e78-ac14-bb98760c2a0a,"[2f249bba-2201-424f-8515-182cd5272cd9, e273d4a...","[0.5992770791053772, 0.5436180233955383, 0.534..."
3,9fd75f16-1bd5-4451-af69-5885b7d5267a,"[8e253cda-fa64-4f77-ae17-e1d8774f96bb, 2f249bb...","[0.6424867510795593, 0.5795997381210327, 0.528..."
4,0f303917-6e53-449c-8242-7ab82e8fb78f,"[e273d4a7-96e7-4d03-848c-7bf991820771, 2f249bb...","[0.6200345754623413, 0.6041606068611145, 0.581..."
...,...,...,...
1815,8e88d11c-e574-450a-ade4-c3f96756cb51,"[5345361d-b741-4428-befe-f5d86db1035c, 0c79e70...","[0.7634917497634888, 0.698533296585083, 0.6356..."
1816,8a105170-77a8-4706-8fbc-c33b65fe7a69,"[87f80156-6d4d-424f-b17a-4aad8d73d32d, 687829c...","[0.7524518966674805, 0.6602441668510437, 0.650..."
1817,2fdc3c5b-fdd7-4402-b5f2-edbadf83889d,"[6e249abc-2caf-489d-89c7-b3152c9a6dc6, 0c79e70...","[0.7308936715126038, 0.6427885293960571, 0.626..."
1818,2ac6e1b0-09b3-4457-93a4-933e1c72103b,"[6e249abc-2caf-489d-89c7-b3152c9a6dc6, 49e8250...","[0.7471401691436768, 0.6871441602706909, 0.681..."


In [226]:
medical_qa_openai_small_dict = medical_qa_qrels_df.groupby("query-id").apply(lambda g: dict(zip(g["corpus-id"], g["score"]))).to_dict()
k_values = [1, 3, 5, 10]

medical_qa_openai_small_dict = {
    qid: {doc_id: int(score) for doc_id, score in doc_dict.items()}
    for qid, doc_dict in medical_qa_openai_small_dict.items()
}

medical_qa_openai_small_results_dict = {}

for query_id, doc_ids, scores in zip(
    medical_qa_openai_small_results["query-id"],
    medical_qa_openai_small_results["corpus-id"],
    medical_qa_openai_small_results["score"],
):
    if query_id not in medical_qa_openai_small_results_dict:
        medical_qa_openai_small_results_dict[query_id] = {}

    for doc_id, score in zip(doc_ids, scores):
        medical_qa_openai_small_results_dict[query_id][doc_id] = score  # Ensure float


  medical_qa_openai_small_dict = medical_qa_qrels_df.groupby("query-id").apply(lambda g: dict(zip(g["corpus-id"], g["score"]))).to_dict()


In [228]:
ndcg_medical_qa_openai_small, _map_medical_qa_openai_small, recall_medical_qa_openai_small, precision_medical_qa_openai_small, top_k_accuracy_medical_qa_openai_small = evaluate(
    qrels=medical_qa_openai_small_dict, 
    results=medical_qa_openai_small_results_dict, 
    k_values=k_values)

custom_medical_qa_openai_small = {
    "NDCG": ndcg_medical_qa_openai_small,
    "MAP": _map_medical_qa_openai_small,
    "Recall": recall_medical_qa_openai_small,
    "Precision": precision_medical_qa_openai_small,
    "Top-K Accuracy": top_k_accuracy_medical_qa_openai_small
}

with open("custom_results/MedicalQA/text-embedding-3-small.json", "w") as f:
    json.dump(custom_medical_qa_openai_small, f, indent=4)

MTEB

In [243]:
model = OpenAIEmbedder(engine="text-embedding-3-small")
tasks = mteb.get_tasks(tasks=["MedicalQARetrieval"])
evaluation = mteb.MTEB(tasks=tasks)
results = evaluation.run(model, output_folder=f"mteb_results/MedicalQARetrieval/text-embedding-3-small")



text-embedding-3-large

In [231]:
openai_ef_large = embedding_functions.OpenAIEmbeddingFunction(
    api_key=openai_api_key,
    model_name="text-embedding-3-large"
)

In [233]:
medical_qa_openai_large = chroma_client.get_or_create_collection(
    name="medicalqa_openai_large", 
    embedding_function=openai_ef_large,
    metadata={
        "hnsw:space": "cosine"
    }
)

In [234]:
medical_qa_openai_large.add(
    documents=medical_qa_docs[0],
    ids=medical_qa_corpus_ids[0]
)

medical_qa_openai_large.peek()

{'ids': ['0377f7ec-d481-4bb2-a878-5d279a2a11fe'],
 'embeddings': array([[-0.05502633,  0.02624224, -0.00510723, ...,  0.00343915,
          0.00771381, -0.01212086]]),
 'documents': ['LCMV infections can occur after exposure to fresh urine, droppings, saliva, or nesting materials from infected rodents.  Transmission may also occur when these materials are directly introduced into broken skin, the nose, the eyes, or the mouth, or presumably, via the bite of an infected rodent. Person-to-person transmission has not been reported, with the exception of vertical transmission from infected mother to fetus, and rarely, through organ transplantation.'],
 'uris': None,
 'data': None,
 'metadatas': [None],
 'included': [<IncludeEnum.embeddings: 'embeddings'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [235]:
medical_qa_openai_large.delete(ids=['0377f7ec-d481-4bb2-a878-5d279a2a11fe'])

In [237]:
medical_qa_openai_large.peek()

{'ids': [],
 'embeddings': array([], dtype=float64),
 'documents': [],
 'uris': None,
 'data': None,
 'metadatas': [],
 'included': [<IncludeEnum.embeddings: 'embeddings'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [238]:
batch_size = 100

for i in tqdm(range(0, len(medical_qa_docs), batch_size), desc="Processing Batches"):
    batch_documents = medical_qa_docs[i:i + batch_size]
    batch_ids = medical_qa_corpus_ids[i:i + batch_size]

    medical_qa_openai_large.add(
        documents=batch_documents,
        ids=batch_ids
    )

Processing Batches: 100%|██████████| 21/21 [00:27<00:00,  1.31s/it]


In [239]:
medical_qa_openai_large_results = dict()
medical_qa_openai_large_results["query-id"] = []
medical_qa_openai_large_results["corpus-id"] = []
medical_qa_openai_large_results["score"] = []

for i in tqdm(range(0, len(medical_qa_queries), batch_size), desc="Processing Batches"):
    batch_documents = medical_qa_queries[i:i + batch_size]
    batch_ids = medical_qa_queries_ids[i:i + batch_size]

    medical_qa_openai_large_results["query-id"].extend(batch_ids)

    batch_result = medical_qa_openai_large.query(
        query_texts=batch_documents,
        n_results=10
    )

    scores = [[1 - item for item in sublist] for sublist in batch_result["distances"]]

    medical_qa_openai_large_results["corpus-id"].extend(batch_result["ids"])
    medical_qa_openai_large_results["score"].extend(scores)

medical_qa_openai_large_results_df = pd.DataFrame(medical_qa_openai_large_results)

medical_qa_openai_large_results_df.to_parquet("df_results/medical_qa_openai_large_results_df.parquet", engine="pyarrow", index=False)

Processing Batches: 100%|██████████| 19/19 [00:22<00:00,  1.17s/it]


In [240]:
medical_qa_openai_large_results_df

Unnamed: 0,query-id,corpus-id,score
0,2d3e200a-8ddf-4062-a678-f5d0401c54ad,"[935eb313-ccf5-4135-9de6-f6b9cfb6a8f5, e273d4a...","[0.7172563672065735, 0.6540296673774719, 0.631..."
1,bf3b28a5-db6f-412d-96f1-d67d17bcaef8,"[2f249bba-2201-424f-8515-182cd5272cd9, 12b3b47...","[0.73685622215271, 0.5617017149925232, 0.55098..."
2,ebb4ab78-ff5f-4e78-ac14-bb98760c2a0a,"[2f249bba-2201-424f-8515-182cd5272cd9, e273d4a...","[0.605175256729126, 0.5850441455841064, 0.5801..."
3,9fd75f16-1bd5-4451-af69-5885b7d5267a,"[8e253cda-fa64-4f77-ae17-e1d8774f96bb, 2f249bb...","[0.6906461119651794, 0.5933429598808289, 0.572..."
4,0f303917-6e53-449c-8242-7ab82e8fb78f,"[e273d4a7-96e7-4d03-848c-7bf991820771, 935eb31...","[0.7198009490966797, 0.6264835000038147, 0.595..."
...,...,...,...
1815,8e88d11c-e574-450a-ade4-c3f96756cb51,"[5345361d-b741-4428-befe-f5d86db1035c, 0c79e70...","[0.735322117805481, 0.6824033856391907, 0.5890..."
1816,8a105170-77a8-4706-8fbc-c33b65fe7a69,"[687829c7-bf8e-4151-9baf-9c8dbc1a0e43, 87f8015...","[0.7039521336555481, 0.6767681241035461, 0.645..."
1817,2fdc3c5b-fdd7-4402-b5f2-edbadf83889d,"[6e249abc-2caf-489d-89c7-b3152c9a6dc6, 0c79e70...","[0.7330617904663086, 0.6596018671989441, 0.559..."
1818,2ac6e1b0-09b3-4457-93a4-933e1c72103b,"[6e249abc-2caf-489d-89c7-b3152c9a6dc6, 0c79e70...","[0.7358267307281494, 0.7023440003395081, 0.631..."


In [241]:
medical_qa_openai_large_dict = medical_qa_qrels_df.groupby("query-id").apply(lambda g: dict(zip(g["corpus-id"], g["score"]))).to_dict()
k_values = [1, 3, 5, 10]

medical_qa_openai_large_dict = {
    qid: {doc_id: int(score) for doc_id, score in doc_dict.items()}
    for qid, doc_dict in medical_qa_openai_large_dict.items()
}

medical_qa_openai_large_results_dict = {}

for query_id, doc_ids, scores in zip(
    medical_qa_openai_large_results["query-id"],
    medical_qa_openai_large_results["corpus-id"],
    medical_qa_openai_large_results["score"],
):
    if query_id not in medical_qa_openai_large_results_dict:
        medical_qa_openai_large_results_dict[query_id] = {}

    for doc_id, score in zip(doc_ids, scores):
        medical_qa_openai_large_results_dict[query_id][doc_id] = score

ndcg_medical_qa_openai_large, _map_medical_qa_openai_large, recall_medical_qa_openai_large, precision_medical_qa_openai_large, top_k_accuracy_medical_qa_openai_large = evaluate(
    qrels=medical_qa_openai_large_dict, 
    results=medical_qa_openai_large_results_dict, 
    k_values=k_values
)

custom_medical_qa_openai_large = {
    "NDCG": ndcg_medical_qa_openai_large,
    "MAP": _map_medical_qa_openai_large,
    "Recall": recall_medical_qa_openai_large,
    "Precision": precision_medical_qa_openai_large,
    "Top-K Accuracy": top_k_accuracy_medical_qa_openai_large
}

with open("custom_results/MedicalQA/text-embedding-3-large.json", "w") as f:
    json.dump(custom_medical_qa_openai_large, f, indent=4)

  medical_qa_openai_large_dict = medical_qa_qrels_df.groupby("query-id").apply(lambda g: dict(zip(g["corpus-id"], g["score"]))).to_dict()


MTEB

In [244]:
model = OpenAIEmbedder(engine="text-embedding-3-large")
tasks = mteb.get_tasks(tasks=["MedicalQARetrieval"])
evaluation = mteb.MTEB(tasks=tasks)
results = evaluation.run(model, output_folder=f"mteb_results/MedicalQARetrieval/text-embedding-3-large")



minilm model

In [245]:
medical_qa_minilm = chroma_client.get_or_create_collection(
    name="medical_qa_minilm", 
    embedding_function=default_ef,
    metadata={
        "hnsw:space": "cosine"
    }
)

In [246]:
for i in tqdm(range(0, len(medical_qa_docs), batch_size), desc="Processing Batches"):
    batch_documents = medical_qa_docs[i:i + batch_size]
    batch_ids = medical_qa_corpus_ids[i:i + batch_size]

    medical_qa_minilm.add(
        documents=batch_documents,
        ids=batch_ids
    )

Processing Batches: 100%|██████████| 21/21 [01:48<00:00,  5.19s/it]


In [247]:
medical_qa_minilm_results = dict()
medical_qa_minilm_results["query-id"] = []
medical_qa_minilm_results["corpus-id"] = []
medical_qa_minilm_results["score"] = []

for i in tqdm(range(0, len(medical_qa_queries), batch_size), desc="Processing Batches"):
    batch_documents = medical_qa_queries[i:i + batch_size]
    batch_ids = medical_qa_queries_ids[i:i + batch_size]

    medical_qa_minilm_results["query-id"].extend(batch_ids)

    batch_result = medical_qa_minilm.query(
        query_texts=batch_documents,
        n_results=10
    )

    scores = [[1 - item for item in sublist] for sublist in batch_result["distances"]]

    medical_qa_minilm_results["corpus-id"].extend(batch_result["ids"])
    medical_qa_minilm_results["score"].extend(scores)

medical_qa_minilm_results_df = pd.DataFrame(medical_qa_minilm_results)

medical_qa_minilm_results_df.to_parquet("df_results/medical_qa_minilm_results_df.parquet", engine="pyarrow", index=False)


Processing Batches: 100%|██████████| 19/19 [01:36<00:00,  5.07s/it]


In [248]:
medical_qa_minilm_results_df

Unnamed: 0,query-id,corpus-id,score
0,2d3e200a-8ddf-4062-a678-f5d0401c54ad,"[2f249bba-2201-424f-8515-182cd5272cd9, 8e253cd...","[0.4581689238548279, 0.4542955160140991, 0.433..."
1,bf3b28a5-db6f-412d-96f1-d67d17bcaef8,"[2f249bba-2201-424f-8515-182cd5272cd9, f28f93b...","[0.5394981503486633, 0.453566312789917, 0.4519..."
2,ebb4ab78-ff5f-4e78-ac14-bb98760c2a0a,"[2f249bba-2201-424f-8515-182cd5272cd9, f28f93b...","[0.5055281519889832, 0.4633753299713135, 0.450..."
3,9fd75f16-1bd5-4451-af69-5885b7d5267a,"[60910493-5224-4d80-a1f0-2c04be430b58, 8e253cd...","[0.5783251523971558, 0.5436692237854004, 0.488..."
4,0f303917-6e53-449c-8242-7ab82e8fb78f,"[8e253cda-fa64-4f77-ae17-e1d8774f96bb, e273d4a...","[0.48761630058288574, 0.48056530952453613, 0.4..."
...,...,...,...
1815,8e88d11c-e574-450a-ade4-c3f96756cb51,"[5345361d-b741-4428-befe-f5d86db1035c, 0c79e70...","[0.8258951902389526, 0.6904723048210144, 0.689..."
1816,8a105170-77a8-4706-8fbc-c33b65fe7a69,"[5345361d-b741-4428-befe-f5d86db1035c, 687829c...","[0.7740635275840759, 0.7201942801475525, 0.720..."
1817,2fdc3c5b-fdd7-4402-b5f2-edbadf83889d,"[49f4df79-6f9e-42a8-9d19-515b5deb7ada, 5345361...","[0.6518778204917908, 0.6235338449478149, 0.608..."
1818,2ac6e1b0-09b3-4457-93a4-933e1c72103b,"[49f4df79-6f9e-42a8-9d19-515b5deb7ada, 5345361...","[0.6747296452522278, 0.6607894897460938, 0.644..."


In [251]:
medical_qa_minilm_dict = medical_qa_qrels_df.groupby("query-id").apply(lambda g: dict(zip(g["corpus-id"], g["score"]))).to_dict()
k_values = [1, 3, 5, 10]

medical_qa_minilm_dict = {
    qid: {doc_id: int(score) for doc_id, score in doc_dict.items()}
    for qid, doc_dict in medical_qa_minilm_dict.items()
}

medical_qa_minilm_results_dict = {}

for query_id, doc_ids, scores in zip(
    medical_qa_minilm_results["query-id"],
    medical_qa_minilm_results["corpus-id"],
    medical_qa_minilm_results["score"],
):
    if query_id not in medical_qa_minilm_results_dict:
        medical_qa_minilm_results_dict[query_id] = {}

    for doc_id, score in zip(doc_ids, scores):
        medical_qa_minilm_results_dict[query_id][doc_id] = score  # Ensure float


  medical_qa_minilm_dict = medical_qa_qrels_df.groupby("query-id").apply(lambda g: dict(zip(g["corpus-id"], g["score"]))).to_dict()


In [252]:
ndcg_medical_qa_minilm, _map_medical_qa_minilm, recall_medical_qa_minilm, precision_medical_qa_minilm, top_k_accuracy_medical_qa_minilm = evaluate(
    qrels=medical_qa_minilm_dict, 
    results=medical_qa_minilm_results_dict, 
    k_values=k_values)

custom_medical_qa_minilm = {
    "NDCG": ndcg_medical_qa_minilm,
    "MAP": _map_medical_qa_minilm,
    "Recall": recall_medical_qa_minilm,
    "Precision": precision_medical_qa_minilm,
    "Top-K Accuracy": top_k_accuracy_medical_qa_minilm
}

with open("custom_results/MedicalQA/all-MiniLM-L6-v2.json", "w") as f:
    json.dump(custom_medical_qa_minilm, f, indent=4)

In [253]:
model = mteb.get_model("all-MiniLM-L6-v2")
tasks = mteb.get_tasks(tasks=["MedicalQARetrieval"])
evaluation = mteb.MTEB(tasks=tasks)
results = evaluation.run(model, output_folder=f"mteb_results/MedicalQARetrieval/all-MiniLM-L6-v2")



Batches: 100%|██████████| 16/16 [00:01<00:00, 13.79it/s]
Batches: 100%|██████████| 16/16 [00:03<00:00,  4.52it/s]


jina ai on mteb

In [256]:
model = JinaAIEmbedder(engine="jina-embeddings-v3")
tasks = mteb.get_tasks(tasks=["MedicalQARetrieval"])
evaluation = mteb.MTEB(tasks=tasks)
results = evaluation.run(model, output_folder=f"mteb_results/MedicalQARetrieval/jina-embeddings-v3")



In [258]:
jinaai_ef = embedding_functions.JinaEmbeddingFunction(
                api_key=jina_api_key,
                model_name="jina-embeddings-v3"
            )

### LitSearch

### Multilingual

In [12]:
multi_corpus_en_df = pd.read_parquet("datasets/multi_corpus_en.parquet")
multi_qrels_en_df = pd.read_parquet("datasets/multi_qrels_en.parquet")
multi_queries_en_df = pd.read_parquet("datasets/multi_queries_en.parquet")

multi_queries_en = multi_queries_en_df["text"].tolist()
multi_query_ids_en = multi_queries_en_df["_id"].tolist()
multi_corpus_en = multi_corpus_en_df["text"].tolist()
multi_corpus_ids_en = multi_corpus_en_df["_id"].tolist()

In [16]:
multi_corpus_en_df[10:]

Unnamed: 0,_id,title,text
10,20231101.en_5843419_75,France,France is a member of the North Atlantic Treat...
11,20231101.en_5843419_76,France,France retains strong political and economic i...
12,20231101.en_5843419_77,France,"In 2017, France was the world's fourth-largest..."
13,20231101.en_5843419_78,France,The French Armed Forces () are the military an...
14,20231101.en_5843419_79,France,France has been a recognised nuclear state sin...
...,...,...,...
13495,20231101.en_1488463_17,Agroforestry,"Especially in recent years, poor smallholder f..."
13496,20231101.en_1488463_18,Agroforestry,Research with Faidherbia albida in Zambia show...
13497,20231101.en_1488463_19,Agroforestry,A well-studied example of an agroforestry hill...
13498,20231101.en_1488463_20,Agroforestry,"Thin and prune Hillside secondary forest, leav..."


In [19]:
multi_queries_en_df

Unnamed: 0,_id,text
0,q20231101.en_399353_52,"When did the show ""Bewitched"" start airing on ..."
1,q20231101.en_47595_14,What is the geological history of Manchuria?
2,q20231101.en_392095_57,What are the facilities and capabilities of Ch...
3,q20231101.en_21345189_22,What role did Vladivostok play during the Grea...
4,q20231101.en_633_54,How can algae be used to manage farm runoff an...
...,...,...
1495,q20231101.en_58022477_4,"What is the meaning of the term ""Ahegao""?"
1496,q20231101.en_47325_6,How can you determine the sex of a perch using...
1497,q20231101.en_14401500_17,"What bands participated in the November 2009 ""..."
1498,q20231101.en_48774_13,What role did the lance play in the Charge of ...


In [13]:
len(multi_corpus_en_df)

13500

In [14]:
len(multi_queries_en_df)

1500

In [73]:
multi_qrels_en_df.head()

Unnamed: 0,query-id,corpus-id,score
0,q20231101.en_45492650_20,20231101.en_45492650_16,0.5
1,q20231101.en_45492650_20,20231101.en_45492650_17,0.5
2,q20231101.en_45492650_20,20231101.en_45492650_18,0.5
3,q20231101.en_45492650_20,20231101.en_45492650_19,0.5
4,q20231101.en_45492650_20,20231101.en_45492650_20,1.0


In [81]:
results_df = pd.read_parquet(f"df_results/WikipediaRetrievalMultilingual_en_text-embedding-3-small_results_df.parquet")

In [85]:
results_dict = results_df.to_dict(orient="list")


In [86]:
results_dict

{'query-id': ['q20231101.en_399353_52',
  'q20231101.en_47595_14',
  'q20231101.en_392095_57',
  'q20231101.en_21345189_22',
  'q20231101.en_633_54',
  'q20231101.en_2752_5',
  'q20231101.en_77363_18',
  'q20231101.en_610191_12',
  'q20231101.en_1812_10',
  'q20231101.en_5237_21',
  'q20231101.en_19230475_34',
  'q20231101.en_164646_20',
  'q20231101.en_23473595_4',
  'q20231101.en_241559_32',
  'q20231101.en_21069333_27',
  'q20231101.en_5371_128',
  'q20231101.en_66196_38',
  'q20231101.en_175442_5',
  'q20231101.en_13873779_11',
  'q20231101.en_1145676_15',
  'q20231101.en_88931_16',
  'q20231101.en_47734_61',
  'q20231101.en_1178_5',
  'q20231101.en_142818_38',
  'q20231101.en_1056447_9',
  'q20231101.en_98534_9',
  'q20231101.en_2953963_17',
  'q20231101.en_18933194_74',
  'q20231101.en_378464_15',
  'q20231101.en_44668_8',
  'q20231101.en_93084_26',
  'q20231101.en_65119_17',
  'q20231101.en_22877693_24',
  'q20231101.en_1372_25',
  'q20231101.en_1797_9',
  'q20231101.en_51928_67

In [87]:
custom_results(
    chroma_client=chroma_client,
    qrels_df=multi_qrels_en_df,
    queries=multi_queries_en,
    query_ids=multi_query_ids_en,
    corpus=multi_corpus_en,
    corpus_ids=multi_corpus_ids_en,
    ef=openai_ef_small,
    dataset_name="WikipediaRetrievalMultilingual_en",
    model_name="text-embedding-3-small"
)

evaluating


In [88]:
custom_results(
    chroma_client=chroma_client,
    qrels_df=multi_qrels_en_df,
    queries=multi_queries_en,
    query_ids=multi_query_ids_en,
    corpus=multi_corpus_en,
    corpus_ids=multi_corpus_ids_en,
    ef=openai_ef_large,
    dataset_name="WikipediaRetrievalMultilingual_en",
    model_name="text-embedding-3-large"
)

WikipediaRetrievalMultilingual_en_text-embedding-3-large collection created


Processing Batches: 100%|██████████| 135/135 [02:04<00:00,  1.09it/s]


embedding complete


Processing Batches: 100%|██████████| 15/15 [00:13<00:00,  1.14it/s]

evaluating





In [94]:
jina_collection = chroma_client.get_collection(name="WikipediaRetrievalMultilingual_en_jina-embeddings-v3", embedding_function=jinaai_ef)
jina_collection.count()

7400

In [95]:
custom_results(
    chroma_client=chroma_client,
    qrels_df=multi_qrels_en_df,
    queries=multi_queries_en,
    query_ids=multi_query_ids_en,
    corpus=multi_corpus_en[7400:],
    corpus_ids=multi_corpus_ids_en[7400:],
    ef=jinaai_ef,
    dataset_name="WikipediaRetrievalMultilingual_en",
    model_name="jina-embeddings-v3"
)

WikipediaRetrievalMultilingual_en_jina-embeddings-v3 collection created


Processing Batches: 100%|██████████| 61/61 [01:12<00:00,  1.19s/it]


embedding complete


Processing Batches: 100%|██████████| 15/15 [00:18<00:00,  1.22s/it]

evaluating





In [78]:
run_mteb_openai(task_name="WikipediaRetrievalMultilingual", size="small")



In [79]:
run_mteb_openai(task_name="WikipediaRetrievalMultilingual", size="large")



KeyboardInterrupt: 

In [None]:
run_mteb_jina(task_name="WikipediaRetrievalMultilingual")

In [90]:
multi_corpus_it_df = pd.read_parquet("datasets/multi_corpus_it.parquet")
multi_qrels_it_df = pd.read_parquet("datasets/multi_qrels_it.parquet")
multi_queries_it_df = pd.read_parquet("datasets/multi_queries_it.parquet")

multi_queries_it = multi_queries_it_df["text"].tolist()
multi_query_ids_it = multi_queries_it_df["_id"].tolist()
multi_corpus_it = multi_corpus_it_df["text"].tolist()
multi_corpus_ids_it = multi_corpus_it_df["_id"].tolist()

In [93]:
custom_results(
    chroma_client=chroma_client,
    qrels_df=multi_qrels_it_df,
    queries=multi_queries_it,
    query_ids=multi_query_ids_it,
    corpus=multi_corpus_it,
    corpus_ids=multi_corpus_ids_it,
    ef=openai_ef_small,
    dataset_name="WikipediaRetrievalMultilingual_it",
    model_name="text-embedding-3-small"
)

evaluating


In [96]:
custom_results(
    chroma_client=chroma_client,
    qrels_df=multi_qrels_it_df,
    queries=multi_queries_it,
    query_ids=multi_query_ids_it,
    corpus=multi_corpus_it,
    corpus_ids=multi_corpus_ids_it,
    ef=openai_ef_large,
    dataset_name="WikipediaRetrievalMultilingual_it",
    model_name="text-embedding-3-large"
)

WikipediaRetrievalMultilingual_it_text-embedding-3-large collection created


Processing Batches: 100%|██████████| 135/135 [02:16<00:00,  1.01s/it]


embedding complete


Processing Batches: 100%|██████████| 15/15 [00:14<00:00,  1.04it/s]

evaluating





In [97]:
custom_results(
    chroma_client=chroma_client,
    qrels_df=multi_qrels_it_df,
    queries=multi_queries_it,
    query_ids=multi_query_ids_it,
    corpus=multi_corpus_it,
    corpus_ids=multi_corpus_ids_it,
    ef=jinaai_ef,
    dataset_name="WikipediaRetrievalMultilingual_it",
    model_name="jina-embeddings-v3"
)

WikipediaRetrievalMultilingual_it_jina-embeddings-v3 collection created


Processing Batches: 100%|██████████| 135/135 [02:38<00:00,  1.17s/it]


embedding complete


Processing Batches: 100%|██████████| 15/15 [00:14<00:00,  1.06it/s]

evaluating





In [98]:
multi_corpus_nl_df = pd.read_parquet("datasets/multi_corpus_nl.parquet")
multi_qrels_nl_df = pd.read_parquet("datasets/multi_qrels_nl.parquet")
multi_queries_nl_df = pd.read_parquet("datasets/multi_queries_nl.parquet")

multi_queries_nl = multi_queries_nl_df["text"].tolist()
multi_query_ids_nl = multi_queries_nl_df["_id"].tolist()
multi_corpus_nl = multi_corpus_nl_df["text"].tolist()
multi_corpus_ids_nl = multi_corpus_nl_df["_id"].tolist()

In [99]:
custom_results(
    chroma_client=chroma_client,
    qrels_df=multi_qrels_nl_df,
    queries=multi_queries_nl,
    query_ids=multi_query_ids_nl,
    corpus=multi_corpus_nl,
    corpus_ids=multi_corpus_ids_nl,
    ef=openai_ef_small,
    dataset_name="WikipediaRetrievalMultilingual_nl",
    model_name="text-embedding-3-small"
)

WikipediaRetrievalMultilingual_nl_text-embedding-3-small collection created


Processing Batches: 100%|██████████| 135/135 [01:52<00:00,  1.20it/s]


embedding complete


Processing Batches: 100%|██████████| 15/15 [00:14<00:00,  1.02it/s]

evaluating





In [101]:
jina_collection = chroma_client.get_collection(name="WikipediaRetrievalMultilingual_nl_jina-embeddings-v3", embedding_function=jinaai_ef)
jina_collection.count()

1500

In [102]:
custom_results(
    chroma_client=chroma_client,
    qrels_df=multi_qrels_nl_df,
    queries=multi_queries_nl,
    query_ids=multi_query_ids_nl,
    corpus=multi_corpus_nl[1500:],
    corpus_ids=multi_corpus_ids_nl[1500:],
    ef=jinaai_ef,
    dataset_name="WikipediaRetrievalMultilingual_nl",
    model_name="jina-embeddings-v3"
)

WikipediaRetrievalMultilingual_nl_jina-embeddings-v3 collection created


Processing Batches:   0%|          | 0/120 [00:00<?, ?it/s]

Processing Batches: 100%|██████████| 120/120 [02:22<00:00,  1.19s/it]


embedding complete


Processing Batches: 100%|██████████| 15/15 [00:18<00:00,  1.21s/it]

evaluating





In [103]:
custom_results(
    chroma_client=chroma_client,
    qrels_df=multi_qrels_nl_df,
    queries=multi_queries_nl,
    query_ids=multi_query_ids_nl,
    corpus=multi_corpus_nl,
    corpus_ids=multi_corpus_ids_nl,
    ef=openai_ef_large,
    dataset_name="WikipediaRetrievalMultilingual_nl",
    model_name="text-embedding-3-large"
)

WikipediaRetrievalMultilingual_nl_text-embedding-3-large collection created


Processing Batches: 100%|██████████| 135/135 [02:16<00:00,  1.01s/it]


embedding complete


Processing Batches: 100%|██████████| 15/15 [00:14<00:00,  1.04it/s]

evaluating





In [104]:
multi_corpus_pt_df = pd.read_parquet("datasets/multi_corpus_pt.parquet")
multi_qrels_pt_df = pd.read_parquet("datasets/multi_qrels_pt.parquet")
multi_queries_pt_df = pd.read_parquet("datasets/multi_queries_pt.parquet")

multi_queries_pt = multi_queries_pt_df["text"].tolist()
multi_query_ids_pt = multi_queries_pt_df["_id"].tolist()
multi_corpus_pt = multi_corpus_pt_df["text"].tolist()
multi_corpus_ids_pt = multi_corpus_pt_df["_id"].tolist()

custom_results(
    chroma_client=chroma_client,
    qrels_df=multi_qrels_pt_df,
    queries=multi_queries_pt,
    query_ids=multi_query_ids_pt,
    corpus=multi_corpus_pt,
    corpus_ids=multi_corpus_ids_pt,
    ef=openai_ef_small,
    dataset_name="WikipediaRetrievalMultilingual_pt",
    model_name="text-embedding-3-small"
)

custom_results(
    chroma_client=chroma_client,
    qrels_df=multi_qrels_pt_df,
    queries=multi_queries_pt,
    query_ids=multi_query_ids_pt,
    corpus=multi_corpus_pt,
    corpus_ids=multi_corpus_ids_pt,
    ef=jinaai_ef,
    dataset_name="WikipediaRetrievalMultilingual_pt",
    model_name="jina-embeddings-v3"
)

custom_results(
    chroma_client=chroma_client,
    qrels_df=multi_qrels_pt_df,
    queries=multi_queries_pt,
    query_ids=multi_query_ids_pt,
    corpus=multi_corpus_pt,
    corpus_ids=multi_corpus_ids_pt,
    ef=openai_ef_large,
    dataset_name="WikipediaRetrievalMultilingual_pt",
    model_name="text-embedding-3-large"
)

WikipediaRetrievalMultilingual_pt_text-embedding-3-small collection created


Processing Batches: 100%|██████████| 135/135 [02:04<00:00,  1.09it/s]


embedding complete


Processing Batches: 100%|██████████| 15/15 [00:13<00:00,  1.13it/s]


evaluating
WikipediaRetrievalMultilingual_pt_jina-embeddings-v3 collection created


Processing Batches: 100%|██████████| 135/135 [02:34<00:00,  1.15s/it]


embedding complete


Processing Batches: 100%|██████████| 15/15 [00:13<00:00,  1.08it/s]


evaluating
WikipediaRetrievalMultilingual_pt_text-embedding-3-large collection created


Processing Batches: 100%|██████████| 135/135 [02:12<00:00,  1.02it/s]


embedding complete


Processing Batches: 100%|██████████| 15/15 [00:12<00:00,  1.18it/s]

evaluating





In [109]:
multi_corpus_pt_df

Unnamed: 0,_id,title,text
0,20231101.pt_330581_25,YouTube,"Em junho de 2010, o YouTube lançou o editor de..."
1,20231101.pt_330581_26,YouTube,"O canal Youtube Edu, abreviatura de educação, ..."
2,20231101.pt_330581_27,YouTube,"Em março de 2006, cerca de vinte mil novos víd..."
3,20231101.pt_330581_28,YouTube,O escritório do YouTube fica na Califórnia. Na...
4,20231101.pt_330581_29,YouTube,A tecnologia de reprodução dos vídeos do YouTu...
...,...,...,...
13484,20231101.pt_432187_4,Miquerinos,Miquerinos era filho de Quéfren e Camerernebet...
13485,20231101.pt_432187_5,Miquerinos,"Além dela, Miquerinos teve duas outras esposas..."
13486,20231101.pt_432187_6,Miquerinos,Ao contrário dos faraós antecessores Quéops e ...
13487,20231101.pt_432187_7,Miquerinos,A Pirâmide de Miquerinos foi construída na bor...


In [113]:
multi_corpus_hi_df = pd.read_parquet("datasets/multi_corpus_hi.parquet")
multi_qrels_hi_df = pd.read_parquet("datasets/multi_qrels_hi.parquet")
multi_queries_hi_df = pd.read_parquet("datasets/multi_queries_hi.parquet")

multi_queries_hi = multi_queries_hi_df["text"].tolist()
multi_query_ids_hi = multi_queries_hi_df["_id"].tolist()
multi_corpus_hi = multi_corpus_hi_df["text"].tolist()
multi_corpus_ids_hi = multi_corpus_hi_df["_id"].tolist()

custom_results(
    chroma_client=chroma_client,
    qrels_df=multi_qrels_hi_df,
    queries=multi_queries_hi,
    query_ids=multi_query_ids_hi,
    corpus=multi_corpus_hi,
    corpus_ids=multi_corpus_ids_hi,
    ef=openai_ef_small,
    dataset_name="WikipediaRetrievalMultilingual_hi",
    model_name="text-embedding-3-small"
)

WikipediaRetrievalMultilingual_hi_text-embedding-3-small collection created


Processing Batches: 100%|██████████| 135/135 [02:18<00:00,  1.02s/it]


embedding complete


Processing Batches: 100%|██████████| 15/15 [00:09<00:00,  1.52it/s]


evaluating
WikipediaRetrievalMultilingual_hi_jina-embeddings-v3 collection created


Processing Batches:   7%|▋         | 9/135 [00:20<04:46,  2.28s/it]


ReadTimeout: The read operation timed out in add.

In [116]:
custom_results(
    chroma_client=chroma_client,
    qrels_df=multi_qrels_hi_df,
    queries=multi_queries_hi,
    query_ids=multi_query_ids_hi,
    corpus=multi_corpus_hi,
    corpus_ids=multi_corpus_ids_hi,
    ef=openai_ef_large,
    dataset_name="WikipediaRetrievalMultilingual_hi",
    model_name="text-embedding-3-large"
)

WikipediaRetrievalMultilingual_hi_text-embedding-3-large collection created


Processing Batches: 100%|██████████| 135/135 [02:28<00:00,  1.10s/it]


embedding complete


Processing Batches: 100%|██████████| 15/15 [00:14<00:00,  1.06it/s]

evaluating





In [135]:
jina_collection = chroma_client.get_collection(name="WikipediaRetrievalMultilingual_hi_jina-embeddings-v3", embedding_function=jinaai_ef)
jina_collection.count()

10300

In [137]:
jinaai_ef = embedding_functions.JinaEmbeddingFunction(
                api_key=jina_api_key,
                model_name="jina-embeddings-v3"
            )

In [139]:
custom_results(
    chroma_client=chroma_client,
    qrels_df=multi_qrels_hi_df,
    queries=multi_queries_hi,
    query_ids=multi_query_ids_hi,
    corpus=multi_corpus_hi[10300:],
    corpus_ids=multi_corpus_ids_hi[10300:],
    ef=jinaai_ef,
    dataset_name="WikipediaRetrievalMultilingual_hi",
    model_name="jina-embeddings-v3"
)

evaluating


### Visualizations

In [142]:
directory = "custom_results"
visuals_dir = "visuals"
os.makedirs(visuals_dir, exist_ok=True)

data = []

for task in os.listdir(directory):
    task_path = os.path.join(directory, task)
    if os.path.isdir(task_path):
        for filename in os.listdir(task_path):
            if filename.endswith(".json"):
                model_name = filename.replace(".json", "")
                filepath = os.path.join(task_path, filename)
                
                with open(filepath, "r") as f:
                    results = json.load(f)
                    
                    data.append({
                        "Task": task,
                        "Model": model_name,
                        "NDCG@10": results["NDCG"].get("NDCG@10", 0),
                        "MAP@10": results["MAP"].get("MAP@10", 0),
                        "Recall@10": results["Recall"].get("Recall@10", 0),
                        "P@10": results["Precision"].get("P@10", 0),
                        "Accuracy@10": results["Top-K Accuracy"].get("Accuracy@10", 0)
                    })

df = pd.DataFrame(data)

unique_models = df["Model"].unique()
colors = plt.cm.get_cmap("tab10", len(unique_models))
color_map = {model: colors(i) for i, model in enumerate(unique_models)}

metrics = ["NDCG@10", "MAP@10", "Recall@10", "P@10", "Accuracy@10"]
for task in df["Task"].unique():
    subset = df[df["Task"] == task]
    for metric in metrics:
        plt.figure(figsize=(10, 5))
        bars = plt.bar(subset["Model"], subset[metric], color=[color_map[model] for model in subset["Model"]])
        plt.xlabel("Model")
        plt.ylabel(metric)
        plt.title(f"{task} - {metric}")
        plt.xticks(rotation=45, ha="right")
        plt.ylim(0, 1.5)
        plt.tight_layout()
        
        for bar in bars:
            height = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2, height, f"{height:.4f}", ha='center', va='bottom', fontsize=10)
        
        plt.savefig(os.path.join(visuals_dir, f"{task}_{metric}.png"))
        plt.close()



  colors = plt.cm.get_cmap("tab10", len(unique_models))
