# BEIR Benchmarking for Azure Cosmos DB for MongoDB (vCore) (Part2)


## Preparation


### Configuration


In [None]:
import os
import json

In [None]:
# Vectorize query text if True
vectorize_query = False

# BEIR datasets to download
dataset_name = "scifact"

### Environment variables


In [None]:
# Load environment variabls from .env file
from dotenv import load_dotenv

load_dotenv()

### Connect to Azure Cosmos DB for MongoDB (vCore)


In [None]:
from pymongo import MongoClient

admin_user = os.environ["ADMIN_USER"]
admin_password = os.environ["ADMIN_PASSWORD"]

client = MongoClient(
    f"mongodb+srv://{admin_user}:{admin_password}@vector-cluster.mongocluster.cosmos.azure.com/?tls=true&authMechanism=SCRAM-SHA-256&retrywrites=false&maxIdleTimeMS=120000"
)

### Download BEIR datasets


In [None]:
from beir import util, LoggingHandler
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.lexical import BM25Search as BM25

In [None]:
url = (
    "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(
        dataset_name
    )
)

# path for downloaded files
out_dir = "./datasets"

data_path = util.download_and_unzip(url, out_dir)

In [None]:
corpus, queries, qrels = GenericDataLoader(data_path).load(
    split="test"
)  # pull data from corpus and queries

### Vectorize Queries


In [None]:
from openai import AzureOpenAI
from tenacity import retry, wait_random_exponential, stop_after_attempt

openai_client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2023-05-15",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
)

model = "text-embedding-ada-002-v2"


# Generate Document Embeddings using OpenAI Ada 002
# Read the text-sample.json
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
# Function to generate embeddings for title and content fields, also used for query embeddings
def generate_embeddings(text, model=model):
    return openai_client.embeddings.create(input=[text], model=model).data[0].embedding

In [None]:
if vectorize_query == True:
    query_ids = list(queries)
    queriesVector = {}
    dict_results = {}
    for query_id in query_ids:
        query = queries[query_id]
        queriesVector[query_id] = generate_embeddings(query)
    with open("scifact_query_vector.json", "w") as f:
        json.dump(queriesVector, f)
else:
    queriesVector = json.load(open("scifact_query_vector.json"))

## Search for BEIR dataset


### Full Text Search


In [None]:
query_ids = list(queries)
dict_results = {}
for query_id in query_ids:
    query = queries[query_id]
    filter = {
        "$text": {
            "$search": query,
        }
    }
    project = {"score": {"$meta": "textScore"}, "_id": 0, "corpusId": 1}
    sort = list({"score": {"$meta": "textScore"}}.items())
    collation = {}
    limit = 100
    results = client["scifact"]["scifact-collection"].find(
        filter=filter, projection=project, sort=sort, collation=collation, limit=limit
    )
    id_score = {}
    for result in results:
        id_score[result["corpusId"]] = result["score"]
    dict_results[query_id] = id_score

# Evaluate the performance
from beir.retrieval.evaluation import EvaluateRetrieval

ndcg, _map, recall, precision = EvaluateRetrieval.evaluate(
    qrels, dict_results, [1, 3, 5, 10, 50, 100]
)
print(ndcg, _map, recall, precision)