# BEIR Benchmarking for Azure Cosmos DB for MongoDB (vCore) (Part1)


## Preparation


### Configuration


In [None]:
import os

In [None]:
# Recreate Index if True
recreate_index = False

# BEIR datasets to download
dataset_name = "scifact"

### Environment variables


In [None]:
# Load environment variabls from .env file
from dotenv import load_dotenv

load_dotenv()

### Download BEIR datasets


In [None]:
from beir import util, LoggingHandler
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.lexical import BM25Search as BM25

In [None]:
url = (
    "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(
        dataset_name
    )
)

# path for downloaded files
out_dir = "./datasets"

data_path = util.download_and_unzip(url, out_dir)

In [None]:
corpus, queries, qrels = GenericDataLoader(data_path).load(
    split="test"
)  # pull data from corpus and queries

### Embedding data using OpenAI model


In [None]:
from openai import AzureOpenAI
from tenacity import retry, wait_random_exponential, stop_after_attempt

openai_client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2023-05-15",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
)

model = "text-embedding-ada-002-v2"


# Generate Document Embeddings using OpenAI Ada 002
# Read the text-sample.json
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
# Function to generate embeddings for title and content fields, also used for query embeddings
def generate_embeddings(text, model=model):
    return openai_client.embeddings.create(input=[text], model=model).data[0].embedding


# create documents for corpus
documents = []
for id in corpus:
    # print(id)
    documents.append(
        {
            "corpusId": id,
            "title": corpus[id]["title"],
            "text": corpus[id]["text"],
            "titleVector": generate_embeddings(corpus[id]["title"]),
            "textVector": generate_embeddings(corpus[id]["text"]),
        }
    )

## Azure Cosmos DB for MongoDB (vCore)


### Connect to Azure Cosmos DB for MongoDB (vCore)


In [None]:
from pymongo import MongoClient

admin_user = os.environ["ADMIN_USER"]
admin_password = os.environ["ADMIN_PASSWORD"]

client = MongoClient(
    f"mongodb+srv://{admin_user}:{admin_password}@vector-cluster.mongocluster.cosmos.azure.com/?tls=true&authMechanism=SCRAM-SHA-256&retrywrites=false&maxIdleTimeMS=120000"
)

### Create Index


In [None]:
# TODO

## Test


### Full Text Search


In [None]:
query = "Microstructural development of human newborn cerebral white matter assessed in vivo by diffusion tensor magnetic resonance imaging."

filter = {
    "$text": {
        "$search": query,
    }
}
project = {"score": {"$meta": "textScore"}, "_id": 0, "corpusId": 1}
sort = list({"score": {"$meta": "textScore"}}.items())
collation = {}
limit = 10000

result = client["scifact"]["scifact-collection"].find(
    filter=filter, projection=project, sort=sort, collation=collation, limit=limit
)