In [40]:
from beir import util, LoggingHandler
from beir.retrieval import models
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES

import logging
import pathlib, os

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

In [54]:
#### Download scifact.zip dataset and unzip the dataset
# dataset = "scifact"
# url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
# out_dir = os.path.join(pathlib.Path('__file__').parent.absolute(), "datasets")
# data_path = util.download_and_unzip(url, out_dir)

#### Provide the data_path where learncorpus dataset is located
data_path = "./datasets/learncorpus"


In [55]:
#### Provide the data_path where scifact has been downloaded and unzipped
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

2024-12-12 17:12:50 - Loading Corpus...


100%|██████████| 50/50 [00:00<00:00, 6245.06it/s]

2024-12-12 17:12:50 - Loaded 50 TEST Documents.
2024-12-12 17:12:50 - Doc Example: {'text': '# Connect functions to Azure services using bindings (programming-language-csharp)\r\n\r\nWhen you create a function, language-specific trigger code is added in your project from a set of trigger templates. If you want to connect your function to other services by using input or output bindings, you have to add specific binding definitions in your function. To learn more about bindings, see [Azure Functions triggers and bindings concepts](functions-triggers-bindings).\r\n\r\n## Local development\r\n\r\nWhen you develop functions locally, you need to update the function code to add bindings. For languages that use function.json, Visual Studio Code provides tooling to add bindings to a function.\r\n\r\n### Manually add bindings based on examples\r\n\r\nWhen adding a binding to an existing function, you need to add binding-specific attributes to the function definition in code.\r\n\r\nThe followin




In [56]:
qrels

{'0': {'cf374970-651e-b078-cf54-7a1117d11405': 1},
 '1': {'cf374970-651e-b078-cf54-7a1117d11405': 1},
 '2': {'cf374970-651e-b078-cf54-7a1117d11405': 1},
 '3': {'cf374970-651e-b078-cf54-7a1117d11405': 1},
 '4': {'cf374970-651e-b078-cf54-7a1117d11405': 1},
 '5': {'cf374970-651e-b078-cf54-7a1117d11405': 1},
 '6': {'cf374970-651e-b078-cf54-7a1117d11405': 1},
 '7': {'cf374970-651e-b078-cf54-7a1117d11405': 1},
 '8': {'cf374970-651e-b078-cf54-7a1117d11405': 1},
 '9': {'1dcc4d8f-79e5-35b9-7fc7-750cab3fc03d': 1},
 '10': {'1dcc4d8f-79e5-35b9-7fc7-750cab3fc03d': 1},
 '11': {'1dcc4d8f-79e5-35b9-7fc7-750cab3fc03d': 1},
 '12': {'1dcc4d8f-79e5-35b9-7fc7-750cab3fc03d': 1},
 '13': {'1dcc4d8f-79e5-35b9-7fc7-750cab3fc03d': 1},
 '14': {'1dcc4d8f-79e5-35b9-7fc7-750cab3fc03d': 1},
 '15': {'1dcc4d8f-79e5-35b9-7fc7-750cab3fc03d': 1},
 '16': {'1dcc4d8f-79e5-35b9-7fc7-750cab3fc03d': 1},
 '17': {'1dcc4d8f-79e5-35b9-7fc7-750cab3fc03d': 1},
 '18': {'350da97f-fa07-6ab9-6df0-548f2c00a651': 1},
 '19': {'350da97f-fa07

In [58]:
#### Load the SBERT model and retrieve using cosine-similarity
# model = DRES(models.SentenceBERT("msmarco-distilbert-base-tas-b"), batch_size=16)
# retriever = EvaluateRetrieval(model, score_function="cos_sim") # or "cos_sim" for cosine similarity
# results = retriever.retrieve(first_corpus, first_queries)
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from urllib.parse import parse_qs, urlparse
import json
import requests


def convert_v2_to_v1(res: dict) -> list:
    # The Knowledge Service response schema has been changed from v1 to v2
    # To align with the format in Learn Copilot Journey DB, we take v1 format as the inner contract
    lst = res.get("items", [])
    ret = [
        {
            "itemId": item["id"],
            "score": item["rerankerScore"],
            "metadata": {
                "content": item["content"],
                "lastUpdated": item["lastModifiedDateTime"],
                "url": item["contentUrl"],
                "title": item["title"],
            },
        }
        for item in lst
    ]
    return ret

def fetch_knowledge_service_response(
    token: str,
    query: str,
    endpoint: str,
    category: str,
    filter: str,
    scope: str,
    top: int,
    threshold: float,
) -> list[dict]:
    # Asserts
    assert category in {
        "document",
        "conceptual",
        "training",
        "evaluation",
    }, f"Invalid category {category}"
    assert top > 0, f"Invalid top {top} when invoke knowledge service, must be > 0."
    assert (
        0 <= threshold <= 1
    ), f"Invalid threshold {threshold} when invoke knowledge service, must be [0, 1]."

    # if endpoint.startswith("https://learn.microsoft.com"):
    #     assert (
    #         category != "evaluation"
    #     ), "Evaluation category is not supported in public KS."

    # Fetch response
    endpoint_url = endpoint.replace("{category}", category)
    endpoint_url_parsed = urlparse(endpoint_url)

    url_params = parse_qs(endpoint_url_parsed.query)
    endpoint_url_parsed = endpoint_url_parsed._replace(params="", query="", fragment="")

    request_params = {
        "url": endpoint_url_parsed.geturl(),
        "json": {
            # "queryType": "vector",
            "input": query,
            "filter": filter,
        },
        "headers": {"Authorization": f"Bearer {token}"},
        "params": {
            **url_params,
            "top": top,
            "scorethreshold": threshold,
        },
    }

    use_api_v2 = category == "document"
    if use_api_v2:
        request_params["params"]["api-version"] = "v2"
    # if scope is not None or scope.strip("\n ") != "":
    #     request_params["params"]["scope"] = scope

    # response = post_request(request_params=request_params)
    response = requests.post(
        request_params["url"],
        json=request_params["json"],
        headers=request_params["headers"],
        params=request_params["params"],
    )
    response.raise_for_status()

    return convert_v2_to_v1(response.json()) if use_api_v2 else response.json()
    #return response.json()

def retrieve_from_ks(
    token: str,
    query: str,
    endpoint: str,
    ks_category: str = "document",
    ks_filter: str = None,  # type: ignore
    ks_scope: str = None,  # type: ignore
    ks_top: int = 10,
    ks_threshold: float = 0.8,
) -> object:

    # try:
    #     ks_const = json.loads(ks_source)

    #     if isinstance(ks_const, list):
    #         if validate_ks_const(ks_const):
    #             return ks_const
    # except AssertionError as e:
    #     raise e
    # except Exception:
    #     pass

    return fetch_knowledge_service_response(
        token,
        query,
        endpoint,
        ks_category,
        ks_filter,
        ks_scope,
        ks_top,
        threshold=ks_threshold,
    )

token_provider = get_bearer_token_provider(
    DefaultAzureCredential(), "api://5405974b-a0ac-4de0-80e0-9efe337ea291/.default"
)

results = {qid: {} for qid in queries.keys()}

for key, value in queries.items():
    ks_result = retrieve_from_ks(token_provider(), value, "https://learn.microsoft.com/api/knowledge/{category}/relevantitems", "conceptual", "search.ismatch('\"https://learn.microsoft.com/en-us/azure/azure-functions/\"', 'url')", None, 10, 0.1)
    results[key] = {item['itemId']: item['score'] for item in ks_result}
    #results.append({key: {item['itemId']: item['score'] for item in ks_result}})

2024-12-12 17:13:25 - No environment configuration found.
2024-12-12 17:13:25 - ManagedIdentityCredential will use IMDS
2024-12-12 17:13:25 - Request URL: 'http://169.254.169.254/metadata/identity/oauth2/token?api-version=REDACTED&resource=REDACTED'
Request method: 'GET'
Request headers:
    'User-Agent': 'azsdk-python-identity/1.17.1 Python/3.12.4 (Windows-11-10.0.26100-SP0)'
No body was attached to the request
2024-12-12 17:13:29 - DefaultAzureCredential acquired a token from AzureCliCredential


In [52]:
results

{'0': {'2bad2db5-5cd7-ebd7-0d05-03ed117b6330': 0.7149272,
  '4a6e9898-18ad-a627-dcae-0e1e1a6618d8': 0.70749193,
  '66b9d8b2-282d-ddf5-774f-36cf53e8f014': 0.6938638,
  '8b032441-76ed-ac63-abbb-4b3eecde418b': 0.6828429,
  '29406b12-36a3-8e73-f133-2189ebd6b8a6': 0.68241507,
  'a114085c-4803-ec5c-6bbd-174d90ac6184': 0.680539,
  'cf374970-651e-b078-cf54-7a1117d11405': 0.67543995,
  '63921b90-d217-c2a2-7945-128b16a8e2ed': 0.67340493,
  '749aaf48-068a-c14f-6ebd-31764b7d6bb2': 0.6724152,
  '998595dd-3530-ae8d-4dd6-6a8482f0903c': 0.6712451},
 '1': {'3fa6d840-7c14-aeb6-b403-3d1c739938e0': 0.72756755,
  'a114085c-4803-ec5c-6bbd-174d90ac6184': 0.7046081,
  '21070952-9c0b-f679-ef32-3a81bd26b933': 0.7008698,
  'fe363ef6-b68a-56f3-82a1-99cf8289c5aa': 0.69865507,
  'eb3f063e-2e12-b19d-7167-5c6eb1187cc4': 0.6960976,
  '9b63e42d-7b16-f13d-edd3-662f4735e424': 0.6893577,
  '29406b12-36a3-8e73-f133-2189ebd6b8a6': 0.6870575,
  '2bad2db5-5cd7-ebd7-0d05-03ed117b6330': 0.6828825,
  '63921b90-d217-c2a2-7945-128

In [53]:
from beir import util, LoggingHandler
from beir.retrieval import models
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES
#### Evaluate your model with NDCG@k, MAP@K, Recall@K and Precision@K  where k = [1,3,5,10,100,1000] 

model = DRES(models.SentenceBERT("msmarco-distilbert-base-tas-b"), batch_size=16)
retriever = EvaluateRetrieval(model, score_function="cos_sim") # or "cos_sim" for cosine similarity
ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)

2024-12-12 14:11:23 - Use pytorch device_name: cpu
2024-12-12 14:11:23 - Load pretrained SentenceTransformer: msmarco-distilbert-base-tas-b
2024-12-12 14:11:27 - For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
2024-12-12 14:11:27 - 

2024-12-12 14:11:27 - NDCG@1: 0.3297
2024-12-12 14:11:27 - NDCG@3: 0.4927
2024-12-12 14:11:27 - NDCG@5: 0.5385
2024-12-12 14:11:27 - NDCG@10: 0.5795
2024-12-12 14:11:27 - NDCG@100: 0.5795
2024-12-12 14:11:27 - NDCG@1000: 0.5795
2024-12-12 14:11:27 - 

2024-12-12 14:11:27 - MAP@1: 0.3297
2024-12-12 14:11:27 - MAP@3: 0.4506
2024-12-12 14:11:27 - MAP@5: 0.4764
2024-12-12 14:11:27 - MAP@10: 0.4944
2024-12-12 14:11:27 - MAP@100: 0.4944
2024-12-12 14:11:27 - MAP@1000: 0.4944
2024-12-12 14:11:27 - 

2024-12-12 14:11:27 - Recall@1: 0.3297
2024-12-12 14:11:27 - Recall@3: 0.6154
2024-12-12 14:11:27 - Recall@5: 0.7253
2024-12-12 14:11:27 - Recall@10: 0.8461
2024-12-12 14:11:27 