Reference
- https://mlflow.org/docs/latest/genai/datasets/sdk-guide/
- https://mlflow.org/docs/latest/genai/concepts/evaluation-datasets/
- https://mlflow.org/docs/3.0.1/llms/llm-evaluate?#heuristic-based-metrics
- https://www.mlflow.org/docs/2.20.3/llms/rag/notebooks/retriever-evaluation-tutorial.html

Note:
- From mlflow 3, there is less focus on deterministic IR metrics (precision@K, recall@K, NDCG@K)
- However, mlflow.models.evaluate() still has these functionalities

In [1]:
# Initial records might have simple structure
initial_record = {
    "inputs": {"question": "What is MLflow?"},
    "expectations": {
        "answer": "MLflow is an open source platform for managing ML lifecycle"
    },
}

# Later records can add new fields
enhanced_record = {
    "inputs": {
        "question": "What is MLflow?",
        "context": "MLflow provides experiment tracking, model registry, and deployment tools",  # New field
        "max_tokens": 150,  # New field
    },
    "expectations": {
        "answer": "MLflow is an open source platform for managing the ML lifecycle, including experimentation, reproducibility, deployment, and a central model registry",
        "relevance_score": 0.95,  # New field
        "factual_accuracy": 1.0,  # New field
    },
}

In [8]:
import json
import pandas as pd
import mlflow
from mlflow.genai.datasets import (
    create_dataset,
    get_dataset,
    search_datasets,
    set_dataset_tags,
    delete_dataset_tag,
)


EXPERIMENT_NAME = "wixqa_expertwritten_dataset_creation"
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment(EXPERIMENT_NAME)
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
experiment_id = experiment.experiment_id

In [13]:
# Create a new dataset with tags for organization
dataset = create_dataset(
    name="wixqa_expertwritten_sample",
    experiment_id=[experiment_id],  # Link to experiments ("0" is default)
    tags={
        "version": "1.0",
        "purpose": "regression_testing",
        "model": "gpt-4o",
        "team": "personal",
        "status": "development",
    },
)

In [14]:
# Test cases can be manually defined as dictionaries
# merge_records() accepts both dict and pandas.DataFrame formats for manual
# record additions
{
    "question": "Can I start accepting payments on my site while my Wix Payments account is still under verification?",
    "answer": "You can start accepting payments on your site using [Wix Payments](https:\/\/support.wix.com\/en\/article\/about-wix-payments) almost immediately. However, we need to verify your identity before your account can be fully activated.",
    "article_ids": ["49d9e88fadbf11fa4e685c847590078ff9394c2fe7566094f504f53ca4aca465"],
}

{
    "question": "I am inquiring about purchasing the yearly premium plan for $17.00, which includes a free domain for 1 year. However, the voucher does not show up at checkout. Do I need to purchase the plan first and then will the voucher be available to activate?",
    "answer": "When you purchase a yearly Premium plan, the free domain voucher is not immediately visible at checkout. Instead, the voucher becomes available after you complete the purchase of the plan. You can then claim the voucher by visiting the [Premium Vouchers page](https:\/\/manage.wix.com\/account\/vouchers?referralAdditionalInfo=%7Btheir) in your Wix account. The voucher is valid for two months from the date of purchase, and you can use it to register a domain for free for one year. If you have already purchased the plan and still do not see the voucher, ensure that you meet all eligibility criteria, such as not having purchased a monthly plan or a domain prior to the plan. If issues persist, you can [contact Wix Customer Care](https:\/\/support.wix.com\/en\/article\/contacting-wix-customer-care-for-support) for further assistance.",
    "article_ids": ["06535db983ea0ffe0214af14497a1d158f279d92c92f211e87b8820aa95dbe43"],
}

test_cases = [
    {
        "inputs": {
            "question": "Can I start accepting payments on my site while my Wix Payments account is still under verification?",
        },
        "expectations": {
            "response": "You can start accepting payments on your site using [Wix Payments](https:\/\/support.wix.com\/en\/article\/about-wix-payments) almost immediately. However, we need to verify your identity before your account can be fully activated.",
        },
        "tags": {
            "reviewed_by": "expert_team",
        },
    },
    {
        "inputs": {
            "question": "I am inquiring about purchasing the yearly premium plan for $17.00, which includes a free domain for 1 year. However, the voucher does not show up at checkout. Do I need to purchase the plan first and then will the voucher be available to activate?",
        },
        "expectations": {
            "response": "When you purchase a yearly Premium plan, the free domain voucher is not immediately visible at checkout. Instead, the voucher becomes available after you complete the purchase of the plan. You can then claim the voucher by visiting the [Premium Vouchers page](https:\/\/manage.wix.com\/account\/vouchers?referralAdditionalInfo=%7Btheir) in your Wix account. The voucher is valid for two months from the date of purchase, and you can use it to register a domain for free for one year. If you have already purchased the plan and still do not see the voucher, ensure that you meet all eligibility criteria, such as not having purchased a monthly plan or a domain prior to the plan. If issues persist, you can [contact Wix Customer Care](https:\/\/support.wix.com\/en\/article\/contacting-wix-customer-care-for-support) for further assistance.",
        },
        "tags": {
            "reviewed_by": "expert_team",
        },
    },
]

# Add to your dataset (accepts list[dict], list[Trace] or pandas.DataFrame)
dataset.merge_records(test_cases)

<mlflow.genai.datasets.evaluation_dataset.EvaluationDataset at 0x76c937c45ed0>

In [15]:
print(dataset.dataset_id)
print(experiment_id)

d-7377856557a24756805aad24cef5c85a
14


In [16]:
# from mlflow.genai.datasets import (
#     add_dataset_to_experiments,
#     remove_dataset_from_experiments,
# )

# # Add dataset to additional experiments
# # SKIPPED for now: KeyError: <class 'service_pb2.AddDatasetToExperiments'>
# dataset = add_dataset_to_experiments(
#     dataset_id=dataset.dataset_id, experiment_ids=[experiment_id]
# )
# print(f"Dataset now linked to experiments: {dataset.experiment_ids}")

The evaluation dataset should contain three columns: 
- question, 
- ground truth doc IDs, 
- retrieved relevant doc IDs. 

A “doc ID” is a unique string identifier of the documents in you RAG application. For example, it could be the URL of a documentation web page, or the file path of a PDF document.

In [25]:
# CORPUS_PATH = "./wixqa/wixqa_expertwritten.jsonl"
# docs = []
# # with open(CORPUS_PATH, "r", encoding="utf-8") as f:
# #     for line in f:
# #         record = {}
# #         obj = json.loads(line)
# #         record["inputs"] = {"question": obj["question"]}
# #         record["expectations"] = {"expected_response": obj["answer"]}
# #         record["tag"] = {"reviewed_by": "expert_team"}
# #         docs.append(record)

In [27]:
CORPUS_PATH = "./wixqa/wixqa_expertwritten.jsonl"
records = []
with open(CORPUS_PATH, "r", encoding="utf-8") as f:
    for line in f:
        record = {}
        obj = json.loads(line)
        record["inputs"] = {"question": obj["question"]}
        record["expectations"] = {"expected_response": obj["answer"], "ground_truth_doc_ids": obj["article_ids"]}
        record["tag"] = {"reviewed_by": "expert_team"}
        records.append(record)

In [28]:
dataset = create_dataset(
    name="wixqa_expertwritten",
    experiment_id=[experiment_id],  # Link to experiments ("0" is default)
    tags={
        "version": "1.0",
        "purpose": "retrieval_evaluation",
        "model": "gpt-4o",
        "team": "personal",
        "status": "development",
    },
)
dataset.merge_records(records)

<mlflow.genai.datasets.evaluation_dataset.EvaluationDataset at 0x76c93461de90>

In [33]:
dataset_df = dataset.to_df()
dataset_df.head(5)

Unnamed: 0,inputs,outputs,expectations,tags,source_type,source_id,created_time,dataset_record_id
0,{'question': 'I want to completely remove the ...,{},{'expected_response': ' To completely remove ...,{'mlflow.user': 'joshuale'},HUMAN,,1763054984266,dr-005306a1cff042748027fe8b59e188be
1,{'question': 'Im having trouble changing the b...,{},{'expected_response': 'To change the browser t...,{'mlflow.user': 'joshuale'},HUMAN,,1763054984266,dr-0167e516c5b94df1acc467ca65d579d1
2,{'question': 'My automated emails triggered by...,{},{'expected_response': 'If your automated email...,{'mlflow.user': 'joshuale'},HUMAN,,1763054984266,dr-02f59e84d34045f79b8e35c19f40688e
3,{'question': 'I would like to change my billin...,{},{'expected_response': 'To change your billing ...,{'mlflow.user': 'joshuale'},HUMAN,,1763054984266,dr-0538499ae8564e8697b719e90258993c
4,{'question': 'I need help setting up an email ...,{},{'expected_response': 'To set up an email for ...,{'mlflow.user': 'joshuale'},HUMAN,,1763054984266,dr-05a3d3aa899642d3bccfb5c66c641259


In [37]:
dataset_df.to_csv("wixqa/wixqa_expertwritten_eval_dataset.csv", index=False)