# 04. Evaluate with code-based scorers

In [None]:
%run ./00_setup

## Load evaluation data as records

In [None]:
eval_dataset = mlflow.genai.datasets.get_dataset(
    uc_table_name=f"{CATALOG}.{SCHEMA}.{EVAL_TABLE}",
)

eval_records = eval_dataset.to_df()[["inputs", "expectations"]].to_dict(
    orient="records"
)

## Define code-based scorers

In [None]:
from typing import Any
import mlflow
import json
import difflib
from mlflow.entities import Trace, Feedback, SpanType
from mlflow.genai.scorers import scorer

In [None]:
def fuzzy_match_score(predicted_value: str, expected_value: str) -> float:
    """
    Calculate fuzzy match score between predicted and expected values.
    Returns a score between 0.0 and 1.0, where 1.0 is perfect match.
    """
    if not predicted_value and not expected_value:
        return 1.0
    if not predicted_value or not expected_value:
        return 0.0

    # Convert to strings and normalize (lowercase, strip)
    pred_str = str(predicted_value).lower().strip()
    exp_str = str(expected_value).lower().strip()

    # Use difflib SequenceMatcher for fuzzy matching
    similarity = difflib.SequenceMatcher(None, pred_str, exp_str).ratio()
    return similarity


def extract_field_from_json(json_data, field_name: str) -> str:
    """
    Extract a specific field value from various JSON structures.
    Handles ChatCompletion objects, direct JSON strings, and nested structures.
    Returns empty string if field not found or JSON is invalid.
    """
    try:
        # Handle ChatCompletion object
        if hasattr(json_data, "choices") and len(json_data.choices) > 0:
            content = json_data.choices[0].message.content
            data = json.loads(content)
            field_value = data.get(field_name)
        # Handle dict with expected_response
        elif isinstance(json_data, dict) and "expected_response" in json_data:
            nested_json = json.loads(json_data["expected_response"])
            field_value = nested_json.get(field_name)
        # Handle direct dict
        elif isinstance(json_data, dict):
            field_value = json_data.get(field_name)
        # Handle JSON string
        else:
            data = json.loads(str(json_data))
            field_value = data.get(field_name)

        if field_value is None:
            return ""
        return str(field_value).strip()
    except (json.JSONDecodeError, TypeError, KeyError, AttributeError):
        return ""

In [None]:
# Create fuzzy matching scorers for each field


@scorer
def start_date(outputs: str, expectations: str) -> Feedback:
    """Fuzzy match scorer for start_date field"""
    predicted = extract_field_from_json(outputs, "start_date")
    expected = extract_field_from_json(expectations, "start_date")
    score = fuzzy_match_score(predicted, expected)

    return Feedback(
        value="yes" if score >= 0.7 else "no",
        rationale=f"Fuzzy match score for start_date: {score:.1%}. Predicted: '{predicted}', Expected: '{expected}'",
    )


@scorer
def end_date(outputs: str, expectations: str) -> Feedback:
    """Fuzzy match scorer for end_date field"""
    predicted = extract_field_from_json(outputs, "end_date")
    expected = extract_field_from_json(expectations, "end_date")
    score = fuzzy_match_score(predicted, expected)

    return Feedback(
        value="yes" if score >= 0.7 else "no",
        rationale=f"Fuzzy match score for end_date: {score:.1%}. Predicted: '{predicted}', Expected: '{expected}'",
    )


@scorer
def leased_space(outputs: str, expectations: str) -> Feedback:
    """Fuzzy match scorer for leased_space field"""
    predicted = extract_field_from_json(outputs, "leased_space")
    expected = extract_field_from_json(expectations, "leased_space")
    score = fuzzy_match_score(predicted, expected)

    return Feedback(
        value="yes" if score >= 0.7 else "no",
        rationale=f"Fuzzy match score for leased_space: {score:.1%}. Predicted: '{predicted}', Expected: '{expected}'",
    )


@scorer
def lessee(outputs: str, expectations: str) -> Feedback:
    """Fuzzy match scorer for lessee field"""
    predicted = extract_field_from_json(outputs, "lessee")
    expected = extract_field_from_json(expectations, "lessee")
    score = fuzzy_match_score(predicted, expected)

    return Feedback(
        value="yes" if score >= 0.7 else "no",
        rationale=f"Fuzzy match score for lessee: {score:.1%}. Predicted: '{predicted}', Expected: '{expected}'",
    )


@scorer
def lessor(outputs: str, expectations: str) -> Feedback:
    """Fuzzy match scorer for lessor field"""
    predicted = extract_field_from_json(outputs, "lessor")
    expected = extract_field_from_json(expectations, "lessor")
    score = fuzzy_match_score(predicted, expected)

    return Feedback(
        value="yes" if score >= 0.7 else "no",
        rationale=f"Fuzzy match score for lessor: {score:.1%}. Predicted: '{predicted}', Expected: '{expected}'",
    )


@scorer
def signing_date(outputs: str, expectations: str) -> Feedback:
    """Fuzzy match scorer for signing_date field"""
    predicted = extract_field_from_json(outputs, "signing_date")
    expected = extract_field_from_json(expectations, "signing_date")
    score = fuzzy_match_score(predicted, expected)

    return Feedback(
        value="yes" if score >= 0.7 else "no",
        rationale=f"Fuzzy match score for signing_date: {score:.1%}. Predicted: '{predicted}', Expected: '{expected}'",
    )


@scorer
def term_of_payment(outputs: str, expectations: str) -> Feedback:
    """Fuzzy match scorer for term_of_payment field"""
    predicted = extract_field_from_json(outputs, "term_of_payment")
    expected = extract_field_from_json(expectations, "term_of_payment")
    score = fuzzy_match_score(predicted, expected)

    return Feedback(
        value="yes" if score >= 0.7 else "no",
        rationale=f"Fuzzy match score for term_of_payment: {score:.1%}. Predicted: '{predicted}', Expected: '{expected}'",
    )


@scorer
def designated_use(outputs: str, expectations: str) -> Feedback:
    """Fuzzy match scorer for designated_use field"""
    predicted = extract_field_from_json(outputs, "designated_use")
    expected = extract_field_from_json(expectations, "designated_use")
    score = fuzzy_match_score(predicted, expected)

    return Feedback(
        value="yes" if score >= 0.7 else "no",
        rationale=f"Fuzzy match score for designated_use: {score:.1%}. Predicted: '{predicted}', Expected: '{expected}'",
    )


@scorer
def extension_period(outputs: str, expectations: str) -> Feedback:
    """Fuzzy match scorer for extension_period field"""
    predicted = extract_field_from_json(outputs, "extension_period")
    expected = extract_field_from_json(expectations, "extension_period")
    score = fuzzy_match_score(predicted, expected)

    return Feedback(
        value="yes" if score >= 0.7 else "no",
        rationale=f"Fuzzy match score for extension_period: {score:.1%}. Predicted: '{predicted}', Expected: '{expected}'",
    )


@scorer
def expiration_date_of_lease(outputs: str, expectations: str) -> Feedback:
    """Fuzzy match scorer for expiration_date_of_lease field"""
    predicted = extract_field_from_json(outputs, "expiration_date_of_lease")
    expected = extract_field_from_json(expectations, "expiration_date_of_lease")
    score = fuzzy_match_score(predicted, expected)

    return Feedback(
        value="yes" if score >= 0.7 else "no",
        rationale=f"Fuzzy match score for expiration_date_of_lease: {score:.1%}. Predicted: '{predicted}', Expected: '{expected}'",
    )

## Run evaluation with custom fuzzy matching scorers

In [None]:
# Collect all fuzzy matching scorers
fuzzy_scorers = [
    start_date,
    end_date,
    leased_space,
    lessee,
    lessor,
    signing_date,
    term_of_payment,
    designated_use,
    extension_period,
    expiration_date_of_lease,
]

# Run evaluation with fuzzy matching scorers
with mlflow.start_run(run_name="Eval with code-based scorers"):
    fuzzy_results = mlflow.genai.evaluate(
        data=eval_records, predict_fn=extract_lease_data, scorers=fuzzy_scorers
    )