In [4]:
import sys
sys.path.insert(0, '..')

In [5]:
import pickle

In [18]:
with open('./eval-run-v2-2025-10-24-21-42.bin', 'rb') as f_in:
    rows = pickle.load(f_in)

- match
- partial match
- mismatch
- not found

In [28]:
import docs

In [29]:
github_data = docs.read_github_data()
parsed_data = docs.parse_data(github_data)

In [31]:
file_index = {d['filename']: d['content'] for d in parsed_data}

In [19]:
import pandas as pd

In [20]:
df_evals = pd.DataFrame(rows)

In [40]:
df_evals['filename'] = df_evals['original_question'].apply(lambda r: r['filename'])
df_evals['reference'] = df_evals['filename'].apply(file_index.get)

In [43]:
from evidently import Dataset, DataDefinition
from evidently.descriptors import LLMEval
from evidently.llm.templates import MulticlassClassificationPromptTemplate

In [13]:
matcher = MulticlassClassificationPromptTemplate(
    pre_messages=[("system", "You are a judge that evaluates the factual alignment of two chatbot answers.")],
    criteria="""
    You are given a new answer and a reference answer and also the question.
    Classify the new answer based on how it compares to the reference.
    ===
    Question: {question}
    Reference: {reference}
    """,
    category_criteria={
        "match": "The answer matches the reference in all factual and semantic details.",
        "partial_match": "The answer is correct in what it says but leaves out details from the reference.",
        "mismatch": "The answer doesn't match the reference answer.",
        "not_available": "The answer says that information is not available.",
    },
    uncertainty="unknown",
    include_reasoning=True,
    include_scores=False
)

In [44]:
evals_dataset = Dataset.from_pandas(
    data=df_evals,
    data_definition=DataDefinition(),
    descriptors=[
        LLMEval("answer",
            template=matcher,
            additional_columns={"reference": "reference", "question": "question"},
            provider="openai",
            model="gpt-4o-mini",
            alias="eval"
        )
    ]
)

In [45]:
df_result = evals_dataset.as_dataframe()

In [48]:
df_result['eval reasoning'].iloc[1]

"The new answer provides an overview of dataset evaluation metrics, mentioning model quality, classification quality metrics, and regression metrics, which aligns with the reference's focus. However, it lacks specifics about dataset-level metrics mentioned in the reference, like examples of parameters used and different metric options. It also does not reference the contents of the accordion details present in the original reference, thus missing some details."

In [51]:
df_result['eval'].value_counts(normalize=True)

eval
partial_match    0.653846
match            0.346154
Name: proportion, dtype: float64

In [53]:
from evidently import Report
from evidently.presets import TextEvals

In [55]:
report = Report([
    TextEvals()
])

my_eval = report.run(evals_dataset, None)