# This shows how to do 3 things:
1. specify or define evaluators
2. run evaluations on a local dataset (as a pandas dataframe)
3. run evaluations _after_ running your RAG query system

in each case, the results will be visible in the UI (it saves to your lastmile account)

In [4]:
from lastmile_eval.rag.debugger.api.evaluation import evaluate_rag_outputs
import pandas as pd

from lastmile_eval.text.metrics import calculate_rouge1_score

project_id = "123"

def rouge1(df: pd.DataFrame):
    return calculate_rouge1_score(df["output"].tolist(), df["groundTruth"].tolist())

trace_level_evaluators = {
    "rouge1": rouge1
}

df = pd.DataFrame({
    "query": ["hello", "world"],
    "output": ["hello", "world"],
    "groundTruth": ["hello", "world!"]
})

eval_result = evaluate_rag_outputs(
    project_id=project_id,
    trace_level_evaluators=trace_level_evaluators,
    dataset_level_evaluators={},
    df=df,
    lastmile_api_token="token",
    evaluation_set_name="Hello world: with output prepared"
)

2024-05-01 14:59:23,944 - Starting new HTTPS connection (1): lastmileai.dev:443
2024-05-01 14:59:24,063 - https://lastmileai.dev:443 "POST /api/evaluation_test_sets/create HTTP/1.1" 200 305
2024-05-01 14:59:24,066 - Starting new HTTPS connection (1): lastmileai.dev:443
2024-05-01 14:59:24,177 - https://lastmileai.dev:443 "GET /api/evaluation_test_cases/list HTTP/1.1" 200 None
2024-05-01 14:59:24,195 - Starting new HTTPS connection (1): s3.amazonaws.com:443
2024-05-01 14:59:24,295 - https://s3.amazonaws.com:443 "HEAD /datasets.huggingface.co/datasets/metrics/evaluate-metric/rouge/evaluate-metric/rouge.py HTTP/1.1" 404 0
2024-05-01 14:59:24,298 - Starting new HTTPS connection (1): huggingface.co:443
2024-05-01 14:59:24,373 - https://huggingface.co:443 "HEAD /spaces/evaluate-metric/rouge/resolve/v0.4.1/rouge.py HTTP/1.1" 404 0
2024-05-01 14:59:24,375 - Starting new HTTPS connection (1): huggingface.co:443
2024-05-01 14:59:24,445 - https://huggingface.co:443 "HEAD /spaces/evaluate-metric/r

In [6]:
eval_result

CreateEvaluationsResult(success=True, message='{"id":"clvo6lg3g0013pebq1q8jdvcx","createdAt":"2024-05-01T18:59:24.987Z","updatedAt":"2024-05-01T18:59:24.987Z","name":"Hello world: with output prepared","paramSet":null,"testSetId":"clvo6lfdr0048qyj0nmwwbhui","creatorId":"clkrgxm850004phi6ee5mvhd1","projectId":null,"organizationId":null,"visibility":"MEMBER","metadata":null,"active":true}')

In [2]:
from functools import partial
from typing import Sequence
from lastmile_eval.rag.debugger.api.evaluation import run_queries_and_evaluate_outputs
from lastmile_eval.text.metrics import calculate_faithfulness_score, calculate_qa_score


project_id = "123"

def calculate_qa_score(
    texts_to_evaluate: Sequence[str],
    references: Sequence[str],
    questions: Sequence[str],
    model_name: str = "gpt-3.5-turbo",
) -> list[float]:

    # Mock implementation, real one isn't working
    out = [0.1 + float(i / len(texts_to_evaluate)) for i in range(len(texts_to_evaluate))]
    return out


trace_level_evaluators = {
    "qa_score": partial(calculate_qa_score, model_name="gpt-3.5-turbo"),
    # "p_faithful": partial(calculate_faithfulness_score, lastmile_api_token=lastmile_api_token),
}

def rag_query_fn_example(query: str):
    return f"Survey says: the answer to '{query}' is 42."

eval_result = run_queries_and_evaluate_outputs(
    project_id=project_id,
    trace_level_evaluators=trace_level_evaluators,
    dataset_level_evaluators={},
    rag_query_fn=rag_query_fn_example,
    queries=["hello", "world"],
    ground_truth=["hello", "world!"],
    evaluation_set_name="Hello world: run RAG fn first, v6",
    lastmile_api_token="token"
)

2024-05-01 16:18:27,529 - Starting new HTTPS connection (1): lastmileai.dev:443
2024-05-01 16:18:27,852 - https://lastmileai.dev:443 "POST /api/evaluation_test_sets/create HTTP/1.1" 200 305
2024-05-01 16:18:27,855 - Starting new HTTPS connection (1): lastmileai.dev:443
2024-05-01 16:18:28,091 - https://lastmileai.dev:443 "GET /api/evaluation_test_cases/list HTTP/1.1" 200 None
2024-05-01 16:18:28,102 - Starting new HTTPS connection (1): lastmileai.dev:443
2024-05-01 16:18:28,212 - https://lastmileai.dev:443 "POST /api/evaluation_sets/create HTTP/1.1" 200 342


In [3]:
eval_result

CreateEvaluationsResult(success=True, message='{"id":"clvo9f3zo005lpebqcz7dfb2d","createdAt":"2024-05-01T20:18:28.211Z","updatedAt":"2024-05-01T20:18:28.211Z","name":"Hello world: run RAG fn first, v6","paramSet":null,"testSetId":"clvo9f3pt005gpebq9gpffa6w","creatorId":"clkrgxm850004phi6ee5mvhd1","projectId":null,"organizationId":null,"visibility":"MEMBER","metadata":null,"active":true}')