# Langfuse + Ragas

## Prepare the Data

In [19]:
from datasets import load_dataset

fiqa_eval = load_dataset("explodinggradients/fiqa", "ragas_eval")['baseline']
fiqa_eval

Dataset({
    features: ['question', 'ground_truths', 'answer', 'contexts'],
    num_rows: 30
})

## Score with Trace

In [22]:
ENV_HOST = "https://cloud.langfuse.com"
ENV_SECRET_KEY = "sk-lf-9ef3e2b8-c5ec-4d51-b530-277e5bb98b26"
ENV_PUBLIC_KEY = "pk-lf-83fe3fe9-b4c5-4269-b387-86257297cc3a"

In [23]:
from langfuse import Langfuse
 
langfuse = Langfuse(ENV_PUBLIC_KEY, ENV_SECRET_KEY, ENV_HOST)

In [25]:
row = fiqa_eval[0]

In [28]:
from ragas.metrics import faithfulness, answer_relevancy

In [34]:
from langfuse.model import CreateTrace, CreateSpan, CreateGeneration, CreateEvent, CreateScore
 
trace = langfuse.trace(CreateTrace(name = "rag"))
trace.span(CreateSpan(
    name = "retrieval", input={'query': row['question']}, output={'chunks': row['contexts']}
))
trace.span(CreateSpan(
    name = "generation", input={'contexts': row['contexts']}, output={'answer': row['answer']}
))

faithfulness_score = faithfulness.score_single(
    {'question': row['question'], 'contexts': row['contexts'], 'answer': row['answer']}
)
answer_relevancy.init_model()
answer_relevancy_score = answer_relevancy.score_single(
    {'question': row['question'], 'answer': row['answer']}
)

In [38]:
trace.score(CreateScore(name='faithfulness', value=faithfulness_score))
trace.score(CreateScore(name='answer_relevancy', value=answer_relevancy_score))

<langfuse.client.StatefulClient at 0x7efd5192bf40>

# Scoring as batch

In [42]:
langfuse.client.trace.list(name='summary-generation')

ValidationError: 2 validation errors for ParsingModel[Traces]
__root__ -> data -> 0 -> scores
  field required (type=value_error.missing)
__root__ -> data -> 1 -> scores
  field required (type=value_error.missing)

In [None]:
for i in 

In [7]:
def fetch_all_pages(name=None, user_id = None, limit=50):
    page = 1
    all_data = []
 
    while True:
        response = langfuse.get_generations(name=name, limit=limit, user_id=user_id, page=page)
        if not response.data:
            break
 
        all_data.extend(response.data)
        page += 1
 
    return all_data

In [8]:
generations = fetch_all_pages(name="summary-generation")
print(len(generations))

2


In [17]:
import os
from langchain.evaluation import load_evaluator, EvaluatorType
from langchain import PromptTemplate, OpenAI, LLMChain
from langchain.evaluation.criteria import LabeledCriteriaEvalChain


EVAL_TYPES={
    "hallucination": True,
    "conciseness": True,
    "relevance": True,
    "coherence": True,
    "harmfulness": True,
    "maliciousness": True,
    "helpfulness": True,
    "controversiality": True,
    "misogyny": True,
    "criminality": True,
    "insensitivity": True
}


def get_evaluator_for_key(key: str):
  llm = OpenAI(temperature=0)
  return load_evaluator("criteria", criteria=key, llm=llm)
 
def get_hallucination_eval():
  criteria = {
    "hallucination": (
      "Does this submission contain information"
      " not present in the input or reference?"
    ),
  }
  llm = OpenAI(temperature=0)
 
  return LabeledCriteriaEvalChain.from_llm(
      llm=llm,
      criteria=criteria,
  )

In [18]:
from langfuse.model import InitialScore
 
 
def execute_eval_and_score():
 
  for generation in generations:
    criteria = [key for key, value in EVAL_TYPES.items() if value and key != "hallucination"]
 
    for criterion in criteria:
      eval_result = get_evaluator_for_key(criterion).evaluate_strings(
          prediction=generation.completion,
          input=generation.prompt,
      )
      print(eval_result)
 
      langfuse.score(InitialScore(name=criterion, traceId=generation.trace_id, observationId=generation.id, value=eval_result["score"], comment=eval_result['reasoning']))
 
execute_eval_and_score()
 

AttributeError: 'Observation' object has no attribute 'completion'