In [None]:
%run RAG.ipynb

In [17]:
import os
import openai

from dotenv import load_dotenv

load_dotenv()

openai.api_key = os.getenv("OPENAI_API_KEY")

In [None]:
eval_questions_path = rags_for_eval[0].get_evaluation_questions()
eval_questions = []
with open(eval_questions_path, 'r') as file:
    for line in file:
        item = line.strip()
        eval_questions.append(item)

In [19]:
from trulens.apps.custom import instrument
from trulens.core import TruSession

session = TruSession()
session.reset_database()

In [20]:
def get_trulens_app(rag, feedbacks:list):
  """
  Get trulens app for a given rag and feedbacks.

  Args:
    rag: RAG object.
    feedbacks: List of feedbacks.

  Returns:
    trulens app.
  """
  from trulens.apps.custom import TruCustomApp

  return TruCustomApp(
    rag,
    app_name="RAG",
    app_version=rag.name,
    feedbacks=feedbacks,
  )
  

 


def build_trulens_recorder(rag):
  """
  Build trulens recorder for a given rag, setting the feedbacks functions and building the recorder app.

  Args:
    rag: RAG object.

  Returns:
    trulens recorder app.
  """
  from trulens.providers.openai import OpenAI
  from trulens.core import Feedback
  from trulens.core import Select
  import numpy as np

  provider = OpenAI(model_engine="gpt-4o-mini")

  # Define a groundedness feedback function
  f_groundedness = (
    Feedback(provider.groundedness_measure_with_cot_reasons, name="Groundedness")
    .on(Select.RecordCalls.retrieve.rets.collect())
    .on_output()
  )

  # Question/answer relevance between overall question and answer.
  f_answer_relevance = (
    Feedback(provider.relevance_with_cot_reasons, name="Answer Relevance")
    .on_input()
    .on_output()
  )

  # Question/statement relevance between question and each context chunk.
  f_context_relevance = (
    Feedback(provider.context_relevance_with_cot_reasons, name="Context Relevance")
    .on_input()
    .on(Select.RecordCalls.retrieve.rets[:])
    .aggregate(np.mean)
  )
  

  feedbacks = [f_groundedness, f_answer_relevance, f_context_relevance]

  return get_trulens_app(rag, feedbacks)


def eval_rags(rags:list, questions:list):
  """
  Evaluate a list of rags for a list of questions.

  Args:
    rags: List of RAG objects.
    questions: List of questions for evaluation.

  Returns:
    None
  """
  from tqdm import tqdm

  for r in rags:
    print("Evaluating: ", r.name)
    tru_query_engine_recorder = build_trulens_recorder(r)

    with tru_query_engine_recorder as recording:
      for q in tqdm(questions):
        r.query(q)

    print("\nFinished evaluation\n")


In [None]:
eval_rags(rags_for_eval, eval_questions[:2])

In [None]:
session.get_leaderboard()

In [None]:
session

In [None]:
from trulens.dashboard import run_dashboard

run_dashboard(
    session
)

In [24]:
# from trulens.dashboard import stop_dashboard

# stop_dashboard(
#     session
# )