In [None]:
import os
from time import sleep

import pandas as pd
import phoenix as px
from datasets import Dataset
from dotenv import load_dotenv
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain
from langchain_chroma import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from phoenix.session.evaluation import get_qa_with_reference
from phoenix.trace import SpanEvaluations, using_project
from phoenix.trace.langchain import LangChainInstrumentor
from ragas import evaluate
from ragas.metrics import (
    answer_relevancy,
    context_precision,
    context_recall,
    faithfulness, answer_correctness,
)

In [None]:
# load .env file
load_dotenv()

# get azure credentials from .env file
azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
api_version = os.getenv("AZURE_OPENAI_VERSION")
deployment_name = os.getenv("AZURE_OPENAI_DEPLOYMENT")
embedding_deployment_name = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT")
api_key = os.getenv("AZURE_OPENAI_API_KEY")
temperature = os.getenv("TEMPERATURE")

In [None]:
def build_chain(embeddings_model, model):
    # load vectorstore
    vectorstore = Chroma(embedding_function=embeddings_model, persist_directory="./chroma_db")

    # initialize a retriever from the vectorstore
    retriever = vectorstore.as_retriever()

    # crate a system prompt that tells the LLM to answer questions based on the given context
    # and use a variable that represents the context
    system_prompt = (
        "You are an assistant for question-answering tasks. "
        "Use the following pieces of retrieved context to answer "
        "the question. If you don't know the answer, say that you "
        "don't know. Use three sentences maximum and keep the "
        "answer concise."
        "\n\n"
        "{context}"
    )

    # create a prompt template with the system prompt
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            ("human", "{input}"),
        ]
    )

    # create a helper chain that inserts the retrieved documents into the prompt
    question_answer_chain = create_stuff_documents_chain(model, prompt)

    # create the final RAG chai
    chain = create_retrieval_chain(retriever, question_answer_chain)

    return chain

In [None]:
def generate_ragas_dataset(chain, test_data_df):
    test_questions = test_data_df["question"].values

    # execute chain and store answers and retrieved context
    responses = [
        chain.invoke({"input": question})
        for question in test_questions
    ]

    contexts = []
    for response in responses:
        page_contents = [doc.page_content for doc in response["context"]]
        contexts.append(page_contents)

    test_data_df["answer"] = [response["answer"] for response in responses]
    test_data_df["contexts"] = contexts
    test_dataset = Dataset.from_pandas(test_data_df)

    return test_dataset

In [None]:
# initialize the Embedding Model
embeddings = AzureOpenAIEmbeddings(
    api_version=api_version,
    openai_api_type='azure',
    azure_endpoint=azure_endpoint,
    azure_deployment=embedding_deployment_name,
)

# initialize the Azure OpenAI Model
model = AzureChatOpenAI(
    azure_endpoint=azure_endpoint,
    deployment_name=deployment_name,
    api_key=api_key,
    api_version=api_version,
    openai_api_type="azure",
    temperature=0.0,
    streaming=True,
)

In [None]:
# create a chain
chain = build_chain(embeddings, model)

In [None]:
# read in the testset as a dataframe
evaluation_data = pd.read_csv(
    "testdata.csv",
    usecols=["question", "ground_truth"],
)

In [None]:
# start phoenix session and client
session = px.launch_app(use_temp_dir=False)
client = px.Client()

# initialize Langchain auto-instrumentation
LangChainInstrumentor().instrument()

In [None]:
# create ragas testset in an extra project
with using_project("test"):
    ragas_eval_dataset = generate_ragas_dataset(chain, evaluation_data)

ragas_evals_df = pd.DataFrame(ragas_eval_dataset)

ragas_evals_df

In [None]:
# wait a few seconds in case data hasn't become fully available yet
sleep(5)

# collect information about rag spans
spans_dataframe = get_qa_with_reference(client, project_name="test")

spans_dataframe

In [None]:
# use extra project to see how ragas works under the hood
with using_project("ragas-evals"):
    # start evaluation
    evaluation_result = evaluate(
        dataset=ragas_eval_dataset,
        metrics=[faithfulness, answer_correctness, context_recall, context_precision],
        llm=model,
        embeddings=embeddings
    )
    
# get evaluation scores
eval_scores_df = pd.DataFrame(evaluation_result.scores)

# get evaluation data
eval_data_df = pd.DataFrame(evaluation_result.dataset)

In [None]:
# assign span ids to the ragas evaluation scores (needed so Phoenix knows where to attach the spans).
span_questions = (
    spans_dataframe[["input"]]
    .sort_values("input")
    .drop_duplicates(subset=["input"], keep="first")
    .reset_index()
    .rename({"input": "question"}, axis=1)
)

ragas_evals_df = ragas_evals_df.merge(span_questions, on="question").set_index("context.span_id")
eval_data_df = eval_data_df.merge(span_questions, on="question").set_index("context.span_id")
eval_scores_df.index = eval_data_df.index

In [None]:
for eval_name in eval_scores_df.columns:
    evals_df = eval_scores_df[[eval_name]].rename(columns={eval_name: "score"})
    evals = SpanEvaluations(eval_name, evals_df)
    px.Client().log_evaluations(evals)

In [None]:
px.close_app()