In [7]:
import getpass
import os
import pprint
import pandas as pd
from langgraph.checkpoint.memory import MemorySaver
from langgraph.graph import START, MessagesState, StateGraph
from langchain_core.messages import HumanMessage
from langchain_core.messages import AIMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from pydantic import BaseModel, Field, validator
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from langchain import hub

%load_ext dotenv
%dotenv

os.environ["LANGCHAIN_TRACING_V2"] = "true"
#os.environ["LANGCHAIN_API_KEY"] = getpass.getpass()
#os.environ["OPENAI_API_KEY"] = getpass.getpass()

# Initialize model
answer_model  = ChatOpenAI(model="gpt-4o", temperature=0.5)
eval_model = ChatOpenAI(model="gpt-4o", temperature=0.1)

response_schemas = [
    ResponseSchema(name="evaluation", description="feedback on answer"),
    ResponseSchema(
        name="relevance",
        description="evaluation of answer, should be a percentage",
    ),
]

output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions()

# Pull latest prompt from LangSmith
relevance_prompt = hub.pull("benchmarking_relevance_v1")

# Create a prompt 
eval_prompt = PromptTemplate(
    template = relevance_prompt.template,
    input_variables=["question", "answer", "reference"],
    partial_variables={"format_instructions": format_instructions},
)

# Load test cases
testcases_df = pd.read_csv("relevance_gemini_01a.csv")
eval_store_data = []

# Iterate over test cases and call LLM for answer
for index, row in testcases_df.iterrows():
    question = row["question"]
    reference = row["reference"]

    # Get answer from LLM
    answer = answer_model.invoke([HumanMessage(content=question)]).content
    
    # Evaluate response
    eval_prompt_and_model = eval_prompt | eval_model
    output = eval_prompt_and_model.invoke({"question": question, "answer": answer, "reference": reference})

    # chain = prompt | model | parser
    # chain.invoke({"question": question, "answer": answer, "reference": reference})

    # Parse the output using the parser
    parsed_result = output_parser.invoke(output)

    # Store results
    eval_store_data.append(parsed_result)
    eval_store_data[index]["question"] = question
    eval_store_data[index]["reference"] = reference
    eval_store_data[index]["answer"] = answer

answer_headers = ["question", "reference", "answer", "evaluation", "relevance"]
eval_df = pd.DataFrame(eval_store_data, columns=answer_headers)
%store eval_df    

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv
Stored 'eval_df' (DataFrame)


  db[ 'autorestore/' + arg ] = obj


In [8]:
eval_df

Unnamed: 0,question,reference,answer,evaluation,relevance
0,Summarize the NPS for this year (2024) for Cir...,Average NPS of -17 in 2024. \n\nJanuary 2024: ...,"I'm sorry, but I don't have access to real-tim...",The system answer does not provide any of the ...,0%
