In [121]:
import getpass
import os
import pprint
import pandas as pd
from langgraph.checkpoint.memory import MemorySaver
from langgraph.graph import START, MessagesState, StateGraph
from langchain_core.messages import HumanMessage
from langchain_core.messages import AIMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from pydantic import BaseModel, Field, validator
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from langchain import hub
from langchain_google_genai import ChatGoogleGenerativeAI

%load_ext dotenv
%dotenv

os.environ["LANGCHAIN_TRACING_V2"] = "true"
#os.environ["LANGCHAIN_API_KEY"] = getpass.getpass()
#os.environ["OPENAI_API_KEY"] = getpass.getpass()

# Load test cases and data store
testcases_df = pd.read_csv("relevance_gemini_01.csv")
manual_scores_df = pd.read_csv("reference_scoring_v1.csv")

# Define output headers
relevance_answer_headers = ["question", "reference", "answer", "evaluation", "relevance"]
depth_answer_headers = ["question", "reference", "answer", "evaluation", "depth"]

# Define evaluation function
def evaluate_criteria(parser, testcases_df, eval_llm, tested_llm, prompt):
    eval_store_data = []
    #Iterate over test cases and call LLM(s) for answer
    
    for index, row in testcases_df.iterrows():
        question = row["question"]
        reference = row["reference"]

        # # Get answer from LLM
        # answer = tested_llm.invoke([HumanMessage(content=question)]).content
        
        # Get answer from reference
        answer = row["nps_advisor_answer"]
        
        # Evaluate response
        eval_prompt_and_model = prompt | eval_llm
        output = eval_prompt_and_model.invoke({"question": question, "answer": answer, "reference": reference})

        # Parse the output using the parser
        parsed_result = parser.invoke(output)

        # Store results
        eval_store_data.append(parsed_result)
        eval_store_data[index]["question"] = question
        eval_store_data[index]["reference"] = reference
        eval_store_data[index]["answer"] = answer  
    return eval_store_data

# Create a prompt 
def create_prompt(prompt_template, criteria):
    
    # Define output schema
    response_schemas = [
        ResponseSchema(name="evaluation", description="feedback on answer"),
        ResponseSchema(
            name=criteria,
            description="evaluation of answer, should be a percentage",
        ),
    ]

    # Define pydanthic output parser
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    format_instructions = output_parser.get_format_instructions()
    
    eval_prompt = PromptTemplate(
        template = prompt_template.template,
        input_variables=["question", "answer", "reference"],
        partial_variables={"format_instructions": format_instructions},
    )
    return eval_prompt, output_parser

# Pull latest prompt from LangSmith
relevance_prompt = hub.pull("benchmarking_relevance_v1")
depth_prompt = hub.pull("benchmarking_depth_v1")

# Initialize models to test
gpt4_model  = ChatOpenAI(model="gpt-4o", temperature=0.5)
gemini_model = ChatGoogleGenerativeAI(model="gemini-pro")

# Initialize evaluator model
eval_model = ChatOpenAI(model="gpt-4o", temperature=0.0)

test_relevence = True
test_depth = False

# Run relevance benchmark
if test_relevence:
    rel_prompt, parser = create_prompt(relevance_prompt, "relevance")
    relevance_store_data = evaluate_criteria(parser, testcases_df, eval_model, gemini_model, rel_prompt)
    rel_df = pd.DataFrame(relevance_store_data, columns=relevance_answer_headers)
    %store rel_df

# Run depth benchmark
if test_depth:
    dep_prompt, parser = create_prompt(depth_prompt, "depth")
    depth_store_data = evaluate_criteria(parser, testcases_df, eval_model, gemini_model, dep_prompt)
    dep_df = pd.DataFrame(depth_store_data, columns=depth_answer_headers)
    %store dep_df
    

# Store outputs
# eval_df = pd.merge(rel_df, dep_df, how="inner", on=["question", "reference"])

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv
Stored 'rel_df' (DataFrame)


  db[ 'autorestore/' + arg ] = obj


In [120]:
#Depth
import pandas as pd

golden_reference = pd.read_csv("reference_scoring_v1.csv")
# display(golden_reference.loc[:,["question", "reference", "nps_advisor_depth"]])
# display(dep_df)

golden_reference = golden_reference.loc[:,["question", "reference", "nps_advisor_depth"]]
compare_df = pd.merge(dep_df, golden_reference, how="inner", on=["question", "reference"])

compare_df['depth'] = compare_df['depth'].str.rstrip('%').astype('float')/100.0
compare_df['nps_advisor_depth'] = compare_df['nps_advisor_depth'].str.rstrip('%').astype('float')/100.0
compare_df['delta'] = compare_df.apply(lambda x: abs(x['depth'] - x['nps_advisor_depth']), axis=1)

# compare_df
print("Delta: "+str(compare_df['delta'].mean()))
compare_df.to_excel("dep_scoring.xlsx")

Delta: 0.13738666666666666


In [122]:
#Relevance
import pandas as pd

golden_reference = pd.read_csv("reference_scoring_v1.csv")
# display(golden_reference.loc[:,["question", "reference", "nps_advisor_relevance"]])
# display(rel_df)

golden_reference = golden_reference.loc[:,["question", "reference", "nps_advisor_relevance"]]
compare_df = pd.merge(rel_df, golden_reference, how="inner", on=["question", "reference"])

compare_df['relevance'] = compare_df['relevance'].str.rstrip('%').astype('float')/100.0
compare_df['nps_advisor_relevance'] = compare_df['nps_advisor_relevance'].str.rstrip('%').astype('float')/100.0
compare_df['delta'] = compare_df.apply(lambda x: abs(x['relevance'] - x['nps_advisor_relevance']), axis=1)

# compare_df
print("Delta: "+str(compare_df['delta'].mean()))
compare_df.to_excel("rel_scoring.xlsx")

Delta: 0.09943999999999999
