In [24]:
import getpass
import os
import pprint
import pandas as pd
from langgraph.checkpoint.memory import MemorySaver
from langgraph.graph import START, MessagesState, StateGraph
from langchain_core.messages import HumanMessage
from langchain_core.messages import AIMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from pydantic import BaseModel, Field, validator
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from langchain import hub
from langchain_google_genai import ChatGoogleGenerativeAI

%load_ext dotenv
%dotenv

os.environ["LANGCHAIN_TRACING_V2"] = "true"
#os.environ["LANGCHAIN_API_KEY"] = getpass.getpass()
#os.environ["OPENAI_API_KEY"] = getpass.getpass()

# Define output headers
relevance_answer_headers = ["question", "reference", "evaluation", "relevance"]
depth_answer_headers = ["question", "reference", "evaluation", "depth"]

# Define evaluation function
def evaluate_criteria(parser, testcases_df, eval_llm, tested_llm, prompt):
    eval_store_data = []
    #Iterate over test cases and call LLM(s) for answer
    
    for index, row in testcases_df.iterrows():
        question = row["question"]
        reference = row["reference"]

        # Get answer from LLM
        answer = tested_llm.invoke([HumanMessage(content=question)]).content
        
        # Evaluate response
        eval_prompt_and_model = prompt | eval_llm
        output = eval_prompt_and_model.invoke({"question": question, "answer": answer, "reference": reference})

        # Parse the output using the parser
        parsed_result = parser.invoke(output)

        # Store results
        eval_store_data.append(parsed_result)
        eval_store_data[index]["question"] = question
        eval_store_data[index]["reference"] = reference
        # eval_store_data[index]["answer"] = answer  
    
    return eval_store_data

# Create a prompt 
def create_prompt(prompt_template, criteria):
    
    # Define output schema
    response_schemas = [
        ResponseSchema(name="evaluation", description="feedback on answer"),
        ResponseSchema(
            name=criteria,
            description="evaluation of answer, should be a percentage",
        ),
    ]

    # Define pydanthic output parser
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    format_instructions = output_parser.get_format_instructions()
    
    eval_prompt = PromptTemplate(
        template = prompt_template.template,
        input_variables=["question", "answer", "reference"],
        partial_variables={"format_instructions": format_instructions},
    )
    
    return eval_prompt, output_parser

# Pull latest prompt from LangSmith
relevance_prompt = hub.pull("benchmarking_relevance_v1")
depth_prompt = hub.pull("benchmarking_depth_v1")

# Initialize models to test
gpt4_model  = ChatOpenAI(model="gpt-4o", temperature=0.5)
gemini_model = ChatGoogleGenerativeAI(model="gemini-pro")

# Initialize evaluator model
eval_model = ChatOpenAI(model="gpt-4o", temperature=0.1)

# Load test cases and data store
testcases_df = pd.read_csv("relevance_gemini_01.csv")

# Run relevance benchmark
rel_prompt, parser = create_prompt(relevance_prompt, "relevance")
relevance_store_data = evaluate_criteria(parser, testcases_df, eval_model, gemini_model, rel_prompt)
rel_df = pd.DataFrame(relevance_store_data, columns=relevance_answer_headers)

# Run depth benchmark
dep_prompt, parser = create_prompt(depth_prompt, "depth")
depth_store_data = evaluate_criteria(parser, testcases_df, eval_model, gemini_model, dep_prompt)
dep_df = pd.DataFrame(depth_store_data, columns=depth_answer_headers)

# Store outputs
eval_df = pd.merge(rel_df, dep_df, how="inner", on=["question", "reference"])
eval_df
%store eval_df    

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv
Stored 'eval_df' (DataFrame)


  db[ 'autorestore/' + arg ] = obj


In [25]:
eval_df

Unnamed: 0,question,reference,evaluation_x,relevance,evaluation_y,depth
0,Summarize the NPS for this year (2024) for Cir...,Average NPS of -17 in 2024. \n\nJanuary 2024: ...,The system answer does not provide any of the ...,0%,The system_answer does not provide any specifi...,0%
1,What trends have Circles life observed in NPS ...,"""2017: +54 \n2018: +50\n2019: +45\n2020: +40\n...",The system answer does not address the specifi...,0%,The system_answer does not provide any of the ...,0%
2,What are the top issues mentioned in the App S...,Network Reliability: A significant number of u...,The system answer covers several key issues me...,67%,The system answer covers several key issues me...,66%
3,What emerging themes or trends for Circles.Lif...,"""Analyzing the provided NPS data and customer ...",The system answer covers several key points fr...,80%,The system answer covers several key themes an...,70%
4,How many customers are passive in Circles.Life...,"Circles.Life has approximately 1,478 passive c...",The system answer does not provide the specifi...,0%,The system_answer provides a comprehensive lis...,50%
5,\nWhat are the top issues causing detractors f...,Network Coverage and Reliability: Customers ha...,The system answer covers several key issues me...,60%,The system_answer addresses several key issues...,60%
6,Compare the NPS of Circles with other Singapor...,Circles.Life has achieved a Net Promoter Score...,The system answer provides a detailed comparis...,70%,The system_answer provides a detailed comparis...,70%
7,What key drivers contribute to customers' loya...,4 Key Strategies:\n\nExceptional Customer Serv...,The system answer covers several key drivers o...,50%,The system answer provides a comprehensive lis...,50%
8,"Based on the reasons for NPS, which aspects sh...",Positive Aspects (Promoter Reasons):\nValue fo...,The system answer covers several key points fr...,80%,The system_answer covers several relevant aspe...,50%
9,What are the primary reasons for promoters' hi...,Value for Money: Affordable plans that provide...,The system answer does not address the questio...,0%,The system_answer does not address any of the ...,0%
