In [None]:
import os
import pandas as pd
import urllib
import requests
import math
import json
from pathlib import Path
from langgraph.checkpoint.memory import MemorySaver
from langgraph.graph import START, MessagesState, StateGraph
from langchain_core.messages import HumanMessage
from langchain_core.messages import AIMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from langchain import hub
from langchain_google_genai import ChatGoogleGenerativeAI
from dotenv import load_dotenv

load_dotenv()

os.environ["LANGCHAIN_TRACING_V2"] = "true"
# os.environ["LANGCHAIN_API_KEY"] = getpass.getpass()
# os.environ["OPENAI_API_KEY"] = getpass.getpass()

# Define evaluation function
def evaluate_criteria_from_file(parser, eval_llm, prompt, testcases_df):
    eval_store_data = []

    # Iterate over test cases and evaluate answers from file
    for index, row in testcases_df.iterrows():
        question = row["question"]
        reference = row["reference"]
        answer = row["nps_advisor_answer"]

        # Evaluate response
        eval_prompt_and_model = prompt | eval_llm
        output = eval_prompt_and_model.invoke(
            {"question": question, "answer": answer, "reference": reference})

        # Parse the output using the parser
        parsed_result = parser.invoke(output)

        # Store results
        eval_store_data.append(parsed_result)
        eval_store_data[index]["question"] = question
        eval_store_data[index]["reference"] = reference
        eval_store_data[index]["answer"] = answer
    return eval_store_data

# Define evaluation function
def evaluate_llm_output(parser, eval_llm, prompt, testcases_df, answers):
    eval_store_data = []
 
    # Evaluate responses
    for index, row in testcases_df.iterrows():
        print(row)
        question = testcases_df.loc[index]['question']
        reference = testcases_df.loc[index]['reference']
        
        eval_prompt_and_model = prompt | eval_llm
        output = eval_prompt_and_model.invoke(
            {"question": question, "answer": answers[index], "reference": reference})

        # Parse the output using the parser
        parsed_result = parser.invoke(output)

        # Store results
        eval_store_data.append(parsed_result)
        eval_store_data[index]["question"] = question
        eval_store_data[index]["reference"] = reference
        eval_store_data[index]["answer"] = answers[index]
        
    return eval_store_data

# Create a prompt
def create_prompt(prompt_template, criteria):
    # Define output schema
    response_schemas = [
        ResponseSchema(name="evaluation", description="feedback on answer"),
        ResponseSchema(
            name=criteria,
            description="evaluation of answer, must be a percentage",
        ),
    ]

    # Define pydanthic output parser
    output_parser = StructuredOutputParser.from_response_schemas(
        response_schemas)
    format_instructions = output_parser.get_format_instructions()

    eval_prompt = PromptTemplate(
        template=prompt_template.template,
        input_variables=["question", "answer", "reference"],
        partial_variables={"format_instructions": format_instructions},
    )
    return eval_prompt, output_parser

# Format score into float
def format_score(criteria, dataframe):
    dataframe[criteria] = dataframe[criteria].str.rstrip(
        '%').astype('float')/100.0
    return dataframe

def calculate_score(relevance, depth, prioritization):
    if math.isnan(prioritization):
        return relevance * 0.500 + depth * 0.500
    return relevance * 0.450 + depth * 0.450 + prioritization * 0.100

def apply_calculate_score(dataframe):
    return calculate_score(dataframe['relevance'], dataframe['depth'], dataframe['prioritization'])

def get_answers(testcases_df, tested_llm, payload_params):
    answers = []    
    
    if tested_llm['name'] == 'nps_advisor':
        url = payload_params[0]
        reframe = payload_params[1]
        guardrails = payload_params[2]
        telco_operator = payload_params[3]
        
        # Loop through question list and get responses
        for index, row in testcases_df.iterrows():
            question = row["question"]
            question_reframed = urllib.parse.quote_plus(question)

            # # Get answer from LLM
            payload = "/".join([question_reframed, reframe,
                            guardrails, telco_operator])
            answer = requests.get(url + payload)
            answers.append((index, json.loads(answer.content)['response'][0]))
            
    else:
        for index, row in testcases_df.iterrows():
            question = row["question"]

            # # Get answer from LLM
            answer = tested_llm['model'].invoke([HumanMessage(content=question)]).content
            answers.append((index, answer))
    
    return answers
  
# Main exec
def execute_benchmarks(prompt, testcases, criteria, 
                       headers, test_target, answers):
    test_models = ['nps_advisor', 'gpt', 'gemini']
    
    prompt, parser = create_prompt(prompt, criteria)
    if criteria == "prioritization":  # Select only rows with prioritization flag
        testcases = testcases.loc[testcases['prioritization_flag'] == 1]

    if test_target['name'] in test_models:
        store_data = evaluate_llm_output(
            parser, eval_model, prompt, testcases, answers)
    else:
        store_data = evaluate_criteria_from_file(
            parser, eval_model, prompt, testcases)
    df = pd.DataFrame(store_data, columns=headers)
    df = format_score(criteria, df)

    # Save output
    
    Path(os.getcwd() + output_dir + test_target['name'] + "/").mkdir(parents=True, exist_ok=True)
    df.to_excel(os.getcwd() + output_dir + test_target['name'] + "/" + criteria + "_scoring.xlsx")
    return df


# Load test cases and data store
testcases_df = pd.read_csv("testcases_v1.csv")
reference_scores_df = pd.read_csv("reference_scoring_v1.csv")

# Define variables
relevance = "relevance"
depth = "depth"
prioritization = "prioritization"

relevance_reference_header = "nps_advisor_relevance"
depth_reference_header = "nps_advisor_depth"
prioritization_reference_header = "nps_advisor_priority"

relevance_answer_headers = ["question",
                            "reference", "answer", "evaluation", "relevance"]
depth_answer_headers = ["question", "reference",
                        "answer", "evaluation", "depth"]
priority_answer_headers = ["question", "reference",
                           "answer", "evaluation", "prioritization"]

url = 'https://agenticworkflows.onrender.com/invoke/'
reframe = 'true'
guardrails = 'true'
telco_operator = 'Circles.Life'
payload_params = [url, reframe, guardrails, telco_operator]

output_dir = '/output/'

# Pull latest prompt from LangSmith
relevance_prompt = hub.pull("benchmarking_relevance_v1")
depth_prompt = hub.pull("benchmarking_depth_v1")
priority_prompt = hub.pull("benchmarking_prioritization_v1")

# Initialize models to test
gpt4_model = {'name': 'gpt', 'model':ChatOpenAI(model="gpt-4o", temperature=0.5)}
gemini_model = {'name': 'gemini', 'model': ChatGoogleGenerativeAI(model="gemini-pro")}
nps_advisor = {'name': 'nps_advisor'}

test_models = ['nps_advisor', 'gpt', 'gemini']

# Initialize evaluator model
eval_model = ChatOpenAI(model="gpt-4o", temperature=0.0)

# Target
test_target = nps_advisor

# Switches
test_relevence = True
test_depth = True
test_priority = True

# Run relevance benchmark
if test_target['name'] in test_models:
    answers = get_answers(testcases_df, test_target, payload_params)
    # print(answers)
else:
    answers = ''
                      
if test_relevence:
    rel_df = execute_benchmarks(relevance_prompt, testcases_df,
                                relevance, relevance_answer_headers, 
                                test_target, answers)
    display(rel_df)
    %store rel_df

if test_depth:
    dep_df = execute_benchmarks(depth_prompt, testcases_df,
                                depth, depth_answer_headers, 
                                test_target, answers)
    display(dep_df)
    %store dep_df

if test_priority:
    pri_df = execute_benchmarks(priority_prompt, testcases_df,
                                prioritization, priority_answer_headers, 
                                test_target, answers)
    display(pri_df)
    %store pri_df
    
# Calculate weighted score
output_df = pd.merge(rel_df, dep_df, how="inner", on=[
                     "question", "reference"])
output_df = pd.merge(output_df, pri_df, how="outer",
                     on=["question", "reference"])
output_df["weighted_score"] = output_df.apply(apply_calculate_score, axis=1)

Path(os.getcwd() + output_dir).mkdir(parents=True, exist_ok=True)
output_df.to_excel(os.getcwd() + output_dir + test_target['name'] + "_weighted_score.xlsx")


Unnamed: 0                                                             1
question               Summarize the NPS for this year (2024) for Cir...
prioritization_flag                                                    1
reference              Average NPS of -17 in 2024. \n\nJanuary 2024: ...
nps_advisor_answer     As of 2024, Circles.Life has experienced fluct...
Name: 0, dtype: object
Unnamed: 0                                                             2
question               What trends have Circles life observed in NPS ...
prioritization_flag                                                    1
reference              "2017: +54 \n2018: +50\n2019: +45\n2020: +40\n...
nps_advisor_answer     ### Circles.Life NPS Trends Over the Past 5 Ye...
Name: 1, dtype: object
Unnamed: 0                                                             3
question               What are the top issues mentioned in the App S...
prioritization_flag                                                    1
refer

Unnamed: 0,question,reference,answer,evaluation,relevance
0,Summarize the NPS for this year (2024) for Cir...,Average NPS of -17 in 2024. \n\nJanuary 2024: ...,"(0, <Response [200]>)",The answer does not provide any of the key poi...,0.0
1,What trends have Circles life observed in NPS ...,"""2017: +54 \n2018: +50\n2019: +45\n2020: +40\n...","(1, <Response [200]>)",The answer does not provide any information ab...,0.0
2,What are the top issues mentioned in the App S...,Network Reliability: A significant number of u...,"(2, <Response [200]>)",The answer provided does not address any of th...,0.0
3,What emerging themes or trends for Circles.Lif...,"""Analyzing the provided NPS data and customer ...","(3, <Response [200]>)",The answer provided does not contain any of th...,0.0
4,How many customers are passive in Circles.Life...,"Circles.Life has approximately 1,478 passive c...","(4, <Response [200]>)",The answer does not provide any of the key poi...,0.0
5,\nWhat are the top issues causing detractors f...,Network Coverage and Reliability: Customers ha...,"(5, <Response [200]>)",The answer does not provide any information re...,0.0
6,Compare the NPS of Circles with other Singapor...,Circles.Life has achieved a Net Promoter Score...,"(6, <Response [200]>)",The answer provided does not address the quest...,0.0
7,What key drivers contribute to customers' loya...,4 Key Strategies:\n\nExceptional Customer Serv...,"(7, <Response [200]>)",The answer provided does not contain any infor...,0.0
8,"Based on the reasons for NPS, which aspects sh...",Positive Aspects (Promoter Reasons):\nValue fo...,"(8, <Response [200]>)",The answer provided does not contain any key p...,0.0
9,What are the primary reasons for promoters' hi...,Value for Money: Affordable plans that provide...,"(9, <Response [200]>)",The answer provided does not list any key poin...,0.0


Stored 'rel_df' (DataFrame)
Unnamed: 0                                                             1
question               Summarize the NPS for this year (2024) for Cir...
prioritization_flag                                                    1
reference              Average NPS of -17 in 2024. \n\nJanuary 2024: ...
nps_advisor_answer     As of 2024, Circles.Life has experienced fluct...
Name: 0, dtype: object
Unnamed: 0                                                             2
question               What trends have Circles life observed in NPS ...
prioritization_flag                                                    1
reference              "2017: +54 \n2018: +50\n2019: +45\n2020: +40\n...
nps_advisor_answer     ### Circles.Life NPS Trends Over the Past 5 Ye...
Name: 1, dtype: object
Unnamed: 0                                                             3
question               What are the top issues mentioned in the App S...
prioritization_flag                               

KeyboardInterrupt: 

In [None]:
import pandas as pd
import math
import os


def calculate_score(relevance, depth, prioritization):
    if math.isnan(prioritization):
        return relevance * 0.500 + depth * 0.500
    return relevance * 0.450 + depth * 0.450 + prioritization * 0.100


def apply_calculate_score(dataframe):
    return calculate_score(dataframe['relevance'], dataframe['depth'], dataframe['prioritization'])


output_df = pd.merge(rel_df, dep_df, how="inner", on=[
                     "question", "reference"])
output_df = pd.merge(output_df, pri_df, how="outer",
                     on=["question", "reference"])

# display(output_df)
output_df["weighted_score"] = output_df.apply(apply_calculate_score, axis=1)
output_df.to_excel(os.getcwd() + "/data/weighted_score.xlsx")

In [None]:
from pathlib import Path
import os
Path(os.getcwd()+"/my/directory").mkdir(parents=True, exist_ok=True)

In [39]:
index = 0
testcases_df.loc[index]

Unnamed: 0                                                             1
question               Summarize the NPS for this year (2024) for Cir...
prioritization_flag                                                    1
reference              Average NPS of -17 in 2024. \n\nJanuary 2024: ...
nps_advisor_answer     As of 2024, Circles.Life has experienced fluct...
Name: 0, dtype: object