In [None]:
import getpass
import os
import pprint
import pandas as pd
from langgraph.checkpoint.memory import MemorySaver
from langgraph.graph import START, MessagesState, StateGraph
from langchain_core.messages import HumanMessage
from langchain_core.messages import AIMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from pydantic import BaseModel, Field, validator
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from langchain import hub
from langchain_google_genai import ChatGoogleGenerativeAI
from dotenv import load_dotenv

load_dotenv()

os.environ["LANGCHAIN_TRACING_V2"] = "true"
#os.environ["LANGCHAIN_API_KEY"] = getpass.getpass()
#os.environ["OPENAI_API_KEY"] = getpass.getpass()

# Define evaluation function
def evaluate_criteria_from_file(parser, testcases_df, eval_llm, prompt):
    eval_store_data = []
    
    #Iterate over test cases and evaluate answers from file
    for index, row in testcases_df.iterrows():
        question = row["question"]
        reference = row["reference"]
        answer = row["nps_advisor_answer"]
        
        # Evaluate response
        eval_prompt_and_model = prompt | eval_llm
        output = eval_prompt_and_model.invoke({"question": question, "answer": answer, "reference": reference})

        # Parse the output using the parser
        parsed_result = parser.invoke(output)

        # Store results
        eval_store_data.append(parsed_result)
        eval_store_data[index]["question"] = question
        eval_store_data[index]["reference"] = reference
        eval_store_data[index]["answer"] = answer  
    return eval_store_data

# Define evaluation function
def evaluate_llm_output(parser, testcases_df, eval_llm, tested_llm, prompt):
    eval_store_data = []
    
    #Iterate over test cases and call LLM(s) for answer
    for index, row in testcases_df.iterrows():
        question = row["question"]
        reference = row["reference"]

        # # Get answer from LLM
        answer = tested_llm.invoke([HumanMessage(content=question)]).content
        
        # Evaluate response
        eval_prompt_and_model = prompt | eval_llm
        output = eval_prompt_and_model.invoke({"question": question, "answer": answer, "reference": reference})

        # Parse the output using the parser
        parsed_result = parser.invoke(output)

        # Store results
        eval_store_data.append(parsed_result)
        eval_store_data[index]["question"] = question
        eval_store_data[index]["reference"] = reference
        eval_store_data[index]["answer"] = answer  
    return eval_store_data

# Create a prompt 
def create_prompt(prompt_template, criteria):
    # Define output schema
    response_schemas = [
        ResponseSchema(name="evaluation", description="feedback on answer"),
        ResponseSchema(
            name=criteria,
            description="evaluation of answer, must be a percentage",
        ),
    ]

    # Define pydanthic output parser
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    format_instructions = output_parser.get_format_instructions()
    
    eval_prompt = PromptTemplate(
        template = prompt_template.template,
        input_variables=["question", "answer", "reference"],
        partial_variables={"format_instructions": format_instructions},
    )
    return eval_prompt, output_parser

def execute_benchmarks(prompt, criteria, headers):
    prompt, parser = create_prompt(prompt, criteria)
    if criteria == "prioritization": # Select only rows with prioritization flag
        testcases_df = testcases_df.loc[testcases_df['prioritization_flag'] == 1]
        
    store_data = evaluate_criteria_from_file(parser, testcases_df, eval_model, prompt)
    df = pd.DataFrame(store_data, columns=headers)
    return df

def calculate_delta(reference_data, criteria, criteria_header, dataframe):
    reference_data = reference_data.loc[:,["question", "reference", criteria_header]]
    output_df = pd.merge(dataframe, reference_data, how="inner", on=["question", "reference"])
    output_df[criteria] = output_df[criteria].str.rstrip('%').astype('float')/100.0
    output_df[criteria_header] = output_df[criteria_header].str.rstrip('%').astype('float')/100.0
    output_df['delta'] = output_df.apply(lambda x: abs(x[criteria] - x[criteria_header]), axis=1)
    return output_df

def database_llm_switch(switch):
    if switch == 'llm':
        return evaluate_llm_output
    if switch == 'database':
        return evaluate_criteria_from_file 
    
# Load test cases and data store
testcases_df = pd.read_csv("testcases_v1.csv")
manual_scores_df = pd.read_csv("reference_scoring_v1.csv")

# Define variables
relevance_answer_headers = ["question", "reference", "answer", "evaluation", "relevance"]
depth_answer_headers = ["question", "reference", "answer", "evaluation", "depth"]
priority_answer_headers = ["question", "reference", "answer", "evaluation", "prioritization"]

# Pull latest prompt from LangSmith
relevance_prompt = hub.pull("benchmarking_relevance_v1")
depth_prompt = hub.pull("benchmarking_depth_v1")
priority_prompt = hub.pull("benchmarking_prioritization_v1")

# Initialize models to test
gpt4_model  = ChatOpenAI(model="gpt-4o", temperature=0.5)
gemini_model = ChatGoogleGenerativeAI(model="gemini-pro")

# Initialize evaluator model
eval_model = ChatOpenAI(model="gpt-4o", temperature=0.0)

test_relevence = True
test_depth = False
test_priority = False

# Run relevance benchmark


if test_relevence:
    rel_prompt, rel_parser = create_prompt(relevance_prompt, "relevance")
    store_data = evaluate_criteria_from_file(rel_parser, testcases_df, eval_model, rel_prompt)
    rel_df = pd.DataFrame(store_data, columns=relevance_answer_headers)
    display(rel_df)
    
    # Calculate delta
    criteria_header = "nps_advisor_relevance"
    criteria = "relevance" 
    rel_delta_df = calculate_delta(manual_scores_df, criteria, criteria_header, rel_df)
    %store rel_delta_df
    
    # Output
    print("Relevance Delta: "+str(rel_delta_df['delta'].mean()))
    rel_delta_df.to_excel("rel_scoring.xlsx")

# Run depth benchmark
if test_depth:
    dep_prompt, dep_parser = create_prompt(depth_prompt, "depth")
    store_data = evaluate_criteria_from_file(dep_parser, testcases_df, eval_model, dep_prompt)
    dep_df = pd.DataFrame(store_data, columns=depth_answer_headers)
    display(dep_df)
    
    # Calculate delta
    criteria_header = "nps_advisor_depth"
    criteria = "depth" 
    dep_delta_df = calculate_delta(manual_scores_df, criteria, criteria_header, dep_df)
    %store dep_delta_df

    # Output
    print("Depth Delta: "+str(dep_delta_df['delta'].mean()))
    dep_delta_df.to_excel("dep_scoring.xlsx")
    
# Run priority benchmark
if test_priority:
    prompt, parser = create_prompt(priority_prompt, "prioritization")
    testcases_df = testcases_df.loc[testcases_df['prioritization_flag'] == 1] # Select only rows with prioritization flag
    store_data = evaluate_criteria_from_file(parser, testcases_df, eval_model, prompt)
    pri_df = pd.DataFrame(store_data, columns=priority_answer_headers)
    display(pri_df)
    

    # Calculate delta
    criteria_header = "nps_advisor_priority"
    criteria = "prioritization" 
    pri_delta_df = calculate_delta(manual_scores_df, criteria, criteria_header, pri_df)
    %store pri_delta_df

    # Output
    print("Prioritization Delta: "+str(pri_delta_df['delta'].mean()))
    pri_delta_df.to_excel("pri_scoring.xlsx")

    

TypeError: database_llm_switch() takes 1 positional argument but 4 were given

In [47]:
import pandas as pd
import math

def calculate_score(relevance, depth, prioritization):
    if math.isnan(prioritization):
        return relevance * 0.500 + depth * 0.500
    return  relevance * 0.450 + depth * 0.450 + prioritization * 0.100

def apply_calculate_score(dataframe):
    return calculate_score(dataframe['relevance'], dataframe['depth'], dataframe['prioritization'])

output_df = pd.merge(rel_delta_df, dep_delta_df, how="inner", on=["question", "reference", "answer"])
output_df = pd.merge(output_df, pri_delta_df, how="outer", on=["question", "reference", "answer"])

display(output_df)
output_df["score"] = output_df.apply(apply_calculate_score, axis=1)
output_df.to_excel("weighted_score.xlsx")


Unnamed: 0,question,reference,answer,evaluation_x,relevance,nps_advisor_relevance,delta_x,evaluation_y,depth,nps_advisor_depth,delta_y,evaluation,prioritization,nps_advisor_priority,delta
0,\nWhat are the top issues causing detractors f...,Network Coverage and Reliability: Customers ha...,Detractors of Circles.Life have expressed diss...,The answer addresses some of the issues mentio...,0.4,0.6,0.2,The answer covers several issues and solutions...,0.4,0.625,0.225,The answer identifies three main issues: rewar...,0.4,0.2,0.2
1,At which stages do customers encounter either ...,Onboarding:\nDelight: Promoters in the data of...,Customers encounter friction or delight at var...,The answer effectively covers the key points m...,1.0,1.0,0.0,The answer effectively covers the key subpoint...,0.9,0.7273,0.1727,,,,
2,"Based on the reasons for NPS, which aspects sh...",Positive Aspects (Promoter Reasons):\nValue fo...,To improve the Net Promoter Score (NPS) for Ci...,The answer covers several key aspects that ali...,0.5,0.5,0.0,The answer provides a comprehensive list of as...,0.5,0.4667,0.0333,,,,
3,Compare the NPS of Circles with other Singapor...,Circles.Life has achieved a Net Promoter Score...,"As of September 2024, Circles.Life Singapore h...",The answer provides a detailed comparison of C...,0.71,0.6667,0.0433,The answer provides a good overview of the fac...,0.5714,0.8182,0.2468,,,,
4,Do certain customer segments experience signif...,Segment-Specific NPS Outcomes\nPlan Type:\n\nH...,"Yes, certain customer segments do experience s...",The answer identifies that certain customer se...,0.5,0.6,0.1,The answer provides a comprehensive overview o...,0.5,0.5455,0.0455,,,,
5,How do NPS scores vary across customer segment...,1. Geography (Roaming vs. Non-Roaming Segments...,NPS scores for Circles.Life vary significantly...,The answer provides a detailed analysis of NPS...,0.25,0.625,0.375,The answer provides a detailed analysis of NPS...,0.5,0.5556,0.0556,,,,
6,How many customers are passive in Circles.Life...,"Circles.Life has approximately 1,478 passive c...",The current number of passive customers for Ci...,The answer provides a detailed explanation of ...,0.2,0.4,0.2,The answer provides a detailed breakdown of pa...,0.25,0.25,0.0,The answer provides a detailed analysis of the...,0.4,0.5,0.1
7,Is there a difference in NPS score between por...,The average NPS score for New Number customers...,"Yes, there is a difference in NPS scores betwe...",The answer provides a detailed analysis of the...,0.5,0.5,0.0,The answer provides a detailed analysis of the...,0.5,0.5,0.0,,,,
8,Summarize the NPS for this year (2024) for Cir...,Average NPS of -17 in 2024. \n\nJanuary 2024: ...,"As of 2024, Circles.Life has experienced fluct...",The answer provides a detailed month-by-month ...,0.9,1.0,0.1,The answer provides a detailed month-by-month ...,0.7,0.4286,0.2714,The answer provides a detailed month-by-month ...,1.0,1.0,0.0
9,What are the primary reasons for promoters' hi...,Value for Money: Affordable plans that provide...,Promoters of Circles.Life have provided severa...,The answer covers several key points related t...,0.33,0.3333,0.0033,The answer provides several reasons for promot...,0.22,0.3636,0.1436,,,,


In [68]:
import requests
import urllib.parse
import json
import pprint as pp

question = 'What is Circles NPS this month'
url = 'https://agenticworkflows.onrender.com/run/'

payload = urllib.parse.quote_plus(question)
print(payload)

response = requests.get(url + payload)
pp.pprint(json.loads(response.content)["response"])


What+is+Circles+NPS+this+month
['As of November 2024, Circles.Life has an NPS (Net Promoter Score) of '
 '-12.99%. This score is derived from the following breakdown of customer '
 'responses:\n'
 '\n'
 '- **Detractors**: 48.31%\n'
 '- **Passives**: 16.36%\n'
 '- **Promoters**: 35.32%\n'
 '\n'
 'The NPS score is calculated by subtracting the percentage of detractors from '
 'the percentage of promoters. A negative NPS indicates that there are more '
 'detractors than promoters among the customer base.']


In [67]:
print(json.loads(response.content)["response"])

['As of November 2024, Circles.Life has an NPS (Net Promoter Score) of -24.43%. This score reflects the percentage of detractors, passives, and promoters among its customers, indicating a challenging customer satisfaction landscape. \n\nHere’s a breakdown of the NPS components for November 2024:\n- **Detractor Percentage**: 52.98%\n- **Passive Percentage**: 18.47%\n- **Promoter Percentage**: 28.55%\n\nThe NPS score is calculated by subtracting the percentage of detractors from the percentage of promoters. In this case, the negative score suggests that there are more detractors than promoters, which can be a focus area for improvement in customer experience.']
