In [10]:
import os
import pandas as pd
import urllib
import requests
from langgraph.checkpoint.memory import MemorySaver
from langgraph.graph import START, MessagesState, StateGraph
from langchain_core.messages import HumanMessage
from langchain_core.messages import AIMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from langchain import hub
from langchain_google_genai import ChatGoogleGenerativeAI
from dotenv import load_dotenv

load_dotenv()

os.environ["LANGCHAIN_TRACING_V2"] = "true"
# os.environ["LANGCHAIN_API_KEY"] = getpass.getpass()
# os.environ["OPENAI_API_KEY"] = getpass.getpass()

# Define evaluation function
def evaluate_criteria_from_file(parser, testcases_df, eval_llm, prompt):
    eval_store_data = []

    # Iterate over test cases and evaluate answers from file
    for index, row in testcases_df.iterrows():
        question = row["question"]
        reference = row["reference"]
        answer = row["nps_advisor_answer"]

        # Evaluate response
        eval_prompt_and_model = prompt | eval_llm
        output = eval_prompt_and_model.invoke(
            {"question": question, "answer": answer, "reference": reference})

        # Parse the output using the parser
        parsed_result = parser.invoke(output)

        # Store results
        eval_store_data.append(parsed_result)
        eval_store_data[index]["question"] = question
        eval_store_data[index]["reference"] = reference
        eval_store_data[index]["answer"] = answer
    return eval_store_data

# Define evaluation function
def evaluate_llm_output(parser, testcases_df, eval_llm, tested_llm, prompt, payload_params):
    eval_store_data = []
    
    if tested_llm[0] == 'nps_advisor':
        url = payload_params[0]
        reframe = payload_params[1]
        guardrails = payload_params[2]
        telco_operator = payload_params[3]
        
        for index, row in testcases_df.iterrows():
            question = row["question"]
            question_reframed = urllib.parse.quote_plus(question)
            
            reference = row["reference"]

            # # Get answer from LLM
            payload = "/".join([question_reframed, reframe,
                            guardrails, telco_operator])
            answer = requests.get(url + payload)

            # Evaluate response
            eval_prompt_and_model = prompt | eval_llm
            output = eval_prompt_and_model.invoke(
                {"question": question, "answer": answer, "reference": reference})

            # Parse the output using the parser
            parsed_result = parser.invoke(output)

            # Store results
            eval_store_data.append(parsed_result)
            eval_store_data[index]["question"] = question
            eval_store_data[index]["reference"] = reference
            eval_store_data[index]["answer"] = answer
        
    
    # Iterate over test cases and call LLM(s) for answer
    else:
        for index, row in testcases_df.iterrows():
            question = row["question"]
            question_reframed = urllib.parse.quote_plus(question)
            
            reference = row["reference"]

            # # Get answer from LLM
            answer = tested_llm[1].invoke([HumanMessage(content=question)]).content

            # Evaluate response
            eval_prompt_and_model = prompt | eval_llm
            output = eval_prompt_and_model.invoke(
                {"question": question, "answer": answer, "reference": reference})

            # Parse the output using the parser
            parsed_result = parser.invoke(output)

            # Store results
            eval_store_data.append(parsed_result)
            eval_store_data[index]["question"] = question
            eval_store_data[index]["reference"] = reference
            eval_store_data[index]["answer"] = answer
        
    return eval_store_data

# Create a prompt
def create_prompt(prompt_template, criteria):
    # Define output schema
    response_schemas = [
        ResponseSchema(name="evaluation", description="feedback on answer"),
        ResponseSchema(
            name=criteria,
            description="evaluation of answer, must be a percentage",
        ),
    ]

    # Define pydanthic output parser
    output_parser = StructuredOutputParser.from_response_schemas(
        response_schemas)
    format_instructions = output_parser.get_format_instructions()

    eval_prompt = PromptTemplate(
        template=prompt_template.template,
        input_variables=["question", "answer", "reference"],
        partial_variables={"format_instructions": format_instructions},
    )
    return eval_prompt, output_parser


def execute_benchmarks(prompt, testcases, reference_scores, criteria, 
                       headers, criteria_header, test_target, payload_params):
    test_models = ['nps_advisor', 'gpt', 'gemini']
    
    prompt, parser = create_prompt(prompt, criteria)
    if criteria == "prioritization":  # Select only rows with prioritization flag
        testcases = testcases.loc[testcases['prioritization_flag'] == 1]

    if test_target[0] in test_models:
        store_data = evaluate_llm_output(
            parser, testcases, eval_model, test_target, prompt, payload_params)
    else:
        store_data = evaluate_criteria_from_file(
            parser, testcases, eval_model, prompt)
    df = pd.DataFrame(store_data, columns=headers)

    # Calculate delta
    delta_df = calculate_delta(reference_scores, criteria, criteria_header, df)

    # Output
    print(criteria + " Delta: "+str(delta_df['delta'].mean()))
    delta_df.to_excel(criteria + "_scoring.xlsx")
    return delta_df


def calculate_delta(reference_data, criteria, criteria_header, dataframe):
    reference_data = reference_data.loc[:, [
        "question", "reference", criteria_header]]
    output_df = pd.merge(dataframe, reference_data,
                         how="inner", on=["question", "reference"])
    output_df[criteria] = output_df[criteria].str.rstrip(
        '%').astype('float')/100.0
    output_df[criteria_header] = output_df[criteria_header].str.rstrip(
        '%').astype('float')/100.0
    output_df['delta'] = output_df.apply(
        lambda x: abs(x[criteria] - x[criteria_header]), axis=1)
    return output_df


# Load test cases and data store
testcases_df = pd.read_csv("testcases_v1.csv")
reference_scores_df = pd.read_csv("reference_scoring_v1.csv")

# Define variables
relevance = "relevance"
depth = "depth"
prioritization = "prioritization"

relevance_reference_header = "nps_advisor_relevance"
depth_reference_header = "nps_advisor_depth"
prioritization_reference_header = "nps_advisor_priority"

relevance_answer_headers = ["question",
                            "reference", "answer", "evaluation", "relevance"]
depth_answer_headers = ["question", "reference",
                        "answer", "evaluation", "depth"]
priority_answer_headers = ["question", "reference",
                           "answer", "evaluation", "prioritization"]

url = 'https://agenticworkflows.onrender.com/invoke/'
reframe = 'true'
guardrails = 'true'
telco_operator = 'Circles.Life'
payload_params = [url, reframe, guardrails, telco_operator]

# Pull latest prompt from LangSmith
relevance_prompt = hub.pull("benchmarking_relevance_v1")
depth_prompt = hub.pull("benchmarking_depth_v1")
priority_prompt = hub.pull("benchmarking_prioritization_v1")

# Initialize models to test
gpt4_model = ['gpt', ChatOpenAI(model="gpt-4o", temperature=0.5)]
gemini_model = ['gemini', ChatGoogleGenerativeAI(model="gemini-pro")]
nps_advisor = ['nps_advisor']

# Initialize evaluator model
eval_model = ChatOpenAI(model="gpt-4o", temperature=0.0)

# Switches
test_relevence = False
test_depth = False
test_priority = True

# Target
test_target = ['']

# Run relevance benchmark
if test_relevence:
    rel_df = execute_benchmarks(relevance_prompt, testcases_df, reference_scores_df,
                                relevance, relevance_answer_headers, relevance_reference_header, 
                                test_target, payload_params)
    display(rel_df)
    %store rel_df

# Run depth benchmark
if test_depth:
    dep_df = execute_benchmarks(depth_prompt, testcases_df, reference_scores_df,
                                depth, depth_answer_headers, depth_reference_header, 
                                test_target, payload_params)
    display(dep_df)
    %store dep_df

# Run priority benchmark
if test_priority:
    pri_df = execute_benchmarks(priority_prompt, testcases_df, reference_scores_df,
                                prioritization, priority_answer_headers, prioritization_reference_header, 
                                test_target, payload_params)
    display(pri_df)
    %store pri_df


prioritization Delta: 0.105


Unnamed: 0,question,reference,answer,evaluation,prioritization,nps_advisor_priority,delta
0,Summarize the NPS for this year (2024) for Cir...,Average NPS of -17 in 2024. \n\nJanuary 2024: ...,"As of 2024, Circles.Life has experienced fluct...",The answer provides a detailed month-by-month ...,1.0,1.0,0.0
1,What trends have Circles life observed in NPS ...,"""2017: +54 \n2018: +50\n2019: +45\n2020: +40\n...",### Circles.Life NPS Trends Over the Past 5 Ye...,The answer provides a detailed narrative of th...,0.2,0.5,0.3
2,What are the top issues mentioned in the App S...,Network Reliability: A significant number of u...,Based on the reviews from the App Store and Go...,The answer provides a comprehensive list of is...,0.57,0.6,0.03
3,What emerging themes or trends for Circles.Lif...,"""Analyzing the provided NPS data and customer ...",Emerging themes and trends in customer feedbac...,The answer identifies several key themes that ...,0.5,0.5,0.0
4,How many customers are passive in Circles.Life...,"Circles.Life has approximately 1,478 passive c...",The current number of passive customers for Ci...,The answer provides a detailed list of factors...,0.4,0.5,0.1
5,\nWhat are the top issues causing detractors f...,Network Coverage and Reliability: Customers ha...,Detractors of Circles.Life have expressed diss...,The answer identifies three main issues: rewar...,0.4,0.2,0.2


Stored 'pri_df' (DataFrame)


In [None]:
import pandas as pd
import math


def calculate_score(relevance, depth, prioritization):
    if math.isnan(prioritization):
        return relevance * 0.500 + depth * 0.500
    return relevance * 0.450 + depth * 0.450 + prioritization * 0.100


def apply_calculate_score(dataframe):
    return calculate_score(dataframe['relevance'], dataframe['depth'], dataframe['prioritization'])


output_df = pd.merge(rel_delta_df, dep_delta_df, how="inner", on=[
                     "question", "reference", "answer"])
output_df = pd.merge(output_df, pri_delta_df, how="outer",
                     on=["question", "reference", "answer"])

display(output_df)
output_df["score"] = output_df.apply(apply_calculate_score, axis=1)
output_df.to_excel("weighted_score.xlsx")

Unnamed: 0,question,reference,answer,evaluation_x,relevance,nps_advisor_relevance,delta_x,evaluation_y,depth,nps_advisor_depth,delta_y,evaluation,prioritization,nps_advisor_priority,delta
0,\nWhat are the top issues causing detractors f...,Network Coverage and Reliability: Customers ha...,Detractors of Circles.Life have expressed diss...,The answer addresses some of the issues mentio...,0.4,0.6,0.2,The answer covers several issues and solutions...,0.4,0.625,0.225,The answer identifies three main issues: rewar...,0.4,0.2,0.2
1,At which stages do customers encounter either ...,Onboarding:\nDelight: Promoters in the data of...,Customers encounter friction or delight at var...,The answer effectively covers the key points m...,1.0,1.0,0.0,The answer effectively covers the key subpoint...,0.9,0.7273,0.1727,,,,
2,"Based on the reasons for NPS, which aspects sh...",Positive Aspects (Promoter Reasons):\nValue fo...,To improve the Net Promoter Score (NPS) for Ci...,The answer covers several key aspects that ali...,0.5,0.5,0.0,The answer provides a comprehensive list of as...,0.5,0.4667,0.0333,,,,
3,Compare the NPS of Circles with other Singapor...,Circles.Life has achieved a Net Promoter Score...,"As of September 2024, Circles.Life Singapore h...",The answer provides a detailed comparison of C...,0.71,0.6667,0.0433,The answer provides a good overview of the fac...,0.5714,0.8182,0.2468,,,,
4,Do certain customer segments experience signif...,Segment-Specific NPS Outcomes\nPlan Type:\n\nH...,"Yes, certain customer segments do experience s...",The answer identifies that certain customer se...,0.5,0.6,0.1,The answer provides a comprehensive overview o...,0.5,0.5455,0.0455,,,,
5,How do NPS scores vary across customer segment...,1. Geography (Roaming vs. Non-Roaming Segments...,NPS scores for Circles.Life vary significantly...,The answer provides a detailed analysis of NPS...,0.25,0.625,0.375,The answer provides a detailed analysis of NPS...,0.5,0.5556,0.0556,,,,
6,How many customers are passive in Circles.Life...,"Circles.Life has approximately 1,478 passive c...",The current number of passive customers for Ci...,The answer provides a detailed explanation of ...,0.2,0.4,0.2,The answer provides a detailed breakdown of pa...,0.25,0.25,0.0,The answer provides a detailed analysis of the...,0.4,0.5,0.1
7,Is there a difference in NPS score between por...,The average NPS score for New Number customers...,"Yes, there is a difference in NPS scores betwe...",The answer provides a detailed analysis of the...,0.5,0.5,0.0,The answer provides a detailed analysis of the...,0.5,0.5,0.0,,,,
8,Summarize the NPS for this year (2024) for Cir...,Average NPS of -17 in 2024. \n\nJanuary 2024: ...,"As of 2024, Circles.Life has experienced fluct...",The answer provides a detailed month-by-month ...,0.9,1.0,0.1,The answer provides a detailed month-by-month ...,0.7,0.4286,0.2714,The answer provides a detailed month-by-month ...,1.0,1.0,0.0
9,What are the primary reasons for promoters' hi...,Value for Money: Affordable plans that provide...,Promoters of Circles.Life have provided severa...,The answer covers several key points related t...,0.33,0.3333,0.0033,The answer provides several reasons for promot...,0.22,0.3636,0.1436,,,,


In [14]:
import requests
import urllib.parse
import json
import pprint as pp


def send_advisor_payload(url, question_reframed, reframe, guardrails, telco_operator):
    payload = "/".join([question_reframed, reframe,
                       guardrails, telco_operator])
    response = requests.get(url + payload)
    return response


question = 'What is Circles NPS this month'
url = 'https://agenticworkflows.onrender.com/invoke/'

question_reframed = urllib.parse.quote_plus(question)
telco_operator = 'Circles.Life'
reframe = 'true'
guardrails = 'true'

response = send_advisor_payload(
    url, question_reframed, reframe, guardrails, telco_operator)

In [15]:
print(json.loads(response.content)['response'])

['As of November 2024, the current Net Promoter Score (NPS) for Circles.Life is not available. However, the most recent data indicates that in September 2024, the NPS was -24.43%. This score reflects a higher percentage of detractors (52.98%) compared to promoters (28.55%), indicating challenges in customer satisfaction during that period.\n\nIf you need further details or updates, please let me know!']


In [5]:
gemini_model

ChatGoogleGenerativeAI(model='models/gemini-pro', google_api_key=SecretStr('**********'), client=<google.ai.generativelanguage_v1beta.services.generative_service.client.GenerativeServiceClient object at 0x12e8bcfc0>, default_metadata=())

ValidationError: 1 validation error for ChatGoogleGenerativeAI
model
  Field required [type=missing, input_value={}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.10/v/missing