In [None]:
import getpass
import os
import pprint
import pandas as pd
from langgraph.checkpoint.memory import MemorySaver
from langgraph.graph import START, MessagesState, StateGraph
from langchain_core.messages import HumanMessage
from langchain_core.messages import AIMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from pydantic import BaseModel, Field, validator
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from langchain import hub
from langchain_google_genai import ChatGoogleGenerativeAI
from dotenv import load_dotenv

load_dotenv()

os.environ["LANGCHAIN_TRACING_V2"] = "true"
#os.environ["LANGCHAIN_API_KEY"] = getpass.getpass()
#os.environ["OPENAI_API_KEY"] = getpass.getpass()

# Define evaluation function
def evaluate_criteria(parser, testcases_df, eval_llm, tested_llm, prompt):
    eval_store_data = []
    #Iterate over test cases and call LLM(s) for answer
    
    for index, row in testcases_df.iterrows():
        question = row["question"]
        reference = row["reference"]

        # # Get answer from LLM
        # answer = tested_llm.invoke([HumanMessage(content=question)]).content
        
        # Get answer from reference
        answer = row["nps_advisor_answer"]
        
        # Evaluate response
        eval_prompt_and_model = prompt | eval_llm
        output = eval_prompt_and_model.invoke({"question": question, "answer": answer, "reference": reference})

        # Parse the output using the parser
        parsed_result = parser.invoke(output)

        # Store results
        eval_store_data.append(parsed_result)
        eval_store_data[index]["question"] = question
        eval_store_data[index]["reference"] = reference
        eval_store_data[index]["answer"] = answer  
    return eval_store_data

# Create a prompt 
def create_prompt(prompt_template, criteria):
    # Define output schema
    response_schemas = [
        ResponseSchema(name="evaluation", description="feedback on answer"),
        ResponseSchema(
            name=criteria,
            description="evaluation of answer, must be a percentage",
        ),
    ]

    # Define pydanthic output parser
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    format_instructions = output_parser.get_format_instructions()
    
    eval_prompt = PromptTemplate(
        template = prompt_template.template,
        input_variables=["question", "answer", "reference"],
        partial_variables={"format_instructions": format_instructions},
    )
    return eval_prompt, output_parser

# def create_priority_prompt(prompt_template, criteria):
#     # Define output schema
#     response_schemas = [
#         ResponseSchema(name="evaluation", description="feedback on answer"),
#         ResponseSchema(
#             name=criteria,
#             description="evaluation of answer, must be a percentage",
#         ),
#     ]

#     # Define pydanthic output parser
#     output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
#     format_instructions = output_parser.get_format_instructions()
    
#     eval_prompt = PromptTemplate(
#         template = prompt_template.template,
#         input_variables=["answer_list", "reference_list"],
#         partial_variables={"format_instructions": format_instructions},
#     )
#     return eval_prompt, output_parser

def execute_benchmarks(prompt, criteria, headers):
    prompt, parser = create_prompt(prompt, criteria)
    store_data = evaluate_criteria(parser, testcases_df, eval_model, gemini_model, prompt)
    df = pd.DataFrame(store_data, columns=headers)
    return df

def calculate_delta(reference_data, criteria, criteria_header, dataframe):
    reference_data = reference_data.loc[:,["question", "reference", criteria_header]]
    output_df = pd.merge(dataframe, reference_data, how="inner", on=["question", "reference"])
    output_df[criteria] = output_df[criteria].str.rstrip('%').astype('float')/100.0
    output_df[criteria_header] = output_df[criteria_header].str.rstrip('%').astype('float')/100.0
    output_df['delta'] = output_df.apply(lambda x: abs(x[criteria] - x[criteria_header]), axis=1)
    return output_df
    
# Load test cases and data store
testcases_df = pd.read_csv("relevance_gemini_01.csv")
manual_scores_df = pd.read_csv("reference_scoring_v1.csv")

# Define output headers
relevance_answer_headers = ["question", "reference", "answer", "evaluation", "relevance"]
depth_answer_headers = ["question", "reference", "answer", "evaluation", "depth"]
priority_answer_headers = ["question", "reference", "answer", "evaluation", "prioritization"]

# Pull latest prompt from LangSmith
relevance_prompt = hub.pull("benchmarking_relevance_v1")
depth_prompt = hub.pull("benchmarking_depth_v1")
extraction_prompt = hub.pull("benchmarking_keypoints_extraction_v1")
priority_prompt = hub.pull("benchmarking_prioritization_v1")

# Initialize models to test
gpt4_model  = ChatOpenAI(model="gpt-4o", temperature=0.5)
gemini_model = ChatGoogleGenerativeAI(model="gemini-pro")

# Initialize evaluator model
eval_model = ChatOpenAI(model="gpt-4o", temperature=0.0)

test_relevence = False
test_depth = False
test_priority = True

# Run relevance benchmark
if test_relevence:
    rel_df = execute_benchmarks(relevance_prompt, "relevance", relevance_answer_headers)
    display(rel_df)
    %store rel_df
    
    # Calculate delta
    criteria_header = "nps_advisor_relevance"
    criteria = "relevance" 
    compare_df = calculate_delta(manual_scores_df, criteria, criteria_header, rel_df)

    # Output
    print("Relevance Delta: "+str(compare_df['delta'].mean()))
    compare_df.to_excel("rel_scoring.xlsx")

# Run depth benchmark
if test_depth:
    dep_df = execute_benchmarks(depth_prompt, "depth", depth_answer_headers)
    display(dep_df)
    %store dep_df
    
    # Calculate delta
    criteria_header = "nps_advisor_depth"
    criteria = "depth" 
    compare_df = calculate_delta(manual_scores_df, criteria, criteria_header, dep_df)

    # Output
    print("Depth Delta: "+str(compare_df['delta'].mean()))
    compare_df.to_excel("dep_scoring.xlsx")
    
# Run priority benchmark
if test_priority:
    pri_df = execute_benchmarks(priority_prompt, "prioritization", priority_answer_headers)
    display(pri_df)
    %store pri_df

    # Calculate delta
    criteria_header = "nps_advisor_priority"
    criteria = "prioritization" 
    compare_df = calculate_delta(manual_scores_df, criteria, criteria_header, pri_df)

    # Output
    print("Prioritization Delta: "+str(compare_df['delta'].mean()))
    compare_df.to_excel("pri_scoring.xlsx")
    

Unnamed: 0,question,reference,answer,evaluation,prioritization
0,Summarize the NPS for this year (2024) for Cir...,Average NPS of -17 in 2024. \n\nJanuary 2024: ...,"As of 2024, Circles.Life has experienced fluct...",The answer provides a detailed month-by-month ...,100%
1,What trends have Circles life observed in NPS ...,"""2017: +54 \n2018: +50\n2019: +45\n2020: +40\n...",### Circles.Life NPS Trends Over the Past 5 Ye...,The answer provides a detailed narrative of th...,40%
2,What are the top issues mentioned in the App S...,Network Reliability: A significant number of u...,Based on the reviews from the App Store and Go...,The answer provides a comprehensive list of is...,57%
3,What emerging themes or trends for Circles.Lif...,"""Analyzing the provided NPS data and customer ...",Emerging themes and trends in customer feedbac...,The answer identifies several key themes that ...,50%
4,How many customers are passive in Circles.Life...,"Circles.Life has approximately 1,478 passive c...",The current number of passive customers for Ci...,The answer provides a detailed analysis of the...,40%
5,\nWhat are the top issues causing detractors f...,Network Coverage and Reliability: Customers ha...,Detractors of Circles.Life have expressed diss...,The answer identifies three main issues: rewar...,40%
6,Compare the NPS of Circles with other Singapor...,Circles.Life has achieved a Net Promoter Score...,"As of September 2024, Circles.Life Singapore h...",The answer provides a comprehensive overview o...,50%
7,What key drivers contribute to customers' loya...,4 Key Strategies:\n\nExceptional Customer Serv...,Customers' loyalty and advocacy for Circles.Li...,The answer provides a comprehensive list of fa...,40%
8,"Based on the reasons for NPS, which aspects sh...",Positive Aspects (Promoter Reasons):\nValue fo...,To improve the Net Promoter Score (NPS) for Ci...,The answer identifies several key areas for im...,50%
9,What are the primary reasons for promoters' hi...,Value for Money: Affordable plans that provide...,Promoters of Circles.Life have provided severa...,The answer highlights several key reasons for ...,22%


Stored 'pri_df' (DataFrame)
Prioritization Delta: 0.07166666666666667


In [219]:
import getpass
import os
import pprint
import pandas as pd
from langgraph.checkpoint.memory import MemorySaver
from langgraph.graph import START, MessagesState, StateGraph
from langchain_core.messages import HumanMessage
from langchain_core.messages import AIMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from pydantic import BaseModel, Field, validator
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from langchain import hub
from langchain_google_genai import ChatGoogleGenerativeAI
from dotenv import load_dotenv

load_dotenv()

os.environ["LANGCHAIN_TRACING_V2"] = "true"

paragraph = """
"Emerging themes and trends in customer feedback for Circles.Life that could impact future NPS include:

### Positive Trends:
1. **Responsive Customer Support**: Customers appreciate the quick and helpful responses from customer service, particularly through live chat. This responsiveness is a strong point that can enhance customer loyalty and satisfaction.

2. **User-Friendly App**: The app's ease of navigation and functionality is frequently praised, indicating that a seamless digital experience is valued by users. This can positively influence NPS as customers enjoy managing their services easily.

3. **Customer-Centric Innovations**: There is a growing interest in personalized services and customer-centric digital solutions, such as the ""Circle of Joy"" rewards program, which enhances customer engagement and satisfaction.

### Negative Trends:
1. **Inconsistent Service Quality**: Reports of poor customer service experiences, including long response times and unresolved issues, highlight a significant area for improvement. Inconsistencies in service quality can lead to dissatisfaction and negatively impact NPS.

2. **Technical Issues with the App**: Users have expressed frustration with bugs and usability issues in the app, which can detract from the overall customer experience and lead to lower satisfaction scores.

3. **Network Reliability Concerns**: Complaints about network coverage and connectivity issues are prevalent, particularly in crowded areas. This can significantly affect customer perceptions of value and reliability, impacting their likelihood to recommend Circles.Life.

4. **Dissatisfaction with Rewards Programs**: Feedback indicates that customers feel the rewards program lacks variety and attractiveness, which can lead to frustration and a desire to switch providers.

### Conclusion:
To maintain and improve NPS, Circles.Life should focus on enhancing service consistency, addressing technical issues in the app, improving network reliability, and revamping the rewards program to better meet customer expectations. By leveraging positive feedback and addressing negative trends, Circles.Life can foster greater customer loyalty and satisfaction."""

eval_model = ChatOpenAI(model="gpt-4o", temperature=0.0)
extraction_prompt = hub.pull("benchmarking_keypoints_extraction_v1")
priority_prompt = hub.pull("benchmarking_prioritization_v1a")
# print(priority_prompt)
chain = extraction_prompt | eval_model
output = chain.invoke({"paragraph":paragraph})
print("\n".join(output['key points']))

Responsive Customer Support
User-Friendly App
Customer-Centric Innovations
Inconsistent Service Quality
Technical Issues with the App
Network Reliability Concerns
Dissatisfaction with Rewards Programs
