In [None]:
import os
import pandas as pd
import urllib
import requests
import math
import json
import gspread
from pathlib import Path
from datetime import datetime
from langgraph.checkpoint.memory import MemorySaver
from langgraph.graph import START, MessagesState, StateGraph
from langchain_core.messages import HumanMessage
from langchain_core.messages import AIMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from langchain import hub
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.output_parsers import RetryOutputParser
from dotenv import load_dotenv

load_dotenv()

os.environ["LANGCHAIN_TRACING_V2"] = "true"
# os.environ["LANGCHAIN_API_KEY"] = getpass.getpass()
# os.environ["OPENAI_API_KEY"] = getpass.getpass()

# Define evaluation function


def evaluate_criteria_from_file(parser, eval_llm, prompt, testcases_df):
    eval_store_data = []

    # Iterate over test cases and evaluate answers from file
    for index, row in testcases_df.iterrows():
        question = row["question"]
        reference = row["reference"]
        answer = row["nps_advisor_answer"]

        # Evaluate response
        eval_prompt_and_model = prompt | eval_llm
        output = eval_prompt_and_model.invoke(
            {"question": question, "answer": answer, "reference": reference})

        # Parse the output using the parser
        parsed_result = parser.invoke(output)

        # Store results
        eval_store_data.append(parsed_result)
        eval_store_data[index]["question"] = question
        eval_store_data[index]["reference"] = reference
        eval_store_data[index]["answer"] = answer
    return eval_store_data

# Define evaluation function


def evaluate_llm_output(parser, eval_llm, prompt, testcases_df, answers):
    eval_store_data = []

    # Evaluate responses
    for index, row in testcases_df.iterrows():
        question = testcases_df.loc[index]['question']
        reference = testcases_df.loc[index]['reference']

        eval_prompt_and_model = prompt | eval_llm
        output = eval_prompt_and_model.invoke(
            {"question": question, "answer": answers[index], "reference": reference})

        # Parse the output using the parser
        parsed_result = parser.invoke(output)

        # Store results
        eval_store_data.append(parsed_result)
        eval_store_data[index]["question"] = question
        eval_store_data[index]["reference"] = reference
        eval_store_data[index]["answer"] = answers[index][1]

    return eval_store_data

# Create a prompt


def create_prompt(prompt_template, criteria):
    # Define output schema
    response_schemas = [
        ResponseSchema(name="evaluation", description="feedback on answer"),
        ResponseSchema(
            name=criteria,
            description="evaluation of answer, must be a percentage",
        ),
    ]

    # Define pydanthic output parser
    output_parser = StructuredOutputParser.from_response_schemas(
        response_schemas)
    format_instructions = output_parser.get_format_instructions()

    eval_prompt = PromptTemplate(
        template=prompt_template.template,
        input_variables=["question", "answer", "reference"],
        partial_variables={"format_instructions": format_instructions},
    )
    return eval_prompt, output_parser

# Format score into float


def format_score(criteria, dataframe):
    dataframe[criteria] = dataframe[criteria].str.rstrip(
        '%').astype('float')/100.0
    return dataframe


def calculate_score(relevance, depth, prioritization):
    if math.isnan(prioritization):
        return relevance * 0.500 + depth * 0.500
    return relevance * 0.450 + depth * 0.450 + prioritization * 0.100


def apply_calculate_score(dataframe):
    return calculate_score(dataframe['relevance'], dataframe['depth'], dataframe['prioritization'])


def get_answers(testcases_df, tested_llm, payload_params):
    answers = []

    if tested_llm['name'] == 'nps_advisor':
        url = payload_params[0]
        reframe = payload_params[1]
        guardrails = payload_params[2]
        telco_operator = payload_params[3]

        # Loop through question list and get responses
        for index, row in testcases_df.iterrows():
            question = row["question"]
            question_reframed = urllib.parse.quote_plus(question)

            # # Get answer from LLM
            payload = "/".join([question_reframed, reframe,
                                guardrails, telco_operator])
            answer = requests.get(url + payload)
            answers.append((index, json.loads(answer.content)['response'][0]))

    else:
        for index, row in testcases_df.iterrows():
            question = row["question"]

            # # Get answer from LLM
            answer = tested_llm['model'].invoke(
                [HumanMessage(content=question)]).content
            answers.append((index, answer))

    return answers


def evaluation(prompt, criteria, testcases, test_target, answers):
    test_models = ['nps_advisor', 'gpt', 'gemini']
    headers = ["question", "reference", "answer", "evaluation"]
    headers.append(criteria)

    prompt, parser = create_prompt(prompt, criteria)
    if criteria == "prioritization":  # Select only rows with prioritization flag
        testcases = testcases.loc[testcases['prioritization_flag'] == 1]

    if test_target['name'] == 'reference':
        store_data = evaluate_criteria_from_file(
            parser, eval_model, prompt, testcases)
    else:
        store_data = evaluate_llm_output(
            parser, eval_model, prompt, testcases, answers)
    df = pd.DataFrame(store_data, columns=headers)
    df = format_score(criteria, df)

    # Save output
    Path(os.getcwd() + output_dir +
         test_target['name'] + "/").mkdir(parents=True, exist_ok=True)
    df.to_excel(os.getcwd() + output_dir + test_target['name'] + "/" +
                datetime.today().strftime('%Y%m%d') + "_" +
                criteria +
                "_scoring.xlsx")
    return df

def colnum_string(n):
    string = ""
    while n > 0:
        n, remainder = divmod(n - 1, 26)
        string = chr(65 + remainder) + string
    return string

# Main exec
def execute_benchmarks(checks, prompts, testcases, test_target, answers):
    for check in checks:
        match check:
            case "relevance":
                criteria = 'relevance'
                prompt = prompts[criteria]
                rel_df = evaluation(
                    prompt, criteria, testcases, test_target, answers)
            case "depth":
                criteria = 'depth'
                prompt = prompts[criteria]
                dep_df = evaluation(
                    prompt, criteria, testcases, test_target, answers)
            case "prioritization":
                criteria = 'prioritization'
                prompt = prompts[criteria]
                pri_df = evaluation(
                    prompt, criteria, testcases, test_target, answers)

    # Calculate weighted score
    if len(checks) == 3:
        dep_join = dep_df.columns.difference(rel_df.columns)
        pri_join = pri_df.columns.difference(rel_df.columns)
        output_df = pd.merge(rel_df, dep_df[dep_join], left_index=True,
                             right_index=True, how="outer")
        output_df = pd.merge(output_df, pri_df[pri_join], left_index=True,
                             right_index=True, how="outer")
        output_df["weighted_score"] = output_df.apply(
            apply_calculate_score, axis=1)

        return output_df
    else:
        return ''

# Write to gsheet
gc = gspread.oauth(
    credentials_filename='C:/Users/njeny/Documents/Projects/benchmarking_automation/credentials.json',
    authorized_user_filename='C:/Users/njeny/Documents/Projects/benchmarking_automation/token.json'
)

SHEET_ID = '1ifHt5uJJ4uUeaF2O_qOJXmfLR-LCF64w8ZUopWJ2xmQ'

# Load test cases and data store
testcases_df = pd.read_csv("testcases_v1.csv")
reference_scores_df = pd.read_csv("reference_scoring_v1.csv")

# Define variables
relevance = "relevance"
depth = "depth"
prioritization = "prioritization"

relevance_reference_header = "nps_advisor_relevance"
depth_reference_header = "nps_advisor_depth"
prioritization_reference_header = "nps_advisor_priority"

url = 'https://agenticworkflows.onrender.com/invoke/'
reframe = 'true'
guardrails = 'true'
telco_operator = 'Circles.Life'
payload_params = [url, reframe, guardrails, telco_operator]

output_dir = '/output/'

# Pull latest prompt from LangSmith
relevance_prompt = hub.pull("benchmarking_relevance_v1")
depth_prompt = hub.pull("benchmarking_depth_v1")
priority_prompt = hub.pull("benchmarking_prioritization_v1")

prompts = {'relevance': relevance_prompt,
           'depth': depth_prompt,
           'prioritization': priority_prompt}

# Initialize models to test
gpt4_model = {'name': 'gpt', 'model': ChatOpenAI(
    model="gpt-4o", temperature=0.5)}
gemini_model = {'name': 'gemini',
                'model': ChatGoogleGenerativeAI(model="gemini-pro")}
nps_advisor = {'name': 'nps_advisor'}
reference_model = {'name': 'reference'}

test_models = ['nps_advisor', 'gpt', 'gemini', 'reference']

# Initialize evaluator model
eval_model = ChatOpenAI(model="gpt-4o", temperature=0.0)

# Target
test_target = nps_advisor

# Switches
checks = ['relevance', 'depth', 'prioritization']

# Run relevance benchmark
def main():
    if test_target['name'] == 'reference':
        answers = ''
    else:
        answers = get_answers(testcases_df, test_target, payload_params)

    weighted_score_df = execute_benchmarks(checks,
                                        prompts,
                                        testcases_df,
                                        test_target,
                                        answers)

    Path(os.getcwd() + output_dir).mkdir(parents=True, exist_ok=True)
    weighted_score_df.to_excel(os.getcwd() + output_dir +
                            datetime.today().strftime('%Y%m%d') + "_" +
                            test_target['name'] +
                            "_weighted_score.xlsx")

    if test_target['name'] == 'nps_advisor':
        weighted_today = weighted_score_df.loc[:, 'weighted_score'].astype(str)

        ss = gc.open_by_key(SHEET_ID)
        worksheet = ss.worksheet("Sheet1")

        values = worksheet.get_all_values()
        col = colnum_string(max([len(r) for r in values]) + 1)
        worksheet.update(col + '1', [[datetime.today().strftime('%Y%m%d')]] + [[e]
                        for e in weighted_today.tolist()], value_input_option='USER_ENTERED')

main()

  from .autonotebook import tqdm as notebook_tqdm
Failed to multipart ingest runs: langsmith.utils.LangSmithRateLimitError: Rate limit exceeded for https://api.smith.langchain.com/runs/multipart. HTTPError('429 Client Error: Too Many Requests for url: https://api.smith.langchain.com/runs/multipart', '{"detail":"Monthly unique traces usage limit exceeded"}')trace=2aa59935-2826-4d4e-8526-016fd427f8a9,id=2aa59935-2826-4d4e-8526-016fd427f8a9; trace=2aa59935-2826-4d4e-8526-016fd427f8a9,id=801696b8-61b9-44c7-9cf2-90ff1f269c91; trace=2aa59935-2826-4d4e-8526-016fd427f8a9,id=fd8367fc-b3af-4df4-abd0-a2a9fc2abc89
Failed to multipart ingest runs: langsmith.utils.LangSmithRateLimitError: Rate limit exceeded for https://api.smith.langchain.com/runs/multipart. HTTPError('429 Client Error: Too Many Requests for url: https://api.smith.langchain.com/runs/multipart', '{"detail":"Monthly unique traces usage limit exceeded"}')trace=f67bed07-2d9a-4bd4-8b35-361295f11a17,id=f67bed07-2d9a-4bd4-8b35-361295f11a1

Failed to multipart ingest runs: langsmith.utils.LangSmithRateLimitError: Rate limit exceeded for https://api.smith.langchain.com/runs/multipart. HTTPError('429 Client Error: Too Many Requests for url: https://api.smith.langchain.com/runs/multipart', '{"detail":"Monthly unique traces usage limit exceeded"}')trace=4111c00a-04d8-416d-8f20-9d9efe54ad40,id=4111c00a-04d8-416d-8f20-9d9efe54ad40; trace=0857fa9c-1c0b-4d17-8af6-93aaa3722c1a,id=0857fa9c-1c0b-4d17-8af6-93aaa3722c1a; trace=0857fa9c-1c0b-4d17-8af6-93aaa3722c1a,id=66df5d6a-c4b3-4882-aca6-60d91f8d01ac


In [None]:
import pandas as pd
import math
import os


def calculate_score(relevance, depth, prioritization):
    if math.isnan(prioritization):
        return relevance * 0.500 + depth * 0.500
    return relevance * 0.450 + depth * 0.450 + prioritization * 0.100


def apply_calculate_score(dataframe):
    return calculate_score(dataframe['relevance'], dataframe['depth'], dataframe['prioritization'])


dep_join = dep_df.columns.difference(rel_df.columns)
pri_join = pri_df.columns.difference(rel_df.columns)
output_df = pd.merge(rel_df, dep_df[dep_join], left_index=True,
                     right_index=True, how="outer")
output_df = pd.merge(output_df, pri_df[pri_join], left_index=True,
                     right_index=True, how="outer")

# display(output_df)
output_df["weighted_score"] = output_df.apply(apply_calculate_score, axis=1)

Path(os.getcwd() + output_dir).mkdir(parents=True, exist_ok=True)
output_df.to_excel(os.getcwd() + output_dir +
                   test_target['name'] + "_weighted_score.xlsx")

In [59]:
display(weighted_score_df.loc[:, 'weighted_score'])
weighted_today = weighted_score_df.loc[:, 'weighted_score']
weighted_today.values.to_list()

0     0.820000
1     0.132500
2     0.582015
3     0.710015
4     0.290000
5     0.400000
6     0.600000
7     0.575000
8     0.600000
9     0.275000
10    0.500000
11    0.375000
12    1.000000
13    0.500000
14    0.675000
Name: weighted_score, dtype: float64

AttributeError: 'numpy.ndarray' object has no attribute 'to_list'

In [None]:
import gspread
import pandas as pd




weighted_today = weighted_score_df.loc[:, 'weighted_score'].astype(str)
weighted_today.describe

gc = gspread.oauth(
    credentials_filename='C:/Users/njeny/Documents/Projects/benchmarking_automation/credentials.json',
    authorized_user_filename='C:/Users/njeny/Documents/Projects/benchmarking_automation/token.json'
)
# gc = gspread.service_account(filename='benchmarking-443208-52f8a50d0528.json')
SHEET_ID = '1ifHt5uJJ4uUeaF2O_qOJXmfLR-LCF64w8ZUopWJ2xmQ'

ss = gc.open_by_key(SHEET_ID)
worksheet = ss.worksheet("Sheet1")

# values_list = worksheet.row_values(1)
# print(values_list)
# rang = 'A' + str(len(values_list)+1)
# worksheet.update(rang, [["20240911"]] + [[e] for e in weighted_today.tolist()])


values = worksheet.get_all_values()
col = colnum_string(max([len(r) for r in values]) + 1)
worksheet.update(col + '1', [["20240911"]] + [[e]
                 for e in weighted_today.tolist()], value_input_option='USER_ENTERED')

  worksheet.update(col + '1', [["20240911"]] + [[e]


{'spreadsheetId': '1ifHt5uJJ4uUeaF2O_qOJXmfLR-LCF64w8ZUopWJ2xmQ',
 'updatedRange': 'Sheet1!J1:J16',
 'updatedRows': 16,
 'updatedColumns': 1,
 'updatedCells': 16}