In [7]:
import os
from azure.identity import DefaultAzureCredential
credential = DefaultAzureCredential()

from dotenv import load_dotenv
load_dotenv(override=True) # take environment variables from .env.


True

In [8]:
import json
import pandas as pd

flow_output_dataframe = pd.read_json("instance_results.jsonl", lines=True)
flow_output_dataframe = flow_output_dataframe[['inputs.context','inputs.ground_truth','inputs.query','inputs.response']]
flow_output_dataframe = flow_output_dataframe[flow_output_dataframe["inputs.response"]!='The requested information is not available in the retrieved data. Please try another query or topic.'].reset_index(drop=True)
flow_output_dataframe = flow_output_dataframe.rename(columns={'inputs.context': 'context', 'inputs.ground_truth': 'ground_truth', 'inputs.query': 'query', 'inputs.response': 'response'})
flow_output_dataframe

Unnamed: 0,context,ground_truth,query,response
0,"[{'content': 'Table of Contents\nreasons, incl...",['Our International Operations Expose Us to a ...,What are some of the risks associated with int...,International operations expose companies to a...
1,"[{'content': ""Table of Contents\nOur digital c...",['Title: Operating Risks Content: Table of Con...,Wht are the risks assoociated with holding cas...,Holding cash equivalents in British Pounds (GB...
2,"[{'content': ""Alphabet Inc.\nTable of Contents...",['The Loss of Key Senior Management Personnel ...,Why is it challenging for businesses to hire a...,Businesses face challenges in hiring and retai...
3,[{'content': 'Table of Contents\nWe Face Risks...,"[""Title: Risks Related to Data Security, Syste...",How can supplier relationships impact the oper...,Supplier relationships can significantly impac...
4,[{'content': 'Exhibit A ALPHABET INC. CLAWBACK...,"['Title: Claims, Litigation, Government Invest...",How do governments and regulatory authorities ...,Governments and regulatory authorities impact ...
5,[{'content': 'Table of Contents\nAlphabet Inc....,"['Risk Factors."" Overview Our primary source o...",What be free cash flows and how they help busi...,Free cash flow represents the cash a company g...
6,"[{'content': ""Alphabet Inc.\nTable of Contents...",['<1-hop>\n\nNote 10 - SEGMENT INFORMATION Con...,How do tax contingencies and segment performan...,Tax contingencies and segment performance eval...
7,[{'content': '3\nCONTRACT OR TORT OR OTHERWISE...,"[""<1-hop>\n\nTitle: Risk Factors Content: Tabl...",How do the risks associated with artificial in...,The risks associated with artificial intellige...
8,"[{'content': '(stating that it is a ""notice of...",['<1-hop>\n\nTitle: Notice of Default Content:...,What happens when the Administrative Agent res...,"When the Administrative Agent resigns, the fol..."
9,[{'content': 'Table of Contents\nand related t...,['<1-hop>\n\nTitle: Cost of Sales and Operatin...,How do fulfillment costs and marketing costs i...,Fulfillment costs and marketing costs signific...


In [51]:
flow_output_dataframe['context'][0]

'[{\'content\': \'Table of Contents\\nreasons, including if we are unable to realize the expected benefits of new technologies or if we do not successfully plan or execute new strategies, which could harm our business or damage our reputation.\\nOur International Operations Expose Us to a Number of Risks\\nOur international activities are significant to our revenues and profits, and we plan to further expand internationally. In certain international market segments, we have relatively little operating experience and may not benefit from any first-to-market advantages or otherwise succeed. It is costly to establish, develop, and maintain international operations and stores, and promote our brand internationally. Our international operations may not become profitable on a sustained basis.\\nIn addition to risks described elsewhere in this section, our international sales and operations are subject to a number of risks, including:\\n· local economic and political conditions;\\n· governmen

In [9]:
output_path = "data.jsonl"

with open(output_path, "w") as f:
    f.write(flow_output_dataframe.to_json(orient='records', lines=True, force_ascii=False))

### Basic evaluator

In [11]:
# Initialize Azure AI project and Azure OpenAI conncetion with your environment variables
azure_ai_project = {
    "subscription_id": os.environ.get("AZURE_SUBSCRIPTION_ID"),
    "resource_group_name": os.environ.get("AZURE_RESOURCE_GROUP"),
    "project_name": os.environ.get("AZURE_PROJECT_NAME"),
}

model_config = {
    "azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
    "api_key": os.environ.get("AZURE_OPENAI_API_KEY"),
    "azure_deployment": os.environ.get("AZURE_OPENAI_CHATGPT_DEPLOYMENT"),
    "api_version": os.environ.get("AZURE_OPENAI_API_VERSION"),
}

from azure.ai.evaluation import GroundednessProEvaluator, GroundednessEvaluator

# Initialzing Groundedness and Groundedness Pro evaluators
groundedness_eval = GroundednessEvaluator(model_config)
groundedness_pro_eval = GroundednessProEvaluator(azure_ai_project=azure_ai_project, credential=credential)

query_response = dict(
    query="Which tent is the most waterproof?",
    context="The Alpine Explorer Tent is the most water-proof of all tents available.",
    response="The Alpine Explorer Tent is the most waterproof."
)

# Running Groundedness Evaluator on a query and response pair
groundedness_score = groundedness_eval(
    **query_response
)
print(groundedness_score)

{'groundedness': 5.0, 'gpt_groundedness': 5.0, 'groundedness_reason': 'The response is fully correct and complete, directly addressing the query with precise information from the context.'}
{'answer_length': 27}
2025-01-30 13:31:43 -0500   21200 execution.bulk     INFO     Current thread is not main thread, skip signal handler registration in BatchEngine.
2025-01-30 13:31:49 -0500   21200 execution.bulk     INFO     Finished 2 / 29 lines.
2025-01-30 13:31:49 -0500   21200 execution.bulk     INFO     Average execution time for completed lines: 3.03 seconds. Estimated time for incomplete lines: 81.81 seconds.
2025-01-30 13:31:50 -0500   21200 execution.bulk     INFO     Finished 5 / 29 lines.
2025-01-30 13:31:50 -0500   21200 execution.bulk     INFO     Average execution time for completed lines: 1.34 seconds. Estimated time for incomplete lines: 32.16 seconds.
2025-01-30 13:31:51 -0500   21200 execution.bulk     INFO     Finished 9 / 29 lines.
2025-01-30 13:31:51 -0500   21200 execution

### Custom Evaluator

In [43]:
from answer_len.answer_length import AnswerLengthEvaluator

answer_length_evaluator = AnswerLengthEvaluator()
answer_length = answer_length_evaluator(answer="What is the speed of light?")

answer_length

{'answer_length': 27}

### Local Evaluatior on Datasets

In [47]:
from azure.ai.evaluation import evaluate

result = evaluate(
    data="data.jsonl", # provide your data here
    evaluators={
        "groundedness": groundedness_eval,
        "answer_length": answer_length_evaluator,
    },
    # column mapping
    evaluator_config={
        "groundedness": {
            "column_mapping": {
                "query": "${data.query}",
                "context": "${data.context}",
                "response": "${data.response}"
            } 
        },
        "answer_length": {
            "column_mapping": {
                "answer": "${data.response}",
            } 
        }
    },
    # Optionally provide your Azure AI project information to track your evaluation results in your Azure AI project
    #azure_ai_project = azure_ai_project,
    # Optionally provide an output path to dump a json of metric summary, row level data and metric and Azure AI project URL
    output_path="./myevalresults.json"
)

[2025-01-30 13:57:08 -0500][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_m9iynfbk_20250130_135708_236404, log path: C:\Users\jomedin\.promptflow\.runs\azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_m9iynfbk_20250130_135708_236404\logs.txt
[2025-01-30 13:57:08 -0500][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run answer_len_answer_length_answerlengthevaluator_meca6tnj_20250130_135708_249667, log path: C:\Users\jomedin\.promptflow\.runs\answer_len_answer_length_answerlengthevaluator_meca6tnj_20250130_135708_249667\logs.txt


Prompt flow service has started...
You can view the traces in local from http://127.0.0.1:23333/v1.0/ui/traces/?#run=azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_m9iynfbk_20250130_135708_236404
Prompt flow service has started...
You can view the traces in local from http://127.0.0.1:23333/v1.0/ui/traces/?#run=answer_len_answer_length_answerlengthevaluator_meca6tnj_20250130_135708_249667
2025-01-30 13:57:23 -0500   21200 execution.bulk     INFO     Finished 20 / 29 lines.
2025-01-30 13:57:23 -0500   21200 execution.bulk     INFO     Average execution time for completed lines: 0.71 seconds. Estimated time for incomplete lines: 6.39 seconds.
2025-01-30 13:57:25 -0500   21200 execution.bulk     INFO     Finished 22 / 29 lines.
2025-01-30 13:57:25 -0500   21200 execution.bulk     INFO     Average execution time for completed lines: 0.74 seconds. Estimated time for incomplete lines: 5.18 seconds.
2025-01-30 13:57:25 -0500   21200 execution.bulk     INFO     Finished 24 

In [49]:
import json

with open("./myevalresults.json", "r") as read_file:
    data = json.load(read_file)

data['metrics']

{'groundedness.groundedness': 4.793103448275862,
 'groundedness.gpt_groundedness': 4.793103448275862,
 'answer_length.answer_length': 1505.551724137931}

In [16]:
## https://github.com/Azure-Samples/azureai-samples/blob/main/scenarios/evaluate/Supported_Evaluation_Metrics/AI_Judge_Evaluators_Quality/AI_Judge_Evaluators_Quality.ipynb

In [33]:
import pathlib

from azure.ai.evaluation import evaluate
from azure.ai.evaluation import (
    ContentSafetyEvaluator,
    RelevanceEvaluator,
    CoherenceEvaluator,
    GroundednessEvaluator,
    FluencyEvaluator,
    SimilarityEvaluator,
    RetrievalEvaluator,
)

content_safety_evaluator = ContentSafetyEvaluator(
    azure_ai_project=azure_ai_project, credential=DefaultAzureCredential()
)
relevance_evaluator = RelevanceEvaluator(model_config)
coherence_evaluator = CoherenceEvaluator(model_config)
groundedness_evaluator = GroundednessEvaluator(model_config)
fluency_evaluator = FluencyEvaluator(model_config)
similarity_evaluator = SimilarityEvaluator(model_config)
retrieval_evaluator = RetrievalEvaluator(model_config)

path = "data.jsonl"

results = evaluate(
    evaluation_name="Eval-Run-" + "-" + model_config["azure_deployment"].title(),
    data=path,
    evaluators={
        "content_safety": content_safety_evaluator,
        "coherence": coherence_evaluator,
        "relevance": relevance_evaluator,
        "groundedness": groundedness_evaluator,
        "fluency": fluency_evaluator,
        "similarity": similarity_evaluator,
        "retrieval": retrieval_evaluator,
    },
    evaluator_config={
        "content_safety": {"column_mapping": {"query": "${data.query}", "response": "${data.response}"}},
        "coherence": {"column_mapping": {"response": "${data.response}", "query": "${data.query}"}},
        "relevance": {
            "column_mapping": {"response": "${data.response}", "context": "${data.context}", "query": "${data.query}"}
        },
        "groundedness": {
            "column_mapping": {
                "response": "${data.response}",
                "context": "${data.context}",
                "query": "${data.query}",
            }
        },
        "fluency": {
            "column_mapping": {"response": "${data.response}", "context": "${data.context}", "query": "${data.query}"}
        },
        "similarity": {
            "column_mapping": {"ground_truth": "${data.ground_truth}", "response": "${data.response}", "context": "${data.context}", "query": "${data.query}"}
        },
        "retrieval":{
            "column_mapping": {"context": "${data.context}", "query": "${data.query}"}
        }
    },
    output_path="./combined_results.json"
)

[2025-01-30 13:47:16 -0500][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_similarity_similarity_asyncsimilarityevaluator_19jzuv0_20250130_134715_705730, log path: C:\Users\jomedin\.promptflow\.runs\azure_ai_evaluation_evaluators_similarity_similarity_asyncsimilarityevaluator_19jzuv0_20250130_134715_705730\logs.txt
[2025-01-30 13:47:16 -0500][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_tovcwj26_20250130_134715_699238, log path: C:\Users\jomedin\.promptflow\.runs\azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_tovcwj26_20250130_134715_699238\logs.txt
[2025-01-30 13:47:16 -0500][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_mjsnhqov_20250130_134715_705730, log path: C:\Users\jomedin\.promptflow\.runs\azure_ai_evaluation_evaluators_common_bas

Prompt flow service has started...
Prompt flow service has started...
Prompt flow service has started...
Prompt flow service has started...
Prompt flow service has started...
Prompt flow service has started...
Prompt flow service has started...
You can view the traces in local from http://127.0.0.1:23333/v1.0/ui/traces/?#run=azure_ai_evaluation_evaluators_similarity_similarity_asyncsimilarityevaluator_19jzuv0_20250130_134715_705730
You can view the traces in local from http://127.0.0.1:23333/v1.0/ui/traces/?#run=azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_mjsnhqov_20250130_134715_705730
You can view the traces in local from http://127.0.0.1:23333/v1.0/ui/traces/?#run=azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_tovcwj26_20250130_134715_699238
You can view the traces in local from http://127.0.0.1:23333/v1.0/ui/traces/?#run=azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_ob7128f9_20250130_134715_699238
You can view the trac

[2025-01-30 13:47:42 -0500][promptflow.core._prompty_utils][ERROR] - Exception occurs: RateLimitError: Error code: 429 - {'error': {'code': '429', 'message': 'Requests to the ChatCompletions_Create Operation under Azure OpenAI API version 2024-10-21 have exceeded token rate limit of your current AIServices S0 pricing tier. Please retry after 43 seconds. Please contact Azure support service if you would like to further increase the default rate limit.'}}
[2025-01-30 13:47:42 -0500][promptflow.core._prompty_utils][ERROR] - Exception occurs: RateLimitError: Error code: 429 - {'error': {'code': '429', 'message': 'Requests to the ChatCompletions_Create Operation under Azure OpenAI API version 2024-10-21 have exceeded token rate limit of your current AIServices S0 pricing tier. Please retry after 43 seconds. Please contact Azure support service if you would like to further increase the default rate limit.'}}
[2025-01-30 13:47:42 -0500][promptflow.core._prompty_utils][ERROR] - Exception occur

2025-01-30 13:47:44 -0500   21200 execution.bulk     INFO     Finished 18 / 29 lines.
2025-01-30 13:47:44 -0500   21200 execution.bulk     INFO     Average execution time for completed lines: 1.54 seconds. Estimated time for incomplete lines: 16.94 seconds.
2025-01-30 13:47:45 -0500   21200 execution.bulk     INFO     Finished 19 / 29 lines.
2025-01-30 13:47:45 -0500   21200 execution.bulk     INFO     Average execution time for completed lines: 1.51 seconds. Estimated time for incomplete lines: 15.1 seconds.
2025-01-30 13:47:45 -0500   21200 execution.bulk     INFO     Finished 20 / 29 lines.
2025-01-30 13:47:45 -0500   21200 execution.bulk     INFO     Average execution time for completed lines: 1.43 seconds. Estimated time for incomplete lines: 12.87 seconds.


[2025-01-30 13:47:45 -0500][promptflow.core._prompty_utils][ERROR] - Exception occurs: RateLimitError: Error code: 429 - {'error': {'code': '429', 'message': 'Requests to the ChatCompletions_Create Operation under Azure OpenAI API version 2024-10-21 have exceeded token rate limit of your current AIServices S0 pricing tier. Please retry after 40 seconds. Please contact Azure support service if you would like to further increase the default rate limit.'}}
[2025-01-30 13:47:45 -0500][promptflow.core._prompty_utils][ERROR] - Exception occurs: RateLimitError: Error code: 429 - {'error': {'code': '429', 'message': 'Requests to the ChatCompletions_Create Operation under Azure OpenAI API version 2024-10-21 have exceeded token rate limit of your current AIServices S0 pricing tier. Please retry after 40 seconds. Please contact Azure support service if you would like to further increase the default rate limit.'}}
[2025-01-30 13:47:45 -0500][promptflow.core._prompty_utils][ERROR] - Exception occur

2025-01-30 13:47:45 -0500   21200 execution.bulk     INFO     Finished 20 / 29 lines.
2025-01-30 13:47:45 -0500   21200 execution.bulk     INFO     Average execution time for completed lines: 1.44 seconds. Estimated time for incomplete lines: 12.96 seconds.


[2025-01-30 13:47:45 -0500][promptflow.core._prompty_utils][ERROR] - Exception occurs: RateLimitError: Error code: 429 - {'error': {'code': '429', 'message': 'Requests to the ChatCompletions_Create Operation under Azure OpenAI API version 2024-10-21 have exceeded token rate limit of your current AIServices S0 pricing tier. Please retry after 40 seconds. Please contact Azure support service if you would like to further increase the default rate limit.'}}
[2025-01-30 13:47:45 -0500][promptflow.core._prompty_utils][ERROR] - Exception occurs: RateLimitError: Error code: 429 - {'error': {'code': '429', 'message': 'Requests to the ChatCompletions_Create Operation under Azure OpenAI API version 2024-10-21 have exceeded token rate limit of your current AIServices S0 pricing tier. Please retry after 40 seconds. Please contact Azure support service if you would like to further increase the default rate limit.'}}
[2025-01-30 13:47:45 -0500][promptflow.core._prompty_utils][ERROR] - Exception occur

2025-01-30 13:47:47 -0500   21200 execution.bulk     INFO     Finished 12 / 29 lines.
2025-01-30 13:47:47 -0500   21200 execution.bulk     INFO     Average execution time for completed lines: 2.54 seconds. Estimated time for incomplete lines: 43.18 seconds.


[2025-01-30 13:47:47 -0500][promptflow.core._prompty_utils][ERROR] - Exception occurs: RateLimitError: Error code: 429 - {'error': {'code': '429', 'message': 'Requests to the ChatCompletions_Create Operation under Azure OpenAI API version 2024-10-21 have exceeded token rate limit of your current AIServices S0 pricing tier. Please retry after 39 seconds. Please contact Azure support service if you would like to further increase the default rate limit.'}}
[2025-01-30 13:47:47 -0500][promptflow.core._prompty_utils][ERROR] - Exception occurs: RateLimitError: Error code: 429 - {'error': {'code': '429', 'message': 'Requests to the ChatCompletions_Create Operation under Azure OpenAI API version 2024-10-21 have exceeded token rate limit of your current AIServices S0 pricing tier. Please retry after 39 seconds. Please contact Azure support service if you would like to further increase the default rate limit.'}}


2025-01-30 13:47:50 -0500   21200 execution.bulk     INFO     Finished 17 / 29 lines.
2025-01-30 13:47:50 -0500   21200 execution.bulk     INFO     Average execution time for completed lines: 2.0 seconds. Estimated time for incomplete lines: 24.0 seconds.
2025-01-30 13:47:51 -0500   21200 execution.bulk     INFO     Finished 19 / 29 lines.
2025-01-30 13:47:51 -0500   21200 execution.bulk     INFO     Average execution time for completed lines: 1.84 seconds. Estimated time for incomplete lines: 18.4 seconds.
2025-01-30 13:47:51 -0500   21200 execution.bulk     INFO     Finished 20 / 29 lines.
2025-01-30 13:47:51 -0500   21200 execution.bulk     INFO     Average execution time for completed lines: 1.75 seconds. Estimated time for incomplete lines: 15.75 seconds.


[2025-01-30 13:47:51 -0500][promptflow.core._prompty_utils][ERROR] - Exception occurs: RateLimitError: Error code: 429 - {'error': {'code': '429', 'message': 'Requests to the ChatCompletions_Create Operation under Azure OpenAI API version 2024-10-21 have exceeded token rate limit of your current AIServices S0 pricing tier. Please retry after 34 seconds. Please contact Azure support service if you would like to further increase the default rate limit.'}}
[2025-01-30 13:47:51 -0500][promptflow.core._prompty_utils][ERROR] - Exception occurs: RateLimitError: Error code: 429 - {'error': {'code': '429', 'message': 'Requests to the ChatCompletions_Create Operation under Azure OpenAI API version 2024-10-21 have exceeded token rate limit of your current AIServices S0 pricing tier. Please retry after 34 seconds. Please contact Azure support service if you would like to further increase the default rate limit.'}}
[2025-01-30 13:47:51 -0500][promptflow.core._prompty_utils][ERROR] - Exception occur

2025-01-30 13:47:51 -0500   21200 execution.bulk     INFO     Finished 19 / 29 lines.
2025-01-30 13:47:51 -0500   21200 execution.bulk     INFO     Average execution time for completed lines: 1.86 seconds. Estimated time for incomplete lines: 18.6 seconds.


[2025-01-30 13:47:52 -0500][promptflow.core._prompty_utils][ERROR] - Exception occurs: RateLimitError: Error code: 429 - {'error': {'code': '429', 'message': 'Requests to the ChatCompletions_Create Operation under Azure OpenAI API version 2024-10-21 have exceeded token rate limit of your current AIServices S0 pricing tier. Please retry after 34 seconds. Please contact Azure support service if you would like to further increase the default rate limit.'}}
[2025-01-30 13:47:52 -0500][promptflow.core._prompty_utils][ERROR] - Exception occurs: RateLimitError: Error code: 429 - {'error': {'code': '429', 'message': 'Requests to the ChatCompletions_Create Operation under Azure OpenAI API version 2024-10-21 have exceeded token rate limit of your current AIServices S0 pricing tier. Please retry after 34 seconds. Please contact Azure support service if you would like to further increase the default rate limit.'}}
[2025-01-30 13:47:52 -0500][promptflow.core._prompty_utils][ERROR] - Exception occur

2025-01-30 13:48:35 -0500   21200 execution.bulk     INFO     Finished 22 / 29 lines.
2025-01-30 13:48:35 -0500   21200 execution.bulk     INFO     Average execution time for completed lines: 3.6 seconds. Estimated time for incomplete lines: 25.2 seconds.
2025-01-30 13:48:35 -0500   21200 execution.bulk     INFO     Finished 24 / 29 lines.
2025-01-30 13:48:35 -0500   21200 execution.bulk     INFO     Average execution time for completed lines: 3.3 seconds. Estimated time for incomplete lines: 16.5 seconds.
2025-01-30 13:48:35 -0500   21200 execution.bulk     INFO     Finished 26 / 29 lines.
2025-01-30 13:48:35 -0500   21200 execution.bulk     INFO     Average execution time for completed lines: 3.05 seconds. Estimated time for incomplete lines: 9.15 seconds.
2025-01-30 13:48:35 -0500   21200 execution.bulk     INFO     Finished 22 / 29 lines.
2025-01-30 13:48:35 -0500   21200 execution.bulk     INFO     Average execution time for completed lines: 3.61 seconds. Estimated time for incomp

 Please check out C:/Users/jomedin/.promptflow/.runs/azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_ob7128f9_20250130_134715_699238 for more details.


2025-01-30 13:48:46 -0500   21200 execution.bulk     INFO     Finished 26 / 29 lines.
2025-01-30 13:48:46 -0500   21200 execution.bulk     INFO     Average execution time for completed lines: 3.48 seconds. Estimated time for incomplete lines: 10.44 seconds.
2025-01-30 13:48:47 -0500   21200 execution.bulk     INFO     Finished 28 / 29 lines.
2025-01-30 13:48:47 -0500   21200 execution.bulk     INFO     Average execution time for completed lines: 3.24 seconds. Estimated time for incomplete lines: 3.24 seconds.
2025-01-30 13:48:48 -0500   21200 execution.bulk     INFO     Finished 29 / 29 lines.
2025-01-30 13:48:48 -0500   21200 execution.bulk     INFO     Average execution time for completed lines: 3.16 seconds. Estimated time for incomplete lines: 0.0 seconds.


In [34]:
import json

with open("./combined_results.json", "r") as read_file:
    data = json.load(read_file)

data['metrics']

{'coherence.coherence': 4.275862068965517,
 'coherence.gpt_coherence': 4.275862068965517,
 'relevance.relevance': 4.517241379310345,
 'relevance.gpt_relevance': 4.517241379310345,
 'groundedness.groundedness': 4.137931034482759,
 'groundedness.gpt_groundedness': 4.137931034482759,
 'fluency.fluency': 4.103448275862069,
 'fluency.gpt_fluency': 4.103448275862069,
 'similarity.similarity': 3.7586206896551726,
 'similarity.gpt_similarity': 3.7586206896551726,
 'retrieval.retrieval': 2.6206896551724137,
 'retrieval.gpt_retrieval': 2.6206896551724137}

In [35]:
pd.DataFrame(data['rows']).head(10)

Unnamed: 0,inputs.context,inputs.ground_truth,inputs.query,inputs.response,outputs.coherence.coherence,outputs.coherence.gpt_coherence,outputs.coherence.coherence_reason,outputs.relevance.relevance,outputs.relevance.gpt_relevance,outputs.relevance.relevance_reason,...,outputs.groundedness.gpt_groundedness,outputs.groundedness.groundedness_reason,outputs.fluency.fluency,outputs.fluency.gpt_fluency,outputs.fluency.fluency_reason,outputs.similarity.similarity,outputs.similarity.gpt_similarity,outputs.retrieval.retrieval,outputs.retrieval.gpt_retrieval,outputs.retrieval.retrieval_reason
0,"[{'content': 'Table of Contents\nreasons, incl...",['Our International Operations Expose Us to a ...,What are some of the risks associated with int...,International operations expose companies to a...,5,5,"The RESPONSE is coherent, well-organized, and ...",4,4,The RESPONSE fully addresses the QUERY with ac...,...,3,The RESPONSE is accurate in its use of informa...,4,4,"The RESPONSE is well-articulated, grammaticall...",4,4,5,5,The first two chunks are highly relevant and w...
1,"[{'content': ""Table of Contents\nOur digital c...",['Title: Operating Risks Content: Table of Con...,Wht are the risks assoociated with holding cas...,Holding cash equivalents in British Pounds (GB...,4,4,The RESPONSE is coherent and effectively addre...,4,4,The RESPONSE fully addresses the QUERY with ac...,...,5,"The RESPONSE is entirely based on the CONTEXT,...",4,4,"The RESPONSE is well-articulated, with good gr...",3,3,3,3,The context contains relevant information abou...
2,"[{'content': ""Alphabet Inc.\nTable of Contents...",['The Loss of Key Senior Management Personnel ...,Why is it challenging for businesses to hire a...,Businesses face challenges in hiring and retai...,4,4,The RESPONSE is coherent and effectively addre...,4,4,The RESPONSE fully addresses the QUERY with ac...,...,5,"The RESPONSE is entirely based on the CONTEXT,...",4,4,"The RESPONSE is well-articulated, with good gr...",3,3,2,2,The context is partially relevant but poorly r...
3,[{'content': 'Table of Contents\nWe Face Risks...,"[""Title: Risks Related to Data Security, Syste...",How can supplier relationships impact the oper...,Supplier relationships can significantly impac...,5,5,"The RESPONSE is coherent, well-structured, and...",5,5,The RESPONSE fully addresses the QUERY with ac...,...,5,"The RESPONSE is fully grounded and complete, a...",4,4,"The RESPONSE is well-articulated, with good gr...",3,3,2,2,The context contains one relevant chunk at the...
4,[{'content': 'Exhibit A ALPHABET INC. CLAWBACK...,"['Title: Claims, Litigation, Government Invest...",How do governments and regulatory authorities ...,Governments and regulatory authorities impact ...,4,4,"The RESPONSE is coherent, well-organized, and ...",5,5,The RESPONSE fully addresses the QUERY with ac...,...,5,"The RESPONSE is entirely based on the CONTEXT,...",4,4,"The RESPONSE is well-written, with strong gram...",3,3,2,2,The context contains relevant information abou...
5,[{'content': 'Table of Contents\nAlphabet Inc....,"['Risk Factors."" Overview Our primary source o...",What be free cash flows and how they help busi...,Free cash flow represents the cash a company g...,5,5,"The RESPONSE is coherent, logically structured...",4,4,The RESPONSE fully addresses the QUERY with ac...,...,3,The RESPONSE is accurate in its description of...,4,4,"The RESPONSE is well-articulated, with good gr...",3,3,2,2,The input data should get this score because w...
6,"[{'content': ""Alphabet Inc.\nTable of Contents...",['<1-hop>\n\nNote 10 - SEGMENT INFORMATION Con...,How do tax contingencies and segment performan...,Tax contingencies and segment performance eval...,4,4,"The RESPONSE is coherent, logically organized,...",4,4,The RESPONSE fully addresses the QUERY with ac...,...,3,The RESPONSE is accurate in discussing tax con...,4,4,"The RESPONSE is well-articulated, with good co...",3,3,2,2,The contexts are partially relevant as they di...
7,[{'content': '3\nCONTRACT OR TORT OR OTHERWISE...,"[""<1-hop>\n\nTitle: Risk Factors Content: Tabl...",How do the risks associated with artificial in...,The risks associated with artificial intellige...,4,4,"The RESPONSE is coherent, well-organized, and ...",5,5,The RESPONSE fully addresses the QUERY with ac...,...,3,The RESPONSE is accurate and aligns with the C...,5,5,The RESPONSE deserves a high score because it ...,4,4,2,2,The context contains partially relevant inform...
8,"[{'content': '(stating that it is a ""notice of...",['<1-hop>\n\nTitle: Notice of Default Content:...,What happens when the Administrative Agent res...,"When the Administrative Agent resigns, the fol...",4,4,"The RESPONSE is coherent, well-organized, and ...",5,5,The RESPONSE fully addresses the QUERY with ac...,...,3,The RESPONSE is mostly grounded in the CONTEXT...,4,4,"The RESPONSE is well-written, coherent, and de...",4,4,3,3,The context contains relevant information but ...
9,[{'content': 'Table of Contents\nand related t...,['<1-hop>\n\nTitle: Cost of Sales and Operatin...,How do fulfillment costs and marketing costs i...,Fulfillment costs and marketing costs signific...,5,5,"The RESPONSE is coherent, well-structured, and...",5,5,The RESPONSE fully addresses the QUERY with ac...,...,5,"The RESPONSE is entirely based on the CONTEXT,...",4,4,"The RESPONSE is well-articulated, grammaticall...",5,5,4,4,The input data should get this score because t...
