In [1]:
import itertools
import logging
from functools import partial

import pandas as pd
import lastmile_utils.lib.jupyter as jupyter_utils

pd.set_option("display.max_colwidth", None)

jupyter_utils.set_log_level(logging.WARNING)



In [2]:
from aiconfig.eval.lib import (
    TestSuiteWithInputsSettings,
    UserTestSuiteWithInputs,
    brevity,
    contains_substring,
    run_test_suite_with_inputs,
)

jupyter_utils.set_log_level(logging.WARNING)

tsconfig = TestSuiteWithInputsSettings(
    {
        "prompt_name": "get_activities",
        "aiconfig_path": "./travel_parametrized.aiconfig.json",
    }
)


substring_case_insensitive = partial(contains_substring, case_sensitive=False)


test_inputs = [
    "Empire State Building is on fifth avenue. What is the cross street?",
    "What is the best borough?",
]
test_suite_with_inputs = UserTestSuiteWithInputs(
    [
        (test_inputs[0], partial(substring_case_insensitive, substring="34th street")),
        (test_inputs[1], partial(substring_case_insensitive, substring="Brooklyn")),
    ]
    + list(itertools.product(test_inputs, [brevity]))
)


df_result = await run_test_suite_with_inputs(
    test_suite=test_suite_with_inputs,
    settings=tsconfig,
)


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df_result

Unnamed: 0,input,aiconfig_output,value,metric_name,metric_description,best_value,worst_value
0,Empire State Building is on fifth avenue. What is the cross street?,The cross street of Empire State Building is West 34th Street.,1.0,contains_substring,1.0 (pass) if contains given substring,1.0,0.0
1,Empire State Building is on fifth avenue. What is the cross street?,The cross street of Empire State Building is West 34th Street.,62.0,brevity,Absolute text length,1.0,inf
2,What is the best borough?,"As an AI language model, I don't have personal opinions. The choice of the best borough is subjective and depends on individual preferences. Each borough of a city has its own unique qualities and attractions. For example, in New York City, some might prefer Manhattan for its iconic landmarks and vibrant city life, while others might prefer Brooklyn for its trendy neighborhoods and diverse culture. It ultimately depends on what you value and what you are looking for in a borough.",1.0,contains_substring,1.0 (pass) if contains given substring,1.0,0.0
3,What is the best borough?,"As an AI language model, I don't have personal opinions. The choice of the best borough is subjective and depends on individual preferences. Each borough of a city has its own unique qualities and attractions. For example, in New York City, some might prefer Manhattan for its iconic landmarks and vibrant city life, while others might prefer Brooklyn for its trendy neighborhoods and diverse culture. It ultimately depends on what you value and what you are looking for in a borough.",484.0,brevity,Absolute text length,1.0,inf


In [4]:
df_result.set_index(["input", "aiconfig_output", "metric_name"])\
        .value.unstack("metric_name")

Unnamed: 0_level_0,metric_name,brevity,contains_substring
input,aiconfig_output,Unnamed: 2_level_1,Unnamed: 3_level_1
Empire State Building is on fifth avenue. What is the cross street?,The cross street of Empire State Building is West 34th Street.,62.0,1.0
What is the best borough?,"As an AI language model, I don't have personal opinions. The choice of the best borough is subjective and depends on individual preferences. Each borough of a city has its own unique qualities and attractions. For example, in New York City, some might prefer Manhattan for its iconic landmarks and vibrant city life, while others might prefer Brooklyn for its trendy neighborhoods and diverse culture. It ultimately depends on what you value and what you are looking for in a borough.",484.0,1.0


In [5]:
import itertools
from functools import partial


from aiconfig.eval.lib import (
    UserTestSuiteOutputsOnly,
    brevity,
    contains_substring,
    run_test_suite_outputs_only,
)


substring_case_insensitive = partial(contains_substring, case_sensitive=False)


test_outputs = [
    "The cross street for the Empire State Building is West 34th Street.",
    'The answer to this question is subjective and depends on individual preferences. All the boroughs of a city have their own unique features, attractions, and advantages. Therefore, it is impossible to determine the "best" borough as it varies depending on personal interests, lifestyle, and priorities.',
]
test_suite_outputs_only = UserTestSuiteOutputsOnly(
    [
        (test_outputs[0], partial(substring_case_insensitive, substring="34th street")),
        (test_outputs[1], partial(substring_case_insensitive, substring="Brooklyn")),
    ]
    + list(itertools.product(test_outputs, [brevity]))
)


df_result = await run_test_suite_outputs_only(
    test_suite=test_suite_outputs_only,
)

In [6]:
df_result

Unnamed: 0,input,aiconfig_output,value,metric_name,metric_description,best_value,worst_value
0,Missing,The cross street for the Empire State Building is West 34th Street.,1.0,contains_substring,1.0 (pass) if contains given substring,1.0,0.0
1,Missing,"The answer to this question is subjective and depends on individual preferences. All the boroughs of a city have their own unique features, attractions, and advantages. Therefore, it is impossible to determine the ""best"" borough as it varies depending on personal interests, lifestyle, and priorities.",0.0,contains_substring,1.0 (pass) if contains given substring,1.0,0.0
2,Missing,The cross street for the Empire State Building is West 34th Street.,67.0,brevity,Absolute text length,1.0,inf
3,Missing,"The answer to this question is subjective and depends on individual preferences. All the boroughs of a city have their own unique features, attractions, and advantages. Therefore, it is impossible to determine the ""best"" borough as it varies depending on personal interests, lifestyle, and priorities.",301.0,brevity,Absolute text length,1.0,inf


In [7]:
df_result.set_index([ "aiconfig_output", "metric_name"])\
        .value.unstack("metric_name")

metric_name,brevity,contains_substring
aiconfig_output,Unnamed: 1_level_1,Unnamed: 2_level_1
"The answer to this question is subjective and depends on individual preferences. All the boroughs of a city have their own unique features, attractions, and advantages. Therefore, it is impossible to determine the ""best"" borough as it varies depending on personal interests, lifestyle, and priorities.",301.0,0.0
The cross street for the Empire State Building is West 34th Street.,67.0,1.0
