In [None]:
import argparse
import csv
import json
import os

import numpy as np
import pandas as pd

from benchmark import Benchmark

workload_names = [
    "archeology.json",
    "astronomy.json",
    "biomedical.json" "environment.json",
    "legal.json",
    "wildfire.json",
]

sys_names = {
    "BaselineLLMSystemGPTo3FewShot_5Tries": "5 Tries",
    "BaselineLLMSystemGPTo3FewShot_10Tries": "10 Tries",
    "BaselineLLMSystemGPTo3FewShot_15Tries": "15 Tries",
    "BaselineLLMSystemGPTo3FewShot_10Rows": "10 Rows",
    "BaselineLLMSystemGPTo3FewShot_50Rows": "50 Rows",
    # 'BaselineLLMSystemGPTo3FewShot_100Rows' : '100 Rows',
    "BaselineLLMSystemGPTo3FewShot_150Rows": "150 Rows",
}

In [None]:
sut_metrics = {}
for sut_name in sys_names:
    aggregated_result_filepath = "./results/aggregated_results.csv"

    df = pd.read_csv(aggregated_result_filepath)
    metric_aggregation_dict = {}
    for (sut, metric), group in df.groupby(["sut", "metric"]):
        if sut != sut_name:
            continue
        group_dropped_na = group.dropna()
        metric_aggregation_dict[metric] = group["value_mean"].mean()
    # print(f"Aggregated results for {sut_name}:")
    # print(metric_aggregation_dict)
    sut_metrics[sut_name] = metric_aggregation_dict

metrics_df = pd.DataFrame.from_dict(sut_metrics, orient="index")
metrics = [
    "bleu",
    "llm_code_eval",
    "f1",
    "mean_absolute_error",
    "precision",
    "recall",
    "rouge",
    "success",
    "runtime",
]
for m in metrics:
    if m != "runtime":
        metrics_df[m] = metrics_df[m] * 100

display(metrics_df)
ltx_table = metrics_df.to_latex(
    index=True,
    label="tab:metrics",
    caption="Metrics for different systems.",
    float_format="%.2f",
    column_format="l" + "c" * len(metrics_df.columns),
)

for sut_name in sys_names:
    ltx_table = ltx_table.replace(sut_name, "& " + sys_names[sut_name])

print(ltx_table)

Unnamed: 0,bleu,f1,f1_approximate,llm_code_eval,llm_paraphrase,mean_absolute_error,mean_relative_absolute_error,mean_squared_error,precision,rae_score,recall,rouge,runtime,string_bootstrap,success
BaselineLLMSystemGPTo3FewShot_5Tries,30.71168,20.07378,0.0,45.596624,0.25,661.014706,7618.392877,211.107162,24.484083,3809.315169,28.978529,35.795162,15.960792,0.301339,20.209677
BaselineLLMSystemGPTo3FewShot_10Tries,32.5525,20.596518,0.0,48.828131,0.194444,665.324706,8213.540965,214.669116,24.336115,4106.862907,29.65048,37.59842,8.401137,0.277084,19.463408
BaselineLLMSystemGPTo3FewShot_15Tries,25.978303,19.747259,0.0,42.170862,0.222222,349.084706,7618.370206,57.087045,21.512256,3809.32268,28.199353,32.712843,8.558599,0.287698,18.376126
BaselineLLMSystemGPTo3FewShot_10Rows,26.05957,20.264073,0.0,46.956207,0.222222,22101.003362,6666.142298,327406.640222,26.00346,3333.241756,32.237575,33.536638,8.026405,0.274522,19.319282
BaselineLLMSystemGPTo3FewShot_50Rows,27.915855,20.637763,0.0,41.722773,0.194444,661.014,7618.392709,211.107162,24.276971,3809.315294,28.61083,31.437094,7.063004,0.286464,22.469963
BaselineLLMSystemGPTo3FewShot_150Rows,27.91379,18.773706,0.0,42.968399,0.25,141.189345,8094.491005,7.330263,22.569637,4047.362158,28.735452,35.622115,7.984223,0.29224,19.99266


\begin{table}
\caption{Metrics for different systems.}
\label{tab:metrics}
\begin{tabular}{lccccccccccccccc}
\toprule
 & bleu & f1 & f1_approximate & llm_code_eval & llm_paraphrase & mean_absolute_error & mean_relative_absolute_error & mean_squared_error & precision & rae_score & recall & rouge & runtime & string_bootstrap & success \\
\midrule
& 5 Tries & 30.71 & 20.07 & 0.00 & 45.60 & 0.25 & 661.01 & 7618.39 & 211.11 & 24.48 & 3809.32 & 28.98 & 35.80 & 15.96 & 0.30 & 20.21 \\
& 10 Tries & 32.55 & 20.60 & 0.00 & 48.83 & 0.19 & 665.32 & 8213.54 & 214.67 & 24.34 & 4106.86 & 29.65 & 37.60 & 8.40 & 0.28 & 19.46 \\
& 15 Tries & 25.98 & 19.75 & 0.00 & 42.17 & 0.22 & 349.08 & 7618.37 & 57.09 & 21.51 & 3809.32 & 28.20 & 32.71 & 8.56 & 0.29 & 18.38 \\
& 10 Rows & 26.06 & 20.26 & 0.00 & 46.96 & 0.22 & 22101.00 & 6666.14 & 327406.64 & 26.00 & 3333.24 & 32.24 & 33.54 & 8.03 & 0.27 & 19.32 \\
& 50 Rows & 27.92 & 20.64 & 0.00 & 41.72 & 0.19 & 661.01 & 7618.39 & 211.11 & 24.28 & 3809.32 & 28.61 & 31

In [3]:
[k for k in sut_metrics.keys()]

['BaselineLLMSystemGPTo3FewShot_5Tries',
 'BaselineLLMSystemGPTo3FewShot_10Tries',
 'BaselineLLMSystemGPTo3FewShot_15Tries',
 'BaselineLLMSystemGPTo3FewShot_10Rows',
 'BaselineLLMSystemGPTo3FewShot_50Rows',
 'BaselineLLMSystemGPTo3FewShot_100Rows',
 'BaselineLLMSystemGPTo3FewShot_150Rows']

In [None]:
print("Per-domain aggregation:")
# Calculate the weighted mean of the following metrics per domain
domains = ["archeology", "astronomy", "biomedical", "environment", "legal", "wildfire"]
metrics = ["success", "llm_paraphrase", "rae_score", "f1"]
suts = list(sys_names.keys())
# measures = {}
# for sut in df['sut'].unique():
#     for workload_name in workload_names:
#         df['weighted_metric'] = df['value_support'] + df['value_mean']
#         x = df.groupby(['sut', 'workload'])['value_support'].sum()
#         supports = dict(x)
#         measures = []
#         for key in supports.keys():
#             sut, workload = key
#             sys_domain_measure = df[df['sut'] == sut][df['workload'] == workload]['value_mean'] * df[df['sut'] == sut][df['workload'] == workload]['value_support'] / supports[key]

df.loc[df["metric"] == "rae_score", "value_mean"] = 1 / (1 + df["value_mean"])
df["meansupp"] = df["value_mean"] * df["value_support"]
# change value_mean of rae_score to 1/(1+ value_mean)

results = {}
for domain in domains + ["overall", "runtime"]:
    if domain not in ["overall", "runtime", "rae_score"]:
        sut_df = df.query(
            f'sut in {suts} and workload == "{domain}.json" and metric in {metrics}'
        )
        x = sut_df.groupby(["sut"]).sum()["meansupp"]
        y = sut_df.groupby(["sut"]).sum()["value_support"]
        results[domain] = x / y

    elif domain == "overall":
        sut_df = df.query(f"sut in {suts} and metric in {metrics}")
        x = sut_df.groupby(["sut"]).sum()["meansupp"]
        y = sut_df.groupby(["sut"]).sum()["value_support"]
        results[domain] = x / y

    elif domain == "runtime":
        sut_df = df.query(f'sut in {suts} and metric == "runtime"')
        x = sut_df.groupby(["sut"]).sum()["meansupp"]
        y = sut_df.groupby(["sut"]).sum()["value_support"]
        results[domain] = x / y


domain_df = pd.DataFrame(results) * 100
domain_df = domain_df.reindex(suts)

display(domain_df)
ltx_table = domain_df.to_latex(
    index=True,
    label="tab:metrics",
    caption="Metrics for different domains.",
    float_format="%.2f",
    column_format="l" + "c" * len(metrics_df.columns),
)
for sys_name in sys_names:
    ltx_table = ltx_table.replace(sys_name, "& " + sys_names[sys_name])

print(ltx_table)

Per-domain aggregation:


Unnamed: 0_level_0,archeology,astronomy,biomedical,environment,legal,wildfire,overall,runtime
sut,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
BaselineLLMSystemGPTo3FewShot_5Tries,20.607638,11.945766,8.534435,34.840872,12.204257,40.59699,23.36241,1374.815624
BaselineLLMSystemGPTo3FewShot_10Tries,19.861545,11.603604,8.713293,36.655777,10.794672,37.864245,22.832362,575.879581
BaselineLLMSystemGPTo3FewShot_15Tries,,10.655849,8.718492,36.840115,10.147417,38.42531,22.923179,738.448902
BaselineLLMSystemGPTo3FewShot_10Rows,18.748753,12.79606,8.629019,34.516399,13.321824,37.419575,22.890776,732.445713
BaselineLLMSystemGPTo3FewShot_50Rows,23.475182,10.54666,7.869968,37.60401,14.075001,40.630148,24.681686,655.610235
BaselineLLMSystemGPTo3FewShot_150Rows,21.076572,10.579296,8.637668,31.677181,13.088347,39.224599,22.583519,802.900424


\begin{table}
\caption{Metrics for different domains.}
\label{tab:metrics}
\begin{tabular}{lccccccccccccccc}
\toprule
 & archeology & astronomy & biomedical & environment & legal & wildfire & overall & runtime \\
sut &  &  &  &  &  &  &  &  \\
\midrule
& 5 Tries & 20.61 & 11.95 & 8.53 & 34.84 & 12.20 & 40.60 & 23.36 & 1374.82 \\
& 10 Tries & 19.86 & 11.60 & 8.71 & 36.66 & 10.79 & 37.86 & 22.83 & 575.88 \\
& 15 Tries & NaN & 10.66 & 8.72 & 36.84 & 10.15 & 38.43 & 22.92 & 738.45 \\
& 10 Rows & 18.75 & 12.80 & 8.63 & 34.52 & 13.32 & 37.42 & 22.89 & 732.45 \\
& 50 Rows & 23.48 & 10.55 & 7.87 & 37.60 & 14.08 & 40.63 & 24.68 & 655.61 \\
& 150 Rows & 21.08 & 10.58 & 8.64 & 31.68 & 13.09 & 39.22 & 22.58 & 802.90 \\
\bottomrule
\end{tabular}
\end{table}



In [35]:
mkdwn_table = domain_df.to_markdown()
for sys_name in sys_names:
    mkdwn_table = mkdwn_table.replace(sys_name, sys_names[sys_name])

print(mkdwn_table)

| sut                                   |   archeology |   astronomy |   biomedical |   environment |   legal |   wildfire |   overall |   runtime |
|:--------------------------------------|-------------:|------------:|-------------:|--------------:|--------:|-----------:|----------:|----------:|
| 5 Tries  |      20.6076 |     11.9458 |      8.53444 |       34.8409 | 12.2043 |    40.597  |   23.3624 |  1374.82  |
| 10 Tries |      19.8615 |     11.6036 |      8.71329 |       36.6558 | 10.7947 |    37.8642 |   22.8324 |   575.88  |
| 15 Tries |     nan      |     10.6558 |      8.71849 |       36.8401 | 10.1474 |    38.4253 |   22.9232 |   738.449 |
| 10 Rows  |      18.7488 |     12.7961 |      8.62902 |       34.5164 | 13.3218 |    37.4196 |   22.8908 |   732.446 |
| 50 Rows  |      23.4752 |     10.5467 |      7.86997 |       37.604  | 14.075  |    40.6301 |   24.6817 |   655.61  |
| 150 Rows |      21.0766 |     10.5793 |      8.63767 |       31.6772 | 13.0883 |    39.2246 |   22.5