In [1]:
import argparse
import csv
import json
import os

import numpy as np
import pandas as pd

from benchmark import Benchmark

workload_names = [
    "archeology.json",
    "astronomy.json",
    "biomedical.json" "environment.json",
    "legal.json",
    "wildfire.json",
]


sys_names = {
    'BaselineLLMSystemLlama3_3InstructNaive': 'Llama3-3Intruct',
    'BaselineLLMSystemDeepseekR1Naive': 'DeepSeek-R1',
    'BaselineLLMSystemQwen2_5CoderNaive': 'Qwen2-5Coder',
    'BaselineLLMSystemLlama3_3InstructOneShot': 'Llama3-3Intruct',
    'BaselineLLMSystemDeepseekR1OneShot': 'DeepSeek-R1',
    'BaselineLLMSystemQwen2_5CoderOneShot': 'Qwen2-5Coder',
    'BaselineLLMSystemLlama3_3InstructFewShot': 'Llama3-3Intruct',
    'BaselineLLMSystemDeepseekR1FewShot': 'DeepSeek-R1',
    'BaselineLLMSystemQwen2_5CoderFewShot': 'Qwen2-5Coder',
}

In [2]:
sut_metrics = {}
for sut_name in sys_names:

    aggregated_result_filepath = "./results/aggregated_results.csv"

    df = pd.read_csv(aggregated_result_filepath)
    metric_aggregation_dict = {}
    for (sut, metric), group in df.groupby(["sut", "metric"]):
        if sut != sut_name:
            continue
        group_dropped_na = group.dropna()
        metric_aggregation_dict[metric] = group["value_mean"].mean()
    # print(f"Aggregated results for {sut_name}:")
    # print(metric_aggregation_dict)
    sut_metrics[sut_name] = metric_aggregation_dict

metrics_df = pd.DataFrame.from_dict(sut_metrics, orient="index")
metrics = ['bleu', 'llm_code_eval', 'f1', 'mean_absolute_error', 'precision', 'recall', 'rouge', 'success', 'runtime']
for m in metrics:
    if m != 'runtime':
        metrics_df[m] = metrics_df[m]*100

display(metrics_df)
ltx_table = metrics_df.to_latex(
    index=True,
    label="tab:metrics",
    caption="Metrics for different systems.",
    float_format="%.2f",
    column_format="l" + "c" * len(metrics_df.columns),
)

for sut_name in sys_names:
    ltx_table = ltx_table.replace(sut_name, "& "+sys_names[sut_name])

print(ltx_table)

Unnamed: 0,bleu,f1,f1_approximate,llm_code_eval,llm_paraphrase,mean_absolute_error,mean_relative_absolute_error,mean_squared_error,precision,rae_score,recall,rouge,runtime,string_bootstrap,success
BaselineLLMSystemLlama3_3InstructNaive,10.90184,15.381865,0.0,31.840467,0.083333,,1.0,,6.578313,0.5,4.515189,10.377817,15.825707,0.083333,3.466307
BaselineLLMSystemDeepseekR1Naive,13.857792,16.5114,0.0,15.283604,0.111111,,1.0,,9.613201,0.5,9.467789,15.349299,15.051404,0.134921,5.211261
BaselineLLMSystemQwen2_5CoderNaive,7.95299,13.73625,0.0,36.098564,0.055556,,1.0,,4.557502,0.5,4.439509,6.905594,16.910971,0.055556,1.155263
BaselineLLMSystemLlama3_3InstructOneShot,5.625398,17.353705,0.0,23.46638,0.027778,151352.868769,0.9673,2290769.0,7.612289,0.51245,6.963376,4.822261,27.957608,0.027778,5.340485
BaselineLLMSystemDeepseekR1OneShot,18.033475,16.126906,0.0,13.323198,0.111111,,1.0,,6.930969,0.5,6.871505,19.811936,13.910909,0.162698,5.773668
BaselineLLMSystemQwen2_5CoderOneShot,3.119757,17.882218,0.0,23.181768,0.0,,1.0,,3.600756,0.5,3.998638,1.706353,30.329862,0.0,2.245146
BaselineLLMSystemLlama3_3InstructFewShot,4.305823,19.580461,0.0,19.049272,0.027778,38657.533545,0.973022,572960.6,9.90549,0.526258,9.253332,3.400488,23.63146,0.027778,8.402118
BaselineLLMSystemDeepseekR1FewShot,22.825883,18.641354,0.0,10.784832,0.138889,151353.0,0.9673,2290773.0,16.459765,0.51245,21.157349,26.189374,15.374898,0.217615,15.600217
BaselineLLMSystemQwen2_5CoderFewShot,1.889522,20.751702,0.0,13.731349,0.0,1116.28,0.962014,124.6081,4.87975,0.510277,5.589343,0.0,39.383741,0.0,3.75


\begin{table}
\caption{Metrics for different systems.}
\label{tab:metrics}
\begin{tabular}{lccccccccccccccc}
\toprule
 & bleu & f1 & f1_approximate & llm_code_eval & llm_paraphrase & mean_absolute_error & mean_relative_absolute_error & mean_squared_error & precision & rae_score & recall & rouge & runtime & string_bootstrap & success \\
\midrule
& Llama3-3Intruct & 10.90 & 15.38 & 0.00 & 31.84 & 0.08 & NaN & 1.00 & NaN & 6.58 & 0.50 & 4.52 & 10.38 & 15.83 & 0.08 & 3.47 \\
& DeepSeek-R1 & 13.86 & 16.51 & 0.00 & 15.28 & 0.11 & NaN & 1.00 & NaN & 9.61 & 0.50 & 9.47 & 15.35 & 15.05 & 0.13 & 5.21 \\
& Qwen2-5Coder & 7.95 & 13.74 & 0.00 & 36.10 & 0.06 & NaN & 1.00 & NaN & 4.56 & 0.50 & 4.44 & 6.91 & 16.91 & 0.06 & 1.16 \\
& Llama3-3Intruct & 5.63 & 17.35 & 0.00 & 23.47 & 0.03 & 151352.87 & 0.97 & 2290769.09 & 7.61 & 0.51 & 6.96 & 4.82 & 27.96 & 0.03 & 5.34 \\
& DeepSeek-R1 & 18.03 & 16.13 & 0.00 & 13.32 & 0.11 & NaN & 1.00 & NaN & 6.93 & 0.50 & 6.87 & 19.81 & 13.91 & 0.16 & 5.77 \\
& Qwen2-5C

In [8]:
[k for k in sut_metrics.keys()]

['BaselineLLMSystemLlama3_3InstructNaive',
 'BaselineLLMSystemDeepseekR1Naive',
 'BaselineLLMSystemQwen2_5CoderNaive',
 'BaselineLLMSystemLlama3_3InstructOneShot',
 'BaselineLLMSystemDeepseekR1OneShot',
 'BaselineLLMSystemQwen2_5CoderOneShot',
 'BaselineLLMSystemLlama3_3InstructFewShot',
 'BaselineLLMSystemDeepseekR1FewShot',
 'BaselineLLMSystemQwen2_5CoderFewShot']

In [None]:
print("Per-domain aggregation:")
# Calculate the weighted mean of the following metrics per domain
domains = ['archeology', 'astronomy', 'biomedical', 'environment', 'legal', 'wildfire']
metrics = ['success', 'llm_paraphrase', 'mean_relative_absolute_error', 'f1']
suts = list(sys_names.keys())
# measures = {}
# for sut in df['sut'].unique():
#     for workload_name in workload_names:
#         df['weighted_metric'] = df['value_support'] + df['value_mean']
#         x = df.groupby(['sut', 'workload'])['value_support'].sum()
#         supports = dict(x)
#         measures = []
#         for key in supports.keys():
#             sut, workload = key
#             sys_domain_measure = df[df['sut'] == sut][df['workload'] == workload]['value_mean'] * df[df['sut'] == sut][df['workload'] == workload]['value_support'] / supports[key]
df['meansupp'] = df['value_mean'] * df['value_support']
results = {}
for domain in domains+['overall', 'runtime']:
    if domain not in ['overall', 'runtime']:
        sut_df = df.query(f'sut in {suts} and workload == "{domain}.json" and metric in {metrics}')
        x = sut_df.groupby(['sut']).sum()['meansupp']
        y = sut_df.groupby(['sut']).sum()['value_support']
        results[domain] = x/y

    elif domain == 'overall':
        sut_df = df.query(f'sut in {suts} and metric in {metrics}')
        x = sut_df.groupby(['sut']).sum()['meansupp']
        y = sut_df.groupby(['sut']).sum()['value_support']
        results[domain] = x/y

    elif domain == 'runtime':
        sut_df = df.query(f'sut in {suts} and metric == "runtime"')
        x = sut_df.groupby(['sut']).sum()['meansupp']
        y = sut_df.groupby(['sut']).sum()['value_support']
        results[domain] = x/y


domain_df = pd.DataFrame(results)*100
domain_df = domain_df.reindex(suts)

display(domain_df)
ltx_table = domain_df.to_latex(
    index=True,
    label="tab:metrics",
    caption="Metrics for different domains.",
    float_format="%.2f",
    column_format="l" + "c" * len(metrics_df.columns),
)
for sys_name in sys_names:
    ltx_table = ltx_table.replace(sys_name, "& "+sys_names[sys_name])

print(ltx_table)


Per-domain aggregation:


Unnamed: 0_level_0,archeology,astronomy,biomedical,environment,legal,wildfire,overall,runtime
sut,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
BaselineLLMSystemLlama3_3InstructNaive,15.246627,11.609031,7.973637,9.419268,8.093859,12.345299,10.382376,1215.770496
BaselineLLMSystemDeepseekR1Naive,18.678396,10.082315,5.393772,12.419821,9.516699,17.344537,12.480153,1172.260353
BaselineLLMSystemQwen2_5CoderNaive,,8.196569,5.14405,9.684149,8.288214,12.39129,9.24825,1243.923781
BaselineLLMSystemLlama3_3InstructOneShot,13.536643,11.43035,5.5714,12.399345,8.254262,20.656305,12.24549,2156.37474
BaselineLLMSystemDeepseekR1OneShot,18.046874,10.228354,5.762125,14.730483,7.864745,16.859829,12.375831,1077.230056
BaselineLLMSystemQwen2_5CoderOneShot,14.422837,8.982532,5.326595,12.399345,9.298903,13.203366,11.020322,2378.712947
BaselineLLMSystemLlama3_3InstructFewShot,17.351052,12.369454,8.990052,12.399345,10.798513,25.82371,14.749931,1776.363712
BaselineLLMSystemDeepseekR1FewShot,20.589219,12.740051,9.170564,28.851411,11.000227,26.581641,19.069635,1254.73283
BaselineLLMSystemQwen2_5CoderFewShot,19.090076,,6.997515,12.399345,9.988703,13.203366,12.233805,2972.830028


\begin{table}
\caption{Metrics for different domains.}
\label{tab:metrics}
\begin{tabular}{lccccccccccccccc}
\toprule
 & archeology & astronomy & biomedical & environment & legal & wildfire & overall & runtime \\
sut &  &  &  &  &  &  &  &  \\
\midrule
& Llama3-3Intruct & 15.25 & 11.61 & 7.97 & 9.42 & 8.09 & 12.35 & 10.38 & 1215.77 \\
& DeepSeek-R1 & 18.68 & 10.08 & 5.39 & 12.42 & 9.52 & 17.34 & 12.48 & 1172.26 \\
& Qwen2-5Coder & NaN & 8.20 & 5.14 & 9.68 & 8.29 & 12.39 & 9.25 & 1243.92 \\
& Llama3-3Intruct & 13.54 & 11.43 & 5.57 & 12.40 & 8.25 & 20.66 & 12.25 & 2156.37 \\
& DeepSeek-R1 & 18.05 & 10.23 & 5.76 & 14.73 & 7.86 & 16.86 & 12.38 & 1077.23 \\
& Qwen2-5Coder & 14.42 & 8.98 & 5.33 & 12.40 & 9.30 & 13.20 & 11.02 & 2378.71 \\
& Llama3-3Intruct & 17.35 & 12.37 & 8.99 & 12.40 & 10.80 & 25.82 & 14.75 & 1776.36 \\
& DeepSeek-R1 & 20.59 & 12.74 & 9.17 & 28.85 & 11.00 & 26.58 & 19.07 & 1254.73 \\
& Qwen2-5Coder & 19.09 & NaN & 7.00 & 12.40 & 9.99 & 13.20 & 12.23 & 2972.83 \\
\bottomrul