In [18]:
import pandas as pd
import numpy as np
import json

In [24]:
model = 'gpt-3.5'
benchmark = 'HumanEval' #'APPS-codewars'

def get_results(task_id):
    iter = 0
    acc_dict = {}
    for strategy in ['baseline', 'active-reasoning', 'baseline-binary', 'active-reasoning-binary']:
        acc_dict[strategy] = {}
        for iter in range(5):
            acc_dict[strategy][iter] = []
            for run_seed in [0, 1, 2]:
                try:
                    results_dict = json.load(open(f'./results/code-generation/{benchmark}/{task_id}/{strategy}/{model}/iter_{run_seed}/eval_program_correctness.json', 'r'))
                except(FileNotFoundError):
                    continue
                for eval_seed in range(3):
                    try:
                        r_ls = results_dict[str(eval_seed)][str(iter)]
                        try:
                            acc = len([r for r in r_ls if r == 'True']) / len(r_ls)
                        except(ZeroDivisionError):
                            acc = None
                    except(KeyError):
                        acc = None
                    acc_dict[strategy][iter].append(acc)
    return acc_dict

In [25]:
all_results_df = pd.DataFrame()

id_ls = [
    1, 5, 6, 17, 26, 33, 36, 
    39, 41, 54, 55, 64, 70, 81, 96, 106, 109, 
    147, 93, 118, 101, 143, 121, 134, 139, 141,
    122, 82, 115, 77, 98, 90, 138, 74, 95, 110, 123,
    111, 154, 114, 91, 103, 107, 76, 159, 73
] # ids for HumanEval

# id_ls = [1614, 2664, 2671, 2681, 2717, 2728, 2881, 2927, 2939, 2991, 3016,
#        3068, 3072, 3079, 3220, 3248, 3278, 3366, 3443, 3452, 3477, 3536,
#        3589, 3594, 3598, 3689, 3706, 3715, 3786, 3822, 3856, 3958, 4084,
#        4128, 4182, 4190, 4240, 4293, 4315, 4317, 4326, 4353, 4360, 4414,
#        4438, 4513, 4546] # ids for APPS

for i in id_ls:
    task_id = f'{i}'
    acc_dict = get_results(task_id)
    for key, results in acc_dict.items():
        results_df = pd.DataFrame(results).T
        results_df.index = pd.MultiIndex.from_tuples(zip([key] * 5,  [task_id] * 5, range(5)))
        all_results_df = pd.concat([all_results_df, results_df])

all_results_df = all_results_df.stack().reset_index()
all_results_df.columns = ['strategy', 'task_id', 'iter', 'seed', 'acc']
all_results_df['acc'] = all_results_df['acc'] * 100

all_results_df.head()

Unnamed: 0,strategy,task_id,iter,seed,acc
0,baseline,5,0,0,25.0
1,baseline,5,0,1,10.0
2,baseline,5,0,2,10.0
3,baseline,5,0,3,25.0
4,baseline,5,0,4,10.0


In [26]:
def std_err(x):
    return np.std(x) / np.sqrt(len(x))

print(
(all_results_df.pivot_table(index='iter', columns='strategy', values='acc').map(lambda x: np.round(x, 1)).astype(str)
+ ' \scriptsize{(' +
all_results_df.pivot_table(index='iter', columns='strategy', values='acc', aggfunc=std_err).map(lambda x: np.round(x, 1)).astype(str)
+ ')}'
)[['baseline-binary', 'active-reasoning-binary', 'baseline', 'active-reasoning']].to_latex())

\begin{tabular}{lllll}
\toprule
strategy & baseline-binary & active-reasoning-binary & baseline & active-reasoning \\
iter &  &  &  &  \\
\midrule
0 & 44.1 \scriptsize{(1.9)} & 44.6 \scriptsize{(1.8)} & 47.1 \scriptsize{(1.4)} & 47.0 \scriptsize{(1.4)} \\
1 & 55.3 \scriptsize{(2.5)} & 66.8 \scriptsize{(2.5)} & 67.5 \scriptsize{(1.7)} & 74.4 \scriptsize{(1.6)} \\
2 & 65.2 \scriptsize{(2.5)} & 78.4 \scriptsize{(2.1)} & 71.6 \scriptsize{(1.7)} & 81.4 \scriptsize{(1.6)} \\
3 & 70.7 \scriptsize{(2.6)} & 82.2 \scriptsize{(2.1)} & 75.5 \scriptsize{(1.7)} & 84.5 \scriptsize{(1.5)} \\
4 & 70.8 \scriptsize{(2.6)} & 85.6 \scriptsize{(2.0)} & 75.9 \scriptsize{(1.7)} & 85.0 \scriptsize{(1.5)} \\
\bottomrule
\end{tabular}



In [27]:
llm_df = all_results_df.groupby(['strategy', 'iter'])['acc'].agg([np.mean, std_err]).reset_index()
llm_df.rename({'std_err' : 'std'}, inplace=True, axis=1)
llm_df['model'] = model
llm_df['method'] = llm_df['strategy'].map({
    'active-reasoning': 'EIG (O)',
    'baseline': 'base (O)',
    'active-reasoning-binary': 'EIG (B)',
    'baseline-binary': 'base (B)'
})
llm_df[['model', 'method', 'iter', 'mean', 'std']].to_csv(f'./results/code-generation/{benchmark}/{model}.csv', index=False)

  llm_df = all_results_df.groupby(['strategy', 'iter'])['acc'].agg([np.mean, std_err]).reset_index()
