In [None]:
from analysis import load_raw_data, log_scale_integers, px, compute_performance_by_gen_en
from analysis import plot_performance_vs_LLMCalls, compute_dVx, compute_dFx, ScaleLaws
from analysis import plot_performance_vs_LLMCalls_onecurve, simulate_answer
from analysis import plot_scatter_plotly_combined, plot_answer_distribution_pie
import numpy
from tqdm import tqdm

# A Case Study on averitec

In [None]:
gen_n_list = log_scale_integers(2, 1000, 20)
samples = 1000
cate = 'dev'
dataname = 'averitec'
color_filter = px.colors.qualitative.Plotly[0]
color_vote = px.colors.qualitative.Plotly[1]
color_line = px.colors.qualitative.Plotly[2]
colors = [color_vote,color_filter]
width=750
height=600

### 1. Overall Performance

In [None]:
data_filter = load_raw_data(dataname = dataname,
                     cate = cate,
                    use_filter=True,
                    )

data_nofilter = load_raw_data(dataname = 'averitec',
                     cate = 'dev',
                    use_filter=False,
                    )

results_overall_nofilter = compute_performance_by_gen_en(data_nofilter, 
                                                gen_n_list=gen_n_list, 
                                                samples=samples,
                                               use_filter=False)

results_overall_filter = compute_performance_by_gen_en(data_filter, 
                                                gen_n_list=gen_n_list, 
                                                samples=samples,
                                               use_filter=True)


In [None]:
fig = plot_performance_vs_LLMCalls(results_overall_nofilter, results_overall_filter, fontsize=60, marker_size=20, line_width=8,show_legend=True,
                                   colors=colors,width=1000,height=700,
                                   xaxis_range=[-0.05,numpy.log(1100)/numpy.log(10)],
                                  dtick=0.01,
                                   h_legend=True,
                                  )

In [None]:
fig = plot_performance_vs_LLMCalls(results_overall_nofilter, results_overall_filter, fontsize=40, marker_size=20, line_width=8,show_legend=True,
                                   colors=colors,width=800,height=700,
                                   xaxis_range=[-0.05,numpy.log(1100)/numpy.log(10)],
                                  dtick=0.01,
                                  )

### 2. Performance breakdown

In [None]:
data_filter = compute_dVx(data_filter)
data_filter = compute_dFx(data_filter)
data_vote_easy = data_filter[data_filter['dVx']<0]
data_vote_hard = data_filter[data_filter['dVx']>0]
data_filter_easy = data_filter[data_filter['dFx']<0]
data_filter_hard = data_filter[data_filter['dFx']>0]
data_all = [data_vote_easy,data_vote_hard,data_filter_easy,data_filter_hard]

In [None]:
perf_vote_easy = compute_performance_by_gen_en(data_vote_easy, 
                                                gen_n_list=gen_n_list, 
                                                samples=samples,
                                               use_filter=False)

perf_vote_hard = compute_performance_by_gen_en(data_vote_hard, 
                                                gen_n_list=gen_n_list, 
                                                samples=samples,
                                               use_filter=False)

perf_filter_easy = compute_performance_by_gen_en(data_filter_easy, 
                                                gen_n_list=gen_n_list, 
                                                samples=samples,
                                               use_filter=True)

perf_filter_hard = compute_performance_by_gen_en(data_filter_hard, 
                                                gen_n_list=gen_n_list, 
                                                samples=samples,
                                               use_filter=True)


In [None]:
fig_vote_easy = plot_performance_vs_LLMCalls_onecurve(perf_vote_easy, fontsize=80, marker_size=15, line_width=8,show_legend=False,color=color_vote,
                                                     width=800,height=600,xaxis_range=[-0.05,numpy.log(1100)/numpy.log(10)],dtick=0.05,
                                                     )

In [None]:
fig_vote_hard = plot_performance_vs_LLMCalls_onecurve(perf_vote_hard, fontsize=80, marker_size=15, line_width=8,show_legend=False,color=color_vote,
                                                     width=800,height=600,xaxis_range=[-0.05,numpy.log(1100)/numpy.log(10)],dtick=0.05,
                                                     )
    

In [None]:
fig_filter_easy = plot_performance_vs_LLMCalls_onecurve(perf_filter_easy, fontsize=80, marker_size=15, line_width=8,show_legend=False,color=color_filter,
                                                     width=800,height=600,xaxis_range=[-0.05,numpy.log(1100)/numpy.log(10)],dtick=0.1,
                                                     )

In [None]:
fig_filter_hard = plot_performance_vs_LLMCalls_onecurve(perf_filter_hard, fontsize=80, marker_size=15, line_width=8,show_legend=False,color=color_filter,
                                                     width=800,height=600,xaxis_range=[-0.05,numpy.log(1100)/numpy.log(10)],dtick=0.04,
                                                     )

### 3. Prediction

In [None]:
MyScaleNoFilter = ScaleLaws()
MyScaleFilter = ScaleLaws()
M = [2,5,10,20,50,100]

MyScaleNoFilter.fitscalelaws(data_nofilter,M=M,trial=100,use_filter=False)
MyScaleFilter.fitscalelaws(data_filter,M=M,trial=100,use_filter=True)

In [None]:
LLM_calls = log_scale_integers(2,1000,100)
number_of_trials = 100

acc_predict = MyScaleNoFilter.predict(LLM_calls)
acc_simulate = [numpy.mean([simulate_answer(data_nofilter.iloc[i],k=k, number_of_trials=number_of_trials)for i in tqdm(range(len(data_nofilter)))]) for k in LLM_calls]

acc_predict_filter = MyScaleFilter.predict(LLM_calls)
acc_simulate_filter = [numpy.mean([simulate_answer(data_filter.iloc[i],k=k, number_of_trials=number_of_trials,use_filter=True)for i in tqdm(range(len(data_nofilter)))]) for k in LLM_calls]

In [None]:
fig = plot_scatter_plotly_combined(numpy.array(acc_simulate), numpy.array(acc_predict), numpy.array(acc_simulate_filter), numpy.array(acc_predict_filter),
                                  figure_size=(800, 700), linewidth=2, markersize=20, fontsize=35,
                                   
                                  )
fig.update_xaxes(tickformat=".2f")
fig.update_yaxes(tickformat=".2f",nticks=5)
fig.show()

In [None]:
fig = plot_scatter_plotly_combined(numpy.array(acc_simulate), numpy.array(acc_predict), numpy.array(acc_simulate_filter), numpy.array(acc_predict_filter),
                                  figure_size=(800, 600), linewidth=2, markersize=20, fontsize=35,
                                   
                                  )


### 4. Find examples

In [None]:
data_both_easy = data_filter[(data_filter['dVx'] < 0) & (data_filter['dFx'] < 0) & (data_filter['true_answer']=='b')]
data_both_hard = data_filter[(data_filter['dVx'] > 0) & (data_filter['dFx'] > 0)]

In [None]:
index = 10
query = data_both_easy.iloc[index]['query']
easy_example_answer = eval(data_both_easy.iloc[index]['full_possible_answer_count_before_filter'])
easy_example_answer_filter = eval(data_both_easy.iloc[index]['full_possible_answer_count'])
true_answer = data_both_easy.iloc[index]['true_answer']
print(true_answer)
print(data_both_easy.iloc[index]['query'])
print(data_both_easy.iloc[index]['full_possible_answer_count_before_filter'])
print(easy_example_answer_filter)
data_both_easy.iloc[index]

In [None]:
plot_answer_distribution_pie(easy_example_answer, true_answer,show_name=False)

plot_answer_distribution_pie(easy_example_answer_filter, true_answer,show_name=False)