# Analyze Metrics and Conduct Inference.ipynb

In [1]:
import pandas as pd
from statsmodels.stats.contingency_tables import mcnemar 
#import matplotlib.pyplot as plt
#import seaborn as sns
import scipy.stats as stats


In [2]:
# Load Combined_Data.xlsx
combined_data = pd.read_excel('Combined_Data.xlsx')

# GSM8k data - limit to task = "gsm8k"
gsm8k_data = combined_data[combined_data['task'] == 'gsm8k']

# Creative writing data - limit to task = "cw"
cw_data = combined_data[combined_data['task'] == 'cw']

# Load direct_prompting_comparison.xlsx
direct_prompting_comparison = pd.read_excel('direct_prompting_comparison.xlsx')


## Perform McNemar's Test on GSM8k data

In [3]:
# Get in terms of number of questions answered correctly: sum accuracy_quality column by model, method, task
gsm8k_num_correct= gsm8k_data[['model', 'method', 'task', 'correct']].groupby(['model', 'method', 'task']).sum()['correct'].reset_index()
# Rename column to num_correct
gsm8k_num_correct = gsm8k_num_correct.rename(columns={'correct': 'num_correct'})

# Make another table with counts of questions missed (number of observations minus accuracy_quality sum) by model, method, task
gsm8k_num_obs = gsm8k_data[['model', 'method', 'task', 'correct']].groupby(['model', 'method', 'task']).count()['correct'].reset_index()
# Rename column to num_obs
gsm8k_num_obs = gsm8k_num_obs.rename(columns={'correct': 'num_obs'})
# Join together by model, method, task
gsm8k_correct_incorrect_obs = gsm8k_num_correct.merge(gsm8k_num_obs, on=['model', 'method', 'task'], how='left')
# Table should have model, method, task, num_correct, num_incorrect, num_obs
gsm8k_correct_incorrect_obs['num_incorrect'] = gsm8k_correct_incorrect_obs['num_obs'] - gsm8k_correct_incorrect_obs['num_correct']

gsm8k_correct_incorrect_obs


Unnamed: 0,model,method,task,num_correct,num_obs,num_incorrect
0,gpt4,ape_zero_shot_cot,gsm8k,93.0,100,7.0
1,gpt4,direct_prompting,gsm8k,73.0,100,27.0
2,gpt4,least_to_most,gsm8k,95.0,100,5.0
3,gpt4,manual_cot,gsm8k,93.0,100,7.0
4,gpt4,manual_few_shot,gsm8k,49.0,100,51.0
5,gpt4,self_refine,gsm8k,89.0,100,11.0
6,gpt4,tree_of_thought,gsm8k,40.0,100,60.0
7,gpt4,zero_shot_cot,gsm8k,95.0,100,5.0
8,td3,ape_zero_shot_cot,gsm8k,49.0,100,51.0
9,td3,direct_prompting,gsm8k,23.0,100,77.0


In [4]:
# Function to perform McNemar's test
# Accepts argument of name of model, method, task

def perform_mcnemar(model, method, task, variable, dataframe):
    
    # Get direct prompting data
    dp_data = dataframe[(dataframe['model'] == model) & (dataframe['method'] == 'direct_prompting') & (dataframe['task'] == task)][['conversation_number', variable]]
    # Rename correct to dp_correct
    dp_data = dp_data.rename(columns={variable: 'dp_' + variable})

    # Get method data
    method_data = dataframe[(dataframe['model'] == model) & (dataframe['method'] == method) & (dataframe['task'] == task)][['conversation_number', variable]]
    # Rename correct to method_correct
    method_data = method_data.rename(columns={variable: 'method_' + variable})

    # Join together by conversation_number
    joined_data = dp_data.merge(method_data, on='conversation_number', how='left')

    # Fill contingency table cells
    dp_var_value_1_method_var_value_1 = joined_data[(joined_data['dp_' + variable] == True) & (joined_data['method_' + variable] == True)].shape[0]
    dp_var_value_1_method_var_value_2 = joined_data[(joined_data['dp_' + variable] == True) & (joined_data['method_' + variable] == False)].shape[0]
    dp_var_value_2_method_var_value_1 = joined_data[(joined_data['dp_' + variable] == False) & (joined_data['method_' + variable] == True)].shape[0]
    dp_var_value_2_method_var_value_2 = joined_data[(joined_data['dp_' + variable] == False) & (joined_data['method_' + variable] == False)].shape[0]
    
    # Create a dataset
    # Rows for dp correct and incorrect
    # Columns for method correct and incorrect
    data = [[dp_var_value_1_method_var_value_1, dp_var_value_1_method_var_value_2], 
            [dp_var_value_2_method_var_value_1, dp_var_value_2_method_var_value_2]]
  
    print('McNemar\'s Test (Exact) for ' + model + ' ' + method + ' ' + task)
    print(data)

    # McNemar's Test, exact, without any continuity correction 
    print('No continuity correction')
    print(mcnemar(data, exact=True, correction=False)) 
    ncc_result = mcnemar(data, exact=True, correction=False)
    ncc_statistic = ncc_result.statistic
    ncc_p_value = ncc_result.pvalue

    # McNemar's Test with the continuity correction 
    print('With continuity correction')
    print(mcnemar(data, exact=True, correction=True)) 
    cc_result = mcnemar(data, exact=True, correction=True)
    cc_statistic = cc_result.statistic
    cc_p_value = cc_result.pvalue

    # Return data
    return ncc_statistic, ncc_p_value, cc_statistic, cc_p_value


In [5]:
# Add McNemar's Test results to table
# Iterate over all combinations of model, method, task
# Except for method = "direct_prompting" and task = "cw"
# For each combination, perform McNemar's Test and add to table

# Create combos of model, method, task - all unique combinations of these three in combined_data
# Get unique values of model, method, task
models = combined_data['model'].unique()
methods = combined_data['method'].unique()
tasks = combined_data['task'].unique()
# Create list of all combinations of model, method, task
combinations = [(model, method, task) for model in models for method in methods for task in tasks]

# Add as rows to a dataframe
# Create empty dataframe
mcnemars_results = pd.DataFrame(columns=['model', 'method', 'task', 'statistic', 'pvalue', 'statistic_with_correction', 'pvalue_with_correction'])

# Iterate over combinations
for model, method, task in combinations:
    # Skip if method = "direct_prompting" or task = "cw"
    if method == 'direct_prompting' or task == 'cw':
        continue
    # Perform McNemar's Test
    statistic_without_correction, pvalue_without_correction, statistic_with_correction, pvalue_with_correction = perform_mcnemar(model, method, task, variable = 'correct', dataframe = combined_data)
    # Add to table
    mcnemars_results = pd.concat([mcnemars_results, pd.DataFrame([[model, method, task, statistic_without_correction, pvalue_without_correction, statistic_with_correction, pvalue_with_correction]], columns=['model', 'method', 'task', 'statistic', 'pvalue', 'statistic_with_correction', 'pvalue_with_correction'])])

mcnemars_results


McNemar's Test (Exact) for td3 zero_shot_cot gsm8k
[[13, 10], [49, 28]]
No continuity correction
pvalue      2.706279703008907e-07
statistic   10.0
With continuity correction
pvalue      2.706279703008907e-07
statistic   10.0
McNemar's Test (Exact) for td3 ape_zero_shot_cot gsm8k
[[11, 12], [38, 39]]
No continuity correction
pvalue      0.00030586400160359517
statistic   12.0
With continuity correction
pvalue      0.00030586400160359517
statistic   12.0
McNemar's Test (Exact) for td3 least_to_most gsm8k
[[15, 8], [43, 34]]
No continuity correction
pvalue      6.867117736675254e-07
statistic   8.0
With continuity correction
pvalue      6.867117736675254e-07
statistic   8.0
McNemar's Test (Exact) for td3 manual_few_shot gsm8k
[[11, 12], [7, 70]]
No continuity correction
pvalue      0.359283447265625
statistic   7.0
With continuity correction
pvalue      0.359283447265625
statistic   7.0
McNemar's Test (Exact) for td3 manual_cot gsm8k
[[16, 7], [44, 33]]
No continuity correction
pvalue   

Unnamed: 0,model,method,task,statistic,pvalue,statistic_with_correction,pvalue_with_correction
0,td3,zero_shot_cot,gsm8k,10.0,2.70628e-07,10.0,2.70628e-07
0,td3,ape_zero_shot_cot,gsm8k,12.0,0.000305864,12.0,0.000305864
0,td3,least_to_most,gsm8k,8.0,6.867118e-07,8.0,6.867118e-07
0,td3,manual_few_shot,gsm8k,7.0,0.3592834,7.0,0.3592834
0,td3,manual_cot,gsm8k,7.0,1.211526e-07,7.0,1.211526e-07
0,td3,tree_of_thought,gsm8k,13.0,1.0,13.0,1.0
0,td3,self_refine,gsm8k,4.0,0.5488281,4.0,0.5488281
0,gpt4,zero_shot_cot,gsm8k,1.0,2.980232e-06,1.0,2.980232e-06
0,gpt4,ape_zero_shot_cot,gsm8k,1.0,1.096725e-05,1.0,1.096725e-05
0,gpt4,least_to_most,gsm8k,0.0,4.768372e-07,0.0,4.768372e-07


In [6]:
# Rows that are significant at p < 0.05
mcnemars_results[mcnemars_results['pvalue'] < 0.05]


Unnamed: 0,model,method,task,statistic,pvalue,statistic_with_correction,pvalue_with_correction
0,td3,zero_shot_cot,gsm8k,10.0,2.70628e-07,10.0,2.70628e-07
0,td3,ape_zero_shot_cot,gsm8k,12.0,0.000305864,12.0,0.000305864
0,td3,least_to_most,gsm8k,8.0,6.867118e-07,8.0,6.867118e-07
0,td3,manual_cot,gsm8k,7.0,1.211526e-07,7.0,1.211526e-07
0,gpt4,zero_shot_cot,gsm8k,1.0,2.980232e-06,1.0,2.980232e-06
0,gpt4,ape_zero_shot_cot,gsm8k,1.0,1.096725e-05,1.0,1.096725e-05
0,gpt4,least_to_most,gsm8k,0.0,4.768372e-07,0.0,4.768372e-07
0,gpt4,manual_few_shot,gsm8k,9.0,0.0002715392,9.0,0.0002715392
0,gpt4,manual_cot,gsm8k,1.0,1.096725e-05,1.0,1.096725e-05
0,gpt4,tree_of_thought,gsm8k,8.0,1.964654e-06,8.0,1.964654e-06


In [7]:
# Rows that are not significant at p < 0.05
mcnemars_results[mcnemars_results['pvalue'] >= 0.05]


Unnamed: 0,model,method,task,statistic,pvalue,statistic_with_correction,pvalue_with_correction
0,td3,manual_few_shot,gsm8k,7.0,0.359283,7.0,0.359283
0,td3,tree_of_thought,gsm8k,13.0,1.0,13.0,1.0
0,td3,self_refine,gsm8k,4.0,0.548828,4.0,0.548828


In [8]:
# Printing gsm8k_correct_incorrect_obs again for reference
gsm8k_correct_incorrect_obs


Unnamed: 0,model,method,task,num_correct,num_obs,num_incorrect
0,gpt4,ape_zero_shot_cot,gsm8k,93.0,100,7.0
1,gpt4,direct_prompting,gsm8k,73.0,100,27.0
2,gpt4,least_to_most,gsm8k,95.0,100,5.0
3,gpt4,manual_cot,gsm8k,93.0,100,7.0
4,gpt4,manual_few_shot,gsm8k,49.0,100,51.0
5,gpt4,self_refine,gsm8k,89.0,100,11.0
6,gpt4,tree_of_thought,gsm8k,40.0,100,60.0
7,gpt4,zero_shot_cot,gsm8k,95.0,100,5.0
8,td3,ape_zero_shot_cot,gsm8k,49.0,100,51.0
9,td3,direct_prompting,gsm8k,23.0,100,77.0


In [9]:
# Save mcnemars_results to Excel
mcnemars_results.to_excel('mcnemars_results.xlsx', index=False)


## We can also run McNemar's test on the compliance variable

In [10]:
# Add as rows to a dataframe
# Create empty dataframe
mcnemars_results_compliance = pd.DataFrame(columns=['model', 'method', 'task', 'statistic', 'pvalue', 'statistic_with_correction', 'pvalue_with_correction'])

# Iterate over combinations
for model, method, task in combinations:
    # Skip if method = "direct_prompting" or task = "gsm8k"
    if method == 'direct_prompting' or task == 'gsm8k':
        continue
    # Perform McNemar's Test
    statistic_without_correction, pvalue_without_correction, statistic_with_correction, pvalue_with_correction = perform_mcnemar(model, method, task, variable = 'compliance', dataframe=combined_data)
    # Add to table
    mcnemars_results_compliance = pd.concat([mcnemars_results_compliance, pd.DataFrame([[model, method, task, statistic_without_correction, pvalue_without_correction, statistic_with_correction, pvalue_with_correction]], columns=['model', 'method', 'task', 'statistic', 'pvalue', 'statistic_with_correction', 'pvalue_with_correction'])])

mcnemars_results_compliance


McNemar's Test (Exact) for td3 zero_shot_cot cw
[[27, 23], [16, 34]]
No continuity correction
pvalue      0.3367836351899315
statistic   16.0
With continuity correction
pvalue      0.3367836351899315
statistic   16.0
McNemar's Test (Exact) for td3 ape_zero_shot_cot cw
[[22, 28], [22, 28]]
No continuity correction
pvalue      0.47988766169832786
statistic   22.0
With continuity correction
pvalue      0.47988766169832786
statistic   22.0
McNemar's Test (Exact) for td3 least_to_most cw
[[16, 34], [9, 41]]
No continuity correction
pvalue      0.00017015517460094998
statistic   9.0
With continuity correction
pvalue      0.00017015517460094998
statistic   9.0
McNemar's Test (Exact) for td3 manual_few_shot cw
[[27, 23], [16, 34]]
No continuity correction
pvalue      0.3367836351899315
statistic   16.0
With continuity correction
pvalue      0.3367836351899315
statistic   16.0
McNemar's Test (Exact) for td3 manual_cot cw
[[10, 40], [9, 41]]
No continuity correction
pvalue      9.263546409954415

Unnamed: 0,model,method,task,statistic,pvalue,statistic_with_correction,pvalue_with_correction
0,td3,zero_shot_cot,cw,16.0,0.3367836,16.0,0.3367836
0,td3,ape_zero_shot_cot,cw,22.0,0.4798877,22.0,0.4798877
0,td3,least_to_most,cw,9.0,0.0001701552,9.0,0.0001701552
0,td3,manual_few_shot,cw,16.0,0.3367836,16.0,0.3367836
0,td3,manual_cot,cw,9.0,9.263546e-06,9.0,9.263546e-06
0,td3,tree_of_thought,cw,2.0,2.266631e-12,2.0,2.266631e-12
0,td3,self_refine,cw,13.0,0.009559879,13.0,0.009559879
0,gpt4,zero_shot_cot,cw,17.0,1.0,17.0,1.0
0,gpt4,ape_zero_shot_cot,cw,22.0,1.0,22.0,1.0
0,gpt4,least_to_most,cw,19.0,0.643969,19.0,0.643969


In [11]:
# Rows that are significant at p < 0.05
mcnemars_results_compliance[mcnemars_results_compliance['pvalue'] < 0.05]


Unnamed: 0,model,method,task,statistic,pvalue,statistic_with_correction,pvalue_with_correction
0,td3,least_to_most,cw,9.0,0.0001701552,9.0,0.0001701552
0,td3,manual_cot,cw,9.0,9.263546e-06,9.0,9.263546e-06
0,td3,tree_of_thought,cw,2.0,2.266631e-12,2.0,2.266631e-12
0,td3,self_refine,cw,13.0,0.009559879,13.0,0.009559879
0,gpt4,tree_of_thought,cw,9.0,1.522183e-05,9.0,1.522183e-05


In [12]:
# Rows that are not significant at p < 0.05
mcnemars_results_compliance[mcnemars_results_compliance['pvalue'] >= 0.05]


Unnamed: 0,model,method,task,statistic,pvalue,statistic_with_correction,pvalue_with_correction
0,td3,zero_shot_cot,cw,16.0,0.336784,16.0,0.336784
0,td3,ape_zero_shot_cot,cw,22.0,0.479888,22.0,0.479888
0,td3,manual_few_shot,cw,16.0,0.336784,16.0,0.336784
0,gpt4,zero_shot_cot,cw,17.0,1.0,17.0,1.0
0,gpt4,ape_zero_shot_cot,cw,22.0,1.0,22.0,1.0
0,gpt4,least_to_most,cw,19.0,0.643969,19.0,0.643969
0,gpt4,manual_few_shot,cw,14.0,0.310505,14.0,0.310505
0,gpt4,manual_cot,cw,18.0,0.532709,18.0,0.532709
0,gpt4,self_refine,cw,15.0,0.255875,15.0,0.255875


In [13]:
# Save mcnemars_results_compliance to Excel
mcnemars_results_compliance.to_excel('mcnemars_results_compliance.xlsx', index=False)


## Paired t-test for quality, means of other metrics

In [14]:
print(list(direct_prompting_comparison.columns))


['model_task_method', 'conversation_number', 'coherence_1_incoherent_10_very_coherent', 'compliance_OLD', 'ease_of_review_1_easy_10_hard', 'correct', 'Prediction_Based_On_First_10', 'Prediction_Based_On_Last_10', 'Aggregated_Prediction', 'Prediction_Based_On_First_10_LP', 'response_Based_On_First_10_LP', 'Prediction_Based_On_Last_10_LP', 'response_Based_On_Last_10_LP', 'response_LP', 'Aggregated_Prediction_LP', 'Prediction_Based_On_First_50_LP', 'response_Based_On_First_50_LP', 'Prediction_Based_On_Last_50_LP', 'response_Based_On_Last_50_LP', 'Aggregated_Prediction_50_LP', 'Prediction_Based_On_random_50_LP_1', 'response_Based_On_random_50_LP_1', 'Prediction_Based_On_random_50_LP_2', 'response_Based_On_random_50_LP_2', 'Aggregated_Prediction_random_50_LP', 'Unnamed: 0_x', 'response_x', 'replace_slash_n_slash_n_with_newline_x', 'replace_slash_n_slash_n_with_newline_values_x', 'replace_slash_n_with_newline_x', 'replace_slash_n_with_newline_values_x', 'avg_cosine_sim', 'num_sentences_x', '

In [15]:
# Perform paired t-test on creative writing data and all other metrics

# Metrics to t-test: 'coherence_1_incoherent_10_very_coherent', 'coherence_1_incoherent_10_very_coherent_compliance_adjusted, 'ease_of_evaluation_score', 'conversation_length', 'input_length', 'output_length', 'conversation_cost', 'gsm8k_length_vs_provided', 'length_vs_direct_prompting', 'num_linebreaks', 'num_sentences', 'num_step_i', 'num_1_dot_etc', 'sentence_length', 'fres', 'num_linebreaks_prompts', 'num_sentences_prompts', 'num_step_i_prompts', 'num_1_dot_etc_prompts', 'sentence_length_prompts', 'fres_prompts'
metrics_to_t_test = ['coherence_1_incoherent_10_very_coherent', 'coherence_1_incoherent_10_very_coherent_compliance_adjusted', 'ease_of_review_1_easy_10_hard', 'conversation_length', 'input_length', 'output_length', 'conversation_cost', 'gsm8k_length_vs_provided', 'length_vs_direct_prompting', 'num_linebreaks', 'num_sentences', 'num_step_i', 'num_1_dot_etc', 'sentence_length', 'fres', 'num_linebreaks_prompts', 'num_sentences_prompts', 'num_step_i_prompts', 'num_1_dot_etc_prompts', 'sentence_length_prompts', 'fres_prompts']
# Additional metrics with cosine similarity added
additional_metrics_to_t_test = ['avg_cosine_sim', 'avg_cosine_sim_compliance_adjusted', 
'avg_inter_paragraph_cosine_sim',
'avg_inter_paragraph_cosine_sim_compliance_adjusted']
metrics_to_t_test = metrics_to_t_test + additional_metrics_to_t_test

# Create table to hold results
# Columns of model, task, method, mean metric, mean dp_metric, statistic, pvalue
t_test_results = pd.DataFrame(columns=['model', 'task', 'method', 'metric', 'dp_mean', 'using_method_mean', 'statistic', 'pvalue'])
# Iterate over model, task, method
for model, task, method in direct_prompting_comparison[['model', 'task', 'method']].drop_duplicates().values:
    # Iterate over metrics_to_t_test
    for metric in metrics_to_t_test:
        # Get data
        # Sort by conversation_number
        data = direct_prompting_comparison[(direct_prompting_comparison['model'] == model) & (direct_prompting_comparison['task'] == task) & (direct_prompting_comparison['method'] == method)].sort_values(by=['conversation_number'])
        # direct_prompting holds the metric when using direct_prompting
        direct_prompting = data['dp_' + metric]
        # using_method holds the metric when using the method
        using_method = data[metric]
        # Perform the paired sample t-test 
        statistic, pvalue = stats.ttest_rel(direct_prompting, using_method, nan_policy='omit')
        # Add to table
        t_test_results = pd.concat([t_test_results, pd.DataFrame([[model, task, method, metric, direct_prompting.mean(), using_method.mean(), statistic, pvalue]], columns=['model', 'task', 'method', 'metric', 'dp_mean', 'using_method_mean', 'statistic', 'pvalue'])])

# Significance if pvalue < 0.05
t_test_results['Significant at 95%'] = t_test_results['pvalue'].apply(lambda x: 'Yes' if x < 0.05 else 'No')

# Sort by metric, model, task, method
t_test_results = t_test_results.sort_values(by=['metric', 'model', 'task', 'method']).reset_index().drop(columns=['index'])

t_test_results


  res = hypotest_fun_out(*samples, **kwds)


Unnamed: 0,model,task,method,metric,dp_mean,using_method_mean,statistic,pvalue,Significant at 95%
0,gpt4,cw,ape_zero_shot_cot,avg_cosine_sim,0.332656,0.349642,-2.443167,0.016330,Yes
1,gpt4,cw,least_to_most,avg_cosine_sim,0.332656,0.341144,-1.181267,0.240326,No
2,gpt4,cw,manual_cot,avg_cosine_sim,0.332656,0.350436,-2.580169,0.011343,Yes
3,gpt4,cw,manual_few_shot,avg_cosine_sim,0.332656,0.346035,-1.885171,0.062339,No
4,gpt4,cw,self_refine,avg_cosine_sim,0.332656,0.356204,-3.476822,0.000755,Yes
...,...,...,...,...,...,...,...,...,...
695,td3,gsm8k,manual_cot,sentence_length_prompts,,,,,No
696,td3,gsm8k,manual_few_shot,sentence_length_prompts,,,,,No
697,td3,gsm8k,self_refine,sentence_length_prompts,,,,,No
698,td3,gsm8k,tree_of_thought,sentence_length_prompts,,,,,No


In [16]:
# Sort by metric, model, task, Significant at 95%
t_test_results = t_test_results.sort_values(by=['metric', 'model', 'task', 'Significant at 95%']).reset_index().drop(columns=['index'])

t_test_results


Unnamed: 0,model,task,method,metric,dp_mean,using_method_mean,statistic,pvalue,Significant at 95%
0,gpt4,cw,least_to_most,avg_cosine_sim,0.332656,0.341144,-1.181267,0.240326,No
1,gpt4,cw,manual_few_shot,avg_cosine_sim,0.332656,0.346035,-1.885171,0.062339,No
2,gpt4,cw,ape_zero_shot_cot,avg_cosine_sim,0.332656,0.349642,-2.443167,0.016330,Yes
3,gpt4,cw,manual_cot,avg_cosine_sim,0.332656,0.350436,-2.580169,0.011343,Yes
4,gpt4,cw,self_refine,avg_cosine_sim,0.332656,0.356204,-3.476822,0.000755,Yes
...,...,...,...,...,...,...,...,...,...
695,td3,gsm8k,manual_cot,sentence_length_prompts,,,,,No
696,td3,gsm8k,manual_few_shot,sentence_length_prompts,,,,,No
697,td3,gsm8k,self_refine,sentence_length_prompts,,,,,No
698,td3,gsm8k,tree_of_thought,sentence_length_prompts,,,,,No


In [17]:
# Rows that are not significant at p < 0.05
t_test_results[t_test_results['pvalue'] >= 0.05]


Unnamed: 0,model,task,method,metric,dp_mean,using_method_mean,statistic,pvalue,Significant at 95%
0,gpt4,cw,least_to_most,avg_cosine_sim,0.332656,0.341144,-1.181267,0.240326,No
1,gpt4,cw,manual_few_shot,avg_cosine_sim,0.332656,0.346035,-1.885171,0.062339,No
14,td3,cw,least_to_most,avg_cosine_sim,0.362868,0.356757,0.828458,0.409404,No
15,td3,cw,manual_cot,avg_cosine_sim,0.362868,0.345318,1.954777,0.053430,No
16,td3,cw,manual_few_shot,avg_cosine_sim,0.362868,0.364338,-0.186210,0.852661,No
...,...,...,...,...,...,...,...,...,...
637,td3,gsm8k,self_refine,output_length,26.580000,25.510000,0.373048,0.709910,No
644,gpt4,cw,ape_zero_shot_cot,sentence_length,19.105867,19.755939,-1.162755,0.247724,No
645,gpt4,cw,zero_shot_cot,sentence_length,19.105867,18.644043,1.000982,0.319276,No
658,td3,cw,ape_zero_shot_cot,sentence_length,16.413378,16.083255,0.828445,0.409411,No


In [18]:
# Save t_test_results to Excel
t_test_results.to_excel('t_test_results.xlsx', index=False)


## Create combined significance table

In [19]:
# accuracy mcnemars
# Create column Significant at 95% for pvalue < 0.05
mcnemars_results['Significant at 95%'] = mcnemars_results['pvalue_with_correction'].apply(lambda x: 'Yes' if x < 0.05 else 'No')
# Add column metric
mcnemars_results['metric'] = 'correct'

mcnemars_results


Unnamed: 0,model,method,task,statistic,pvalue,statistic_with_correction,pvalue_with_correction,Significant at 95%,metric
0,td3,zero_shot_cot,gsm8k,10.0,2.70628e-07,10.0,2.70628e-07,Yes,correct
0,td3,ape_zero_shot_cot,gsm8k,12.0,0.000305864,12.0,0.000305864,Yes,correct
0,td3,least_to_most,gsm8k,8.0,6.867118e-07,8.0,6.867118e-07,Yes,correct
0,td3,manual_few_shot,gsm8k,7.0,0.3592834,7.0,0.3592834,No,correct
0,td3,manual_cot,gsm8k,7.0,1.211526e-07,7.0,1.211526e-07,Yes,correct
0,td3,tree_of_thought,gsm8k,13.0,1.0,13.0,1.0,No,correct
0,td3,self_refine,gsm8k,4.0,0.5488281,4.0,0.5488281,No,correct
0,gpt4,zero_shot_cot,gsm8k,1.0,2.980232e-06,1.0,2.980232e-06,Yes,correct
0,gpt4,ape_zero_shot_cot,gsm8k,1.0,1.096725e-05,1.0,1.096725e-05,Yes,correct
0,gpt4,least_to_most,gsm8k,0.0,4.768372e-07,0.0,4.768372e-07,Yes,correct


In [20]:
# Compliance mcnemars results
# Create column Significant at 95% for pvalue < 0.05
mcnemars_results_compliance['Significant at 95%'] = mcnemars_results_compliance['pvalue_with_correction'].apply(lambda x: 'Yes' if x < 0.05 else 'No')
# Add column metric
mcnemars_results_compliance['metric'] = 'compliance'

mcnemars_results_compliance


Unnamed: 0,model,method,task,statistic,pvalue,statistic_with_correction,pvalue_with_correction,Significant at 95%,metric
0,td3,zero_shot_cot,cw,16.0,0.3367836,16.0,0.3367836,No,compliance
0,td3,ape_zero_shot_cot,cw,22.0,0.4798877,22.0,0.4798877,No,compliance
0,td3,least_to_most,cw,9.0,0.0001701552,9.0,0.0001701552,Yes,compliance
0,td3,manual_few_shot,cw,16.0,0.3367836,16.0,0.3367836,No,compliance
0,td3,manual_cot,cw,9.0,9.263546e-06,9.0,9.263546e-06,Yes,compliance
0,td3,tree_of_thought,cw,2.0,2.266631e-12,2.0,2.266631e-12,Yes,compliance
0,td3,self_refine,cw,13.0,0.009559879,13.0,0.009559879,Yes,compliance
0,gpt4,zero_shot_cot,cw,17.0,1.0,17.0,1.0,No,compliance
0,gpt4,ape_zero_shot_cot,cw,22.0,1.0,22.0,1.0,No,compliance
0,gpt4,least_to_most,cw,19.0,0.643969,19.0,0.643969,No,compliance


In [21]:
# All inference
all_inference = pd.concat([mcnemars_results, mcnemars_results_compliance, t_test_results], ignore_index=True)

all_inference


Unnamed: 0,model,method,task,statistic,pvalue,statistic_with_correction,pvalue_with_correction,Significant at 95%,metric,dp_mean,using_method_mean
0,td3,zero_shot_cot,gsm8k,10.0,2.706280e-07,10.0,2.706280e-07,Yes,correct,,
1,td3,ape_zero_shot_cot,gsm8k,12.0,3.058640e-04,12.0,3.058640e-04,Yes,correct,,
2,td3,least_to_most,gsm8k,8.0,6.867118e-07,8.0,6.867118e-07,Yes,correct,,
3,td3,manual_few_shot,gsm8k,7.0,3.592834e-01,7.0,3.592834e-01,No,correct,,
4,td3,manual_cot,gsm8k,7.0,1.211526e-07,7.0,1.211526e-07,Yes,correct,,
...,...,...,...,...,...,...,...,...,...,...,...
723,td3,manual_cot,gsm8k,,,,,No,sentence_length_prompts,,
724,td3,manual_few_shot,gsm8k,,,,,No,sentence_length_prompts,,
725,td3,self_refine,gsm8k,,,,,No,sentence_length_prompts,,
726,td3,tree_of_thought,gsm8k,,,,,No,sentence_length_prompts,,


In [22]:
# Check for null values of Significant at 95%
all_inference[all_inference['Significant at 95%'].isnull()]


Unnamed: 0,model,method,task,statistic,pvalue,statistic_with_correction,pvalue_with_correction,Significant at 95%,metric,dp_mean,using_method_mean


In [23]:
# Check for cases where pvalue and pvalue_with_correction are both null, but dp_mean and using_method_mean are both not null
all_inference[(all_inference['pvalue'].isnull()) & (all_inference['pvalue_with_correction'].isnull()) & (all_inference['dp_mean'].notnull()) & (all_inference['using_method_mean'].notnull())]


Unnamed: 0,model,method,task,statistic,pvalue,statistic_with_correction,pvalue_with_correction,Significant at 95%,metric,dp_mean,using_method_mean
420,gpt4,manual_few_shot,cw,,,,,No,num_1_dot_etc,0.00,0.00
448,gpt4,ape_zero_shot_cot,cw,,,,,No,num_1_dot_etc_prompts,2.00,2.00
449,gpt4,tree_of_thought,cw,,,,,No,num_1_dot_etc_prompts,2.00,2.00
450,gpt4,zero_shot_cot,cw,,,,,No,num_1_dot_etc_prompts,2.00,2.00
455,gpt4,ape_zero_shot_cot,gsm8k,,,,,No,num_1_dot_etc_prompts,0.38,0.38
...,...,...,...,...,...,...,...,...,...,...,...
639,td3,manual_cot,gsm8k,,,,,No,num_step_i_prompts,0.00,0.00
640,td3,manual_few_shot,gsm8k,,,,,No,num_step_i_prompts,0.00,0.00
641,td3,self_refine,gsm8k,,,,,No,num_step_i_prompts,0.00,0.00
642,td3,tree_of_thought,gsm8k,,,,,No,num_step_i_prompts,0.00,0.00


In [24]:
# Check the above but also where dp_mean and using_method_mean are not equal
all_inference[(all_inference['pvalue'].isnull()) & (all_inference['pvalue_with_correction'].isnull()) & (all_inference['dp_mean'].notnull()) & (all_inference['using_method_mean'].notnull()) & (all_inference['dp_mean'] != all_inference['using_method_mean'])]


Unnamed: 0,model,method,task,statistic,pvalue,statistic_with_correction,pvalue_with_correction,Significant at 95%,metric,dp_mean,using_method_mean


In [25]:
# Output to Excel
all_inference.to_excel('all_inference.xlsx', index=False)
