# Analyze Metrics and Conduct Inference.ipynb

In [19]:
import pandas as pd
from statsmodels.stats.contingency_tables import mcnemar 
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats


In [20]:
# Load Combined_Data.xlsx
combined_data = pd.read_excel('Combined_Data.xlsx')

# GSM8k data - limit to task = "gsm8k"
gsm8k_data = combined_data[combined_data['task'] == 'gsm8k']

# Creative writing data - limit to task = "cw"
cw_data = combined_data[combined_data['task'] == 'cw']

# Load direct_prompting_comparison.xlsx
direct_prompting_comparison = pd.read_excel('direct_prompting_comparison.xlsx')


## Perform McNemar's Test on GSM8k data

In [24]:
# Get in terms of number of questions answered correctly: sum accuracy_quality column by model, method, task
gsm8k_num_correct= gsm8k_data[['model', 'method', 'task', 'correct']].groupby(['model', 'method', 'task']).sum()['correct'].reset_index()
# Rename column to num_correct
gsm8k_num_correct = gsm8k_num_correct.rename(columns={'correct': 'num_correct'})

# Make another table with counts of questions missed (number of observations minus accuracy_quality sum) by model, method, task
gsm8k_num_obs = gsm8k_data[['model', 'method', 'task', 'correct']].groupby(['model', 'method', 'task']).count()['correct'].reset_index()
# Rename column to num_obs
gsm8k_num_obs = gsm8k_num_obs.rename(columns={'correct': 'num_obs'})
# Join together by model, method, task
gsm8k_correct_incorrect_obs = gsm8k_num_correct.merge(gsm8k_num_obs, on=['model', 'method', 'task'], how='left')
# Table should have model, method, task, num_correct, num_incorrect, num_obs
gsm8k_correct_incorrect_obs['num_incorrect'] = gsm8k_correct_incorrect_obs['num_obs'] - gsm8k_correct_incorrect_obs['num_correct']

gsm8k_correct_incorrect_obs


Unnamed: 0,model,method,task,num_correct,num_obs,num_incorrect
0,gpt4,ape_zero_shot_cot,gsm8k,93.0,100,7.0
1,gpt4,direct_prompting,gsm8k,73.0,100,27.0
2,gpt4,least_to_most,gsm8k,95.0,100,5.0
3,gpt4,manual_cot,gsm8k,93.0,100,7.0
4,gpt4,manual_few_shot,gsm8k,49.0,100,51.0
5,gpt4,self_refine,gsm8k,89.0,100,11.0
6,gpt4,tree_of_thought,gsm8k,40.0,100,60.0
7,gpt4,zero_shot_cot,gsm8k,95.0,100,5.0
8,td3,ape_zero_shot_cot,gsm8k,49.0,100,51.0
9,td3,direct_prompting,gsm8k,23.0,100,77.0


In [25]:
# Function to perform McNemar's test
# Accepts argument of name of model, method, task

def perform_mcnemar(model, method, task):
    
    # Get direct prompting data
    dp_data = gsm8k_data[(combined_data['model'] == model) & (combined_data['method'] == 'direct_prompting') & (combined_data['task'] == task)][['conversation_number', 'correct']]
    # Rename correct to dp_correct
    dp_data = dp_data.rename(columns={'correct': 'dp_correct'})

    # Get method data
    method_data = gsm8k_data[(combined_data['model'] == model) & (combined_data['method'] == method) & (combined_data['task'] == task)][['conversation_number', 'correct']]
    # Rename correct to method_correct
    method_data = method_data.rename(columns={'correct': 'method_correct'})

    # Join together by conversation_number
    joined_data = dp_data.merge(method_data, on='conversation_number', how='left')

    # Fill contingency table cells
    dp_correct_method_correct = joined_data[(joined_data['dp_correct'] == True) & (joined_data['method_correct'] == True)].shape[0]
    dp_correct_method_incorrect = joined_data[(joined_data['dp_correct'] == True) & (joined_data['method_correct'] == False)].shape[0]
    dp_incorrect_method_correct = joined_data[(joined_data['dp_correct'] == False) & (joined_data['method_correct'] == True)].shape[0]
    dp_incorrect_method_incorrect = joined_data[(joined_data['dp_correct'] == False) & (joined_data['method_correct'] == False)].shape[0]
    
    # Create a dataset
    # Rows for dp correct and incorrect
    # Columns for method correct and incorrect
    data = [[dp_correct_method_correct, dp_correct_method_incorrect], 
            [dp_incorrect_method_correct, dp_incorrect_method_incorrect]]
  
    print('McNemar\'s Test (Exact) for ' + model + ' ' + method + ' ' + task)
    print(data)

    # McNemar's Test, exact, without any continuity correction 
    print('No continuity correction')
    print(mcnemar(data, exact=True, correction=False)) 
    ncc_result = mcnemar(data, exact=True, correction=False)
    ncc_statistic = ncc_result.statistic
    ncc_p_value = ncc_result.pvalue

    # McNemar's Test with the continuity correction 
    print('With continuity correction')
    print(mcnemar(data, exact=True, correction=True)) 
    cc_result = mcnemar(data, exact=True, correction=True)
    cc_statistic = cc_result.statistic
    cc_p_value = cc_result.pvalue

    # Return data
    return ncc_statistic, ncc_p_value, cc_statistic, cc_p_value


In [26]:
# Add McNemar's Test results to table
# Iterate over all combinations of model, method, task
# Except for method = "direct_prompting" and task = "cw"
# For each combination, perform McNemar's Test and add to table

# Create combos of model, method, task - all unique combinations of these three in combined_data
# Get unique values of model, method, task
models = combined_data['model'].unique()
methods = combined_data['method'].unique()
tasks = combined_data['task'].unique()
# Create list of all combinations of model, method, task
combinations = [(model, method, task) for model in models for method in methods for task in tasks]

# Add as rows to a dataframe
# Create empty dataframe
mcnemars_results = pd.DataFrame(columns=['model', 'method', 'task', 'statistic', 'pvalue', 'statistic_with_correction', 'pvalue_with_correction'])

# Iterate over combinations
for model, method, task in combinations:
    # Skip if method = "direct_prompting" or task = "cw"
    if method == 'direct_prompting' or task == 'cw':
        continue
    # Perform McNemar's Test
    statistic_without_correction, pvalue_without_correction, statistic_with_correction, pvalue_with_correction = perform_mcnemar(model, method, task)
    # Add to table
    mcnemars_results = pd.concat([mcnemars_results, pd.DataFrame([[model, method, task, statistic_without_correction, pvalue_without_correction, statistic_with_correction, pvalue_with_correction]], columns=['model', 'method', 'task', 'statistic', 'pvalue', 'statistic_with_correction', 'pvalue_with_correction'])])

mcnemars_results


McNemar's Test (Exact) for td3 zero_shot_cot gsm8k
[[13, 10], [49, 28]]
No continuity correction
pvalue      2.706279703008907e-07
statistic   10.0
With continuity correction
pvalue      2.706279703008907e-07
statistic   10.0
McNemar's Test (Exact) for td3 ape_zero_shot_cot gsm8k
[[11, 12], [38, 39]]
No continuity correction
pvalue      0.00030586400160359517
statistic   12.0
With continuity correction
pvalue      0.00030586400160359517
statistic   12.0
McNemar's Test (Exact) for td3 least_to_most gsm8k
[[15, 7], [43, 34]]
No continuity correction
pvalue      2.098677427397888e-07
statistic   7.0
With continuity correction
pvalue      2.098677427397888e-07
statistic   7.0
McNemar's Test (Exact) for td3 manual_few_shot gsm8k
[[11, 12], [7, 70]]
No continuity correction
pvalue      0.359283447265625
statistic   7.0
With continuity correction
pvalue      0.359283447265625
statistic   7.0
McNemar's Test (Exact) for td3 manual_cot gsm8k
[[16, 7], [44, 33]]
No continuity correction
pvalue   

  dp_data = gsm8k_data[(combined_data['model'] == model) & (combined_data['method'] == 'direct_prompting') & (combined_data['task'] == task)][['conversation_number', 'correct']]
  method_data = gsm8k_data[(combined_data['model'] == model) & (combined_data['method'] == method) & (combined_data['task'] == task)][['conversation_number', 'correct']]
  dp_data = gsm8k_data[(combined_data['model'] == model) & (combined_data['method'] == 'direct_prompting') & (combined_data['task'] == task)][['conversation_number', 'correct']]
  method_data = gsm8k_data[(combined_data['model'] == model) & (combined_data['method'] == method) & (combined_data['task'] == task)][['conversation_number', 'correct']]
  dp_data = gsm8k_data[(combined_data['model'] == model) & (combined_data['method'] == 'direct_prompting') & (combined_data['task'] == task)][['conversation_number', 'correct']]
  method_data = gsm8k_data[(combined_data['model'] == model) & (combined_data['method'] == method) & (combined_data['task'] ==

Unnamed: 0,model,method,task,statistic,pvalue,statistic_with_correction,pvalue_with_correction
0,td3,zero_shot_cot,gsm8k,10.0,2.70628e-07,10.0,2.70628e-07
0,td3,ape_zero_shot_cot,gsm8k,12.0,0.000305864,12.0,0.000305864
0,td3,least_to_most,gsm8k,7.0,2.098677e-07,7.0,2.098677e-07
0,td3,manual_few_shot,gsm8k,7.0,0.3592834,7.0,0.3592834
0,td3,manual_cot,gsm8k,7.0,1.211526e-07,7.0,1.211526e-07
0,td3,tree_of_thought,gsm8k,13.0,1.0,13.0,1.0
0,td3,self_refine,gsm8k,4.0,0.5488281,4.0,0.5488281
0,gpt4,zero_shot_cot,gsm8k,1.0,2.980232e-06,1.0,2.980232e-06
0,gpt4,ape_zero_shot_cot,gsm8k,1.0,1.096725e-05,1.0,1.096725e-05
0,gpt4,least_to_most,gsm8k,0.0,4.768372e-07,0.0,4.768372e-07


In [27]:
# Rows that are significant at p < 0.05
mcnemars_results[mcnemars_results['pvalue'] < 0.05]


Unnamed: 0,model,method,task,statistic,pvalue,statistic_with_correction,pvalue_with_correction
0,td3,zero_shot_cot,gsm8k,10.0,2.70628e-07,10.0,2.70628e-07
0,td3,ape_zero_shot_cot,gsm8k,12.0,0.000305864,12.0,0.000305864
0,td3,least_to_most,gsm8k,7.0,2.098677e-07,7.0,2.098677e-07
0,td3,manual_cot,gsm8k,7.0,1.211526e-07,7.0,1.211526e-07
0,gpt4,zero_shot_cot,gsm8k,1.0,2.980232e-06,1.0,2.980232e-06
0,gpt4,ape_zero_shot_cot,gsm8k,1.0,1.096725e-05,1.0,1.096725e-05
0,gpt4,least_to_most,gsm8k,0.0,4.768372e-07,0.0,4.768372e-07
0,gpt4,manual_few_shot,gsm8k,9.0,0.0002715392,9.0,0.0002715392
0,gpt4,manual_cot,gsm8k,1.0,1.096725e-05,1.0,1.096725e-05
0,gpt4,tree_of_thought,gsm8k,8.0,1.964654e-06,8.0,1.964654e-06


In [28]:
# Rows that are not significant at p < 0.05
mcnemars_results[mcnemars_results['pvalue'] >= 0.05]


Unnamed: 0,model,method,task,statistic,pvalue,statistic_with_correction,pvalue_with_correction
0,td3,manual_few_shot,gsm8k,7.0,0.359283,7.0,0.359283
0,td3,tree_of_thought,gsm8k,13.0,1.0,13.0,1.0
0,td3,self_refine,gsm8k,4.0,0.548828,4.0,0.548828


In [29]:
# Printing gsm8k_correct_incorrect_obs again for reference
gsm8k_correct_incorrect_obs


Unnamed: 0,model,method,task,num_correct,num_obs,num_incorrect
0,gpt4,ape_zero_shot_cot,gsm8k,93.0,100,7.0
1,gpt4,direct_prompting,gsm8k,73.0,100,27.0
2,gpt4,least_to_most,gsm8k,95.0,100,5.0
3,gpt4,manual_cot,gsm8k,93.0,100,7.0
4,gpt4,manual_few_shot,gsm8k,49.0,100,51.0
5,gpt4,self_refine,gsm8k,89.0,100,11.0
6,gpt4,tree_of_thought,gsm8k,40.0,100,60.0
7,gpt4,zero_shot_cot,gsm8k,95.0,100,5.0
8,td3,ape_zero_shot_cot,gsm8k,49.0,100,51.0
9,td3,direct_prompting,gsm8k,23.0,100,77.0


In [None]:
# Save mcnemars_results to Excel
mcnemars_results.to_excel('mcnemars_results.xlsx', index=False)


## We can also run McNemar's test on the compliance variable

## Paired t-test for quality, means of other metrics

In [30]:
# Perform paired t-test on creative writing data and all other metrics

# Metrics to t-test: 'coherence_1_incoherent_10_very_coherent', 'coherence_1_incoherent_10_very_coherent_compliance_adjusted, 'ease_of_evaluation_score', 'conversation_length', 'input_length', 'output_length', 'conversation_cost', 'gsm8k_length_vs_provided', 'length_vs_direct_prompting', 'num_linebreaks', 'num_sentences', 'num_step_i', 'num_1_dot_etc', 'sentence_length', 'fres', 'num_linebreaks_prompts', 'num_sentences_prompts', 'num_step_i_prompts', 'num_1_dot_etc_prompts', 'sentence_length_prompts', 'fres_prompts'
metrics_to_t_test = ['coherence_1_incoherent_10_very_coherent', 'coherence_1_incoherent_10_very_coherent_compliance_adjusted', 'ease_of_review_1_easy_10_hard', 'conversation_length', 'input_length', 'output_length', 'conversation_cost', 'gsm8k_length_vs_provided', 'length_vs_direct_prompting', 'num_linebreaks', 'num_sentences', 'num_step_i', 'num_1_dot_etc', 'sentence_length', 'fres', 'num_linebreaks_prompts', 'num_sentences_prompts', 'num_step_i_prompts', 'num_1_dot_etc_prompts', 'sentence_length_prompts', 'fres_prompts']

# Create table to hold results
# Columns of model, task, method, mean metric, mean dp_metric, statistic, pvalue
t_test_results = pd.DataFrame(columns=['model', 'task', 'method', 'metric', 'dp_mean', 'using_method_mean', 'statistic', 'pvalue'])
# Iterate over model, task, method
for model, task, method in direct_prompting_comparison[['model', 'task', 'method']].drop_duplicates().values:
    # Iterate over metrics_to_t_test
    for metric in metrics_to_t_test:
        # Get data
        # Sort by conversation_number
        data = direct_prompting_comparison[(direct_prompting_comparison['model'] == model) & (direct_prompting_comparison['task'] == task) & (direct_prompting_comparison['method'] == method)].sort_values(by=['conversation_number'])
        # direct_prompting holds the metric when using direct_prompting
        direct_prompting = data['dp_' + metric]
        # using_method holds the metric when using the method
        using_method = data[metric]
        # Perform the paired sample t-test 
        statistic, pvalue = stats.ttest_rel(direct_prompting, using_method)
        # Add to table
        t_test_results = pd.concat([t_test_results, pd.DataFrame([[model, task, method, metric, direct_prompting.mean(), using_method.mean(), statistic, pvalue]], columns=['model', 'task', 'method', 'metric', 'dp_mean', 'using_method_mean', 'statistic', 'pvalue'])])

# Significance if pvalue < 0.05
t_test_results['Significant at 95%'] = t_test_results['pvalue'].apply(lambda x: 'Yes' if x < 0.05 else 'No')

# Sort by metric, model, task, method
t_test_results = t_test_results.sort_values(by=['metric', 'model', 'task', 'method']).reset_index().drop(columns=['index'])

t_test_results


  res = hypotest_fun_out(*samples, **kwds)


Unnamed: 0,model,task,method,metric,dp_mean,using_method_mean,statistic,pvalue,Significant at 95%
0,gpt4,cw,ape_zero_shot_cot,coherence_1_incoherent_10_very_coherent,5.71,5.46,0.810999,0.419311,No
1,gpt4,cw,least_to_most,coherence_1_incoherent_10_very_coherent,5.71,5.54,0.607946,0.544615,No
2,gpt4,cw,manual_cot,coherence_1_incoherent_10_very_coherent,5.71,6.33,-2.094400,0.038778,Yes
3,gpt4,cw,manual_few_shot,coherence_1_incoherent_10_very_coherent,5.71,6.11,-1.271348,0.206584,No
4,gpt4,cw,self_refine,coherence_1_incoherent_10_very_coherent,5.71,6.04,-1.062998,0.290369,No
...,...,...,...,...,...,...,...,...,...
555,td3,gsm8k,manual_cot,sentence_length_prompts,,,,,No
556,td3,gsm8k,manual_few_shot,sentence_length_prompts,,,,,No
557,td3,gsm8k,self_refine,sentence_length_prompts,,,,,No
558,td3,gsm8k,tree_of_thought,sentence_length_prompts,,,,,No


In [31]:
# Sort by metric, model, task, Significant at 95%
t_test_results = t_test_results.sort_values(by=['metric', 'model', 'task', 'Significant at 95%']).reset_index().drop(columns=['index'])

t_test_results


Unnamed: 0,model,task,method,metric,dp_mean,using_method_mean,statistic,pvalue,Significant at 95%
0,gpt4,cw,ape_zero_shot_cot,coherence_1_incoherent_10_very_coherent,5.71,5.46,0.810999,0.419311,No
1,gpt4,cw,least_to_most,coherence_1_incoherent_10_very_coherent,5.71,5.54,0.607946,0.544615,No
2,gpt4,cw,manual_few_shot,coherence_1_incoherent_10_very_coherent,5.71,6.11,-1.271348,0.206584,No
3,gpt4,cw,self_refine,coherence_1_incoherent_10_very_coherent,5.71,6.04,-1.062998,0.290369,No
4,gpt4,cw,manual_cot,coherence_1_incoherent_10_very_coherent,5.71,6.33,-2.094400,0.038778,Yes
...,...,...,...,...,...,...,...,...,...
555,td3,gsm8k,manual_cot,sentence_length_prompts,,,,,No
556,td3,gsm8k,manual_few_shot,sentence_length_prompts,,,,,No
557,td3,gsm8k,self_refine,sentence_length_prompts,,,,,No
558,td3,gsm8k,tree_of_thought,sentence_length_prompts,,,,,No


In [32]:
# Rows that are not significant at p < 0.05
t_test_results[t_test_results['pvalue'] >= 0.05]


Unnamed: 0,model,task,method,metric,dp_mean,using_method_mean,statistic,pvalue,Significant at 95%
0,gpt4,cw,ape_zero_shot_cot,coherence_1_incoherent_10_very_coherent,5.71,5.46,0.810999,0.419311,No
1,gpt4,cw,least_to_most,coherence_1_incoherent_10_very_coherent,5.71,5.54,0.607946,0.544615,No
2,gpt4,cw,manual_few_shot,coherence_1_incoherent_10_very_coherent,5.71,6.11,-1.271348,0.206584,No
3,gpt4,cw,self_refine,coherence_1_incoherent_10_very_coherent,5.71,6.04,-1.062998,0.290369,No
14,td3,cw,least_to_most,coherence_1_incoherent_10_very_coherent,4.46,4.45,0.030069,0.976073,No
15,td3,cw,manual_cot,coherence_1_incoherent_10_very_coherent,4.46,4.14,0.928556,0.355378,No
16,td3,cw,manual_few_shot,coherence_1_incoherent_10_very_coherent,4.46,4.97,-1.630152,0.106247,No
17,td3,cw,self_refine,coherence_1_incoherent_10_very_coherent,4.46,4.2,0.877447,0.382368,No
18,td3,cw,zero_shot_cot,coherence_1_incoherent_10_very_coherent,4.46,3.86,1.884092,0.062487,No
112,gpt4,cw,self_refine,fres,63.7767,62.369,1.616096,0.109256,No


In [33]:
# Save t_test_results to Excel
t_test_results.to_excel('t_test_results.xlsx', index=False)
