# Analyze Metrics and Conduct Inference.ipynb

In [70]:
import pandas as pd
from statsmodels.stats.contingency_tables import mcnemar 
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats


In [71]:
# Load Combined_Data.xlsx
combined_data = pd.read_excel('Combined_Data.xlsx')

# GSM8k data - limit to task = "gsm8k"
gsm8k_data = combined_data[combined_data['task'] == 'gsm8k']

# Creative writing data - limit to task = "cw"
cw_data = combined_data[combined_data['task'] == 'cw']


## Accuracy/Quality

In [72]:
# Produce table - one column for each method, one row for each model by task type
# Average values of correct_or_incorrect and creative_writing_score
# Also get variance

# Column for combined accuracy_quality - correct_or_incorrect if task = "gsm8k", creative_writing_score if task = "cw"
combined_data['accuracy_quality'] = combined_data.apply(lambda row: row['correct'] if row['task'] == 'gsm8k' else row['coherence_1_incoherent_10_very_coherent'], axis=1)

# Average accuracy_quality by model, method, task
# Also get variance
avg_accuracy_quality_with_variance = combined_data[['model', 'method', 'task', 'accuracy_quality']].groupby(['model', 'method', 'task']).agg(['mean', 'var'])['accuracy_quality'].reset_index()

# Combine mean and variance into one column that is a string with the mean and then the variance in parentheses
avg_accuracy_quality_with_variance['accuracy_quality'] = avg_accuracy_quality_with_variance.apply(lambda row: str(round(row['mean'], 2)) + ' (' + str(round(row['var'], 2)) + ')', axis=1)

avg_accuracy_quality_with_variance


Unnamed: 0,model,method,task,mean,var,accuracy_quality
0,gpt4,ape_zero_shot_cot,cw,5.46,6.210505,5.46 (6.21)
1,gpt4,ape_zero_shot_cot,gsm8k,0.93,0.065758,0.93 (0.07)
2,gpt4,direct_prompting,cw,5.71,7.298889,5.71 (7.3)
3,gpt4,direct_prompting,gsm8k,0.73,0.199091,0.73 (0.2)
4,gpt4,least_to_most,cw,5.54,5.907475,5.54 (5.91)
5,gpt4,least_to_most,gsm8k,0.95,0.04798,0.95 (0.05)
6,gpt4,manual_cot,cw,6.33,5.19303,6.33 (5.19)
7,gpt4,manual_cot,gsm8k,0.93,0.065758,0.93 (0.07)
8,gpt4,manual_few_shot,cw,6.11,6.220101,6.11 (6.22)
9,gpt4,manual_few_shot,gsm8k,0.49,0.252424,0.49 (0.25)


In [73]:
# Pivot table - column method should go wide
avg_accuracy_quality_pivot = avg_accuracy_quality_with_variance.drop(columns=['mean', 'var']).set_index(['model', 'task', 'method']).unstack()
#pivot_table(index=['model', 'task'], columns='method', values='accuracy_quality').reset_index()

# Fix axis
avg_accuracy_quality_pivot = avg_accuracy_quality_pivot.rename_axis([None, None], axis=1).reset_index()

# Sort rows by task - gsm8k task first, then cw
# Sort by model - text-davinci-003 first, then gpt4
avg_accuracy_quality_pivot = avg_accuracy_quality_pivot.sort_values(by=['task', 'model'], ascending=[True, True]).reset_index().drop(columns=['index'])

print(avg_accuracy_quality_pivot.columns)

flattened_cols = [''.join(col).strip().replace('accuracy_quality', '') for col in avg_accuracy_quality_pivot.columns.values]
avg_accuracy_quality_pivot.columns = flattened_cols

print(avg_accuracy_quality_pivot)

# Order columns: direct_prompting, zero_shot_cot, ape_zero_shot_cot, tree_of_thought, self_refine, least_to_most, manual_few_shot, manual_cot
avg_accuracy_quality_pivot = avg_accuracy_quality_pivot[['model', 'task', 'direct_prompting', 'zero_shot_cot', 'ape_zero_shot_cot', 'tree_of_thought', 'self_refine', 'least_to_most', 'manual_few_shot', 'manual_cot']]

# Output to LaTeX
avg_accuracy_quality_pivot.to_latex('../Output/avg_accuracy_quality_pivot.tex', index=False)


MultiIndex([(           'model',                  ''),
            (            'task',                  ''),
            ('accuracy_quality', 'ape_zero_shot_cot'),
            ('accuracy_quality',  'direct_prompting'),
            ('accuracy_quality',     'least_to_most'),
            ('accuracy_quality',        'manual_cot'),
            ('accuracy_quality',   'manual_few_shot'),
            ('accuracy_quality',       'self_refine'),
            ('accuracy_quality',   'tree_of_thought'),
            ('accuracy_quality',     'zero_shot_cot')],
           )
  model   task ape_zero_shot_cot direct_prompting least_to_most   manual_cot  \
0  gpt4     cw       5.46 (6.21)       5.71 (7.3)   5.54 (5.91)  6.33 (5.19)   
1   td3     cw        3.9 (5.36)      4.46 (7.79)   4.45 (5.28)  4.14 (4.89)   
2  gpt4  gsm8k       0.93 (0.07)       0.73 (0.2)   0.95 (0.05)  0.93 (0.07)   
3   td3  gsm8k       0.49 (0.25)      0.23 (0.18)   0.67 (0.95)   0.6 (0.24)   

  manual_few_shot  self_refine tree

  avg_accuracy_quality_pivot = avg_accuracy_quality_pivot.sort_values(by=['task', 'model'], ascending=[True, True]).reset_index().drop(columns=['index'])


In [74]:
# Perform McNemar's Test on GSM8k data

# Get in terms of number of questions answered correctly: sum accuracy_quality column by model, method, task
gsm8k_num_correct= gsm8k_data[['model', 'method', 'task', 'correct']].groupby(['model', 'method', 'task']).sum()['correct'].reset_index()
# Rename column to num_correct
gsm8k_num_correct = gsm8k_num_correct.rename(columns={'correct': 'num_correct'})

# Make another table with counts of questions missed (number of observations minus accuracy_quality sum) by model, method, task
gsm8k_num_obs = gsm8k_data[['model', 'method', 'task', 'correct']].groupby(['model', 'method', 'task']).count()['correct'].reset_index()
# Rename column to num_obs
gsm8k_num_obs = gsm8k_num_obs.rename(columns={'correct': 'num_obs'})
# Join together by model, method, task
gsm8k_correct_incorrect_obs = gsm8k_num_correct.merge(gsm8k_num_obs, on=['model', 'method', 'task'], how='left')
# Table should have model, method, task, num_correct, num_incorrect, num_obs
gsm8k_correct_incorrect_obs['num_incorrect'] = gsm8k_correct_incorrect_obs['num_obs'] - gsm8k_correct_incorrect_obs['num_correct']

gsm8k_correct_incorrect_obs


Unnamed: 0,model,method,task,num_correct,num_obs,num_incorrect
0,gpt4,ape_zero_shot_cot,gsm8k,93.0,100,7.0
1,gpt4,direct_prompting,gsm8k,73.0,100,27.0
2,gpt4,least_to_most,gsm8k,95.0,100,5.0
3,gpt4,manual_cot,gsm8k,93.0,100,7.0
4,gpt4,manual_few_shot,gsm8k,49.0,100,51.0
5,gpt4,self_refine,gsm8k,89.0,100,11.0
6,gpt4,tree_of_thought,gsm8k,40.0,100,60.0
7,gpt4,zero_shot_cot,gsm8k,95.0,100,5.0
8,td3,ape_zero_shot_cot,gsm8k,49.0,100,51.0
9,td3,direct_prompting,gsm8k,23.0,100,77.0


### Testing Accuracy

In [82]:
# Function to perform McNemar's test
# Accepts argument of name of model, method, task

def perform_mcnemar(model, method, task):

    # Create variables for data of interest
    # direct_prompting_correct
    direct_prompting_correct = gsm8k_correct_incorrect_obs[(gsm8k_correct_incorrect_obs['model'] == model) & (gsm8k_correct_incorrect_obs['method'] == "direct_prompting") & (gsm8k_correct_incorrect_obs['task'] == task)]['num_correct'].values[0]
    # direct_prompting_incorrect
    direct_prompting_incorrect = gsm8k_correct_incorrect_obs[(gsm8k_correct_incorrect_obs['model'] == model) & (gsm8k_correct_incorrect_obs['method'] == "direct_prompting") & (gsm8k_correct_incorrect_obs['task'] == task)]['num_incorrect'].values[0]
    # method_correct
    method_correct = gsm8k_correct_incorrect_obs[(gsm8k_correct_incorrect_obs['model'] == model) & (gsm8k_correct_incorrect_obs['method'] == method) & (gsm8k_correct_incorrect_obs['task'] == task)]['num_correct'].values[0]
    # method_incorrect
    method_incorrect = gsm8k_correct_incorrect_obs[(gsm8k_correct_incorrect_obs['model'] == model) & (gsm8k_correct_incorrect_obs['method'] == method) & (gsm8k_correct_incorrect_obs['task'] == task)]['num_incorrect'].values[0]

    # Create a dataset 
    # Row for direct prompting then method
    # Column for correct then incorrect
    data = [[direct_prompting_correct, direct_prompting_incorrect], 
            [method_correct, method_incorrect]] 
    
    print('McNemar\'s Test (Exact) for ' + model + ' ' + method + ' ' + task)
    print(data)

    # McNemar's Test, exact, without any continuity correction 
    print('No continuity correction')
    print(mcnemar(data, exact=True, correction=False)) 
    ncc_result = mcnemar(data, exact=True, correction=False)
    ncc_statistic = ncc_result.statistic
    ncc_p_value = ncc_result.pvalue

    #print(type(mcnemar(data, exact=True, correction=False)))
    #print(str(mcnemar(data, exact=True, correction=False)[0]) + ' ' + str(mcnemar(data, exact=True, correction=False)[1]))
    
    # McNemar's Test with the continuity correction 
    print('With continuity correction')
    print(mcnemar(data, exact=True, correction=True)) 
    cc_result = mcnemar(data, exact=True, correction=True)
    cc_statistic = cc_result.statistic
    cc_p_value = cc_result.pvalue

    # Return data
    return ncc_statistic, ncc_p_value, cc_statistic, cc_p_value


In [83]:
# Add McNemar's Test results to table
# Iterate over all combinations of model, method, task
# Except for method = "direct_prompting" and task = "cw"
# For each combination, perform McNemar's Test and add to table

# Create combos of model, method, task - all unique combinations of these three in combined_data
# Get unique values of model, method, task
models = combined_data['model'].unique()
methods = combined_data['method'].unique()
tasks = combined_data['task'].unique()
# Create list of all combinations of model, method, task
combinations = [(model, method, task) for model in models for method in methods for task in tasks]

# Add as rows to a dataframe
# Create empty dataframe
mcnemars_results = pd.DataFrame(columns=['model', 'method', 'task', 'statistic', 'pvalue', 'statistic_with_correction', 'pvalue_with_correction'])

# Iterate over combinations
for model, method, task in combinations:
    # Skip if method = "direct_prompting" or task = "cw"
    if method == 'direct_prompting' or task == 'cw':
        continue
    # Perform McNemar's Test
    statistic_without_correction, pvalue_without_correction, statistic_with_correction, pvalue_with_correction = perform_mcnemar(model, method, task)
    # Add to table
    mcnemars_results = pd.concat([mcnemars_results, pd.DataFrame([[model, method, task, statistic_without_correction, pvalue_without_correction, statistic_with_correction, pvalue_with_correction]], columns=['model', 'method', 'task', 'statistic', 'pvalue', 'statistic_with_correction', 'pvalue_with_correction'])])

mcnemars_results


McNemar's Test (Exact) for td3 zero_shot_cot gsm8k
[[23.0, 77.0], [62.0, 38.0]]
No continuity correction
pvalue      0.23493150329059798
statistic   62.0
With continuity correction
pvalue      0.23493150329059798
statistic   62.0
McNemar's Test (Exact) for td3 ape_zero_shot_cot gsm8k
[[23.0, 77.0], [49.0, 51.0]]
No continuity correction
pvalue      0.01581820400805208
statistic   49.0
With continuity correction
pvalue      0.01581820400805208
statistic   49.0
McNemar's Test (Exact) for td3 least_to_most gsm8k
[[23.0, 77.0], [67.0, 33.0]]
No continuity correction
pvalue      0.4533716994309739
statistic   67.0
With continuity correction
pvalue      0.4533716994309739
statistic   67.0
McNemar's Test (Exact) for td3 manual_few_shot gsm8k
[[23.0, 77.0], [18.0, 82.0]]
No continuity correction
pvalue      7.252322160518573e-10
statistic   18.0
With continuity correction
pvalue      7.252322160518573e-10
statistic   18.0
McNemar's Test (Exact) for td3 manual_cot gsm8k
[[23.0, 77.0], [60.0, 40

Unnamed: 0,model,method,task,statistic,pvalue,statistic_with_correction,pvalue_with_correction
0,td3,zero_shot_cot,gsm8k,62.0,0.2349315,62.0,0.2349315
0,td3,ape_zero_shot_cot,gsm8k,49.0,0.0158182,49.0,0.0158182
0,td3,least_to_most,gsm8k,67.0,0.4533717,67.0,0.4533717
0,td3,manual_few_shot,gsm8k,18.0,7.252322e-10,18.0,7.252322e-10
0,td3,manual_cot,gsm8k,60.0,0.1714089,60.0,0.1714089
0,td3,tree_of_thought,gsm8k,23.0,5.513581e-08,23.0,5.513581e-08
0,td3,self_refine,gsm8k,20.0,4.59076e-09,20.0,4.59076e-09
0,gpt4,zero_shot_cot,gsm8k,27.0,4.563547e-10,27.0,4.563547e-10
0,gpt4,ape_zero_shot_cot,gsm8k,27.0,1.113051e-09,27.0,1.113051e-09
0,gpt4,least_to_most,gsm8k,27.0,4.563547e-10,27.0,4.563547e-10


In [84]:
# Rows that are significant at p < 0.05
mcnemars_results[mcnemars_results['pvalue'] < 0.05]


Unnamed: 0,model,method,task,statistic,pvalue,statistic_with_correction,pvalue_with_correction
0,td3,ape_zero_shot_cot,gsm8k,49.0,0.0158182,49.0,0.0158182
0,td3,manual_few_shot,gsm8k,18.0,7.252322e-10,18.0,7.252322e-10
0,td3,tree_of_thought,gsm8k,23.0,5.513581e-08,23.0,5.513581e-08
0,td3,self_refine,gsm8k,20.0,4.59076e-09,20.0,4.59076e-09
0,gpt4,zero_shot_cot,gsm8k,27.0,4.563547e-10,27.0,4.563547e-10
0,gpt4,ape_zero_shot_cot,gsm8k,27.0,1.113051e-09,27.0,1.113051e-09
0,gpt4,least_to_most,gsm8k,27.0,4.563547e-10,27.0,4.563547e-10
0,gpt4,manual_few_shot,gsm8k,27.0,0.01544026,27.0,0.01544026
0,gpt4,manual_cot,gsm8k,27.0,1.113051e-09,27.0,1.113051e-09
0,gpt4,self_refine,gsm8k,27.0,6.438995e-09,27.0,6.438995e-09


In [85]:
# Rows that are not significant at p < 0.05
mcnemars_results[mcnemars_results['pvalue'] >= 0.05]


Unnamed: 0,model,method,task,statistic,pvalue,statistic_with_correction,pvalue_with_correction
0,td3,zero_shot_cot,gsm8k,62.0,0.234932,62.0,0.234932
0,td3,least_to_most,gsm8k,67.0,0.453372,67.0,0.453372
0,td3,manual_cot,gsm8k,60.0,0.171409,60.0,0.171409
0,gpt4,tree_of_thought,gsm8k,27.0,0.142071,27.0,0.142071


In [86]:
# Printing gsm8k_correct_incorrect_obs again for reference
gsm8k_correct_incorrect_obs


Unnamed: 0,model,method,task,num_correct,num_obs,num_incorrect
0,gpt4,ape_zero_shot_cot,gsm8k,93.0,100,7.0
1,gpt4,direct_prompting,gsm8k,73.0,100,27.0
2,gpt4,least_to_most,gsm8k,95.0,100,5.0
3,gpt4,manual_cot,gsm8k,93.0,100,7.0
4,gpt4,manual_few_shot,gsm8k,49.0,100,51.0
5,gpt4,self_refine,gsm8k,89.0,100,11.0
6,gpt4,tree_of_thought,gsm8k,40.0,100,60.0
7,gpt4,zero_shot_cot,gsm8k,95.0,100,5.0
8,td3,ape_zero_shot_cot,gsm8k,49.0,100,51.0
9,td3,direct_prompting,gsm8k,23.0,100,77.0


## Create table for comparing each model/task/method with the appropriate direct prompting data

In [87]:
# Split dataset
no_direct_prompting_data = combined_data[combined_data['method'] != 'direct_prompting']
direct_prompting_data = combined_data[combined_data['method'] == 'direct_prompting']

# Add prefix dp_ to columns in direct_prompting_data
direct_prompting_data = direct_prompting_data.add_prefix('dp_')

# Left join datasets on model = dp_model, task = dp_task
direct_prompting_comparison = no_direct_prompting_data.merge(direct_prompting_data, left_on=['model', 'task'], right_on=['dp_model', 'dp_task'], how='left')

direct_prompting_comparison


Unnamed: 0,model_task_method,conversation_number,coherence_1_incoherent_10_very_coherent,task_constraints_followed_0_not_followed_1_followed,ease_of_review_1_easy_10_hard,correct,conversation_length,input_length,output_length,conversation_cost,...,dp_sentence_length_prompts,dp_fres_prompts,dp_num_linebreaks_provided,dp_num_sentences_provided,dp_num_step_i_provided,dp_num_1_dot_etc_provided,dp_model,dp_task,dp_method,dp_accuracy_quality
0,td3_cw_zero_shot_cot_responses,1,2.0,1.0,2.0,,278,63,214,0.00556,...,10.0,94.35,,,,,td3,cw,direct_prompting,1.0
1,td3_cw_zero_shot_cot_responses,1,2.0,1.0,2.0,,278,63,214,0.00556,...,11.4,86.20,,,,,td3,cw,direct_prompting,7.0
2,td3_cw_zero_shot_cot_responses,1,2.0,1.0,2.0,,278,63,214,0.00556,...,12.2,74.69,,,,,td3,cw,direct_prompting,1.0
3,td3_cw_zero_shot_cot_responses,1,2.0,1.0,2.0,,278,63,214,0.00556,...,9.0,69.99,,,,,td3,cw,direct_prompting,10.0
4,td3_cw_zero_shot_cot_responses,1,2.0,1.0,2.0,,278,63,214,0.00556,...,12.0,74.90,,,,,td3,cw,direct_prompting,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279995,gpt4_gsm8k_manual_cot_responses,100,,,,1.0,860,765,95,0.02865,...,,,4.0,5.0,0.0,1.0,gpt4,gsm8k,direct_prompting,1.0
279996,gpt4_gsm8k_manual_cot_responses,100,,,,1.0,860,765,95,0.02865,...,,,2.0,3.0,0.0,0.0,gpt4,gsm8k,direct_prompting,0.0
279997,gpt4_gsm8k_manual_cot_responses,100,,,,1.0,860,765,95,0.02865,...,,,3.0,4.0,0.0,0.0,gpt4,gsm8k,direct_prompting,0.0
279998,gpt4_gsm8k_manual_cot_responses,100,,,,1.0,860,765,95,0.02865,...,,,2.0,3.0,0.0,0.0,gpt4,gsm8k,direct_prompting,1.0


## Paired t-test for quality, means of other metrics

In [88]:
# Perform paired t-test on creative writing data and all other metrics

# Metrics to t-test: 'creative_writing_score', 'ease_of_evaluation_score', 'conversation_length', 'input_length', 'output_length', 'conversation_cost', 'gsm8k_length_vs_provided', 'length_vs_direct_prompting', 'num_linebreaks', 'num_sentences', 'num_step_i', 'num_1_dot_etc', 'sentence_length', 'fres', 'num_linebreaks_prompts', 'num_sentences_prompts', 'num_step_i_prompts', 'num_1_dot_etc_prompts', 'sentence_length_prompts', 'fres_prompts'
metrics_to_t_test = ['creative_writing_score', 'ease_of_evaluation_score', 'conversation_length', 'input_length', 'output_length', 'conversation_cost', 'gsm8k_length_vs_provided', 'length_vs_direct_prompting', 'num_linebreaks', 'num_sentences', 'num_step_i', 'num_1_dot_etc', 'sentence_length', 'fres', 'num_linebreaks_prompts', 'num_sentences_prompts', 'num_step_i_prompts', 'num_1_dot_etc_prompts', 'sentence_length_prompts', 'fres_prompts']

# Create table to hold results
# Columns of model, task, method, mean metric, mean dp_metric, statistic, pvalue
t_test_results = pd.DataFrame(columns=['model', 'task', 'method', 'metric', 'dp_mean', 'using_method_mean', 'statistic', 'pvalue'])
# Iterate over model, task, method
for model, task, method in direct_prompting_comparison[['model', 'task', 'method']].values:
    # Iterate over metrics_to_t_test
    for metric in metrics_to_t_test:
        # Get data
        # Sort by conversation_number
        data = direct_prompting_comparison[(direct_prompting_comparison['model'] == model) & (direct_prompting_comparison['task'] == task) & (direct_prompting_comparison['method'] == method)].sort_values(by=['conversation_number'])
        # direct_prompting holds the metric when using direct_prompting
        direct_prompting = data['dp_' + metric]
        # using_method holds the metric when using the method
        using_method = data[metric]
        # Perform the paired sample t-test 
        statistic, pvalue = stats.ttest_rel(direct_prompting, using_method)
        # Add to table
        t_test_results = t_test_results.concat(pd.DataFrame([[model, task, method, metric, direct_prompting.mean(), using_method.mean(), statistic, pvalue]], columns=['model', 'task', 'method', 'metric', 'dp_mean', 'using_method_mean', 'statistic', 'pvalue']))

t_test_results


KeyError: 'dp_creative_writing_score'

In [None]:
# Rows that are not significant at p < 0.05
t_test_results[t_test_results['pvalue'] >= 0.05]


## Adding release date to accuracy quality table (optional version)

In [None]:
# Release date and gains versus direct prompting

# Load Excel file "..\Selection of Prompt Engineering Methods\Hand-Labeled Method and Implementation Considerations.xlsx"
hand_labeled_data = pd.read_excel('..\Selection of Prompt Engineering Methods\Hand-Labeled Method and Implementation Considerations.xlsx')

# Keep columns ss_publication_date, "Prompt Engineering Method"
hand_labeled_data = hand_labeled_data[['ss_publication_date', 'Prompt Engineering Method']]

# Mapping names from this data to technqiue names
# "Few-Shot Learing" -> "manual_few_shot"
# "Chain-of-Thought Prompting" -> "manual_cot"
# "Zero-Shot Chain-of_Thought" -> "zero_shot_cot"
# "Automatic Prompt Engineer" -> "ape_zero_shot_cot"
# "Self-Refine" -> "self_refine"
# "Least-to-Most Prompting" -> "least_to_most"
# "Tree-of-Thought" -> "tree_of_thought"
# Set names in hand_labeled_data to these names
hand_labeled_data['technique_name'] = hand_labeled_data['Prompt Engineering Method'].replace({'Few-Shot Learning': 'manual_few_shot', 'Chain-of-Thought Prompting': 'manual_cot', 'Zero-Shot Chain-of-Thought': 'zero_shot_cot', 'Automatic Prompt Engineer': 'ape_zero_shot_cot', 'Self-Refine': 'self_refine', 'Least-to-Most Prompting': 'least_to_most', 'Tree-of-Thought': 'tree_of_thought'})

# Convert ss_publication_date to "YYYY-MM-DD"
hand_labeled_data['ss_publication_date'] = hand_labeled_data['ss_publication_date'].dt.strftime('%Y-%m-%d')

# Add ss_publication_date to avg_accuracy_quality_pivot
# The value should be for each column, right below the column name
# Transpose accuracy_quality_pivot
avg_accuracy_quality_pivot_transposed = avg_accuracy_quality_pivot.transpose().reset_index()

# Set name of first column to "technique_name"
avg_accuracy_quality_pivot_transposed = avg_accuracy_quality_pivot_transposed.rename(columns={'index': 'technique_name'})

# Left join hand_labeled_data to avg_accuracy_quality_pivot_transposed on technique_name
avg_accuracy_quality_pivot_transposed = avg_accuracy_quality_pivot_transposed.merge(hand_labeled_data, on='technique_name', how='left')

# Transpose back
avg_accuracy_quality_pivot_with_date = avg_accuracy_quality_pivot_transposed.transpose().reset_index()

# Output to LaTeX
avg_accuracy_quality_pivot_with_date.to_latex('../Output/avg_accuracy_quality_pivot_with_date.tex', index=False)


## Generic Function for a table of means for a metric

In [None]:
def means_table(metric):

    # Average combined_data metric by model, method, task
    avg_combined_data_metric = combined_data[['model', 'method', 'task', metric]].groupby(['model', 'method', 'task']).agg(['mean'])[metric].reset_index()

    # Pivot table - column method should go wide
    avg_combined_data_metric_pivot = avg_combined_data_metric.pivot_table(index=['model', 'task'], columns='method', values=metric).reset_index()

    # Sort rows by task - gsm8k task first, then cw
    # Sort by model - text-davinci-003 first, then gpt4
    avg_combined_data_metric_pivot = avg_combined_data_metric_pivot.sort_values(by=['task', 'model'], ascending=[True, True])

    # Order columns: direct_prompting, zero_shot_cot, ape_zero_shot_cot, tree_of_thought, self_refine, least_to_most, manual_few_shot, manual_cot
    avg_combined_data_metric_pivot = avg_combined_data_metric_pivot[['model', 'task', 'direct_prompting', 'zero_shot_cot', 'ape_zero_shot_cot', 'tree_of_thought', 'self_refine', 'least_to_most', 'manual_few_shot', 'manual_cot']]

    # Output to LaTeX
    avg_combined_data_metric_pivot.to_latex('../Output/avg_' + metric + '_pivot.tex', index=False)



## Length

Run generic function for length of entire interaction, length of all prompts, financial cost

Baseline comparison ratios, change in accuracy/quality plots

In [None]:
means_table('conversation_length')
means_table('input_length')
means_table('conversation_cost')


In [None]:
# Plot gsm8k_length_vs_provided in a bar chart
# Limit to gsm8k data
# Get means by model, method
gsm8k_length_vs_provided_means = gsm8k_data[['model', 'method', 'gsm8k_length_vs_provided']].groupby(['model', 'method']).agg(['mean'])['gsm8k_length_vs_provided'].reset_index()

# Bar chart by model, method
sns.set_theme(style="whitegrid")
plt.figure(figsize=(10, 5))
sns.barplot(x='model', y='mean', hue='method', data=gsm8k_length_vs_provided_means)
plt.xlabel('Model')
plt.ylabel('Average GSM8k Length vs. Provided Length')
plt.title('Average GSM8k Length vs. Provided Length by Model and Method')
plt.savefig('../Output/gsm8k_length_vs_provided_means.png')
plt.show()


In [None]:
# Plot length_vs_direct_prompting in a bar chart
# Get means by model, method, task
length_vs_direct_prompting_means = combined_data[['model', 'method', 'task', 'length_vs_direct_prompting']].groupby(['model', 'method', 'task']).agg(['mean'])['length_vs_direct_prompting'].reset_index()

# Bar chart by model, method, task

# GSM8k plot
plt.figure(figsize=(10, 5))
sns.barplot(x='model', y='mean', hue='method', data=length_vs_direct_prompting_means[length_vs_direct_prompting_means['task'] == 'gsm8k'])
plt.xlabel('Model')
plt.ylabel('Average GSM8k Length vs. Direct Prompting Length')
plt.title('Average GSM8k Length vs. Direct Prompting Length by Model and Method')
plt.savefig('../Output/gsm8k_length_vs_direct_prompting_means.png')
plt.show()

# Creative writing plot
plt.figure(figsize=(10, 5))
sns.barplot(x='model', y='mean', hue='method', data=length_vs_direct_prompting_means[length_vs_direct_prompting_means['task'] == 'cw'])
plt.xlabel('Model')
plt.ylabel('Average Creative Writing Length vs. Direct Prompting Length')
plt.title('Average Creative Writing Length vs. Direct Prompting Length by Model and Method')
plt.savefig('../Output/cw_length_vs_direct_prompting_means.png')
plt.show()


In [None]:
# Compute change in accuracy/quality per change in length (versus direct prompting)

# In the direct_prompting_comparison table, get averages of conversation_length, dp_conversation_length, accuracy_quality, dp_accuracy_quality by model, method, task
averages_for_changes = direct_prompting_comparison[['model', 'method', 'task', 'conversation_length', 'dp_conversation_length', 'accuracy_quality', 'dp_accuracy_quality']].groupby(['model', 'method', 'task']).agg(['mean']).reset_index()

# Compute change in accuracy_quality per change in conversation_length
# dp_accuracy_quality - accuracy_quality
# divided by
# dp_conversation_length - conversation_length
averages_for_changes['change_in_accuracy_quality_per_change_in_conversation_length'] = (averages_for_changes['dp_accuracy_quality']['mean'] - averages_for_changes['accuracy_quality']['mean']) / (averages_for_changes['dp_conversation_length']['mean'] - averages_for_changes['conversation_length']['mean'])


In [None]:
# Plot change in accuracy_quality per change in conversation_length
# Bar chart by model, method, task
# GSM8k plot
plt.figure(figsize=(10, 5))
sns.barplot(x='model', y='change_in_accuracy_quality_per_change_in_conversation_length', hue='method', data=averages_for_changes[averages_for_changes['task'] == 'gsm8k'])
plt.xlabel('Model')
plt.ylabel('Change in Accuracy/Quality per Change in Conversation Length')
plt.title('Change in Accuracy/Quality per Change in Conversation Length by Model and Method')
plt.savefig('../Output/gsm8k_change_in_accuracy_quality_per_change_in_conversation_length.png')
plt.show()

# Creative writing plot
plt.figure(figsize=(10, 5))
sns.barplot(x='model', y='change_in_accuracy_quality_per_change_in_conversation_length', hue='method', data=averages_for_changes[averages_for_changes['task'] == 'cw'])
plt.xlabel('Model')
plt.ylabel('Change in Accuracy/Quality per Change in Conversation Length')
plt.title('Change in Accuracy/Quality per Change in Conversation Length by Model and Method')
plt.savefig('../Output/cw_change_in_accuracy_quality_per_change_in_conversation_length.png')
plt.show()


In [None]:
# Also create a table with the change in accuracy_quality per change in conversation_length

# Sort and output table

# Sort rows by task - gsm8k task first, then cw
# Sort by model - text-davinci-003 first, then gpt4
change_per_table = averages_for_changes.sort_values(by=['task', 'model'], ascending=[True, True])

# Order columns: direct_prompting, zero_shot_cot, ape_zero_shot_cot, tree_of_thought, self_refine, least_to_most, manual_few_shot, manual_cot
change_per_table = change_per_table[['model', 'task', 'direct_prompting', 'zero_shot_cot', 'ape_zero_shot_cot', 'tree_of_thought', 'self_refine', 'least_to_most', 'manual_few_shot', 'manual_cot']]

# Output to LaTeX
change_per_table.to_latex('../Output/change_per_table.tex', index=False)


## Complexity

Run generic function for number of reasoning steps, sentence length, FRE

Bar charts of differences

Generic function for assessment of ease of review too

In [None]:
means_table('num_linebreaks')
means_table('num_sentences')
means_table('num_step_i')
means_table('num_1_dot_etc')
means_table('sentence_length')
means_table('fres')


In [None]:
# Differences of scores 
# Responses versus prompts
# Responses versus provided answer for GSM8k

# Variables: 
# 'num_linebreaks_prompts': num_linebreaks_prompts,
# 'num_sentences_prompts': num_sentences_prompts,
# 'num_step_i_prompts': num_step_i_prompts,
# 'num_1_dot_etc_prompts': num_1_dot_etc_prompts,
# 'sentence_length_prompts': sentence_length_prompts,
# 'fres_prompts': fres_prompts,
# 'num_linebreaks_provided': num_linebreaks_provided,
# 'num_sentences_provided': num_sentences_provided,
# 'num_step_i_provided': num_step_i_provided,
# 'num_1_dot_etc_provided': num_1_dot_etc_provided
# Loop over variables and create differences variables
comparison_vars = ['num_linebreaks_prompts', 'num_sentences_prompts', 'num_step_i_prompts', 'num_1_dot_etc_prompts', 'sentence_length_prompts', 'fres_prompts', 'num_linebreaks_provided', 'num_sentences_provided', 'num_step_i_provided', 'num_1_dot_etc_provided']
for var in comparison_vars:
    # Create a variable that is the difference between the prompts and responses
    # var + '_diff'
    combined_data[var + '_diff'] = combined_data[var] - combined_data[var.replace('_prompts', '').replace('_provided', '')]

# Aggregate "_diff" variables by model, task, method
differences = combined_data[['model', 'task', 'method', 'num_linebreaks_prompts_diff', 'num_sentences_prompts_diff', 'num_step_i_prompts_diff', 'num_1_dot_etc_prompts_diff', 'sentence_length_prompts_diff', 'fres_prompts_diff', 'num_linebreaks_provided_diff', 'num_sentences_provided_diff', 'num_step_i_provided_diff', 'num_1_dot_etc_provided_diff']].groupby(['model', 'task', 'method']).agg(['mean']).reset_index()


In [None]:
# Create plots of _diff variables by model, task, method

# GSM8k plots
# Loop over variables
comparison_vars_gsm8k = ['num_linebreaks_prompts_diff', 'num_sentences_prompts_diff', 'num_step_i_prompts_diff', 'num_1_dot_etc_prompts_diff', 'num_linebreaks_provided_diff', 'num_sentences_provided_diff', 'num_step_i_provided_diff', 'num_1_dot_etc_provided_diff']
for var in comparison_vars_gsm8k:
    # Plot
    plt.figure(figsize=(10, 5))
    sns.barplot(x='model', y=var, hue='method', data=differences[differences['task'] == 'gsm8k'])
    plt.xlabel('Model')
    plt.ylabel('Average ' + var + ' Difference')
    plt.title('Average ' + var + ' Difference by Model and Method')
    plt.savefig('../Output/gsm8k_' + var + '.png')
    plt.show()

# Creative writing plots
# Loop over variables
comparison_vars_cw = ['num_linebreaks_prompts_diff', 'num_sentences_prompts_diff', 'num_step_i_prompts_diff', 'num_1_dot_etc_prompts_diff', 'sentence_length_prompts_diff', 'fres_prompts_diff']
for var in comparison_vars_cw:
    # Plot
    plt.figure(figsize=(10, 5))
    sns.barplot(x='model', y=var, hue='method', data=differences[differences['task'] == 'cw'])
    plt.xlabel('Model')
    plt.ylabel('Average ' + var + ' Difference')
    plt.title('Average ' + var + ' Difference by Model and Method')
    plt.savefig('../Output/cw_' + var + '.png')
    plt.show()


In [None]:
means_table('ease_of_evaluation_score')
