# Analyze Metrics and Conduct Inference.ipynb

In [None]:
import pandas as pd
from statsmodels.stats.contingency_tables import mcnemar 
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats


In [None]:
# Load Combined_Data.xlsx
combined_data = pd.read_excel('Combined_Data.xlsx')

# GSM8k data - limit to task = "gsm8k"
gsm8k_data = combined_data[combined_data['task'] == 'gsm8k']

# Creative writing data - limit to task = "cw"
cw_data = combined_data[combined_data['task'] == 'cw']


## Accuracy/Quality

In [None]:
# Produce table - one column for each method, one row for each model by task type
# Average values of correct_or_incorrect and creative_writing_score
# Also get variance

# Column for combined accuracy_quality - correct_or_incorrect if task = "gsm8k", creative_writing_score if task = "cw"
combined_data['accuracy_quality'] = combined_data.apply(lambda row: row['correct_or_incorrect'] if row['task'] == 'gsm8k' else row['creative_writing_score'], axis=1)

# Average accuracy_quality by model, method, task
# Also get variance
avg_accuracy_quality_with_variance = combined_data[['model', 'method', 'task', 'accuracy_quality']].groupby(['model', 'method', 'task']).agg(['mean', 'var'])['accuracy_quality'].reset_index()

# Combine mean and variance into one column that is a string with the mean and then the variance in parentheses
avg_accuracy_quality_with_variance['accuracy_quality'] = avg_accuracy_quality_with_variance.apply(lambda row: str(round(row['mean'], 2)) + ' (' + str(round(row['var'], 2)) + ')', axis=1)

# Pivot table - column method should go wide
avg_accuracy_quality_pivot = avg_accuracy_quality_with_variance.pivot_table(index=['model', 'task'], columns='method', values='accuracy_quality').reset_index()

# Sort rows by task - gsm8k task first, then cw
# Sort by model - text-davinci-003 first, then gpt4
avg_accuracy_quality_pivot = avg_accuracy_quality_pivot.sort_values(by=['task', 'model'], ascending=[True, True])

# Order columns: direct_prompting, zero_shot_cot, ape_zero_shot_cot, tree_of_thought, self_refine, least_to_most, manual_few_shot, manual_cot
avg_accuracy_quality_pivot = avg_accuracy_quality_pivot[['model', 'task', 'direct_prompting', 'zero_shot_cot', 'ape_zero_shot_cot', 'tree_of_thought', 'self_refine', 'least_to_most', 'manual_few_shot', 'manual_cot']]

# Output to LaTeX
avg_accuracy_quality_pivot.to_latex('../Output/avg_accuracy_quality_pivot.tex', index=False)


In [None]:
# Perform McNemar's Test on GSM8k data

# Get in terms of number of questions answered correctly: sum accuracy_quality column by model, method, task
gsm8k_num_correct= gsm8k_data[['model', 'method', 'task', 'accuracy_quality']].groupby(['model', 'method', 'task']).sum()['accuracy_quality'].reset_index()
# Rename column to num_correct
gsm8k_num_correct = gsm8k_num_correct.rename(columns={'accuracy_quality': 'num_correct'})

# Make another table with counts of questions missed (number of observations minus accuracy_quality sum) by model, method, task
gsm8k_num_obs = gsm8k_data[['model', 'method', 'task', 'accuracy_quality']].groupby(['model', 'method', 'task']).count()['accuracy_quality'].reset_index()
# Rename column to num_obs
gsm8k_num_obs = gsm8k_num_obs.rename(columns={'accuracy_quality': 'num_obs'})
# Join together by model, method, task
gsm8k_correct_incorrect_obs = gsm8k_num_correct.merge(gsm8k_num_obs, on=['model', 'method', 'task'], how='left')
# Table should have model, method, task, num_correct, num_incorrect, num_obs
gsm8k_correct_incorrect_obs['num_incorrect'] = gsm8k_correct_incorrect_obs['num_obs'] - gsm8k_correct_incorrect_obs['num_correct']

gsm8k_correct_incorrect_obs


### Testing Accuracy

In [None]:
# Function to perform McNemar's test
# Accepts argument of name of model, method, task

def perform_mcnemar(model, method, task):

    # Create variables for data of interest
    # direct_prompting_correct
    direct_prompting_correct = gsm8k_correct_incorrect_obs[(gsm8k_correct_incorrect_obs['model'] == model) & (gsm8k_correct_incorrect_obs['method'] == "direct_prompting") & (gsm8k_correct_incorrect_obs['task'] == task)]['num_correct'].values[0]
    # direct_prompting_incorrect
    direct_prompting_incorrect = gsm8k_correct_incorrect_obs[(gsm8k_correct_incorrect_obs['model'] == model) & (gsm8k_correct_incorrect_obs['method'] == "direct_prompting") & (gsm8k_correct_incorrect_obs['task'] == task)]['num_incorrect'].values[0]
    # method_correct
    method_correct = gsm8k_correct_incorrect_obs[(gsm8k_correct_incorrect_obs['model'] == model) & (gsm8k_correct_incorrect_obs['method'] == method) & (gsm8k_correct_incorrect_obs['task'] == task)]['num_correct'].values[0]
    # method_incorrect
    method_incorrect = gsm8k_correct_incorrect_obs[(gsm8k_correct_incorrect_obs['model'] == model) & (gsm8k_correct_incorrect_obs['method'] == method) & (gsm8k_correct_incorrect_obs['task'] == task)]['num_incorrect'].values[0]

    # Create a dataset 
    # Row for direct prompting then method
    # Column for correct then incorrect
    data = [[direct_prompting_correct, direct_prompting_incorrect], 
            [method_correct, method_incorrect]] 
    
    print('McNemar\'s Test (Exact) for ' + model + ' ' + method + ' ' + task)

    # McNemar's Test, exact, without any continuity correction 
    print('No continuity correction')
    print(mcnemar(data, exact=True, correction=False)) 
    
    # McNemar's Test with the continuity correction 
    print('With continuity correction')
    print(mcnemar(data, exact=True, correction=True)) 

    # Return data
    return mcnemar(data, exact=True, correction=False)[0], mcnemar(data, exact=True, correction=False)[1], mcnemar(data, exact=True, correction=True)[0], mcnemar(data, exact=True, correction=True)[1]


In [None]:
# Add McNemar's Test results to table
# Iterate over all combinations of model, method, task
# Except for method = "direct_prompting" and task = "cw"
# For each combination, perform McNemar's Test and add to table

# Create combos of model, method, task - all unique combinations of these three in combined_data
# Get unique values of model, method, task
models = combined_data['model'].unique()
methods = combined_data['method'].unique()
tasks = combined_data['task'].unique()
# Create list of all combinations of model, method, task
combinations = [(model, method, task) for model in models for method in methods for task in tasks]

# Add as rows to a dataframe
# Create empty dataframe
mcnemars_results = pd.DataFrame(columns=['model', 'method', 'task', 'statistic', 'pvalue', 'statistic_with_correction', 'pvalue_with_correction'])

# Iterate over combinations
for model, method, task in combinations:
    # Skip if method = "direct_prompting" or task = "cw"
    if method == 'direct_prompting' or task == 'cw':
        continue
    # Perform McNemar's Test
    statistic_without_correction, pvalue_without_correction, statistic_with_correction, pvalue_with_correction = perform_mcnemar(model, method, task)
    # Add to table
    mcnemars_results = mcnemars_results.concat(pd.DataFrame([[model, method, task, statistic_without_correction, pvalue_without_correction, statistic_with_correction, pvalue_with_correction]], columns=['model', 'method', 'task', 'statistic', 'pvalue', 'statistic_with_correction', 'pvalue_with_correction']))

mcnemars_results


In [None]:
# Rows that are not significant at p < 0.05
mcnemars_results[mcnemars_results['pvalue'] >= 0.05]


## Create table for comparing each model/task/method with the appropriate direct prompting data

In [None]:
# Split dataset
no_direct_prompting_data = combined_data[combined_data['method'] != 'direct_prompting']
direct_prompting_data = combined_data[combined_data['method'] == 'direct_prompting']

# Add prefix dp_ to columns in direct_prompting_data
direct_prompting_data = direct_prompting_data.add_prefix('dp_')

# Left join datasets on model = dp_model, task = dp_task
direct_prompting_comparison = no_direct_prompting_data.merge(direct_prompting_data, left_on=['model', 'task'], right_on=['dp_model', 'dp_task'], how='left')

direct_prompting_comparison


## Paired t-test for quality, means of other metrics

In [None]:
# Perform paired t-test on creative writing data and all other metrics

# Metrics to t-test: 'creative_writing_score', 'ease_of_evaluation_score', 'conversation_length', 'input_length', 'output_length', 'conversation_cost', 'gsm8k_length_vs_provided', 'length_vs_direct_prompting', 'num_linebreaks', 'num_sentences', 'num_step_i', 'num_1_dot_etc', 'sentence_length', 'fres', 'num_linebreaks_prompts', 'num_sentences_prompts', 'num_step_i_prompts', 'num_1_dot_etc_prompts', 'sentence_length_prompts', 'fres_prompts'
metrics_to_t_test = ['creative_writing_score', 'ease_of_evaluation_score', 'conversation_length', 'input_length', 'output_length', 'conversation_cost', 'gsm8k_length_vs_provided', 'length_vs_direct_prompting', 'num_linebreaks', 'num_sentences', 'num_step_i', 'num_1_dot_etc', 'sentence_length', 'fres', 'num_linebreaks_prompts', 'num_sentences_prompts', 'num_step_i_prompts', 'num_1_dot_etc_prompts', 'sentence_length_prompts', 'fres_prompts']

# Create table to hold results
# Columns of model, task, method, mean metric, mean dp_metric, statistic, pvalue
t_test_results = pd.DataFrame(columns=['model', 'task', 'method', 'metric', 'dp_mean', 'using_method_mean', 'statistic', 'pvalue'])
# Iterate over model, task, method
for model, task, method in direct_prompting_comparison[['model', 'task', 'method']].values:
    # Iterate over metrics_to_t_test
    for metric in metrics_to_t_test:
        # Get data
        # Sort by conversation_number
        data = direct_prompting_comparison[(direct_prompting_comparison['model'] == model) & (direct_prompting_comparison['task'] == task) & (direct_prompting_comparison['method'] == method)].sort_values(by=['conversation_number'])
        # direct_prompting holds the metric when using direct_prompting
        direct_prompting = data['dp_' + metric]
        # using_method holds the metric when using the method
        using_method = data[metric]
        # Perform the paired sample t-test 
        statistic, pvalue = stats.ttest_rel(direct_prompting, using_method)
        # Add to table
        t_test_results = t_test_results.concat(pd.DataFrame([[model, task, method, metric, direct_prompting.mean(), using_method.mean(), statistic, pvalue]], columns=['model', 'task', 'method', 'metric', 'dp_mean', 'using_method_mean', 'statistic', 'pvalue']))

t_test_results


In [None]:
# Rows that are not significant at p < 0.05
t_test_results[t_test_results['pvalue'] >= 0.05]


## Adding release date to accuracy quality table (optional version)

In [None]:
# Release date and gains versus direct prompting

# Load Excel file "..\Selection of Prompt Engineering Methods\Hand-Labeled Method and Implementation Considerations.xlsx"
hand_labeled_data = pd.read_excel('..\Selection of Prompt Engineering Methods\Hand-Labeled Method and Implementation Considerations.xlsx')

# Keep columns ss_publication_date, "Prompt Engineering Method"
hand_labeled_data = hand_labeled_data[['ss_publication_date', 'Prompt Engineering Method']]

# Mapping names from this data to technqiue names
# "Few-Shot Learing" -> "manual_few_shot"
# "Chain-of-Thought Prompting" -> "manual_cot"
# "Zero-Shot Chain-of_Thought" -> "zero_shot_cot"
# "Automatic Prompt Engineer" -> "ape_zero_shot_cot"
# "Self-Refine" -> "self_refine"
# "Least-to-Most Prompting" -> "least_to_most"
# "Tree-of-Thought" -> "tree_of_thought"
# Set names in hand_labeled_data to these names
hand_labeled_data['technique_name'] = hand_labeled_data['Prompt Engineering Method'].replace({'Few-Shot Learning': 'manual_few_shot', 'Chain-of-Thought Prompting': 'manual_cot', 'Zero-Shot Chain-of-Thought': 'zero_shot_cot', 'Automatic Prompt Engineer': 'ape_zero_shot_cot', 'Self-Refine': 'self_refine', 'Least-to-Most Prompting': 'least_to_most', 'Tree-of-Thought': 'tree_of_thought'})

# Convert ss_publication_date to "YYYY-MM-DD"
hand_labeled_data['ss_publication_date'] = hand_labeled_data['ss_publication_date'].dt.strftime('%Y-%m-%d')

# Add ss_publication_date to avg_accuracy_quality_pivot
# The value should be for each column, right below the column name
# Transpose accuracy_quality_pivot
avg_accuracy_quality_pivot_transposed = avg_accuracy_quality_pivot.transpose().reset_index()

# Set name of first column to "technique_name"
avg_accuracy_quality_pivot_transposed = avg_accuracy_quality_pivot_transposed.rename(columns={'index': 'technique_name'})

# Left join hand_labeled_data to avg_accuracy_quality_pivot_transposed on technique_name
avg_accuracy_quality_pivot_transposed = avg_accuracy_quality_pivot_transposed.merge(hand_labeled_data, on='technique_name', how='left')

# Transpose back
avg_accuracy_quality_pivot_with_date = avg_accuracy_quality_pivot_transposed.transpose().reset_index()

# Output to LaTeX
avg_accuracy_quality_pivot_with_date.to_latex('../Output/avg_accuracy_quality_pivot_with_date.tex', index=False)


## Generic Function for a table of means for a metric

In [None]:
def means_table(metric):

    # Average combined_data metric by model, method, task
    avg_combined_data_metric = combined_data[['model', 'method', 'task', metric]].groupby(['model', 'method', 'task']).agg(['mean'])[metric].reset_index()

    # Pivot table - column method should go wide
    avg_combined_data_metric_pivot = avg_combined_data_metric.pivot_table(index=['model', 'task'], columns='method', values=metric).reset_index()

    # Sort rows by task - gsm8k task first, then cw
    # Sort by model - text-davinci-003 first, then gpt4
    avg_combined_data_metric_pivot = avg_combined_data_metric_pivot.sort_values(by=['task', 'model'], ascending=[True, True])

    # Order columns: direct_prompting, zero_shot_cot, ape_zero_shot_cot, tree_of_thought, self_refine, least_to_most, manual_few_shot, manual_cot
    avg_combined_data_metric_pivot = avg_combined_data_metric_pivot[['model', 'task', 'direct_prompting', 'zero_shot_cot', 'ape_zero_shot_cot', 'tree_of_thought', 'self_refine', 'least_to_most', 'manual_few_shot', 'manual_cot']]

    # Output to LaTeX
    avg_combined_data_metric_pivot.to_latex('../Output/avg_' + metric + '_pivot.tex', index=False)



## Length

Run generic function for length of entire interaction, length of all prompts, financial cost

Baseline comparison ratios, change in accuracy/quality plots

In [None]:
means_table('conversation_length')
means_table('input_length')
means_table('conversation_cost')


In [None]:
# Plot gsm8k_length_vs_provided in a bar chart
# Limit to gsm8k data
# Get means by model, method
gsm8k_length_vs_provided_means = gsm8k_data[['model', 'method', 'gsm8k_length_vs_provided']].groupby(['model', 'method']).agg(['mean'])['gsm8k_length_vs_provided'].reset_index()

# Bar chart by model, method
sns.set_theme(style="whitegrid")
plt.figure(figsize=(10, 5))
sns.barplot(x='model', y='mean', hue='method', data=gsm8k_length_vs_provided_means)
plt.xlabel('Model')
plt.ylabel('Average GSM8k Length vs. Provided Length')
plt.title('Average GSM8k Length vs. Provided Length by Model and Method')
plt.savefig('../Output/gsm8k_length_vs_provided_means.png')
plt.show()


In [None]:
# Plot length_vs_direct_prompting in a bar chart
# Get means by model, method, task
length_vs_direct_prompting_means = combined_data[['model', 'method', 'task', 'length_vs_direct_prompting']].groupby(['model', 'method', 'task']).agg(['mean'])['length_vs_direct_prompting'].reset_index()

# Bar chart by model, method, task

# GSM8k plot
plt.figure(figsize=(10, 5))
sns.barplot(x='model', y='mean', hue='method', data=length_vs_direct_prompting_means[length_vs_direct_prompting_means['task'] == 'gsm8k'])
plt.xlabel('Model')
plt.ylabel('Average GSM8k Length vs. Direct Prompting Length')
plt.title('Average GSM8k Length vs. Direct Prompting Length by Model and Method')
plt.savefig('../Output/gsm8k_length_vs_direct_prompting_means.png')
plt.show()

# Creative writing plot
plt.figure(figsize=(10, 5))
sns.barplot(x='model', y='mean', hue='method', data=length_vs_direct_prompting_means[length_vs_direct_prompting_means['task'] == 'cw'])
plt.xlabel('Model')
plt.ylabel('Average Creative Writing Length vs. Direct Prompting Length')
plt.title('Average Creative Writing Length vs. Direct Prompting Length by Model and Method')
plt.savefig('../Output/cw_length_vs_direct_prompting_means.png')
plt.show()


In [None]:
# Compute change in accuracy/quality per change in length (versus direct prompting)

# In the direct_prompting_comparison table, get averages of conversation_length, dp_conversation_length, accuracy_quality, dp_accuracy_quality by model, method, task
averages_for_changes = direct_prompting_comparison[['model', 'method', 'task', 'conversation_length', 'dp_conversation_length', 'accuracy_quality', 'dp_accuracy_quality']].groupby(['model', 'method', 'task']).agg(['mean']).reset_index()

# Compute change in accuracy_quality per change in conversation_length
# dp_accuracy_quality - accuracy_quality
# divided by
# dp_conversation_length - conversation_length
averages_for_changes['change_in_accuracy_quality_per_change_in_conversation_length'] = (averages_for_changes['dp_accuracy_quality']['mean'] - averages_for_changes['accuracy_quality']['mean']) / (averages_for_changes['dp_conversation_length']['mean'] - averages_for_changes['conversation_length']['mean'])


In [None]:
# Plot change in accuracy_quality per change in conversation_length
# Bar chart by model, method, task
# GSM8k plot
plt.figure(figsize=(10, 5))
sns.barplot(x='model', y='change_in_accuracy_quality_per_change_in_conversation_length', hue='method', data=averages_for_changes[averages_for_changes['task'] == 'gsm8k'])
plt.xlabel('Model')
plt.ylabel('Change in Accuracy/Quality per Change in Conversation Length')
plt.title('Change in Accuracy/Quality per Change in Conversation Length by Model and Method')
plt.savefig('../Output/gsm8k_change_in_accuracy_quality_per_change_in_conversation_length.png')
plt.show()

# Creative writing plot
plt.figure(figsize=(10, 5))
sns.barplot(x='model', y='change_in_accuracy_quality_per_change_in_conversation_length', hue='method', data=averages_for_changes[averages_for_changes['task'] == 'cw'])
plt.xlabel('Model')
plt.ylabel('Change in Accuracy/Quality per Change in Conversation Length')
plt.title('Change in Accuracy/Quality per Change in Conversation Length by Model and Method')
plt.savefig('../Output/cw_change_in_accuracy_quality_per_change_in_conversation_length.png')
plt.show()


In [None]:
# Also create a table with the change in accuracy_quality per change in conversation_length

# Sort and output table

# Sort rows by task - gsm8k task first, then cw
# Sort by model - text-davinci-003 first, then gpt4
change_per_table = averages_for_changes.sort_values(by=['task', 'model'], ascending=[True, True])

# Order columns: direct_prompting, zero_shot_cot, ape_zero_shot_cot, tree_of_thought, self_refine, least_to_most, manual_few_shot, manual_cot
change_per_table = change_per_table[['model', 'task', 'direct_prompting', 'zero_shot_cot', 'ape_zero_shot_cot', 'tree_of_thought', 'self_refine', 'least_to_most', 'manual_few_shot', 'manual_cot']]

# Output to LaTeX
change_per_table.to_latex('../Output/change_per_table.tex', index=False)


## Complexity

Run generic function for number of reasoning steps, sentence length, FRE

Bar charts of differences

Generic function for assessment of ease of review too

In [None]:
means_table('num_linebreaks')
means_table('num_sentences')
means_table('num_step_i')
means_table('num_1_dot_etc')
means_table('sentence_length')
means_table('fres')


In [None]:
# Differences of scores 
# Responses versus prompts
# Responses versus provided answer for GSM8k

# Variables: 
# 'num_linebreaks_prompts': num_linebreaks_prompts,
# 'num_sentences_prompts': num_sentences_prompts,
# 'num_step_i_prompts': num_step_i_prompts,
# 'num_1_dot_etc_prompts': num_1_dot_etc_prompts,
# 'sentence_length_prompts': sentence_length_prompts,
# 'fres_prompts': fres_prompts,
# 'num_linebreaks_provided': num_linebreaks_provided,
# 'num_sentences_provided': num_sentences_provided,
# 'num_step_i_provided': num_step_i_provided,
# 'num_1_dot_etc_provided': num_1_dot_etc_provided
# Loop over variables and create differences variables
comparison_vars = ['num_linebreaks_prompts', 'num_sentences_prompts', 'num_step_i_prompts', 'num_1_dot_etc_prompts', 'sentence_length_prompts', 'fres_prompts', 'num_linebreaks_provided', 'num_sentences_provided', 'num_step_i_provided', 'num_1_dot_etc_provided']
for var in comparison_vars:
    # Create a variable that is the difference between the prompts and responses
    # var + '_diff'
    combined_data[var + '_diff'] = combined_data[var] - combined_data[var.replace('_prompts', '').replace('_provided', '')]

# Aggregate "_diff" variables by model, task, method
differences = combined_data[['model', 'task', 'method', 'num_linebreaks_prompts_diff', 'num_sentences_prompts_diff', 'num_step_i_prompts_diff', 'num_1_dot_etc_prompts_diff', 'sentence_length_prompts_diff', 'fres_prompts_diff', 'num_linebreaks_provided_diff', 'num_sentences_provided_diff', 'num_step_i_provided_diff', 'num_1_dot_etc_provided_diff']].groupby(['model', 'task', 'method']).agg(['mean']).reset_index()


In [None]:
# Create plots of _diff variables by model, task, method

# GSM8k plots
# Loop over variables
comparison_vars_gsm8k = ['num_linebreaks_prompts_diff', 'num_sentences_prompts_diff', 'num_step_i_prompts_diff', 'num_1_dot_etc_prompts_diff', 'num_linebreaks_provided_diff', 'num_sentences_provided_diff', 'num_step_i_provided_diff', 'num_1_dot_etc_provided_diff']
for var in comparison_vars_gsm8k:
    # Plot
    plt.figure(figsize=(10, 5))
    sns.barplot(x='model', y=var, hue='method', data=differences[differences['task'] == 'gsm8k'])
    plt.xlabel('Model')
    plt.ylabel('Average ' + var + ' Difference')
    plt.title('Average ' + var + ' Difference by Model and Method')
    plt.savefig('../Output/gsm8k_' + var + '.png')
    plt.show()

# Creative writing plots
# Loop over variables
comparison_vars_cw = ['num_linebreaks_prompts_diff', 'num_sentences_prompts_diff', 'num_step_i_prompts_diff', 'num_1_dot_etc_prompts_diff', 'sentence_length_prompts_diff', 'fres_prompts_diff']
for var in comparison_vars_cw:
    # Plot
    plt.figure(figsize=(10, 5))
    sns.barplot(x='model', y=var, hue='method', data=differences[differences['task'] == 'cw'])
    plt.xlabel('Model')
    plt.ylabel('Average ' + var + ' Difference')
    plt.title('Average ' + var + ' Difference by Model and Method')
    plt.savefig('../Output/cw_' + var + '.png')
    plt.show()


In [None]:
means_table('ease_of_evaluation_score')
