# Analyze Metrics and Conduct Inference.ipynb

Produce the large accuracy/quality table.

Metrics - GSM8K accuracy, CW Inter-Sentence Cosine Similarity, CW Inter-Paragraph Cosine Similarity, Compliance-Adjusted CW Inter-Sentence Cosine Similarity, Compliance-Adjusted CW Inter-Paragraph Cosine Similarity, Task Compliance

Row is a model by metric, column is a method.

Include ss publication date under method.

Include mean and variance (parentheses).

In [43]:
import pandas as pd
from statsmodels.stats.contingency_tables import mcnemar 
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import simpledorff
import math


In [44]:
# Load plot_settings.pickle
import pickle

with open('order_list.pkl', 'rb') as f:
    order_list = pickle.load(f)
print(order_list)

with open('hue_order_pub_date.pkl', 'rb') as f:
    hue_order_pub_date = pickle.load(f)
print(hue_order_pub_date)

with open('hue_order_introduced.pkl', 'rb') as f:
    hue_order_introduced = pickle.load(f)
print(hue_order_introduced)

# Set the font to Times New Roman or a similar serif font
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['font.family'] = 'serif'
mpl.rcParams['font.serif'] = 'Times New Roman'
# Optionally, set the font size
mpl.rcParams['font.size'] = 12

# Figure size
f_size = (16 / 2, 9 / 2)

# Function necessary to flip the legend
import itertools
def flip(items, ncol):
    return itertools.chain(*[items[i::ncol] for i in range(ncol)])


['Text-Davinci-003', 'GPT-4']
['Manual Few-Shot', 'Manual CoT', 'Least-to-Most', 'Zero-Shot CoT', 'APE Zero-Shot CoT', 'Self-Refine', 'Tree-of-Thought', 'Direct Prompting']
['Direct Prompting', 'Zero-Shot CoT', 'APE Zero-Shot CoT', 'Tree-of-Thought', 'Self-Refine', 'Least-to-Most', 'Manual Few-Shot', 'Manual CoT']


In [45]:
# Load Combined_Data.xlsx
combined_data = pd.read_excel('Combined_Data.xlsx')

# GSM8k data - limit to task = "gsm8k"
gsm8k_data = combined_data[combined_data['task'] == 'gsm8k']

# Creative writing data - limit to task = "cw"
cw_data = combined_data[combined_data['task'] == 'cw']

# Load direct_prompting_comparison.xlsx
#direct_prompting_comparison = pd.read_excel('direct_prompting_comparison.xlsx')


### Plotting CW Scores for Exploration

In [46]:
# Loop over cases where task = "cw" and output histograms of accuracy quality by model and method
# Loop over models
for model in cw_data['model'].unique():
    # Loop over methods
    for method in cw_data['method'].unique():
        # Get data for this model and method
        data = cw_data[(cw_data['model'] == model) & (cw_data['method'] == method)]
        # Plot histogram
        plt.hist(data['coherence_1_incoherent_10_very_coherent'], bins = 10)
        # Ticks for values of 1, 2, ... 10
        plt.xticks(range(1, 11))
        plt.title(model + ' - ' + method)
        plt.xlabel('Accuracy Quality')
        plt.ylabel('Count')
        # Save histogram
        plt.savefig('../Output/cw_acc_qual_' + model + '_' + method + '.png')
        plt.close()


## Accuracy/Quality

In [47]:
# In combined_data, keep columns Model, Method, Task, correct, avg_cosine_sim, avg_inter_paragraph_cosine_sim, avg_cosine_sim_compliance_adjusted, avg_inter_paragraph_cosine_sim_compliance_adjusted, compliance
limited_combined_data = combined_data[['Model', 'Method', 'Task', 'correct', 'avg_cosine_sim', 'avg_inter_paragraph_cosine_sim', 'avg_cosine_sim_compliance_adjusted', 'avg_inter_paragraph_cosine_sim_compliance_adjusted', 'compliance']]

# Stack by Model and Method
stacked_combined_data = limited_combined_data.set_index(['Model', 'Method', 'Task']).stack().reset_index().rename(columns = {'level_3': 'Metric', 0: 'Value'})

stacked_combined_data


Unnamed: 0,Model,Method,Task,Metric,Value
0,Text-Davinci-003,Direct Prompting,Creative Writing,avg_cosine_sim,0.405294
1,Text-Davinci-003,Direct Prompting,Creative Writing,avg_inter_paragraph_cosine_sim,0.009602
2,Text-Davinci-003,Direct Prompting,Creative Writing,avg_cosine_sim_compliance_adjusted,0.405294
3,Text-Davinci-003,Direct Prompting,Creative Writing,avg_inter_paragraph_cosine_sim_compliance_adju...,0.009602
4,Text-Davinci-003,Direct Prompting,Creative Writing,compliance,1.000000
...,...,...,...,...,...
7567,GPT-4,Manual CoT,GSM8K,correct,1.000000
7568,GPT-4,Manual CoT,GSM8K,correct,1.000000
7569,GPT-4,Manual CoT,GSM8K,correct,1.000000
7570,GPT-4,Manual CoT,GSM8K,correct,1.000000


In [48]:
# Average Value by model, method, metric
# Also get variance
avg_accuracy_quality_with_variance = stacked_combined_data.groupby(['Model', 'Method', 'Task', 'Metric']).agg(['mean', 'var'])['Value'].reset_index()

# Combine mean and variance into one column that is a string with the mean and then the variance in parentheses
avg_accuracy_quality_with_variance['Value'] = avg_accuracy_quality_with_variance.apply(lambda row: str(round(row['mean'], 2)) + ' (' + str(round(row['var'], 2)) + ')', axis=1)
# But set Value to just be mean if metric is compliance or correct
avg_accuracy_quality_with_variance.loc[avg_accuracy_quality_with_variance['Metric'].isin(['compliance', 'correct']), 'Value'] = avg_accuracy_quality_with_variance['mean'].round(2).astype(str)

# Drop mean and variance columns
avg_accuracy_quality_with_variance = avg_accuracy_quality_with_variance.drop(columns = ['mean', 'var'])

avg_accuracy_quality_with_variance


Unnamed: 0,Model,Method,Task,Metric,Value
0,GPT-4,APE Zero-Shot CoT,Creative Writing,avg_cosine_sim,0.35 (0.0)
1,GPT-4,APE Zero-Shot CoT,Creative Writing,avg_cosine_sim_compliance_adjusted,0.36 (0.0)
2,GPT-4,APE Zero-Shot CoT,Creative Writing,avg_inter_paragraph_cosine_sim,0.46 (0.02)
3,GPT-4,APE Zero-Shot CoT,Creative Writing,avg_inter_paragraph_cosine_sim_compliance_adju...,0.45 (0.03)
4,GPT-4,APE Zero-Shot CoT,Creative Writing,compliance,0.56
...,...,...,...,...,...
91,Text-Davinci-003,Zero-Shot CoT,Creative Writing,avg_cosine_sim_compliance_adjusted,0.39 (0.01)
92,Text-Davinci-003,Zero-Shot CoT,Creative Writing,avg_inter_paragraph_cosine_sim,0.41 (0.04)
93,Text-Davinci-003,Zero-Shot CoT,Creative Writing,avg_inter_paragraph_cosine_sim_compliance_adju...,0.36 (0.03)
94,Text-Davinci-003,Zero-Shot CoT,Creative Writing,compliance,0.43


In [49]:
# Pivot table - column method should go wide
avg_accuracy_quality_pivot = avg_accuracy_quality_with_variance.set_index(['Task', 'Metric', 'Model', 'Method']).unstack()
#pivot_table(index=['model', 'task'], columns='method', values='accuracy_quality').reset_index()

# Fix axis
avg_accuracy_quality_pivot = avg_accuracy_quality_pivot.rename_axis([None, None], axis=1).reset_index()

avg_accuracy_quality_pivot


Unnamed: 0_level_0,Task,Metric,Model,Value,Value,Value,Value,Value,Value,Value,Value
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,APE Zero-Shot CoT,Direct Prompting,Least-to-Most,Manual CoT,Manual Few-Shot,Self-Refine,Tree-of-Thought,Zero-Shot CoT
0,Creative Writing,avg_cosine_sim,GPT-4,0.35 (0.0),0.33 (0.0),0.34 (0.0),0.35 (0.0),0.35 (0.0),0.36 (0.0),0.35 (0.0),0.35 (0.0)
1,Creative Writing,avg_cosine_sim,Text-Davinci-003,0.38 (0.01),0.36 (0.0),0.36 (0.0),0.35 (0.0),0.36 (0.0),0.37 (0.0),0.36 (0.01),0.37 (0.01)
2,Creative Writing,avg_cosine_sim_compliance_adjusted,GPT-4,0.36 (0.0),0.33 (0.0),0.35 (0.0),0.36 (0.0),0.35 (0.0),0.35 (0.0),0.35 (0.0),0.34 (0.0)
3,Creative Writing,avg_cosine_sim_compliance_adjusted,Text-Davinci-003,0.39 (0.01),0.37 (0.01),0.37 (0.0),0.35 (0.0),0.36 (0.0),0.39 (0.0),0.4 (0.01),0.39 (0.01)
4,Creative Writing,avg_inter_paragraph_cosine_sim,GPT-4,0.46 (0.02),0.42 (0.03),0.42 (0.02),0.42 (0.02),0.39 (0.02),0.41 (0.03),0.45 (0.02),0.46 (0.02)
5,Creative Writing,avg_inter_paragraph_cosine_sim,Text-Davinci-003,0.42 (0.04),0.36 (0.03),0.48 (0.03),0.48 (0.03),0.48 (0.03),0.37 (0.03),0.43 (0.05),0.41 (0.04)
6,Creative Writing,avg_inter_paragraph_cosine_sim_compliance_adju...,GPT-4,0.45 (0.03),0.42 (0.02),0.4 (0.03),0.42 (0.02),0.37 (0.02),0.4 (0.02),0.46 (0.02),0.46 (0.02)
7,Creative Writing,avg_inter_paragraph_cosine_sim_compliance_adju...,Text-Davinci-003,0.39 (0.03),0.37 (0.03),0.41 (0.03),0.4 (0.04),0.43 (0.03),0.34 (0.04),0.22 (0.02),0.36 (0.03)
8,Creative Writing,compliance,GPT-4,0.56,0.56,0.52,0.51,0.63,0.48,0.26,0.57
9,Creative Writing,compliance,Text-Davinci-003,0.44,0.5,0.25,0.19,0.43,0.32,0.04,0.43


In [50]:
# Recode Metric column
avg_accuracy_quality_pivot['Metric'] = avg_accuracy_quality_pivot['Metric'].replace({'avg_cosine_sim': 'Average Inter-Sentence Cosine Similarity', 'avg_inter_paragraph_cosine_sim': 'Average Inter-Paragraph Cosine Similarity', 'avg_cosine_sim_compliance_adjusted': 'Average Inter-Sentence Cosine Similarity (Compliance Adjusted)', 'avg_inter_paragraph_cosine_sim_compliance_adjusted': 'Average Inter-Paragraph Cosine Similarity (Compliance Adjusted)', 'compliance': 'Compliance', 'correct': 'Accuracy'})

avg_accuracy_quality_pivot


Unnamed: 0_level_0,Task,Metric,Model,Value,Value,Value,Value,Value,Value,Value,Value
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,APE Zero-Shot CoT,Direct Prompting,Least-to-Most,Manual CoT,Manual Few-Shot,Self-Refine,Tree-of-Thought,Zero-Shot CoT
0,Creative Writing,Average Inter-Sentence Cosine Similarity,GPT-4,0.35 (0.0),0.33 (0.0),0.34 (0.0),0.35 (0.0),0.35 (0.0),0.36 (0.0),0.35 (0.0),0.35 (0.0)
1,Creative Writing,Average Inter-Sentence Cosine Similarity,Text-Davinci-003,0.38 (0.01),0.36 (0.0),0.36 (0.0),0.35 (0.0),0.36 (0.0),0.37 (0.0),0.36 (0.01),0.37 (0.01)
2,Creative Writing,Average Inter-Sentence Cosine Similarity (Comp...,GPT-4,0.36 (0.0),0.33 (0.0),0.35 (0.0),0.36 (0.0),0.35 (0.0),0.35 (0.0),0.35 (0.0),0.34 (0.0)
3,Creative Writing,Average Inter-Sentence Cosine Similarity (Comp...,Text-Davinci-003,0.39 (0.01),0.37 (0.01),0.37 (0.0),0.35 (0.0),0.36 (0.0),0.39 (0.0),0.4 (0.01),0.39 (0.01)
4,Creative Writing,Average Inter-Paragraph Cosine Similarity,GPT-4,0.46 (0.02),0.42 (0.03),0.42 (0.02),0.42 (0.02),0.39 (0.02),0.41 (0.03),0.45 (0.02),0.46 (0.02)
5,Creative Writing,Average Inter-Paragraph Cosine Similarity,Text-Davinci-003,0.42 (0.04),0.36 (0.03),0.48 (0.03),0.48 (0.03),0.48 (0.03),0.37 (0.03),0.43 (0.05),0.41 (0.04)
6,Creative Writing,Average Inter-Paragraph Cosine Similarity (Com...,GPT-4,0.45 (0.03),0.42 (0.02),0.4 (0.03),0.42 (0.02),0.37 (0.02),0.4 (0.02),0.46 (0.02),0.46 (0.02)
7,Creative Writing,Average Inter-Paragraph Cosine Similarity (Com...,Text-Davinci-003,0.39 (0.03),0.37 (0.03),0.41 (0.03),0.4 (0.04),0.43 (0.03),0.34 (0.04),0.22 (0.02),0.36 (0.03)
8,Creative Writing,Compliance,GPT-4,0.56,0.56,0.52,0.51,0.63,0.48,0.26,0.57
9,Creative Writing,Compliance,Text-Davinci-003,0.44,0.5,0.25,0.19,0.43,0.32,0.04,0.43


In [51]:
# Flatten columns
flattened_cols = [''.join(col).strip().replace('Value', '') for col in avg_accuracy_quality_pivot.columns.values]
avg_accuracy_quality_pivot.columns = flattened_cols

avg_accuracy_quality_pivot


Unnamed: 0,Task,Metric,Model,APE Zero-Shot CoT,Direct Prompting,Least-to-Most,Manual CoT,Manual Few-Shot,Self-Refine,Tree-of-Thought,Zero-Shot CoT
0,Creative Writing,Average Inter-Sentence Cosine Similarity,GPT-4,0.35 (0.0),0.33 (0.0),0.34 (0.0),0.35 (0.0),0.35 (0.0),0.36 (0.0),0.35 (0.0),0.35 (0.0)
1,Creative Writing,Average Inter-Sentence Cosine Similarity,Text-Davinci-003,0.38 (0.01),0.36 (0.0),0.36 (0.0),0.35 (0.0),0.36 (0.0),0.37 (0.0),0.36 (0.01),0.37 (0.01)
2,Creative Writing,Average Inter-Sentence Cosine Similarity (Comp...,GPT-4,0.36 (0.0),0.33 (0.0),0.35 (0.0),0.36 (0.0),0.35 (0.0),0.35 (0.0),0.35 (0.0),0.34 (0.0)
3,Creative Writing,Average Inter-Sentence Cosine Similarity (Comp...,Text-Davinci-003,0.39 (0.01),0.37 (0.01),0.37 (0.0),0.35 (0.0),0.36 (0.0),0.39 (0.0),0.4 (0.01),0.39 (0.01)
4,Creative Writing,Average Inter-Paragraph Cosine Similarity,GPT-4,0.46 (0.02),0.42 (0.03),0.42 (0.02),0.42 (0.02),0.39 (0.02),0.41 (0.03),0.45 (0.02),0.46 (0.02)
5,Creative Writing,Average Inter-Paragraph Cosine Similarity,Text-Davinci-003,0.42 (0.04),0.36 (0.03),0.48 (0.03),0.48 (0.03),0.48 (0.03),0.37 (0.03),0.43 (0.05),0.41 (0.04)
6,Creative Writing,Average Inter-Paragraph Cosine Similarity (Com...,GPT-4,0.45 (0.03),0.42 (0.02),0.4 (0.03),0.42 (0.02),0.37 (0.02),0.4 (0.02),0.46 (0.02),0.46 (0.02)
7,Creative Writing,Average Inter-Paragraph Cosine Similarity (Com...,Text-Davinci-003,0.39 (0.03),0.37 (0.03),0.41 (0.03),0.4 (0.04),0.43 (0.03),0.34 (0.04),0.22 (0.02),0.36 (0.03)
8,Creative Writing,Compliance,GPT-4,0.56,0.56,0.52,0.51,0.63,0.48,0.26,0.57
9,Creative Writing,Compliance,Text-Davinci-003,0.44,0.5,0.25,0.19,0.43,0.32,0.04,0.43


In [52]:
# Print values of Metric
avg_accuracy_quality_pivot['Metric'].unique()


array(['Average Inter-Sentence Cosine Similarity',
       'Average Inter-Sentence Cosine Similarity (Compliance Adjusted)',
       'Average Inter-Paragraph Cosine Similarity',
       'Average Inter-Paragraph Cosine Similarity (Compliance Adjusted)',
       'Compliance', 'Accuracy'], dtype=object)

In [53]:
# Sort rows
# Task GSM8K and then Creative Writing
# Then Metric Accuracy, then Average Inter-Sentence Cosine Similarity, then Average Inter-Paragraph Cosine Similarity, then Average Inter-Sentence Cosine Similarity (Compliance Adjusted), then Average Inter-Paragraph Cosine Similarity (Compliance Adjusted), then Compliance
# Then Model Text-Davinci-003 and then GPT-4
avg_accuracy_quality_pivot['Task'] = pd.Categorical(avg_accuracy_quality_pivot['Task'], ["GSM8K", "Creative Writing"])
avg_accuracy_quality_pivot['Metric'] = pd.Categorical(avg_accuracy_quality_pivot['Metric'], ["Accuracy", "Average Inter-Sentence Cosine Similarity", "Average Inter-Paragraph Cosine Similarity", "Average Inter-Sentence Cosine Similarity (Compliance Adjusted)", "Average Inter-Paragraph Cosine Similarity (Compliance Adjusted)", "Compliance"])
avg_accuracy_quality_pivot['Model'] = pd.Categorical(avg_accuracy_quality_pivot['Model'], ["Text-Davinci-003", "GPT-4"])
avg_accuracy_quality_pivot = avg_accuracy_quality_pivot.sort_values(by=['Task', 'Metric', 'Model'])

print(avg_accuracy_quality_pivot.columns)
avg_accuracy_quality_pivot


Index(['Task', 'Metric', 'Model', 'APE Zero-Shot CoT', 'Direct Prompting',
       'Least-to-Most', 'Manual CoT', 'Manual Few-Shot', 'Self-Refine',
       'Tree-of-Thought', 'Zero-Shot CoT'],
      dtype='object')


Unnamed: 0,Task,Metric,Model,APE Zero-Shot CoT,Direct Prompting,Least-to-Most,Manual CoT,Manual Few-Shot,Self-Refine,Tree-of-Thought,Zero-Shot CoT
11,GSM8K,Accuracy,Text-Davinci-003,0.49,0.23,0.67,0.6,0.18,0.2,0.23,0.62
10,GSM8K,Accuracy,GPT-4,0.93,0.73,0.95,0.93,0.49,0.89,0.4,0.95
1,Creative Writing,Average Inter-Sentence Cosine Similarity,Text-Davinci-003,0.38 (0.01),0.36 (0.0),0.36 (0.0),0.35 (0.0),0.36 (0.0),0.37 (0.0),0.36 (0.01),0.37 (0.01)
0,Creative Writing,Average Inter-Sentence Cosine Similarity,GPT-4,0.35 (0.0),0.33 (0.0),0.34 (0.0),0.35 (0.0),0.35 (0.0),0.36 (0.0),0.35 (0.0),0.35 (0.0)
5,Creative Writing,Average Inter-Paragraph Cosine Similarity,Text-Davinci-003,0.42 (0.04),0.36 (0.03),0.48 (0.03),0.48 (0.03),0.48 (0.03),0.37 (0.03),0.43 (0.05),0.41 (0.04)
4,Creative Writing,Average Inter-Paragraph Cosine Similarity,GPT-4,0.46 (0.02),0.42 (0.03),0.42 (0.02),0.42 (0.02),0.39 (0.02),0.41 (0.03),0.45 (0.02),0.46 (0.02)
3,Creative Writing,Average Inter-Sentence Cosine Similarity (Comp...,Text-Davinci-003,0.39 (0.01),0.37 (0.01),0.37 (0.0),0.35 (0.0),0.36 (0.0),0.39 (0.0),0.4 (0.01),0.39 (0.01)
2,Creative Writing,Average Inter-Sentence Cosine Similarity (Comp...,GPT-4,0.36 (0.0),0.33 (0.0),0.35 (0.0),0.36 (0.0),0.35 (0.0),0.35 (0.0),0.35 (0.0),0.34 (0.0)
7,Creative Writing,Average Inter-Paragraph Cosine Similarity (Com...,Text-Davinci-003,0.39 (0.03),0.37 (0.03),0.41 (0.03),0.4 (0.04),0.43 (0.03),0.34 (0.04),0.22 (0.02),0.36 (0.03)
6,Creative Writing,Average Inter-Paragraph Cosine Similarity (Com...,GPT-4,0.45 (0.03),0.42 (0.02),0.4 (0.03),0.42 (0.02),0.37 (0.02),0.4 (0.02),0.46 (0.02),0.46 (0.02)


In [54]:
# Order columns
starter_col_order = ['Task', 'Metric', 'Model']
col_order = starter_col_order + hue_order_pub_date
print(col_order)
avg_accuracy_quality_pivot = avg_accuracy_quality_pivot[col_order]

# Convert Model, Task, Metric back to string from Categorical
avg_accuracy_quality_pivot['Model'] = avg_accuracy_quality_pivot['Model'].astype(str)
avg_accuracy_quality_pivot['Task'] = avg_accuracy_quality_pivot['Task'].astype(str)
avg_accuracy_quality_pivot['Metric'] = avg_accuracy_quality_pivot['Metric'].astype(str)

# Delete values of Task and Metric when they are not the first instance in the dataframe
output_table = avg_accuracy_quality_pivot.copy()
output_table['Model'] = output_table['Model'].where(output_table['Model'] != output_table['Model'].shift(), '')
output_table['Task'] = output_table['Task'].where(output_table['Task'] != output_table['Task'].shift(), '')
output_table['Metric'] = output_table['Metric'].where(output_table['Metric'] != output_table['Metric'].shift(), '')

# Output to LaTeX
output_table.to_latex('../Output/avg_accuracy_quality_pivot.tex', index=False)

output_table


['Task', 'Metric', 'Model', 'Manual Few-Shot', 'Manual CoT', 'Least-to-Most', 'Zero-Shot CoT', 'APE Zero-Shot CoT', 'Self-Refine', 'Tree-of-Thought', 'Direct Prompting']


Unnamed: 0,Task,Metric,Model,Manual Few-Shot,Manual CoT,Least-to-Most,Zero-Shot CoT,APE Zero-Shot CoT,Self-Refine,Tree-of-Thought,Direct Prompting
11,GSM8K,Accuracy,Text-Davinci-003,0.18,0.6,0.67,0.62,0.49,0.2,0.23,0.23
10,,,GPT-4,0.49,0.93,0.95,0.95,0.93,0.89,0.4,0.73
1,Creative Writing,Average Inter-Sentence Cosine Similarity,Text-Davinci-003,0.36 (0.0),0.35 (0.0),0.36 (0.0),0.37 (0.01),0.38 (0.01),0.37 (0.0),0.36 (0.01),0.36 (0.0)
0,,,GPT-4,0.35 (0.0),0.35 (0.0),0.34 (0.0),0.35 (0.0),0.35 (0.0),0.36 (0.0),0.35 (0.0),0.33 (0.0)
5,,Average Inter-Paragraph Cosine Similarity,Text-Davinci-003,0.48 (0.03),0.48 (0.03),0.48 (0.03),0.41 (0.04),0.42 (0.04),0.37 (0.03),0.43 (0.05),0.36 (0.03)
4,,,GPT-4,0.39 (0.02),0.42 (0.02),0.42 (0.02),0.46 (0.02),0.46 (0.02),0.41 (0.03),0.45 (0.02),0.42 (0.03)
3,,Average Inter-Sentence Cosine Similarity (Comp...,Text-Davinci-003,0.36 (0.0),0.35 (0.0),0.37 (0.0),0.39 (0.01),0.39 (0.01),0.39 (0.0),0.4 (0.01),0.37 (0.01)
2,,,GPT-4,0.35 (0.0),0.36 (0.0),0.35 (0.0),0.34 (0.0),0.36 (0.0),0.35 (0.0),0.35 (0.0),0.33 (0.0)
7,,Average Inter-Paragraph Cosine Similarity (Com...,Text-Davinci-003,0.43 (0.03),0.4 (0.04),0.41 (0.03),0.36 (0.03),0.39 (0.03),0.34 (0.04),0.22 (0.02),0.37 (0.03)
6,,,GPT-4,0.37 (0.02),0.42 (0.02),0.4 (0.03),0.46 (0.02),0.45 (0.03),0.4 (0.02),0.46 (0.02),0.42 (0.02)


## Adding release date to accuracy quality table (optional version)

In [55]:
# Keep unique values of Method and Method + Publication Date combined_data
print(combined_data.columns)
unique_method = combined_data[['Method', 'Method + Publication Date']].drop_duplicates()

# Drop direct prompting
unique_method = unique_method[unique_method['Method'] != 'Direct Prompting']

# Convert to dictionary
unique_method = unique_method.set_index('Method').to_dict()['Method + Publication Date']

unique_method


Index(['model_task_method', 'conversation_number',
       'coherence_1_incoherent_10_very_coherent', 'compliance_OLD',
       'ease_of_review_1_easy_10_hard', 'correct',
       'Prediction_Based_On_First_10', 'Prediction_Based_On_Last_10',
       'Aggregated_Prediction', 'Prediction_Based_On_First_10_LP',
       'response_Based_On_First_10_LP', 'Prediction_Based_On_Last_10_LP',
       'response_Based_On_Last_10_LP', 'response_LP',
       'Aggregated_Prediction_LP', 'Prediction_Based_On_First_50_LP',
       'response_Based_On_First_50_LP', 'Prediction_Based_On_Last_50_LP',
       'response_Based_On_Last_50_LP', 'Aggregated_Prediction_50_LP',
       'Prediction_Based_On_random_50_LP_1',
       'response_Based_On_random_50_LP_1',
       'Prediction_Based_On_random_50_LP_2',
       'response_Based_On_random_50_LP_2',
       'Aggregated_Prediction_random_50_LP', 'Unnamed: 0_x', 'response_x',
       'replace_slash_n_slash_n_with_newline_x',
       'replace_slash_n_slash_n_with_newline_values

{'Zero-Shot CoT': 'Zero-Shot CoT (May 2022)',
 'APE Zero-Shot CoT': 'APE Zero-Shot CoT (Nov 2022)',
 'Least-to-Most': 'Least-to-Most (May 2022)',
 'Manual Few-Shot': 'Manual Few-Shot (May 2020)',
 'Manual CoT': 'Manual CoT (Jan 2022)',
 'Tree-of-Thought': 'Tree-of-Thought (May 2023)',
 'Self-Refine': 'Self-Refine (Mar 2023)'}

In [63]:
# Use dictionary to rename columns in avg_accuracy_quality_pivot
avg_accuracy_quality_pivot_with_date = avg_accuracy_quality_pivot.rename(columns = unique_method)

print(avg_accuracy_quality_pivot_with_date.columns)

# Output to LaTeX

# Delete values of Task and Metric when they are not the first instance in the dataframe
output_table = avg_accuracy_quality_pivot_with_date.copy()
output_table['Model'] = output_table['Model'].where(output_table['Model'] != output_table['Model'].shift(), '')
output_table['Task'] = output_table['Task'].where(output_table['Task'] != output_table['Task'].shift(), '')
output_table['Metric'] = output_table['Metric'].where(output_table['Metric'] != output_table['Metric'].shift(), '')

# Output to latex. center columns, wrap text, and remove index
latex_string = output_table.to_latex(index=False, 
                      column_format='p{0.75cm}|p{1.5cm}|p{0.75cm}|p{0.75cm}|p{0.75cm}|p{0.75cm}|p{0.75cm}|p{0.75cm}|p{0.75cm}|p{0.75cm}|p{0.75cm}', 
                      #booktabs = True
                      #longtable = True
                      )

# Add lines between rows
lines = latex_string.split('\n')
new_lines = []
for line in lines:
    new_lines.append(line)
    if '\\' in line and '&' in line:  # Identifies a row of the table
        new_lines.append('\\hline')

# Rejoin the modified lines
modified_latex_table = '\n'.join(new_lines)

# Save string to file
with open('../Output/avg_accuracy_quality_pivot_with_date.tex', 'w') as f:
    f.write(modified_latex_table)

output_table


Index(['Task', 'Metric', 'Model', 'Manual Few-Shot (May 2020)',
       'Manual CoT (Jan 2022)', 'Least-to-Most (May 2022)',
       'Zero-Shot CoT (May 2022)', 'APE Zero-Shot CoT (Nov 2022)',
       'Self-Refine (Mar 2023)', 'Tree-of-Thought (May 2023)',
       'Direct Prompting'],
      dtype='object')


Unnamed: 0,Task,Metric,Model,Manual Few-Shot (May 2020),Manual CoT (Jan 2022),Least-to-Most (May 2022),Zero-Shot CoT (May 2022),APE Zero-Shot CoT (Nov 2022),Self-Refine (Mar 2023),Tree-of-Thought (May 2023),Direct Prompting
11,GSM8K,Accuracy,Text-Davinci-003,0.18,0.6,0.67,0.62,0.49,0.2,0.23,0.23
10,,,GPT-4,0.49,0.93,0.95,0.95,0.93,0.89,0.4,0.73
1,Creative Writing,Average Inter-Sentence Cosine Similarity,Text-Davinci-003,0.36 (0.0),0.35 (0.0),0.36 (0.0),0.37 (0.01),0.38 (0.01),0.37 (0.0),0.36 (0.01),0.36 (0.0)
0,,,GPT-4,0.35 (0.0),0.35 (0.0),0.34 (0.0),0.35 (0.0),0.35 (0.0),0.36 (0.0),0.35 (0.0),0.33 (0.0)
5,,Average Inter-Paragraph Cosine Similarity,Text-Davinci-003,0.48 (0.03),0.48 (0.03),0.48 (0.03),0.41 (0.04),0.42 (0.04),0.37 (0.03),0.43 (0.05),0.36 (0.03)
4,,,GPT-4,0.39 (0.02),0.42 (0.02),0.42 (0.02),0.46 (0.02),0.46 (0.02),0.41 (0.03),0.45 (0.02),0.42 (0.03)
3,,Average Inter-Sentence Cosine Similarity (Comp...,Text-Davinci-003,0.36 (0.0),0.35 (0.0),0.37 (0.0),0.39 (0.01),0.39 (0.01),0.39 (0.0),0.4 (0.01),0.37 (0.01)
2,,,GPT-4,0.35 (0.0),0.36 (0.0),0.35 (0.0),0.34 (0.0),0.36 (0.0),0.35 (0.0),0.35 (0.0),0.33 (0.0)
7,,Average Inter-Paragraph Cosine Similarity (Com...,Text-Davinci-003,0.43 (0.03),0.4 (0.04),0.41 (0.03),0.36 (0.03),0.39 (0.03),0.34 (0.04),0.22 (0.02),0.37 (0.03)
6,,,GPT-4,0.37 (0.02),0.42 (0.02),0.4 (0.03),0.46 (0.02),0.45 (0.03),0.4 (0.02),0.46 (0.02),0.42 (0.02)
