# Analyze Metrics and Conduct Inference.ipynb

In [1]:
import pandas as pd
from statsmodels.stats.contingency_tables import mcnemar 
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from Analysis_Functions import *


In [2]:
# Load Combined_Data.xlsx
combined_data = pd.read_excel('Combined_Data.xlsx')

# GSM8k data - limit to task = "gsm8k"
gsm8k_data = combined_data[combined_data['task'] == 'gsm8k']

# Creative writing data - limit to task = "cw"
cw_data = combined_data[combined_data['task'] == 'cw']


In [3]:
# Load plot_settings.pickle
import pickle

with open('order_list.pkl', 'rb') as f:
    order_list = pickle.load(f)
print(order_list)

with open('hue_order_pub_date_no_dp.pkl', 'rb') as f:
    hue_order_pub_date_no_dp = pickle.load(f)
print(hue_order_pub_date_no_dp)

with open('hue_order_pub_date.pkl', 'rb') as f:
    hue_order_pub_date = pickle.load(f)
print(hue_order_pub_date)

with open('hue_order_introduced.pkl', 'rb') as f:
    hue_order_introduced = pickle.load(f)
print(hue_order_introduced)

# Set the font to Times New Roman or a similar serif font
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
mpl.rcParams['font.family'] = 'serif'
mpl.rcParams['font.serif'] = 'Times New Roman'
# Optionally, set the font size
mpl.rcParams['font.size'] = 12

# Figure size
f_size = (16 / 2, 9 / 2)

# Function necessary to flip the legend
import itertools
def flip(items, ncol):
    return itertools.chain(*[items[i::ncol] for i in range(ncol)])


['Text-Davinci-003', 'GPT-4']
['Manual Few-Shot', 'Manual CoT', 'Least-to-Most', 'Zero-Shot CoT', 'APE Zero-Shot CoT', 'Self-Refine', 'Tree-of-Thought']
['Manual Few-Shot', 'Manual CoT', 'Least-to-Most', 'Zero-Shot CoT', 'APE Zero-Shot CoT', 'Self-Refine', 'Tree-of-Thought', 'Direct Prompting']
['Direct Prompting', 'Zero-Shot CoT', 'APE Zero-Shot CoT', 'Tree-of-Thought', 'Self-Refine', 'Least-to-Most', 'Manual Few-Shot', 'Manual CoT']


## Complexity

Big table of complexity metrics

Possibly plots for differences or another table

Possibly a separate table for ease of review scores

### Large table for num_linebreaks, num_sentences, num_step_i, num_1_dot_etc, sentence_length, fres, ease_of_review_1_easy_10_hard (possibly)

Modified from length code

In [4]:
# In combined_data, keep columns
limited_combined_data = combined_data[['Model', 'Method', 'Task', 'model', 'method', 'task', 'num_linebreaks', 'num_sentences', 'num_step_i', 'num_1_dot_etc', 'sentence_length', 'fres'
                                       #, 'ease_of_review_1_easy_10_hard'
                                       ]]

# Stack by Model and Method
stacked_combined_data = limited_combined_data.set_index(['Model', 'Method', 'Task', 'model', 'method', 'task']).stack().reset_index().rename(columns = {'level_6': 'metric', 0: 'Value'})

stacked_combined_data


Unnamed: 0,Model,Method,Task,model,method,task,metric,Value
0,Text-Davinci-003,Direct Prompting,Creative Writing,td3,direct_prompting,cw,num_linebreaks,1.000000
1,Text-Davinci-003,Direct Prompting,Creative Writing,td3,direct_prompting,cw,num_sentences,7.000000
2,Text-Davinci-003,Direct Prompting,Creative Writing,td3,direct_prompting,cw,num_step_i,0.000000
3,Text-Davinci-003,Direct Prompting,Creative Writing,td3,direct_prompting,cw,num_1_dot_etc,0.000000
4,Text-Davinci-003,Direct Prompting,Creative Writing,td3,direct_prompting,cw,sentence_length,14.571429
...,...,...,...,...,...,...,...,...
15995,GPT-4,Manual CoT,GSM8K,gpt4,manual_cot,gsm8k,num_1_dot_etc,1.000000
15996,GPT-4,Manual CoT,GSM8K,gpt4,manual_cot,gsm8k,num_linebreaks,0.000000
15997,GPT-4,Manual CoT,GSM8K,gpt4,manual_cot,gsm8k,num_sentences,4.000000
15998,GPT-4,Manual CoT,GSM8K,gpt4,manual_cot,gsm8k,num_step_i,0.000000


In [5]:
# Average Value by model, method, metric
# Also get variance
avg_metric_with_variance = stacked_combined_data.groupby(['Model', 'Method', 'Task', 'model', 'method', 'task', 'metric']).agg(['mean', 'var'])['Value'].reset_index()

avg_metric_with_variance


Unnamed: 0,Model,Method,Task,model,method,task,metric,mean,var
0,GPT-4,APE Zero-Shot CoT,Creative Writing,gpt4,ape_zero_shot_cot,cw,fres,60.737800,44.725714
1,GPT-4,APE Zero-Shot CoT,Creative Writing,gpt4,ape_zero_shot_cot,cw,num_1_dot_etc,2.810000,7.205960
2,GPT-4,APE Zero-Shot CoT,Creative Writing,gpt4,ape_zero_shot_cot,cw,num_linebreaks,10.770000,8.845556
3,GPT-4,APE Zero-Shot CoT,Creative Writing,gpt4,ape_zero_shot_cot,cw,num_sentences,15.430000,15.176869
4,GPT-4,APE Zero-Shot CoT,Creative Writing,gpt4,ape_zero_shot_cot,cw,num_step_i,0.900000,3.949495
...,...,...,...,...,...,...,...,...,...
155,Text-Davinci-003,Zero-Shot CoT,Creative Writing,td3,zero_shot_cot,cw,sentence_length,15.375624,10.525553
156,Text-Davinci-003,Zero-Shot CoT,GSM8K,td3,zero_shot_cot,gsm8k,num_1_dot_etc,1.840000,6.842828
157,Text-Davinci-003,Zero-Shot CoT,GSM8K,td3,zero_shot_cot,gsm8k,num_linebreaks,3.250000,3.017677
158,Text-Davinci-003,Zero-Shot CoT,GSM8K,td3,zero_shot_cot,gsm8k,num_sentences,4.660000,5.317576


In [6]:
# Add significance
# Load all_inference.xlsx
all_inference = pd.read_excel('all_inference.xlsx')

# Merge with avg_metric_with_variance
avg_metric_with_variance = pd.merge(avg_metric_with_variance, all_inference, how = 'left', on = ['model', 'method', 'task', 'metric'])

# Create column stars if Significant at 95% == "Yes"
avg_metric_with_variance['stars'] = avg_metric_with_variance['Significant at 95%'].apply(lambda x: '*' if x == 'Yes' else '')

avg_metric_with_variance


Unnamed: 0,Model,Method,Task,model,method,task,metric,mean,var,statistic,pvalue,statistic_with_correction,pvalue_with_correction,Significant at 95%,dp_mean,using_method_mean,stars
0,GPT-4,APE Zero-Shot CoT,Creative Writing,gpt4,ape_zero_shot_cot,cw,fres,60.737800,44.725714,3.970500,1.360541e-04,,,Yes,63.776700,60.737800,*
1,GPT-4,APE Zero-Shot CoT,Creative Writing,gpt4,ape_zero_shot_cot,cw,num_1_dot_etc,2.810000,7.205960,-10.467920,1.045234e-17,,,Yes,0.000000,2.810000,*
2,GPT-4,APE Zero-Shot CoT,Creative Writing,gpt4,ape_zero_shot_cot,cw,num_linebreaks,10.770000,8.845556,-28.831242,5.826759e-50,,,Yes,2.080000,10.770000,*
3,GPT-4,APE Zero-Shot CoT,Creative Writing,gpt4,ape_zero_shot_cot,cw,num_sentences,15.430000,15.176869,-9.181456,6.672394e-15,,,Yes,11.320000,15.430000,*
4,GPT-4,APE Zero-Shot CoT,Creative Writing,gpt4,ape_zero_shot_cot,cw,num_step_i,0.900000,3.949495,-4.528681,1.658137e-05,,,Yes,0.000000,0.900000,*
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,Text-Davinci-003,Zero-Shot CoT,Creative Writing,td3,zero_shot_cot,cw,sentence_length,15.375624,10.525553,2.738352,7.323065e-03,,,Yes,16.413378,15.375624,*
156,Text-Davinci-003,Zero-Shot CoT,GSM8K,td3,zero_shot_cot,gsm8k,num_1_dot_etc,1.840000,6.842828,-3.843716,2.142389e-04,,,Yes,0.720000,1.840000,*
157,Text-Davinci-003,Zero-Shot CoT,GSM8K,td3,zero_shot_cot,gsm8k,num_linebreaks,3.250000,3.017677,-16.669221,1.700380e-30,,,Yes,0.160000,3.250000,*
158,Text-Davinci-003,Zero-Shot CoT,GSM8K,td3,zero_shot_cot,gsm8k,num_sentences,4.660000,5.317576,-12.313699,1.100106e-21,,,Yes,1.490000,4.660000,*


In [7]:
# Combine mean and variance into one column that is a string with the mean and then the variance in parentheses
avg_metric_with_variance['Value'] = avg_metric_with_variance.apply(lambda row: str(round(row['mean'], 2)) + row['stars'] + ' (' + str(round(row['var'], 2)) + ')', axis=1)
# But set Value to just be mean if metric is compliance or correct
avg_metric_with_variance.loc[avg_metric_with_variance['metric'].isin(['compliance', 'correct']), 'Value'] = avg_metric_with_variance['mean'].round(2).astype(str) + avg_metric_with_variance['stars']

# Drop mean and variance columns, other columns
avg_metric_with_variance = avg_metric_with_variance[['Model', 'Method', 'Task', 'metric', 'Value']].rename(columns = {'metric': 'Metric'})

avg_metric_with_variance


Unnamed: 0,Model,Method,Task,Metric,Value
0,GPT-4,APE Zero-Shot CoT,Creative Writing,fres,60.74* (44.73)
1,GPT-4,APE Zero-Shot CoT,Creative Writing,num_1_dot_etc,2.81* (7.21)
2,GPT-4,APE Zero-Shot CoT,Creative Writing,num_linebreaks,10.77* (8.85)
3,GPT-4,APE Zero-Shot CoT,Creative Writing,num_sentences,15.43* (15.18)
4,GPT-4,APE Zero-Shot CoT,Creative Writing,num_step_i,0.9* (3.95)
...,...,...,...,...,...
155,Text-Davinci-003,Zero-Shot CoT,Creative Writing,sentence_length,15.38* (10.53)
156,Text-Davinci-003,Zero-Shot CoT,GSM8K,num_1_dot_etc,1.84* (6.84)
157,Text-Davinci-003,Zero-Shot CoT,GSM8K,num_linebreaks,3.25* (3.02)
158,Text-Davinci-003,Zero-Shot CoT,GSM8K,num_sentences,4.66* (5.32)


In [8]:
# Pivot table - column method should go wide
avg_metric_pivot = avg_metric_with_variance.set_index(['Task', 'Metric', 'Model', 'Method']).unstack()
#pivot_table(index=['model', 'task'], columns='method', values='accuracy_quality').reset_index()

# Fix axis
avg_metric_pivot = avg_metric_pivot.rename_axis([None, None], axis=1).reset_index()

avg_metric_pivot


Unnamed: 0_level_0,Task,Metric,Model,Value,Value,Value,Value,Value,Value,Value,Value
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,APE Zero-Shot CoT,Direct Prompting,Least-to-Most,Manual CoT,Manual Few-Shot,Self-Refine,Tree-of-Thought,Zero-Shot CoT
0,Creative Writing,fres,GPT-4,60.74* (44.73),63.78 (51.29),67.37* (35.66),67.76* (51.02),67.84* (55.25),62.37 (51.28),57.57* (43.82),59.95* (50.18)
1,Creative Writing,fres,Text-Davinci-003,71.93* (78.95),74.73 (67.91),75.95 (32.47),74.1 (38.72),76.66* (57.45),73.69 (61.79),66.68* (66.79),72.39* (72.85)
2,Creative Writing,num_1_dot_etc,GPT-4,2.81* (7.21),0.0 (0.0),2.49* (0.49),0.94* (1.81),0.0 (0.0),0.8* (1.03),3.93* (2.19),3.58* (5.58)
3,Creative Writing,num_1_dot_etc,Text-Davinci-003,0.51* (0.8),0.0 (0.0),2.99* (0.01),2.9* (0.21),0.01 (0.01),0.63* (0.62),2.84* (0.84),0.85* (1.4)
4,Creative Writing,num_linebreaks,GPT-4,10.77* (8.85),2.08 (0.07),7.5* (2.41),3.69* (6.26),2.01* (0.03),4.37* (5.57),18.54* (26.94),10.93* (5.66)
5,Creative Writing,num_linebreaks,Text-Davinci-003,4.37* (8.84),0.98 (0.06),7.03* (0.09),6.01* (0.25),1.07* (0.07),3.1* (2.37),11.29* (7.56),4.67* (6.08)
6,Creative Writing,num_sentences,GPT-4,15.43* (15.18),11.32 (5.17),14.76* (6.0),11.93 (7.28),10.25* (3.56),15.97* (59.79),39.42* (38.21),15.92* (10.64)
7,Creative Writing,num_sentences,Text-Davinci-003,10.12* (6.17),7.6 (1.78),17.82* (3.54),15.8* (2.97),10.08* (3.47),13.54* (28.8),31.37* (47.55),10.03* (5.69)
8,Creative Writing,num_step_i,GPT-4,0.9* (3.95),0.0 (0.0),0.0 (0.0),0.0 (0.0),0.0 (0.0),0.0 (0.0),0.0 (0.0),0.66* (2.97)
9,Creative Writing,num_step_i,Text-Davinci-003,1.62* (3.19),0.0 (0.0),0.0 (0.0),0.0 (0.0),0.0 (0.0),0.0 (0.0),0.0 (0.0),1.71* (2.94)


In [9]:
# Unique values of metric
avg_metric_pivot['Metric'].unique()


array(['fres', 'num_1_dot_etc', 'num_linebreaks', 'num_sentences',
       'num_step_i', 'sentence_length'], dtype=object)

In [10]:
# Recode Metric column
avg_metric_pivot['Metric'] = avg_metric_pivot['Metric'].replace({
    'ease_of_review_1_easy_10_hard' : 'Ease of Review (1 = Easy, 10 = Hard)',
    'fres' : 'Flesch Reading Ease Score',
    'num_1_dot_etc' : 'Number of 1., 2., etc.',
    'num_linebreaks' : 'Number of Linebreaks',
    'num_sentences' : 'Number of Sentences',
    'num_step_i' : 'Number of Step 1, Step 2, etc.',
    'sentence_length' : 'Sentence Length'
    })

avg_metric_pivot


Unnamed: 0_level_0,Task,Metric,Model,Value,Value,Value,Value,Value,Value,Value,Value
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,APE Zero-Shot CoT,Direct Prompting,Least-to-Most,Manual CoT,Manual Few-Shot,Self-Refine,Tree-of-Thought,Zero-Shot CoT
0,Creative Writing,Flesch Reading Ease Score,GPT-4,60.74* (44.73),63.78 (51.29),67.37* (35.66),67.76* (51.02),67.84* (55.25),62.37 (51.28),57.57* (43.82),59.95* (50.18)
1,Creative Writing,Flesch Reading Ease Score,Text-Davinci-003,71.93* (78.95),74.73 (67.91),75.95 (32.47),74.1 (38.72),76.66* (57.45),73.69 (61.79),66.68* (66.79),72.39* (72.85)
2,Creative Writing,"Number of 1., 2., etc.",GPT-4,2.81* (7.21),0.0 (0.0),2.49* (0.49),0.94* (1.81),0.0 (0.0),0.8* (1.03),3.93* (2.19),3.58* (5.58)
3,Creative Writing,"Number of 1., 2., etc.",Text-Davinci-003,0.51* (0.8),0.0 (0.0),2.99* (0.01),2.9* (0.21),0.01 (0.01),0.63* (0.62),2.84* (0.84),0.85* (1.4)
4,Creative Writing,Number of Linebreaks,GPT-4,10.77* (8.85),2.08 (0.07),7.5* (2.41),3.69* (6.26),2.01* (0.03),4.37* (5.57),18.54* (26.94),10.93* (5.66)
5,Creative Writing,Number of Linebreaks,Text-Davinci-003,4.37* (8.84),0.98 (0.06),7.03* (0.09),6.01* (0.25),1.07* (0.07),3.1* (2.37),11.29* (7.56),4.67* (6.08)
6,Creative Writing,Number of Sentences,GPT-4,15.43* (15.18),11.32 (5.17),14.76* (6.0),11.93 (7.28),10.25* (3.56),15.97* (59.79),39.42* (38.21),15.92* (10.64)
7,Creative Writing,Number of Sentences,Text-Davinci-003,10.12* (6.17),7.6 (1.78),17.82* (3.54),15.8* (2.97),10.08* (3.47),13.54* (28.8),31.37* (47.55),10.03* (5.69)
8,Creative Writing,"Number of Step 1, Step 2, etc.",GPT-4,0.9* (3.95),0.0 (0.0),0.0 (0.0),0.0 (0.0),0.0 (0.0),0.0 (0.0),0.0 (0.0),0.66* (2.97)
9,Creative Writing,"Number of Step 1, Step 2, etc.",Text-Davinci-003,1.62* (3.19),0.0 (0.0),0.0 (0.0),0.0 (0.0),0.0 (0.0),0.0 (0.0),0.0 (0.0),1.71* (2.94)


In [11]:
# Flatten columns
flattened_cols = [''.join(col).strip().replace('Value', '') for col in avg_metric_pivot.columns.values]
avg_metric_pivot.columns = flattened_cols

avg_metric_pivot


Unnamed: 0,Task,Metric,Model,APE Zero-Shot CoT,Direct Prompting,Least-to-Most,Manual CoT,Manual Few-Shot,Self-Refine,Tree-of-Thought,Zero-Shot CoT
0,Creative Writing,Flesch Reading Ease Score,GPT-4,60.74* (44.73),63.78 (51.29),67.37* (35.66),67.76* (51.02),67.84* (55.25),62.37 (51.28),57.57* (43.82),59.95* (50.18)
1,Creative Writing,Flesch Reading Ease Score,Text-Davinci-003,71.93* (78.95),74.73 (67.91),75.95 (32.47),74.1 (38.72),76.66* (57.45),73.69 (61.79),66.68* (66.79),72.39* (72.85)
2,Creative Writing,"Number of 1., 2., etc.",GPT-4,2.81* (7.21),0.0 (0.0),2.49* (0.49),0.94* (1.81),0.0 (0.0),0.8* (1.03),3.93* (2.19),3.58* (5.58)
3,Creative Writing,"Number of 1., 2., etc.",Text-Davinci-003,0.51* (0.8),0.0 (0.0),2.99* (0.01),2.9* (0.21),0.01 (0.01),0.63* (0.62),2.84* (0.84),0.85* (1.4)
4,Creative Writing,Number of Linebreaks,GPT-4,10.77* (8.85),2.08 (0.07),7.5* (2.41),3.69* (6.26),2.01* (0.03),4.37* (5.57),18.54* (26.94),10.93* (5.66)
5,Creative Writing,Number of Linebreaks,Text-Davinci-003,4.37* (8.84),0.98 (0.06),7.03* (0.09),6.01* (0.25),1.07* (0.07),3.1* (2.37),11.29* (7.56),4.67* (6.08)
6,Creative Writing,Number of Sentences,GPT-4,15.43* (15.18),11.32 (5.17),14.76* (6.0),11.93 (7.28),10.25* (3.56),15.97* (59.79),39.42* (38.21),15.92* (10.64)
7,Creative Writing,Number of Sentences,Text-Davinci-003,10.12* (6.17),7.6 (1.78),17.82* (3.54),15.8* (2.97),10.08* (3.47),13.54* (28.8),31.37* (47.55),10.03* (5.69)
8,Creative Writing,"Number of Step 1, Step 2, etc.",GPT-4,0.9* (3.95),0.0 (0.0),0.0 (0.0),0.0 (0.0),0.0 (0.0),0.0 (0.0),0.0 (0.0),0.66* (2.97)
9,Creative Writing,"Number of Step 1, Step 2, etc.",Text-Davinci-003,1.62* (3.19),0.0 (0.0),0.0 (0.0),0.0 (0.0),0.0 (0.0),0.0 (0.0),0.0 (0.0),1.71* (2.94)


In [12]:
# Print values of Metric
avg_metric_pivot['Metric'].unique()


array(['Flesch Reading Ease Score', 'Number of 1., 2., etc.',
       'Number of Linebreaks', 'Number of Sentences',
       'Number of Step 1, Step 2, etc.', 'Sentence Length'], dtype=object)

In [13]:
# Sort rows
avg_metric_pivot['Task'] = pd.Categorical(avg_metric_pivot['Task'], ["GSM8K", "Creative Writing"])
avg_metric_pivot['Metric'] = pd.Categorical(avg_metric_pivot['Metric'], [
    "Number of Linebreaks",
    "Number of Sentences",
    "Number of Step 1, Step 2, etc.",
    "Number of 1., 2., etc.",
    "Sentence Length",
    "Flesch Reading Ease Score"
    ])
avg_metric_pivot['Model'] = pd.Categorical(avg_metric_pivot['Model'], ["Text-Davinci-003", "GPT-4"])
avg_metric_pivot = avg_metric_pivot.sort_values(by=['Task', 'Metric', 'Model'])

print(avg_metric_pivot.columns)
avg_metric_pivot


Index(['Task', 'Metric', 'Model', 'APE Zero-Shot CoT', 'Direct Prompting',
       'Least-to-Most', 'Manual CoT', 'Manual Few-Shot', 'Self-Refine',
       'Tree-of-Thought', 'Zero-Shot CoT'],
      dtype='object')


Unnamed: 0,Task,Metric,Model,APE Zero-Shot CoT,Direct Prompting,Least-to-Most,Manual CoT,Manual Few-Shot,Self-Refine,Tree-of-Thought,Zero-Shot CoT
15,GSM8K,Number of Linebreaks,Text-Davinci-003,4.44* (5.6),0.16 (0.52),4.16* (3.41),0.0* (0.0),0.0* (0.0),1.16* (0.44),1.39* (0.87),3.25* (3.02)
14,GSM8K,Number of Linebreaks,GPT-4,4.64* (9.97),1.34 (2.99),5.64* (3.95),1.17 (2.85),0.0* (0.0),5.37* (12.62),12.26* (28.86),3.83* (10.73)
17,GSM8K,Number of Sentences,Text-Davinci-003,4.38* (9.05),1.49 (0.94),12.31* (9.37),4.96* (1.49),1.0* (0.0),2.25* (1.1),8.41* (11.4),4.66* (5.32)
16,GSM8K,Number of Sentences,GPT-4,3.15* (3.46),1.51 (0.33),8.59* (6.63),3.5* (2.37),1.0* (0.0),5.12* (5.32),8.03* (18.23),2.84* (2.58)
19,GSM8K,"Number of Step 1, Step 2, etc.",Text-Davinci-003,0.56* (1.56),0.0 (0.0),0.0 (0.0),0.0 (0.0),0.0 (0.0),0.0 (0.0),2.0* (0.0),0.32* (1.05)
18,GSM8K,"Number of Step 1, Step 2, etc.",GPT-4,0.39* (1.31),0.0 (0.0),0.0 (0.0),0.0 (0.0),0.0 (0.0),0.0 (0.0),0.22 (1.75),0.0 (0.0)
13,GSM8K,"Number of 1., 2., etc.",Text-Davinci-003,2.33* (10.49),0.72 (3.13),6.96* (9.37),2.54* (6.92),0.04* (0.04),0.5 (0.98),1.0 (0.0),1.84* (6.84)
12,GSM8K,"Number of 1., 2., etc.",GPT-4,1.95* (10.29),1.28 (8.89),6.95* (11.26),2.8* (13.72),0.04* (0.04),2.36* (19.28),3.5* (17.91),1.99 (13.2)
5,Creative Writing,Number of Linebreaks,Text-Davinci-003,4.37* (8.84),0.98 (0.06),7.03* (0.09),6.01* (0.25),1.07* (0.07),3.1* (2.37),11.29* (7.56),4.67* (6.08)
4,Creative Writing,Number of Linebreaks,GPT-4,10.77* (8.85),2.08 (0.07),7.5* (2.41),3.69* (6.26),2.01* (0.03),4.37* (5.57),18.54* (26.94),10.93* (5.66)


In [14]:
# Order columns
starter_col_order = ['Task', 'Metric', 'Model']
col_order = starter_col_order + hue_order_pub_date
print(col_order)
avg_metric_pivot = avg_metric_pivot[col_order]

# Convert Model, Task, Metric back to string from Categorical
avg_metric_pivot['Model'] = avg_metric_pivot['Model'].astype(str)
avg_metric_pivot['Task'] = avg_metric_pivot['Task'].astype(str)
avg_metric_pivot['Metric'] = avg_metric_pivot['Metric'].astype(str)

# Delete values of Task and Metric when they are not the first instance in the dataframe
output_table = avg_metric_pivot.copy()
output_table['Model'] = output_table['Model'].where(output_table['Model'] != output_table['Model'].shift(), '')
output_table['Task'] = output_table['Task'].where(output_table['Task'] != output_table['Task'].shift(), '')
output_table['Metric'] = output_table['Metric'].where(output_table['Metric'] != output_table['Metric'].shift(), '')

# Output to LaTeX
#output_table.to_latex('../Output/avg_accuracy_quality_pivot.tex', index=False)

# Output to latex. center columns, wrap text, and remove index
latex_string = output_table.to_latex(index=False, 
                      column_format='x{0.75cm}|x{1.5cm}|x{0.75cm}|x{0.75cm}|x{0.75cm}|x{0.75cm}|x{0.75cm}|x{0.75cm}|x{0.75cm}|x{0.75cm}|x{0.75cm}', 
                      #booktabs = True
                      #longtable = True
                      )

# Add lines between rows
lines = latex_string.split('\n')
new_lines = []
for line in lines:
    new_lines.append(line)
    if '\\' in line and '&' in line:  # Identifies a row of the table
        new_lines.append('\\hline')
# Insert \\hline after \toprule
new_lines.insert(2, '\\hline')

# Rejoin the modified lines
modified_latex_table = '\n'.join(new_lines)

print(modified_latex_table)

# Save string to file
with open('../Output/avg_complexity_metrics_pivot.tex', 'w') as f:
    f.write(modified_latex_table)

output_table


['Task', 'Metric', 'Model', 'Manual Few-Shot', 'Manual CoT', 'Least-to-Most', 'Zero-Shot CoT', 'APE Zero-Shot CoT', 'Self-Refine', 'Tree-of-Thought', 'Direct Prompting']
\begin{tabular}{x{0.75cm}|x{1.5cm}|x{0.75cm}|x{0.75cm}|x{0.75cm}|x{0.75cm}|x{0.75cm}|x{0.75cm}|x{0.75cm}|x{0.75cm}|x{0.75cm}}
\toprule
\hline
Task & Metric & Model & Manual Few-Shot & Manual CoT & Least-to-Most & Zero-Shot CoT & APE Zero-Shot CoT & Self-Refine & Tree-of-Thought & Direct Prompting \\
\hline
\midrule
GSM8K & Number of Linebreaks & Text-Davinci-003 & 0.0* (0.0) & 0.0* (0.0) & 4.16* (3.41) & 3.25* (3.02) & 4.44* (5.6) & 1.16* (0.44) & 1.39* (0.87) & 0.16 (0.52) \\
\hline
 &  & GPT-4 & 0.0* (0.0) & 1.17 (2.85) & 5.64* (3.95) & 3.83* (10.73) & 4.64* (9.97) & 5.37* (12.62) & 12.26* (28.86) & 1.34 (2.99) \\
\hline
 & Number of Sentences & Text-Davinci-003 & 1.0* (0.0) & 4.96* (1.49) & 12.31* (9.37) & 4.66* (5.32) & 4.38* (9.05) & 2.25* (1.1) & 8.41* (11.4) & 1.49 (0.94) \\
\hline
 &  & GPT-4 & 1.0* (0.0) & 3.5

Unnamed: 0,Task,Metric,Model,Manual Few-Shot,Manual CoT,Least-to-Most,Zero-Shot CoT,APE Zero-Shot CoT,Self-Refine,Tree-of-Thought,Direct Prompting
15,GSM8K,Number of Linebreaks,Text-Davinci-003,0.0* (0.0),0.0* (0.0),4.16* (3.41),3.25* (3.02),4.44* (5.6),1.16* (0.44),1.39* (0.87),0.16 (0.52)
14,,,GPT-4,0.0* (0.0),1.17 (2.85),5.64* (3.95),3.83* (10.73),4.64* (9.97),5.37* (12.62),12.26* (28.86),1.34 (2.99)
17,,Number of Sentences,Text-Davinci-003,1.0* (0.0),4.96* (1.49),12.31* (9.37),4.66* (5.32),4.38* (9.05),2.25* (1.1),8.41* (11.4),1.49 (0.94)
16,,,GPT-4,1.0* (0.0),3.5* (2.37),8.59* (6.63),2.84* (2.58),3.15* (3.46),5.12* (5.32),8.03* (18.23),1.51 (0.33)
19,,"Number of Step 1, Step 2, etc.",Text-Davinci-003,0.0 (0.0),0.0 (0.0),0.0 (0.0),0.32* (1.05),0.56* (1.56),0.0 (0.0),2.0* (0.0),0.0 (0.0)
18,,,GPT-4,0.0 (0.0),0.0 (0.0),0.0 (0.0),0.0 (0.0),0.39* (1.31),0.0 (0.0),0.22 (1.75),0.0 (0.0)
13,,"Number of 1., 2., etc.",Text-Davinci-003,0.04* (0.04),2.54* (6.92),6.96* (9.37),1.84* (6.84),2.33* (10.49),0.5 (0.98),1.0 (0.0),0.72 (3.13)
12,,,GPT-4,0.04* (0.04),2.8* (13.72),6.95* (11.26),1.99 (13.2),1.95* (10.29),2.36* (19.28),3.5* (17.91),1.28 (8.89)
5,Creative Writing,Number of Linebreaks,Text-Davinci-003,1.07* (0.07),6.01* (0.25),7.03* (0.09),4.67* (6.08),4.37* (8.84),3.1* (2.37),11.29* (7.56),0.98 (0.06)
4,,,GPT-4,2.01* (0.03),3.69* (6.26),7.5* (2.41),10.93* (5.66),10.77* (8.85),4.37* (5.57),18.54* (26.94),2.08 (0.07)


### Differences

In [15]:
# Differences of scores 
# Responses versus prompts
# Responses versus provided answer for GSM8k

# Aggregate "_diff" variables by model, task, method
differences = combined_data[['model', 'task', 'method', 'Model', 'Task', 'Method', 'num_linebreaks_prompts_diff', 'num_sentences_prompts_diff', 'num_step_i_prompts_diff', 'num_1_dot_etc_prompts_diff', 'sentence_length_prompts_diff', 'fres_prompts_diff', 'num_linebreaks_provided_diff', 'num_sentences_provided_diff', 'num_step_i_provided_diff', 'num_1_dot_etc_provided_diff']]

# Stack by Model and Method
stacked_differences = differences.set_index(['Model', 'Method', 'Task', 'model', 'method', 'task']).stack().reset_index().rename(columns = {'level_6': 'metric', 0: 'Value'})

stacked_differences


Unnamed: 0,Model,Method,Task,model,method,task,metric,Value
0,Text-Davinci-003,Direct Prompting,Creative Writing,td3,direct_prompting,cw,num_linebreaks_prompts_diff,-1.000000
1,Text-Davinci-003,Direct Prompting,Creative Writing,td3,direct_prompting,cw,num_sentences_prompts_diff,-2.000000
2,Text-Davinci-003,Direct Prompting,Creative Writing,td3,direct_prompting,cw,num_step_i_prompts_diff,0.000000
3,Text-Davinci-003,Direct Prompting,Creative Writing,td3,direct_prompting,cw,num_1_dot_etc_prompts_diff,2.000000
4,Text-Davinci-003,Direct Prompting,Creative Writing,td3,direct_prompting,cw,sentence_length_prompts_diff,-5.971429
...,...,...,...,...,...,...,...,...
22395,GPT-4,Manual CoT,GSM8K,gpt4,manual_cot,gsm8k,num_1_dot_etc_prompts_diff,19.000000
22396,GPT-4,Manual CoT,GSM8K,gpt4,manual_cot,gsm8k,num_linebreaks_provided_diff,4.000000
22397,GPT-4,Manual CoT,GSM8K,gpt4,manual_cot,gsm8k,num_sentences_provided_diff,-1.000000
22398,GPT-4,Manual CoT,GSM8K,gpt4,manual_cot,gsm8k,num_step_i_provided_diff,0.000000


In [16]:
# Average Value by model, method, metric
# No variance here (hard to interpret)
avg_diff_metric = stacked_differences.groupby(['Model', 'Method', 'Task', 'model', 'method', 'task', 'metric']).agg(['mean'])['Value'].reset_index()

avg_diff_metric


Unnamed: 0,Model,Method,Task,model,method,task,metric,mean
0,GPT-4,APE Zero-Shot CoT,Creative Writing,gpt4,ape_zero_shot_cot,cw,fres_prompts_diff,14.9564
1,GPT-4,APE Zero-Shot CoT,Creative Writing,gpt4,ape_zero_shot_cot,cw,num_1_dot_etc_prompts_diff,-0.8100
2,GPT-4,APE Zero-Shot CoT,Creative Writing,gpt4,ape_zero_shot_cot,cw,num_linebreaks_prompts_diff,-10.7700
3,GPT-4,APE Zero-Shot CoT,Creative Writing,gpt4,ape_zero_shot_cot,cw,num_sentences_prompts_diff,-9.4500
4,GPT-4,APE Zero-Shot CoT,Creative Writing,gpt4,ape_zero_shot_cot,cw,num_step_i_prompts_diff,-0.9000
...,...,...,...,...,...,...,...,...
219,Text-Davinci-003,Zero-Shot CoT,GSM8K,td3,zero_shot_cot,gsm8k,num_linebreaks_provided_diff,0.1700
220,Text-Davinci-003,Zero-Shot CoT,GSM8K,td3,zero_shot_cot,gsm8k,num_sentences_prompts_diff,-0.3000
221,Text-Davinci-003,Zero-Shot CoT,GSM8K,td3,zero_shot_cot,gsm8k,num_sentences_provided_diff,-1.9800
222,Text-Davinci-003,Zero-Shot CoT,GSM8K,td3,zero_shot_cot,gsm8k,num_step_i_prompts_diff,-0.3200


In [17]:
# Create string column with rounding
avg_diff_metric['Value'] = avg_diff_metric.apply(lambda row: str(round(row['mean'], 2)), axis=1)

# Drop mean and variance columns, other columns
avg_diff_metric = avg_diff_metric[['Model', 'Method', 'Task', 'metric', 'Value']].rename(columns = {'metric': 'Metric'})

avg_diff_metric


Unnamed: 0,Model,Method,Task,Metric,Value
0,GPT-4,APE Zero-Shot CoT,Creative Writing,fres_prompts_diff,14.96
1,GPT-4,APE Zero-Shot CoT,Creative Writing,num_1_dot_etc_prompts_diff,-0.81
2,GPT-4,APE Zero-Shot CoT,Creative Writing,num_linebreaks_prompts_diff,-10.77
3,GPT-4,APE Zero-Shot CoT,Creative Writing,num_sentences_prompts_diff,-9.45
4,GPT-4,APE Zero-Shot CoT,Creative Writing,num_step_i_prompts_diff,-0.9
...,...,...,...,...,...
219,Text-Davinci-003,Zero-Shot CoT,GSM8K,num_linebreaks_provided_diff,0.17
220,Text-Davinci-003,Zero-Shot CoT,GSM8K,num_sentences_prompts_diff,-0.3
221,Text-Davinci-003,Zero-Shot CoT,GSM8K,num_sentences_provided_diff,-1.98
222,Text-Davinci-003,Zero-Shot CoT,GSM8K,num_step_i_prompts_diff,-0.32


In [18]:
# Pivot table - column method should go wide
avg_diff_metric_pivot = avg_diff_metric.set_index(['Task', 'Metric', 'Model', 'Method']).unstack()

# Fix axis
avg_diff_metric_pivot = avg_diff_metric_pivot.rename_axis([None, None], axis=1).reset_index()

avg_diff_metric_pivot


Unnamed: 0_level_0,Task,Metric,Model,Value,Value,Value,Value,Value,Value,Value,Value
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,APE Zero-Shot CoT,Direct Prompting,Least-to-Most,Manual CoT,Manual Few-Shot,Self-Refine,Tree-of-Thought,Zero-Shot CoT
0,Creative Writing,fres_prompts_diff,GPT-4,14.96,15.42,6.87,5.56,7.29,8.64,15.83,17.69
1,Creative Writing,fres_prompts_diff,Text-Davinci-003,5.48,7.51,-2.07,0.14,-1.66,3.16,11.72,7.05
2,Creative Writing,num_1_dot_etc_prompts_diff,GPT-4,-0.81,2.0,9.51,11.06,6.0,2.48,-1.93,-1.58
3,Creative Writing,num_1_dot_etc_prompts_diff,Text-Davinci-003,1.49,2.0,9.01,9.1,5.99,2.77,-0.84,1.15
4,Creative Writing,num_linebreaks_prompts_diff,GPT-4,-10.77,-2.08,11.5,15.31,6.99,-0.25,-13.54,-10.93
5,Creative Writing,num_linebreaks_prompts_diff,Text-Davinci-003,-4.37,-0.98,11.97,12.99,7.93,1.52,-6.29,-4.67
6,Creative Writing,num_sentences_prompts_diff,GPT-4,-9.45,-6.33,22.23,23.06,17.74,-9.7,-27.43,-9.94
7,Creative Writing,num_sentences_prompts_diff,Text-Davinci-003,-4.14,-2.61,33.17,31.19,24.91,-4.34,-18.38,-4.05
8,Creative Writing,num_step_i_prompts_diff,GPT-4,-0.9,0.0,0.0,0.0,0.0,0.0,0.0,-0.66
9,Creative Writing,num_step_i_prompts_diff,Text-Davinci-003,-1.62,0.0,0.0,0.0,0.0,0.0,0.0,-1.71


In [19]:
# Print values of Metric
avg_diff_metric_pivot['Metric'].unique()


array(['fres_prompts_diff', 'num_1_dot_etc_prompts_diff',
       'num_linebreaks_prompts_diff', 'num_sentences_prompts_diff',
       'num_step_i_prompts_diff', 'sentence_length_prompts_diff',
       'num_1_dot_etc_provided_diff', 'num_linebreaks_provided_diff',
       'num_sentences_provided_diff', 'num_step_i_provided_diff'],
      dtype=object)

In [20]:
# Recode Metric column
avg_diff_metric_pivot['Metric'] = avg_diff_metric_pivot['Metric'].replace({
    'fres_prompts_diff' : 'Difference in Flesch Reading Ease Score (Responses - Prompts)',
    'num_1_dot_etc_prompts_diff' : 'Difference in Number of 1., 2., etc. (Responses - Prompts)',
    'num_linebreaks_prompts_diff' : 'Difference in Number of Linebreaks (Responses - Prompts)',
    'num_sentences_prompts_diff' : 'Difference in Number of Sentences (Responses - Prompts)',
    'num_step_i_prompts_diff' : 'Difference in Number of Step 1, Step 2, etc. (Responses - Prompts)',
    'sentence_length_prompts_diff' : 'Difference in Sentence Length (Responses - Prompts)',
    'num_linebreaks_provided_diff' : 'Difference in Number of Linebreaks (Responses - Provided Answer)',
    'num_sentences_provided_diff' : 'Difference in Number of Sentences (Responses - Provided Answer)',
    'num_step_i_provided_diff' : 'Difference in Number of Step 1, Step 2, etc. (Responses - Provided Answer)',
    'num_1_dot_etc_provided_diff' : 'Difference in Number of 1., 2., etc. (Responses - Provided Answer)'
    })

avg_diff_metric_pivot


Unnamed: 0_level_0,Task,Metric,Model,Value,Value,Value,Value,Value,Value,Value,Value
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,APE Zero-Shot CoT,Direct Prompting,Least-to-Most,Manual CoT,Manual Few-Shot,Self-Refine,Tree-of-Thought,Zero-Shot CoT
0,Creative Writing,Difference in Flesch Reading Ease Score (Respo...,GPT-4,14.96,15.42,6.87,5.56,7.29,8.64,15.83,17.69
1,Creative Writing,Difference in Flesch Reading Ease Score (Respo...,Text-Davinci-003,5.48,7.51,-2.07,0.14,-1.66,3.16,11.72,7.05
2,Creative Writing,"Difference in Number of 1., 2., etc. (Response...",GPT-4,-0.81,2.0,9.51,11.06,6.0,2.48,-1.93,-1.58
3,Creative Writing,"Difference in Number of 1., 2., etc. (Response...",Text-Davinci-003,1.49,2.0,9.01,9.1,5.99,2.77,-0.84,1.15
4,Creative Writing,Difference in Number of Linebreaks (Responses ...,GPT-4,-10.77,-2.08,11.5,15.31,6.99,-0.25,-13.54,-10.93
5,Creative Writing,Difference in Number of Linebreaks (Responses ...,Text-Davinci-003,-4.37,-0.98,11.97,12.99,7.93,1.52,-6.29,-4.67
6,Creative Writing,Difference in Number of Sentences (Responses -...,GPT-4,-9.45,-6.33,22.23,23.06,17.74,-9.7,-27.43,-9.94
7,Creative Writing,Difference in Number of Sentences (Responses -...,Text-Davinci-003,-4.14,-2.61,33.17,31.19,24.91,-4.34,-18.38,-4.05
8,Creative Writing,"Difference in Number of Step 1, Step 2, etc. (...",GPT-4,-0.9,0.0,0.0,0.0,0.0,0.0,0.0,-0.66
9,Creative Writing,"Difference in Number of Step 1, Step 2, etc. (...",Text-Davinci-003,-1.62,0.0,0.0,0.0,0.0,0.0,0.0,-1.71


In [21]:
# Flatten columns
flattened_cols = [''.join(col).strip().replace('Value', '') for col in avg_diff_metric_pivot.columns.values]
avg_diff_metric_pivot.columns = flattened_cols

avg_diff_metric_pivot


Unnamed: 0,Task,Metric,Model,APE Zero-Shot CoT,Direct Prompting,Least-to-Most,Manual CoT,Manual Few-Shot,Self-Refine,Tree-of-Thought,Zero-Shot CoT
0,Creative Writing,Difference in Flesch Reading Ease Score (Respo...,GPT-4,14.96,15.42,6.87,5.56,7.29,8.64,15.83,17.69
1,Creative Writing,Difference in Flesch Reading Ease Score (Respo...,Text-Davinci-003,5.48,7.51,-2.07,0.14,-1.66,3.16,11.72,7.05
2,Creative Writing,"Difference in Number of 1., 2., etc. (Response...",GPT-4,-0.81,2.0,9.51,11.06,6.0,2.48,-1.93,-1.58
3,Creative Writing,"Difference in Number of 1., 2., etc. (Response...",Text-Davinci-003,1.49,2.0,9.01,9.1,5.99,2.77,-0.84,1.15
4,Creative Writing,Difference in Number of Linebreaks (Responses ...,GPT-4,-10.77,-2.08,11.5,15.31,6.99,-0.25,-13.54,-10.93
5,Creative Writing,Difference in Number of Linebreaks (Responses ...,Text-Davinci-003,-4.37,-0.98,11.97,12.99,7.93,1.52,-6.29,-4.67
6,Creative Writing,Difference in Number of Sentences (Responses -...,GPT-4,-9.45,-6.33,22.23,23.06,17.74,-9.7,-27.43,-9.94
7,Creative Writing,Difference in Number of Sentences (Responses -...,Text-Davinci-003,-4.14,-2.61,33.17,31.19,24.91,-4.34,-18.38,-4.05
8,Creative Writing,"Difference in Number of Step 1, Step 2, etc. (...",GPT-4,-0.9,0.0,0.0,0.0,0.0,0.0,0.0,-0.66
9,Creative Writing,"Difference in Number of Step 1, Step 2, etc. (...",Text-Davinci-003,-1.62,0.0,0.0,0.0,0.0,0.0,0.0,-1.71


In [22]:
# Print values of Metric
avg_diff_metric_pivot['Metric'].unique()


array(['Difference in Flesch Reading Ease Score (Responses - Prompts)',
       'Difference in Number of 1., 2., etc. (Responses - Prompts)',
       'Difference in Number of Linebreaks (Responses - Prompts)',
       'Difference in Number of Sentences (Responses - Prompts)',
       'Difference in Number of Step 1, Step 2, etc. (Responses - Prompts)',
       'Difference in Sentence Length (Responses - Prompts)',
       'Difference in Number of 1., 2., etc. (Responses - Provided Answer)',
       'Difference in Number of Linebreaks (Responses - Provided Answer)',
       'Difference in Number of Sentences (Responses - Provided Answer)',
       'Difference in Number of Step 1, Step 2, etc. (Responses - Provided Answer)'],
      dtype=object)

In [23]:
# Sort rows
avg_diff_metric_pivot['Task'] = pd.Categorical(avg_diff_metric_pivot['Task'], ["GSM8K", "Creative Writing"])
avg_diff_metric_pivot['Metric'] = pd.Categorical(avg_diff_metric_pivot['Metric'], [
    'Difference in Number of Linebreaks (Responses - Prompts)',
    'Difference in Number of Sentences (Responses - Prompts)',
    'Difference in Number of Step 1, Step 2, etc. (Responses - Prompts)',
    'Difference in Number of 1., 2., etc. (Responses - Prompts)',
    'Difference in Sentence Length (Responses - Prompts)',
    'Difference in Flesch Reading Ease Score (Responses - Prompts)',
    'Difference in Number of Linebreaks (Responses - Provided Answer)',
    'Difference in Number of Sentences (Responses - Provided Answer)',
    'Difference in Number of Step 1, Step 2, etc. (Responses - Provided Answer)',
    'Difference in Number of 1., 2., etc. (Responses - Provided Answer)'
    ])
avg_diff_metric_pivot['Model'] = pd.Categorical(avg_diff_metric_pivot['Model'], ["Text-Davinci-003", "GPT-4"])
avg_diff_metric_pivot = avg_diff_metric_pivot.sort_values(by=['Task', 'Metric', 'Model'])

print(avg_diff_metric_pivot.columns)
avg_diff_metric_pivot


Index(['Task', 'Metric', 'Model', 'APE Zero-Shot CoT', 'Direct Prompting',
       'Least-to-Most', 'Manual CoT', 'Manual Few-Shot', 'Self-Refine',
       'Tree-of-Thought', 'Zero-Shot CoT'],
      dtype='object')


Unnamed: 0,Task,Metric,Model,APE Zero-Shot CoT,Direct Prompting,Least-to-Most,Manual CoT,Manual Few-Shot,Self-Refine,Tree-of-Thought,Zero-Shot CoT
17,GSM8K,Difference in Number of Linebreaks (Responses ...,Text-Davinci-003,-2.44,0.84,1.84,16.0,16.0,1.84,4.0,-1.25
16,GSM8K,Difference in Number of Linebreaks (Responses ...,GPT-4,-2.64,-0.34,-0.64,14.83,16.0,0.03,-6.15,-1.83
21,GSM8K,Difference in Number of Sentences (Responses -...,Text-Davinci-003,-0.02,2.87,4.05,57.4,28.36,5.11,-2.18,-0.3
20,GSM8K,Difference in Number of Sentences (Responses -...,GPT-4,0.21,1.85,2.77,42.86,19.36,1.44,15.75,0.52
25,GSM8K,"Difference in Number of Step 1, Step 2, etc. (...",Text-Davinci-003,-0.56,0.0,0.0,0.0,0.0,0.0,-2.0,-0.32
24,GSM8K,"Difference in Number of Step 1, Step 2, etc. (...",GPT-4,-0.39,0.0,0.0,0.0,0.0,0.0,1.78,0.0
13,GSM8K,"Difference in Number of 1., 2., etc. (Response...",Text-Davinci-003,-1.95,-0.34,-2.58,17.84,6.34,-0.12,1.83,-1.46
12,GSM8K,"Difference in Number of 1., 2., etc. (Response...",GPT-4,-1.57,-0.9,-2.57,17.58,6.34,-1.98,-2.12,-1.61
19,GSM8K,Difference in Number of Linebreaks (Responses ...,Text-Davinci-003,-1.02,3.26,-0.74,3.42,3.42,2.26,2.03,0.17
18,GSM8K,Difference in Number of Linebreaks (Responses ...,GPT-4,-1.22,2.08,-2.22,2.25,3.42,-1.95,-8.84,-0.41


In [24]:
# Order columns
starter_col_order = ['Task', 'Metric', 'Model']
col_order = starter_col_order + hue_order_pub_date
print(col_order)
avg_diff_metric_pivot = avg_diff_metric_pivot[col_order]

# Convert Model, Task, Metric back to string from Categorical
avg_diff_metric_pivot['Model'] = avg_diff_metric_pivot['Model'].astype(str)
avg_diff_metric_pivot['Task'] = avg_diff_metric_pivot['Task'].astype(str)
avg_diff_metric_pivot['Metric'] = avg_diff_metric_pivot['Metric'].astype(str)

# Delete values of Task and Metric when they are not the first instance in the dataframe
output_table = avg_diff_metric_pivot.copy()
output_table['Model'] = output_table['Model'].where(output_table['Model'] != output_table['Model'].shift(), '')
output_table['Task'] = output_table['Task'].where(output_table['Task'] != output_table['Task'].shift(), '')
output_table['Metric'] = output_table['Metric'].where(output_table['Metric'] != output_table['Metric'].shift(), '')

# Output to LaTeX
#output_table.to_latex('../Output/avg_accuracy_quality_pivot.tex', index=False)

# Number of columns
#print('output num cols')
#print(len(output_table.columns))

# Output to latex. center columns, wrap text, and remove index
latex_string = output_table.to_latex(index=False, 
                      column_format='x{0.75cm}|x{1.5cm}|x{0.75cm}|x{0.75cm}|x{0.75cm}|x{0.75cm}|x{0.75cm}|x{0.75cm}|x{0.75cm}|x{0.75cm}|x{0.75cm}', 
                      #booktabs = True
                      #longtable = True
                      )

# Add lines between rows
lines = latex_string.split('\n')
new_lines = []
for line in lines:
    new_lines.append(line)
    if '\\' in line and '&' in line:  # Identifies a row of the table
        new_lines.append('\\hline')
# Insert \\hline after \toprule
new_lines.insert(2, '\\hline')

# Rejoin the modified lines
modified_latex_table = '\n'.join(new_lines)

print(modified_latex_table)

# Save string to file
with open('../Output/avg_complexity_diff_metrics_pivot.tex', 'w') as f:
    f.write(modified_latex_table)

output_table


['Task', 'Metric', 'Model', 'Manual Few-Shot', 'Manual CoT', 'Least-to-Most', 'Zero-Shot CoT', 'APE Zero-Shot CoT', 'Self-Refine', 'Tree-of-Thought', 'Direct Prompting']
\begin{tabular}{x{0.75cm}|x{1.5cm}|x{0.75cm}|x{0.75cm}|x{0.75cm}|x{0.75cm}|x{0.75cm}|x{0.75cm}|x{0.75cm}|x{0.75cm}|x{0.75cm}}
\toprule
\hline
Task & Metric & Model & Manual Few-Shot & Manual CoT & Least-to-Most & Zero-Shot CoT & APE Zero-Shot CoT & Self-Refine & Tree-of-Thought & Direct Prompting \\
\hline
\midrule
GSM8K & Difference in Number of Linebreaks (Responses - Prompts) & Text-Davinci-003 & 16.0 & 16.0 & 1.84 & -1.25 & -2.44 & 1.84 & 4.0 & 0.84 \\
\hline
 &  & GPT-4 & 16.0 & 14.83 & -0.64 & -1.83 & -2.64 & 0.03 & -6.15 & -0.34 \\
\hline
 & Difference in Number of Sentences (Responses - Prompts) & Text-Davinci-003 & 28.36 & 57.4 & 4.05 & -0.3 & -0.02 & 5.11 & -2.18 & 2.87 \\
\hline
 &  & GPT-4 & 19.36 & 42.86 & 2.77 & 0.52 & 0.21 & 1.44 & 15.75 & 1.85 \\
\hline
 & Difference in Number of Step 1, Step 2, etc. (R

Unnamed: 0,Task,Metric,Model,Manual Few-Shot,Manual CoT,Least-to-Most,Zero-Shot CoT,APE Zero-Shot CoT,Self-Refine,Tree-of-Thought,Direct Prompting
17,GSM8K,Difference in Number of Linebreaks (Responses ...,Text-Davinci-003,16.0,16.0,1.84,-1.25,-2.44,1.84,4.0,0.84
16,,,GPT-4,16.0,14.83,-0.64,-1.83,-2.64,0.03,-6.15,-0.34
21,,Difference in Number of Sentences (Responses -...,Text-Davinci-003,28.36,57.4,4.05,-0.3,-0.02,5.11,-2.18,2.87
20,,,GPT-4,19.36,42.86,2.77,0.52,0.21,1.44,15.75,1.85
25,,"Difference in Number of Step 1, Step 2, etc. (...",Text-Davinci-003,0.0,0.0,0.0,-0.32,-0.56,0.0,-2.0,0.0
24,,,GPT-4,0.0,0.0,0.0,0.0,-0.39,0.0,1.78,0.0
13,,"Difference in Number of 1., 2., etc. (Response...",Text-Davinci-003,6.34,17.84,-2.58,-1.46,-1.95,-0.12,1.83,-0.34
12,,,GPT-4,6.34,17.58,-2.57,-1.61,-1.57,-1.98,-2.12,-0.9
19,,Difference in Number of Linebreaks (Responses ...,Text-Davinci-003,3.42,3.42,-0.74,0.17,-1.02,2.26,2.03,3.26
18,,,GPT-4,3.42,2.25,-2.22,-0.41,-1.22,-1.95,-8.84,2.08
