# Analyze Metrics and Conduct Inference.ipynb

In [None]:
import pandas as pd
from statsmodels.stats.contingency_tables import mcnemar 


In [None]:
# Load Combined_Data.xlsx
combined_data = pd.read_excel('Combined_Data.xlsx')

# GSM8k data - limit to task = "gsm8k"
gsm8k_data = combined_data[combined_data['task'] == 'gsm8k']

# Creative writing data - limit to task = "cw"
cw_data = combined_data[combined_data['task'] == 'cw']


## Accuracy/Quality

In [None]:
# Produce table - one column for each method, one row for each model by task type
# Average values of correct_or_incorrect and creative_writing_score

# Column for combined accuracy_quality - correct_or_incorrect if task = "gsm8k", creative_writing_score if task = "cw"
combined_data['accuracy_quality'] = combined_data.apply(lambda row: row['correct_or_incorrect'] if row['task'] == 'gsm8k' else row['creative_writing_score'], axis=1)

# Average accuracy_quality by model, method, task
avg_accuracy_quality = combined_data[['model', 'method', 'task', 'accuracy_quality']].groupby(['model', 'method', 'task']).mean()['accuracy_quality'].reset_index()

# Pivot table - column method should go wide
avg_accuracy_quality_pivot = avg_accuracy_quality.pivot_table(index=['model', 'task'], columns='method', values='accuracy_quality').reset_index()

# Sort rows by task - gsm8k task first, then cw
# Sort by model - text-davinci-003 first, then gpt4
avg_accuracy_quality_pivot = avg_accuracy_quality_pivot.sort_values(by=['task', 'model'], ascending=[True, True])

# Order columns: direct_prompting, zero_shot_cot, ape_zero_shot_cot, tree_of_thought, self_refine, least_to_most, manual_few_shot, manual_cot
avg_accuracy_quality_pivot = avg_accuracy_quality_pivot[['model', 'task', 'direct_prompting', 'zero_shot_cot', 'ape_zero_shot_cot', 'tree_of_thought', 'self_refine', 'least_to_most', 'manual_few_shot', 'manual_cot']]


In [None]:
# Perform McNemar's Test on GSM8k data

# Get in terms of number of questions answered correctly: sum accuracy_quality column by model, method, task
gsm8k_num_correct= gsm8k_data[['model', 'method', 'task', 'accuracy_quality']].groupby(['model', 'method', 'task']).sum()['accuracy_quality'].reset_index()
# Rename column to num_correct
gsm8k_num_correct = gsm8k_num_correct.rename(columns={'accuracy_quality': 'num_correct'})

# Make another table with counts of questions missed (number of observations minus accuracy_quality sum) by model, method, task
gsm8k_num_obs = gsm8k_data[['model', 'method', 'task', 'accuracy_quality']].groupby(['model', 'method', 'task']).count()['accuracy_quality'].reset_index()
# Rename column to num_obs
gsm8k_num_obs = gsm8k_num_obs.rename(columns={'accuracy_quality': 'num_obs'})
# Join together by model, method, task
gsm8k_correct_incorrect_obs = gsm8k_num_correct.merge(gsm8k_num_obs, on=['model', 'method', 'task'], how='left')
# Table should have model, method, task, num_correct, num_incorrect, num_obs
gsm8k_correct_incorrect_obs['num_incorrect'] = gsm8k_correct_incorrect_obs['num_obs'] - gsm8k_correct_incorrect_obs['num_correct']

# Iterate over each column in gsm8k_correct_obs except for model, task, direct_prompting
# Get column and direct prompting column

# Create a dataset 
data = [[30, 20], 
        [10, 40]] 
  
# McNemar's Test without any continuity correction 
print(mcnemar(data, exact=False)) 
  
# McNemar's Test with the continuity correction 
print(mcnemar(data, exact=False, correction=False)) 


In [None]:
# Perform paired t-test on creative writing data and all other metrics


In [None]:
# Find variance of correct_incorrect column by model, method, task

# Find variance of creative_writing_score column by model, method, task
