## Paired t-test for quality, means of other metrics

In [None]:
import pandas as pd
import scipy.stats as stats


In [None]:
# Load direct_prompting_comparison.xlsx
direct_prompting_comparison = pd.read_excel('direct_prompting_comparison.xlsx')


In [None]:
# Perform paired t-test on creative writing data and all other metrics

# Metrics to t-test: 'creative_writing_score', 'ease_of_evaluation_score', 'conversation_length', 'input_length', 'output_length', 'conversation_cost', 'gsm8k_length_vs_provided', 'length_vs_direct_prompting', 'num_linebreaks', 'num_sentences', 'num_step_i', 'num_1_dot_etc', 'sentence_length', 'fres', 'num_linebreaks_prompts', 'num_sentences_prompts', 'num_step_i_prompts', 'num_1_dot_etc_prompts', 'sentence_length_prompts', 'fres_prompts'
metrics_to_t_test = ['coherence_1_incoherent_10_very_coherent', 'ease_of_review_1_easy_10_hard', 'conversation_length', 'input_length', 'output_length', 'conversation_cost', 'gsm8k_length_vs_provided', 'length_vs_direct_prompting', 'num_linebreaks', 'num_sentences', 'num_step_i', 'num_1_dot_etc', 'sentence_length', 'fres', 'num_linebreaks_prompts', 'num_sentences_prompts', 'num_step_i_prompts', 'num_1_dot_etc_prompts', 'sentence_length_prompts', 'fres_prompts']

# Create table to hold results
# Columns of model, task, method, mean metric, mean dp_metric, statistic, pvalue
t_test_results = pd.DataFrame(columns=['model', 'task', 'method', 'metric', 'dp_mean', 'using_method_mean', 'statistic', 'pvalue'])
# Iterate over model, task, method
for model, task, method in direct_prompting_comparison[['model', 'task', 'method']].values:
    # Iterate over metrics_to_t_test
    for metric in metrics_to_t_test:
        # Get data
        # Sort by conversation_number
        data = direct_prompting_comparison[(direct_prompting_comparison['model'] == model) & (direct_prompting_comparison['task'] == task) & (direct_prompting_comparison['method'] == method)].sort_values(by=['conversation_number'])
        # direct_prompting holds the metric when using direct_prompting
        direct_prompting = data['dp_' + metric]
        # using_method holds the metric when using the method
        using_method = data[metric]
        # Perform the paired sample t-test 
        statistic, pvalue = stats.ttest_rel(direct_prompting, using_method)
        # Add to table
        t_test_results = pd.concat([t_test_results, pd.DataFrame([[model, task, method, metric, direct_prompting.mean(), using_method.mean(), statistic, pvalue]], columns=['model', 'task', 'method', 'metric', 'dp_mean', 'using_method_mean', 'statistic', 'pvalue'])])

t_test_results


In [None]:
# Rows that are not significant at p < 0.05
t_test_results[t_test_results['pvalue'] >= 0.05]
