In [None]:
import pandas as pd
import scipy.stats as stats
from scipy.stats import wasserstein_distance
import numpy as np
import matplotlib.pyplot as plt
import os
import csv
from collections import defaultdict
from scipy.stats import ttest_1samp
from statistics import mean
from utils import *

In [None]:
def run_stat_test_subset(model, bias_type, use_scores=False, old_scores=None):
    
    root = '../results/'+model+'/csv'
    
    if 'key_typo' in bias_type or 'middle_random' in bias_type or 'letter_swap' in bias_type:
        file = bias_type+'.csv' 
    elif model == 'llama2-7b' or model == 'llama2-13b' or model=='llama2-70b' or model =='gpt-3.5-turbo-instruct'\
    or 'ext_gen' in model:
        file = bias_type+'.csv'
    else:
        file = bias_type+'-sample.csv'
    

    scores = {}
    
    with open(os.path.join(root, file), newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        first_group, second_group, first_options, second_options = get_groups(file)
        for row in reader:
            
            if not use_scores or (use_scores and row["key"] in old_scores):
                
                if row["key"] not in scores:
                    scores[row["key"]] = 0

                if row["group"] == first_group and row["response"] in first_options:
                    scores[row["key"]] += 1
                if row["group"] == second_group and row["response"] in second_options:
                    scores[row["key"]] += -1
        
    return scores, scores.keys()


In [None]:
models = ['gpt-3.5-turbo', 'gpt-3.5-turbo-instruct', 'llama2-7b', 'llama2-13b']
bias_types = ['acquiescence','response_order', 'odd_even', 'opinion_float', 'allow_forbid']

all_results = []

for bias_type in bias_types:
    for model in models:
                
        temp_bias_type = bias_type
        if model == 'llama2-7b' or model == 'llama2-13b':
            if bias_type != "allow_forbid":
                temp_bias_type = temp_bias_type+"-50"
        new_scores, all_keys = run_stat_test_subset(model+"-ext_gen", temp_bias_type) # get all of the keys that are used..
        old_scores, all_keys1 = run_stat_test_subset(model, bias_type, True, new_scores.keys())
        
        new_values = list(new_scores.values())
        new_val_mean = mean(new_values)/50*100
        new_p_value = ttest_1samp(new_values, 0)[1]
        
        old_values = list(old_scores.values())
        old_val_mean = mean(old_values)/50*100
        old_p_value = ttest_1samp(old_values, 0)[1]
        
        all_results.append([bias_type, model, old_val_mean, old_p_value, new_val_mean, new_p_value])

In [None]:
comb_df = pd.DataFrame(all_results, columns = ['bias type', 'model', 'old effect', "old p value", 'new effect', 'new p value'])


In [None]:
comb_df.to_csv("ext_gen_results.csv")

In [None]:
pd.read_csv("ext_gen_results.csv")

In [None]:
comb_df.round(4)

In [None]:
print(comb_df.to_latex(index=False, float_format="{:.4f}".format))

In [None]:
comb_df["diff"] = comb_df["old effect"] - comb_df["new effect"]

In [None]:
comb_df["diff"].mean()

In [None]:
plt.scatter(comb_df['old effect'], comb_df['new effect'])
plt.xlabel("Original Delta")
plt.ylabel("Ext gen Delta")
plt.savefig("effect_correlation_extgen.pdf", format="pdf", bbox_inches="tight")
from scipy.stats import pearsonr

corr, _ = pearsonr(comb_df['old effect'], comb_df['new effect'])
print(corr)


In [None]:
plt.scatter(comb_df['old p value'], comb_df['new p value'])
plt.xlabel("Original p value")
plt.ylabel("Ext gen p value")
plt.savefig("pval_correlation_extgen.pdf", format="pdf", bbox_inches="tight")

from scipy.stats import pearsonr

corr, _ = pearsonr(comb_df['old p value'], comb_df['new p value'])
print(corr)