In [None]:
import pandas as pd
import scipy.stats as stats
from scipy.stats import wasserstein_distance
import numpy as np
import matplotlib.pyplot as plt
import os
import csv
from collections import defaultdict
from scipy.stats import ttest_1samp
from statistics import mean
import random
from utils import *

In [None]:
models = ['gpt-3.5-turbo', 'gpt-3.5-turbo-instruct', 'llama2-7b', 'llama2-13b', 'llama2-70b', 'llama2-7b-chat', 'llama2-13b-chat', 'llama2-70b-chat', 'llama2-70b-ift']
bias_types = ['acquiescence','response_order', 'odd_even', 'opinion_float', 'allow_forbid']
subset_bias_types = ['acquiescence-50','response_order-50', 'odd_even-50', 'opinion_float-50', 'allow_forbid']
perturbations = ['-key_typo', '-middle_random', '-letter_swap']
clean_bias_labels = ['Acquiescence', 'Response Order', 'Odd/even', 'Opinion Float', 'Allow/forbid']


all_results = []

for model in models:
    print(model)

    for i in range(len(bias_types)):
    
        bias_type = bias_types[i]
        
        print(bias_type)
        
        values, p_value, keys = run_stat_test(model, bias_type)
        diff = mean(values)
        lst = [model, clean_bias_labels[i], diff, p_value]
        
        for perturbation in perturbations:
                
            if subset_bias_types[i] == 'opinion_float-50': #qustions are the same
                bias_type = 'odd_even'+perturbation
            else:
                bias_type = bias_types[i]+perturbation

            values, p_value, keys = run_stat_test(model, bias_type)
            diff = mean(values)
            lst.append(diff)
            lst.append(p_value)
        
        all_results.append(lst)


df = pd.DataFrame(all_results, columns = ['model', 'bias type', 'modified', 'bias p value',\
                                          'key typo', 'key typo p value',\
                                          'middle random', 'middle random p value',\
                                          'letter swap', 'letter swap p value'])
df = df.round(4)

In [None]:
df.to_pickle("full_results.pkl")  

In [None]:
df

In [None]:
#THIS PLOTS FULL RESULTS

import seaborn as sns
import numpy as np
np.bool = np.bool_

models = ['ideal', 'llama2-7b', 'llama2-13b', 'llama2-70b', 'llama2-70b-ift', 'llama2-7b-chat', 'llama2-13b-chat', 'llama2-70b-chat', 'gpt-3.5-turbo', 'gpt-3.5-turbo-instruct']
bias_types = ['acquiescence','allow_forbid', 'response_order', 'opinion_float', 'odd_even']
perturbations = ['-key_typo', '-middle_random', '-letter_swap']
exp_settings = ['modified', 'key typo', 'middle random', 'letter swap']
clean_labels = ['bias', 'key typo', 'middle random', 'letter swap']
clean_bias_labels = ['Acquiescence', 'Allow/forbid', 'Response Order', 'Opinion Float', 'Odd/even']
clean_model_labels = ['Most Human-like', 'Llama2-7b', 'Llama2-13b', 'Llama2-70b', 'Solar', 'Llama-7b-chat', 'Llama-13b-chat','Llama-70b-chat', '3.5 Turbo', '3.5 Turbo Instruct']

fig, axs = plt.subplots(2, len(models)//2, figsize=(15,6))

cmap_name = 'tab20c'

for i in range(len(models)):
    
    model = models[i]
    
    effect_data = np.zeros((len(bias_types),len(exp_settings)))
    p_values = np.zeros((len(bias_types),len(exp_settings)))
    
    for k in range(len(exp_settings)):
        for j in range(len(bias_types)):
            
            exp_setting = exp_settings[k]
                        
            if exp_setting == 'modified':
                p_val_col = 'bias p value'

            elif exp_setting =="key typo":
                p_val_col = 'key typo p value'
            
            elif exp_setting == "middle random":
                p_val_col = 'middle random p value'
            
            elif exp_setting == "letter swap":
                p_val_col = 'letter swap p value'
            
            
            if model == 'ideal':
                if k == 0:
                    effect_size = random.uniform(0, 1)
                    p_value = 0.01
                else:
                    effect_size = 1
                    p_value = 1


                if p_value < 0.05:
                    effect_data[j][k] = -0.7
                else:
                    effect_data[j][k] = np.nan

                p_values[j][k] = p_value

            else:
                effect_size = df[(df['bias type'] == clean_bias_labels[j])\
                                                &(df['model']==model)][exp_setting]
                p_value = df[(df['bias type'] == clean_bias_labels[j])\
                                                &(df['model']==model)][p_val_col]

                p_values[j][k] = p_value.item()
                if p_value.item() < 0.05:
                    if effect_size.item() > 0:
                        effect_data[j][k] = -0.7
                    else:
                        effect_data[j][k] = -0.3
                else:
                    effect_data[j][k] = np.nan

    r = i//5
    c = i%5
    
         
    if r == 0 and c== 0:
        sns.heatmap(effect_data, xticklabels=False, cbar=False, ax=axs[r,c], cmap=cmap_name, vmin=-1, vmax=1, linewidths=1, linecolor='gray')  
        tickvalues = [num+0.5 for num in range(0,len(exp_settings))]
        axs[r,c].set_title(clean_model_labels[i])
        
        tickvalues1 = [num+0.5 for num in range(0,len(bias_types))]
        axs[r,c].set_yticks(tickvalues1)
        axs[r,c].set_yticklabels(clean_bias_labels, rotation=0)
        
    if r == 0 and c!= 0:
        sns.heatmap(effect_data, xticklabels=False, yticklabels=False, cbar=False, ax=axs[r,c], cmap=cmap_name, vmin=-1, vmax=1, linewidths=1, linecolor='gray')  
        axs[r,c].set_title(clean_model_labels[i])
        
    if r == 1 and c== 0:
        sns.heatmap(effect_data, cbar=False, ax=axs[r,c], cmap=cmap_name, vmin=-1, vmax=1, linewidths=1, linecolor='gray')  
        tickvalues = [num+0.5 for num in range(0,len(exp_settings))]
        axs[r,c].set_xticks(tickvalues)
        axs[r,c].set_xticklabels(clean_labels, rotation=90)
        axs[r,c].set_title(clean_model_labels[i])
        
        tickvalues1 = [num+0.5 for num in range(0,len(bias_types))]
        axs[r,c].set_yticks(tickvalues1)
        axs[r,c].set_yticklabels(clean_bias_labels, rotation=0)
        
    if r == 1 and c!= 0:
        sns.heatmap(effect_data, yticklabels=False, cbar=False, ax=axs[r,c], cmap=cmap_name, vmin=-1, vmax=1, linewidths=1, linecolor='gray')  
        tickvalues = [num+0.5 for num in range(0,len(exp_settings))]
        axs[r,c].set_xticks(tickvalues)
        axs[r,c].set_xticklabels(clean_labels, rotation=90)
        axs[r,c].set_title(clean_model_labels[i])
        
    zm = np.ma.masked_less(p_values, 0.05)
            
    x= np.arange(effect_data.shape[1]+1)
    y= np.arange(effect_data.shape[0]+1)

    axs[r,c].pcolor(x, y, zm, hatch='//', alpha=0.)


plt.savefig("perturbation.pdf", format="pdf", bbox_inches="tight")
