In [1]:
# Imports
import pandas as pd
from utils import pandas_utils, stats_utils, config_utils
from constants import * 
from statsmodels.stats.contingency_tables import mcnemar
from statsmodels.stats.multitest import multipletests
import re
import numpy as np
import os
from functools import reduce
from tabulate import tabulate

In [2]:
HAVE_ALL_DG_DATASETS = config_utils.have_all_datasets()

rb_df = pd.read_csv(REWARD_BENCH_PATH)
dg_df = pd.read_csv(DEAS_GROENWOLD_PATH)
DF_DICT = {
   RB_DATASET_NAME: rb_df,
   DG_DATASET_NAME: dg_df
}
# if HAVE_ALL_DG_DATASETS:
#    dg_df = pd.read_csv(DEAS_GROENWOLD_PATH)
#    DF_DICT[DG_DATASET_NAME] = dg_df

In [3]:
def get_chos_acc(df, chos_score_col, rej_score_col):
    return (df[chos_score_col] > df[rej_score_col]).mean()

def mark_significant(val, p_val, alpha=0.05):
    """Append '*' to the value if the corresponding p-value is below 0.05."""
    return f"{round(val, 2)}*" if p_val < alpha else f"{round(val, 2)}"

def escape_latex(s: str) -> str:
    """Escapes special LaTeX characters in a string."""
    replacements = {
        "&": r"\&",
        "%": r"\%",
        "$": r"\$",
        "#": r"\#",
        "_": r"\_",
        "{": r"\{",
        "}": r"\}",
        "~": r"\textasciitilde{}",
        "^": r"\textasciicircum{}",
        "\\": r"\textbackslash{}",
    }
    return re.sub(r"([&%$#_{}~^\\])", lambda m: replacements[m.group()], s)

## RQ 1

In [4]:
p_col_name = 'p'
p_holm_col_name = "$p_{Holm}$"

for rb_execution_config in RB_EXECUTION_CONFIGS:
  dataset_name, _, base_suffix, mod_suffixes, _, _ = config_utils.parse_rb_execution_config(rb_execution_config)
  if dataset_name == DG_DATASET_NAME:
    continue
  df = DF_DICT[dataset_name]
  prompt_strategy = W_PROMPT_KEY
  for mod_suffix in mod_suffixes:
    results = {}
    
    _base_acc_latex_col = f"Acc_{dataset_name.upper()}-{base_suffix.upper()}"
    base_acc_latex_col = f"${_base_acc_latex_col}$"
    mod_acc_drop_latex_col = f"$Acc_{dataset_name.upper()}-{mod_suffix.upper()} - {_base_acc_latex_col}$"

    for model in MODELS_DICT.keys():
      # get relevant score col names
      base_chos_score_col = pandas_utils.get_score_col_name(model, prompt_strategy, base_suffix, CHOSEN_KEY)
      base_rej_score_col = pandas_utils.get_score_col_name(model, prompt_strategy, base_suffix, REJECTED_KEY)
      mod_chos_score_col = pandas_utils.get_score_col_name(model, prompt_strategy, mod_suffix, CHOSEN_KEY)
      mod_rej_score_col = pandas_utils.get_score_col_name(model, prompt_strategy, mod_suffix, REJECTED_KEY)

      # get correctly scored items for base and mod texts
      base_correct = df[base_chos_score_col] > df[base_rej_score_col]
      mod_correct = df[mod_chos_score_col] > df[mod_rej_score_col]

      # construct contingency table for McNemar's test
      base_y_mod_y = ((base_correct == True) & (mod_correct == True)).sum()
      base_y_mod_n = ((base_correct == True) & (mod_correct == False)).sum()
      base_n_mod_y = ((base_correct == False) & (mod_correct == True)).sum()
      base_n_mod_n = ((base_correct == False) & (mod_correct == False)).sum()  # Both incorrect
      
      contingency_table = [[base_y_mod_y, base_y_mod_n],
                          [base_n_mod_y, base_n_mod_n]]

      # run McNemar's test
      mcnemar_result = mcnemar(contingency_table, exact=False, correction=False)

      # compute accuracies for base and mod texts
      base_acc = get_chos_acc(df, base_chos_score_col, base_rej_score_col)
      mod_acc = get_chos_acc(df, mod_chos_score_col, mod_rej_score_col)

      # store the results
      results[escape_latex(model)] = {
          base_acc_latex_col: base_acc,
          mod_acc_drop_latex_col: mod_acc - base_acc,
          # "McNemar Stat": mcnemar_result.statistic,
          p_col_name: mcnemar_result.pvalue
      }
    

  results_df = pd.DataFrame.from_dict(results, orient='index')
  results_df.index.name = "Reward Model"

  # apply Holm correction to p-values
  _, pvals_corrected, _, _ = multipletests(pvals=results_df['p'].values, 
                                          alpha=0.05, 
                                          method='holm')
  results_df[p_holm_col_name] = pvals_corrected

  # add p-value info directly to Acc_{RB-V-PHON} Drop column
  results_df[mod_acc_drop_latex_col] = results_df.apply(lambda row: mark_significant(row[mod_acc_drop_latex_col], row[p_holm_col_name]), axis=1)

  # drop unused columns
  results_df.drop([p_col_name, p_holm_col_name], axis=1, inplace=True)
  results_df = results_df.round(2)

  results_df = results_df.sort_values(by=mod_acc_drop_latex_col, ascending=False)

  # Start constructing the LaTeX table and save it to the file
  with open(f'{RESULTS_DIR}/rq1_acc_comparison_{dataset_name}_{base_suffix}_{mod_suffix}.tex', 'w') as f:
      f.write('\\begin{table*}[htp]\n')
      f.write('\\begin{center}\n')
      f.write('\\small\n')
      f.write('\\begin{tabular}{lrr}\n')
      f.write('\\toprule\n')
      f.write(f'Reward Model & {base_acc_latex_col} & {mod_acc_drop_latex_col} \\\\\n')
      f.write('\\midrule\n')
      
      # Loop through each row and write the table contents
      for idx, row in results_df.iterrows():
          model_name = idx  # Model name (already LaTeX-escaped)
          acc_orig = row[base_acc_latex_col]
          acc_drop = row[mod_acc_drop_latex_col]
          acc_orig_formatted = f"{acc_orig:.2f}"  # Ensure two decimal places
          f.write(f"{model_name} & {acc_orig_formatted} & {acc_drop} \\\\\n")
      
      # Finish the table
      f.write('\\bottomrule\n')
      f.write('\\end{tabular}\n')
      f.write('\\normalsize\n')
      f.write('\\end{center}\n')
      f.write('\\end{table*}\n')

## RQ 2

In [5]:
def compute_effect_size_results(rb_execution_config):
    dataset_name, _, base_suffix, mod_suffixes, _, _ = config_utils.parse_rb_execution_config(rb_execution_config)
    df = DF_DICT[dataset_name]
    mod_suffix = mod_suffixes[0]    # Assuming only one mod suffix for now
    prompt_strategy = WO_PROMPT_KEY if dataset_name == DG_DATASET_NAME else W_PROMPT_KEY    # only look at the most relevant prompt strategy for each dataset

    n_samples = df.shape[0]
    pair_id = np.arange(n_samples)
    data_base = pd.DataFrame({'pair_id': pair_id, 'condition': 'base'})
    data_mod = pd.DataFrame({'pair_id': pair_id, 'condition': 'mod'})
    measure_cols = list(MODELS_DICT.keys())
    for model in measure_cols:
        base_col = pandas_utils.get_score_col_name(model, prompt_strategy, base_suffix, CHOSEN_KEY)
        mod_col = pandas_utils.get_score_col_name(model, prompt_strategy, mod_suffix, CHOSEN_KEY)
        data_base[model] = df[base_col].values
        data_mod[model] = df[mod_col].values

    combined_data = pd.concat([data_base, data_mod], ignore_index=True)

    effect_size_df = stats_utils.ttestSummaries(
        df=combined_data,
        condition_col='condition',
        measure_cols=measure_cols,
        paired='pair_id',
        fillna=None
    )

    return dataset_name, effect_size_df

def compute_correlation_results(rb_execution_config):
    dataset_name, _, _, mod_suffixes, _, _ = config_utils.parse_rb_execution_config(rb_execution_config)
    mod_suffix = mod_suffixes[0]    # Assuming only one mod suffix for now
    prompt_strategy = WO_PROMPT_KEY if dataset_name == DG_DATASET_NAME else W_PROMPT_KEY    # only look at the most relevant prompt strategy for each dataset

    model_correlations = {}
    for model in MODELS_DICT.keys():
        path = f"{ALIGNMENTS_DOC_DIR}/{dataset_name}/{prompt_strategy}/{mod_suffix}/{model}.csv"
        alignments_tok_df = pd.read_csv(path)
        correl_result = stats_utils.correlSummary(alignments_tok_df, 'score', 'aalScore')
        model_correlations[model] = correl_result

    return dataset_name, model_correlations

def apply_holm_correction(correl_results_df, dataset_names):
    for dataset_name in dataset_names:
        pvalue_col = f'{dataset_name}_pvalue'
        correl_col = f'{dataset_name}_correl'
        pvals = correl_results_df[pvalue_col].values
        _, pvals_corrected, _, _ = multipletests(pvals=pvals, alpha=0.05, method='holm')
        correl_results_df[correl_col] = [mark_significant(corr, p_holm) for corr, p_holm in zip(correl_results_df[correl_col], pvals_corrected)]
        correl_results_df.drop(columns=[pvalue_col], inplace=True)
    return correl_results_df

def prepare_effect_size_dataframe(effect_size_results):
    d_dfs = []
    for dataset_name, effect_size_df in effect_size_results.items():
        effect_size_df["d"] = effect_size_df.apply(lambda row: mark_significant(row["d"], row['p_holm']), axis=1)
        d_results = effect_size_df[['d']].copy()
        d_results.columns = pd.MultiIndex.from_tuples([('d', dataset_name.upper())])
        d_results.index.name = 'model'
        d_dfs.append(d_results)
    combined_d_df = reduce(lambda left, right: left.join(right, how='outer'), d_dfs)
    return combined_d_df

def prepare_correlation_dataframe(correl_results_df, dataset_names):
    correl_dfs = []
    for dataset_name in dataset_names:
        correl_col = f'{dataset_name}_correl'
        correl_df = correl_results_df[[correl_col]].copy()
        correl_df.columns = pd.MultiIndex.from_tuples([('correl', dataset_name)])
        correl_df.index.name = 'model'
        correl_dfs.append(correl_df)
    combined_correl_df = pd.concat(correl_dfs, axis=1)
    return combined_correl_df

def generate_latex_table(combined_results, output_file_path):
    with open(output_file_path, 'w') as f:
        f.write('\\begin{table*}[htp]\n')
        f.write('\\centering\n')
        f.write('\\small\n')
        f.write('\\begin{tabular}{lcc|cc}\n')
        f.write('\\toprule\n')
        # First header row
        f.write(' & \\multicolumn{2}{c}{\\textbf{Effect Size (d)}} & \\multicolumn{2}{c}{\\textbf{Correlation (r)}} \\\\\n')
        f.write('\\cmidrule(lr){2-3} \\cmidrule(lr){4-5}\n')
        # Second header row
        f.write(f'\\textbf{{Model}} & \\textbf{{RB}} & \\textbf{{DG}} & \\textbf{{RB}} & \\textbf{{DG}} \\\\\n')
        f.write('\\midrule\n')
        # Loop through each row and write table contents
        for idx, row in combined_results.iterrows():
            model_name = escape_latex(str(idx))
            # Extract values, handling missing data
            d_RB = row.get(('d', 'RB'), '--')
            d_DG = row.get(('d', 'DG'), '--')
            correl_RB = row.get(('correl', 'RB'), '--')
            correl_DG = row.get(('correl', 'DG'), '--')
            # Write the row to the LaTeX file with 'RB' before 'DG'
            f.write(f"{model_name} & {d_RB} & {d_DG} & {correl_RB} & {correl_DG} \\\\\n")
        # Finish the table
        f.write('\\bottomrule\n')
        f.write('\\end{tabular}\n')
        f.write('\\normalsize\n')
        f.write('\\caption{Combined Effect Size and Correlation Results by Model and Dataset}\n')
        f.write('\\label{tab:combined_results}\n')
        f.write('\\end{table*}\n')

# Updated variable names for clarity and accuracy
effect_size_results = {}
for rb_execution_config in [RB_EXECUTION_CONFIGS[1], RB_EXECUTION_CONFIGS[0]]:
    dataset_name, effect_size_df = compute_effect_size_results(rb_execution_config)
    effect_size_results[dataset_name] = effect_size_df

correlations = {}
for rb_execution_config in RB_EXECUTION_CONFIGS:
    dataset_name, model_correlations = compute_correlation_results(rb_execution_config)
    correlations[dataset_name] = model_correlations

# Build a dictionary for correlation results
correl_results_dict = {model: {} for model in MODELS_DICT.keys()}
for dataset_name, model_correlations in correlations.items():
    for model, correl_result in model_correlations.items():
        correl_results_dict[model][f'{dataset_name.upper()}_correl'] = correl_result['r']
        correl_results_dict[model][f'{dataset_name.upper()}_pvalue'] = correl_result['p']
correl_results_df = pd.DataFrame.from_dict(correl_results_dict, orient='index')

dataset_names = ['DG', 'RB']

correl_results_df = apply_holm_correction(correl_results_df, dataset_names)

combined_d_df = prepare_effect_size_dataframe(effect_size_results)
combined_correl_df = prepare_correlation_dataframe(correl_results_df, dataset_names)

combined_results = combined_d_df.join(combined_correl_df, how='outer')

combined_results = combined_results.sort_values(by=('d', 'RB'), ascending=False)

output_file_path = f"{RESULTS_DIR}/rq2_combined_results.tex"
generate_latex_table(combined_results, output_file_path)

print(f"LaTeX table has been written to {output_file_path}")

LaTeX table has been written to /home/jmire/rm-dialect-biases/results/rq2_combined_results.tex


In [6]:
# Token-level Analysis (not in paper)

# Create a dictionary to store results for each model
results_dict = {model: {} for model in MODELS_DICT.keys()}

for rb_execution_config in RB_EXECUTION_CONFIGS:
    dataset_name, _, base_suffix, mod_suffixes, _, _ = config_utils.parse_rb_execution_config(rb_execution_config)
    df = DF_DICT[dataset_name]
    mod_suffix = mod_suffixes[0]
    prompt_strategy = WO_PROMPT_KEY if dataset_name == DG_DATASET_NAME else W_PROMPT_KEY    # only look at the most relevant prompt strategy for each dataset

    for model in MODELS_DICT.keys():
        path = f"{ALIGNMENTS_TOK_DIR}/{dataset_name}/{prompt_strategy}/{mod_suffix}/{model}.csv"
        alignments_tok_df = pd.read_csv(path)
        correl_result = stats_utils.correlSummary(alignments_tok_df, 'chos', 'aalScore')
        
        prefix = dataset_name.lower()  # 'dg' or 'rb'
        results_dict[model][f'{prefix}_correl'] = correl_result['r']
        results_dict[model][f'{prefix}_pvalue'] = correl_result['p']

results_df = pd.DataFrame.from_dict(results_dict, orient='index')

_, dg_pvals_corrected, _, _ = multipletests(pvals=results_df['dg_pvalue'].values, alpha=0.05, method='holm')
_, rb_pvals_corrected, _, _ = multipletests(pvals=results_df['rb_pvalue'].values, alpha=0.05, method='holm')

results_df['dg_correl'] = [mark_significant(corr, p_holm) 
                          for corr, p_holm in zip(results_df['dg_correl'], dg_pvals_corrected)]
results_df['rb_correl'] = [mark_significant(corr, p_holm) 
                          for corr, p_holm in zip(results_df['rb_correl'], rb_pvals_corrected)]

results_df = results_df[['dg_correl', 'rb_correl']]

results_df.to_latex(f"{RESULTS_DIR}/rq2_token_level_results.tex", escape=True)

## RQ 3

In [7]:
df = rb_df

measure_cols = list(MODELS_DICT.keys())

n = df.shape[0]
pair_id = np.arange(n)

data_mirror_base_prompt = pd.DataFrame({'pair_id': pair_id, 'condition': 'mirror'})
data_non_mirror_base_prompt = pd.DataFrame({'pair_id': pair_id, 'condition': 'non_mirror'})

data_mirror_mod_prompt = pd.DataFrame({'pair_id': pair_id, 'condition': 'mirror'})
data_non_mirror_mod_prompt = pd.DataFrame({'pair_id': pair_id, 'condition': 'non_mirror'})

base_suffix = 'orig'
mod_suffix = 'vpAAL'

for model in MODELS_DICT.keys():
    # Base prompt 
    base_prompt_mod_candidate_score_col = pandas_utils.get_score_col_name(model, W_BASE_PROMPT_MOD_CANDIDATE_KEY, mod_suffix, CHOSEN_KEY)
    base_prompt_base_candidate_score_col = pandas_utils.get_score_col_name(model, W_PROMPT_KEY, base_suffix, CHOSEN_KEY)

    data_mirror_base_prompt[model] = df[base_prompt_base_candidate_score_col].values
    data_non_mirror_base_prompt[model] = df[base_prompt_mod_candidate_score_col].values

    # Modified prompt
    mod_prompt_base_candidate_score_col = pandas_utils.get_score_col_name(model, W_MOD_PROMPT_BASE_CANDIDATE_KEY, mod_suffix, CHOSEN_KEY)
    mod_prompt_mod_candidate_score_col = pandas_utils.get_score_col_name(model, W_PROMPT_KEY, mod_suffix, CHOSEN_KEY)

    data_mirror_mod_prompt[model] = df[mod_prompt_mod_candidate_score_col].values
    data_non_mirror_mod_prompt[model] = df[mod_prompt_base_candidate_score_col].values

# Base prompt
base_prompt_combined_df = pd.concat([data_non_mirror_base_prompt, data_mirror_base_prompt], ignore_index=True)
base_ttest_df = stats_utils.ttestSummaries(
    base_prompt_combined_df,
    condition_col='condition',
    measure_cols=measure_cols,
    paired='pair_id'
)

# Modified prompt
mod_prompt_combined_df = pd.concat([data_non_mirror_mod_prompt, data_mirror_mod_prompt], ignore_index=True)
mod_ttest_df = stats_utils.ttestSummaries(
    mod_prompt_combined_df,
    condition_col='condition',
    measure_cols=measure_cols,
    paired='pair_id'
)

final_display_df = pd.DataFrame(index=mod_ttest_df.index)
final_display_df['Cohen\'s d (Mod Prompt)'] = [
    mark_significant(d, p_val)
    for d, p_val in zip(mod_ttest_df['d'], mod_ttest_df['p_holm'])
]
final_display_df['Cohen\'s d (Base Prompt)'] = [
    mark_significant(d, p_val)
    for d, p_val in zip(base_ttest_df['d'], base_ttest_df['p_holm'])
]

# sort by effect sizes for mod prompt col
final_display_df['d_mod_prompt'] = mod_ttest_df['d']
final_display_df.sort_values(by='d_mod_prompt', ascending=True, inplace=True)
final_display_df.drop(columns=['d_mod_prompt'], inplace=True)

final_display_df.reset_index(inplace=True)
final_display_df.rename(columns={'index': 'Model'}, inplace=True)

output_file = os.path.join(RESULTS_DIR, 'rq3_mirroring_table.tex')
with open(output_file, 'w') as f:
    f.write('\\begin{table*}[htp]\n')
    f.write('\\centering\n')
    f.write('\\small\n')
    
    num_columns = len(final_display_df.columns)

    column_alignment = 'lcc|cc'
    f.write(f'\\begin{{tabular}}{{{column_alignment}}}\n')
    f.write('\\toprule\n')
    
    headers = final_display_df.columns.tolist()
    escaped_headers = [escape_latex(h) for h in headers]
    header_line = ' & '.join(escaped_headers) + ' \\\\\n'
    f.write(header_line)
    f.write('\\midrule\n')

    for idx, row in final_display_df.iterrows():
        escaped_row = [escape_latex(str(value)) for value in row]
        row_line = ' & '.join(escaped_row) + ' \\\\\n'
        f.write(row_line)
    
    f.write('\\bottomrule\n')
    f.write('\\end{tabular}\n')
    f.write('\\normalsize\n')
    
    f.write('\\caption{Mirroring comparison}\n')
    f.write('\\label{tab:combined_results}\n')
    f.write('\\end{table*}\n')

print(f"LaTeX table has been written to {output_file}")

LaTeX table has been written to /home/jmire/rm-dialect-biases/results/rq3_mirroring_table.tex


## Appendix

#### Dialect Status of the RB (+DG) Dataset

In [8]:
def generate_latex_table(results, output_path):
    """Generate a LaTeX table from results."""
    # Prepare data for tabulate with proper case for headers
    headers = ['Dataset', 'Suffix', 'Column', 'AAL', 'Hispanic', 'White', 'Other']
    
    # Create mapping for case-insensitive lookup
    case_map = {'AAL': 'aal', 'Hispanic': 'hispanic', 'White': 'white', 'Other': 'other'}
    
    table_data = []
    for i, row in results.iterrows():
        formatted_row = [
            row['Dataset'],
            row['Suffix'],
            row['Column']
        ]
        # Add score values using case mapping
        for header in headers[3:]:  # Skip first 3 columns (Dataset, Suffix, Column)
            value = row[case_map[header]]
            formatted_row.append(f"{value:.2f}")
        table_data.append(formatted_row)
    
    latex_table = tabulate(table_data, headers=headers, tablefmt='latex_booktabs', floatfmt='.2f')
    
    # Add LaTeX table formatting
    latex_table = (
        "\\begin{table}[htbp]\n"
        "\\centering\n"
        "\\caption{Dialect Analysis Results}\n"
        "\\label{tab:dialect-analysis}\n"
        + latex_table +
        "\n\\end{table}"
    )
    
    # Write to file
    with open(output_path, 'w') as f:
        f.write(latex_table)

our_dataset_dialect_analysis_df = pd.read_csv(OUR_DATASETS_DIALECT_ANALYSIS_PATH)
generate_latex_table(our_dataset_dialect_analysis_df, f"{RESULTS_DIR}/app_our_datasets_dialect_analysis.tex")