In [1]:
# Imports
import pandas as pd
import datasets
from tqdm import tqdm
from collections import defaultdict
import yaml
import os
from constants import *
from utils.pandas_utils import save_df
from utils import pandas_utils
from code_detection_strategy.GPT4oCodeDetectionStrategy import GPT4oCodeDetectionStrategy
from translation_strategy.ValueTranslationStrategy import ValueTranslationStrategy
from translation_strategy.PhonateTranslationStrategy import PhonateTranslationStrategy
import subprocess
from functools import lru_cache
from utils import idp_utils, langid_utils, config_utils
import json
import random
from datasets import load_dataset, Dataset
from evaluate import load
import torch
import gc
import concurrent.futures
import math
from langDialect.getDialect import detectDialect
from tabulate import tabulate

tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


## Dataset Preparation

#### Load and Assemble Raw Data

In [2]:
HAVE_ALL_DG_DATASETS = config_utils.have_all_datasets()

if HAVE_ALL_DG_DATASETS:
    raw_groenwold_wme_df = pd.read_json(path_or_buf=RAW_GROENWOLD_WME_PATH, 
                                        orient='records',
                                        lines=True)
    raw_groenwold_aal_df = pd.read_json(path_or_buf=RAW_GROENWOLD_AAL_PATH, 
                                        orient='records', 
                                        lines=True)
    raw_deas_prompts_df = pd.read_csv(RAW_DEAS_PROMPTS_PATH)
    raw_deas_eval_df = pd.read_csv(RAW_DEAS_EVAL_PATH)

    # Build DeasGroenwold (DG) df
    groenwold_data = pd.DataFrame({
        'source': ['groenwold'] * len(raw_groenwold_wme_df),
        'txt_wme': raw_groenwold_wme_df['text'].values,
        'txt_aal': raw_groenwold_aal_df['text'].values,
    })
    deas_prompts_data = pd.DataFrame({
        'source': ['deas'] * len(raw_deas_prompts_df),
        'txt_wme': raw_deas_prompts_df['wme_text'].values,
        'txt_aal': raw_deas_prompts_df['aal_text'].values
    })
    deas_eval_data = pd.DataFrame({
        'source': ['deas'] * len(raw_deas_eval_df),
        'txt_wme': raw_deas_eval_df['wme_text'].values,
        'txt_aal': raw_deas_eval_df['aal_text'].values
    })
    dg_df = pd.concat([groenwold_data, deas_prompts_data, deas_eval_data], ignore_index=True)
    dg_df['txt_empty'] = [''] * len(dg_df)
    save_df(dg_df, DEAS_GROENWOLD_PATH)
    print(dg_df.shape)

raw_rb_dataset_dict = datasets.load_dataset(REWARD_BENCH_DATASET_NAME)
raw_rb_df = pd.DataFrame.from_dict(data=raw_rb_dataset_dict['filtered'].to_dict())

# Build RewardBench (RB) df
rb_df = pd.DataFrame({
    'subset': raw_rb_df['subset'],
    'txt_prompt_orig': raw_rb_df[PROMPT_KEY],
    'txt_chos_orig': raw_rb_df[CHOSEN_KEY],
    'txt_rej_orig': raw_rb_df[REJECTED_KEY],
    'txt_empty': [''] * len(raw_rb_df)
})
save_df(rb_df, REWARD_BENCH_PATH)
print(rb_df.shape)

(2985, 5)


#### Filter out examples containing code

In [3]:
# Filter out datapoints containing code
code_detection_strategy = GPT4oCodeDetectionStrategy()

if HAVE_ALL_DG_DATASETS:
  dg_code_predictions_path = f'{CODE_PREDICTIONS_DIR}/dg.csv'
  dg_df = code_detection_strategy.detect(df=dg_df, 
                                        results_path=dg_code_predictions_path,
                                        cols=['txt_wme'])
  print("unfiltered dg length", len(dg_df))
  dg_df = dg_df[dg_df[GPT4O_CODE_PREDICTION_COL] == False]
  dg_df = dg_df.reset_index()
  save_df(dg_df, DEAS_GROENWOLD_PATH)
  print("filtered dg length", len(dg_df))

rb_code_predictions_path = f'{CODE_PREDICTIONS_DIR}/rb.csv'
rb_df = code_detection_strategy.detect(df=rb_df, 
                                       results_path=rb_code_predictions_path,
                                       cols=['txt_prompt_orig', 'txt_chos_orig', 'txt_rej_orig'])
print("unfiltered rb length", len(rb_df))
rb_df = rb_df[rb_df[GPT4O_CODE_PREDICTION_COL] == False]
rb_df = rb_df.reset_index()
save_df(rb_df, REWARD_BENCH_PATH)
print("filtered rb length", len(rb_df))

unfiltered rb length 2985
filtered rb length 1843


#### Add VALUE AAL Translations

In [4]:
value_translation_strategy = ValueTranslationStrategy()

if HAVE_ALL_DG_DATASETS and GET_ALL_RESULTS:
  dg_df = value_translation_strategy.translate(dg_df, DG_DATASET_NAME, ['txt_wme'], [VALUE_AAL_SUFFIX], FORCE_REBUILD_VALUE)
  save_df(dg_df, DEAS_GROENWOLD_PATH)

rb_df = value_translation_strategy.translate(rb_df, RB_DATASET_NAME, ['txt_prompt_orig', 'txt_chos_orig', 'txt_rej_orig'], [VALUE_AAL_SUFFIX], FORCE_REBUILD_VALUE)
save_df(rb_df, REWARD_BENCH_PATH)

#### Add PhonATe AAL Translations

In [5]:
phonate_translation_strategy = PhonateTranslationStrategy()

if HAVE_ALL_DG_DATASETS and GET_ALL_RESULTS:
  dg_df = phonate_translation_strategy.translate(dg_df,
                                                 DG_DATASET_NAME, 
                                                 input_cols=[f'txt_{VALUE_AAL_SUFFIX}'],
                                                 output_col_suffixes=[f'{VALUE_AAL_PHONATE_SUFFIX}'],
                                                 force_retranslate=FORCE_REBUILD_PHONATE)
  save_df(dg_df, DEAS_GROENWOLD_PATH)

rb_df = phonate_translation_strategy.translate(rb_df, 
                                               RB_DATASET_NAME, 
                                               input_cols=[f'txt_prompt_{VALUE_AAL_SUFFIX}', f'txt_chos_{VALUE_AAL_SUFFIX}', f'txt_rej_{VALUE_AAL_SUFFIX}'], 
                                               output_col_suffixes=[f'{VALUE_AAL_PHONATE_SUFFIX}'],
                                               force_retranslate=FORCE_REBUILD_PHONATE)
save_df(rb_df, REWARD_BENCH_PATH)

Using cached results for translation from txt_prompt_vAAL -> txt_prompt_vpAAL
Using cached results for translation from txt_chos_vAAL -> txt_chos_vpAAL
Using cached results for translation from txt_rej_vAAL -> txt_rej_vpAAL


## Reward Model Scoring

#### Helper Functions

In [6]:
def should_skip_execution(prompt_strategy, base_suffix, cur_suffix):
  if prompt_strategy in [W_BASE_PROMPT_MOD_CANDIDATE_KEY, W_MOD_PROMPT_BASE_CANDIDATE_KEY]:
    return base_suffix == cur_suffix
  else:
    return False

def get_prompt_and_candidate_suffixes(prompt_strategy, base_suffix, cur_suffix):
  if prompt_strategy == WO_PROMPT_KEY:
    prompt_suffix, candidate_suffix = "empty", cur_suffix
  elif prompt_strategy == W_PROMPT_KEY:
    prompt_suffix, candidate_suffix = cur_suffix, cur_suffix
  elif prompt_strategy == W_BASE_PROMPT_MOD_CANDIDATE_KEY:
    prompt_suffix, candidate_suffix = base_suffix, cur_suffix
  elif prompt_strategy == W_MOD_PROMPT_BASE_CANDIDATE_KEY:
    prompt_suffix, candidate_suffix = cur_suffix, base_suffix
  return prompt_suffix, candidate_suffix

def get_score_col_name(model, prompt_strategy, suffix, candidate_type):
   return f'{suffix}${prompt_strategy}${model}${candidate_type}'

def get_rb_scores(model_name, input_path, output_dir, force_rerun=False):
    """Run RewardBench and skip if output exists unless force_rerun is True."""
    output_path = f'{output_dir}{model_name}_all.jsonl'
    if not os.path.exists(output_path) or force_rerun:
      param_dict = build_rb_param_dict(model_name, input_path, output_dir)
      _stdout, _stderr = invoke_rewardbench(param_dict)
    return pd.read_json(output_path, orient='records', lines=True)

with open(REWARD_BENCH_EVAL_CONFIGS_PATH, "r") as f:
  reward_bench_eval_configs = yaml.load(f.read(), Loader=yaml.FullLoader)
  
def build_rb_param_dict(model_name, input_path, output_dir):
  param_dict = {}
  model_config = reward_bench_eval_configs[model_name]
  param_dict['model'] = model_name
  # add ref_model for DPO models
  if 'ref_model' in model_config.keys():
    param_dict['ref_model'] = model_config['ref_model']
  param_dict['tokenizer'] = model_config['tokenizer']
  param_dict['dataset'] = input_path
  param_dict['chat_template'] = model_config['chat_template']
  param_dict['batch_size'] = model_config['batch_size']
  param_dict['trust_remote_code'] = model_config['trust_remote_code']
  param_dict['output_dir'] = output_dir
  param_dict['save_all'] = True
  return param_dict

def invoke_rewardbench(param_dict):
  command_parts = ['rewardbench']
  for key, val in param_dict.items():
    if val == None:
      continue
    if type(val) == bool:
      if val:
        part = f'--{key}'
      else:
        continue
    else:
      part = f'--{key}={val}'
    command_parts.append(part)
  command_parts.append('--load_json')
  print(" ".join(command_parts))

  env = os.environ.copy()
  env['CUDA_VISIBLE_DEVICES'] = CUDA_VISIBLE_DEVICES
  result = subprocess.run(command_parts, capture_output=True, text=True, env=env)
  return result.stdout, result.stderr

#### Document-level Reward Model Scoring

In [7]:
DF_DICT = {
   RB_DATASET_NAME: rb_df,
}
if HAVE_ALL_DG_DATASETS:
   DF_DICT[DG_DATASET_NAME] = dg_df

for rb_execution_config in RB_EXECUTION_CONFIGS:
  dataset_name, col_prefix_dict, base_suffix, mod_suffixes, all_suffixes, prompt_strategies = config_utils.parse_rb_execution_config(rb_execution_config)
  if dataset_name == DG_DATASET_NAME and not HAVE_ALL_DG_DATASETS:
    continue
  df = DF_DICT[dataset_name]
  
  aligned_scores_dict = {}
  for suffix in all_suffixes:
    for prompt_strategy in prompt_strategies:
      if should_skip_execution(prompt_strategy, base_suffix, suffix):
        continue
      print("Starting execution for", dataset_name, prompt_strategy, suffix)
      prompt_suffix, candidate_suffix = get_prompt_and_candidate_suffixes(prompt_strategy, base_suffix, suffix)

      rb_input_dict = {}
      rb_input_dict['idx'] = df.index
      prompt_col_name = "txt_empty" if prompt_strategy == WO_PROMPT_KEY else f'{col_prefix_dict[PROMPT_KEY]}_{prompt_suffix}' 
      rb_input_dict[PROMPT_KEY] = df[prompt_col_name].apply(lambda x: [{"role": "user", "content": x}])
      chos_col_name = f'{col_prefix_dict[CHOSEN_KEY]}_{candidate_suffix}' 
      rb_input_dict[CHOSEN_KEY] = df[chos_col_name]
      rej_col_name = f'{col_prefix_dict[REJECTED_KEY]}_{candidate_suffix}' 
      rb_input_dict[REJECTED_KEY] = df[rej_col_name]
      rb_input_df = pd.DataFrame.from_dict(rb_input_dict)

      base_dir = f'{RB_SCORING_DIR}/{dataset_name}/{prompt_strategy}/{suffix}/'
      input_path = f'{base_dir}input.jsonl'
      if not os.path.exists(input_path) or FORCE_RB_RESCORING:
        os.makedirs(base_dir, exist_ok=True)
        rb_input_df.to_json(input_path, lines=True, orient='records')

      for model in MODELS_DICT:
        output_dir = f'{base_dir}{model}/'
        os.makedirs(output_dir, exist_ok=True)

        if not all(get_score_col_name(model, prompt_strategy, suffix, candidate_type) in df.columns for candidate_type in [CHOSEN_KEY, REJECTED_KEY]) or FORCE_RB_RESCORING:
          rb_scores_df = get_rb_scores(
            model_name=model,
            input_path=input_path,
            output_dir=output_dir,
            force_rerun=FORCE_RB_RESCORING
          )

        for candidate_type in [CHOSEN_KEY, REJECTED_KEY]:
            aligned_scores = [None] * len(df)
            for idx, rb_score in zip(rb_input_df['idx'], rb_scores_df[candidate_type]):
                aligned_scores[idx] = rb_score[0] if isinstance(rb_score, list) else rb_score

            col_name = f'{suffix}${prompt_strategy}${model}${candidate_type}'
            aligned_scores_dict[col_name] = aligned_scores

  new_cols_df = pd.DataFrame(aligned_scores_dict)

  df = pd.concat([df, new_cols_df], axis=1)
  DF_DICT[dataset_name] = df
  if dataset_name == DG_DATASET_NAME:
      dg_df = df
  elif dataset_name == RB_DATASET_NAME:
      rb_df = df

if HAVE_ALL_DG_DATASETS:
  save_df(dg_df, DEAS_GROENWOLD_PATH)
save_df(rb_df, REWARD_BENCH_PATH)

Starting execution for rb w_prompt orig
Starting execution for rb w_prompt vpAAL
Starting execution for rb w_mod_prompt_base_candidate vpAAL
Starting execution for rb w_base_prompt_mod_candidate vpAAL


## Correlation: token chosenness x token AALness

In [None]:
if GET_ALL_RESULTS:
    for rb_execution_config in RB_EXECUTION_CONFIGS:
        dataset_name, col_prefix_dict, base_suffix, mod_suffixes, all_suffixes, _ = config_utils.parse_rb_execution_config(rb_execution_config)
        prompt_strategy = WO_PROMPT_KEY if dataset_name == DG_DATASET_NAME else W_PROMPT_KEY
        col_prefixes = [CHOSEN_KEY] if dataset_name == DG_DATASET_NAME else [CHOSEN_KEY, REJECTED_KEY]
        if dataset_name == DG_DATASET_NAME and not HAVE_ALL_DG_DATASETS:
            continue
        df = DF_DICT[dataset_name]

        _, base_candidate_suffix = get_prompt_and_candidate_suffixes(prompt_strategy, base_suffix, base_suffix)
        base_txt_cols = [f'{col_prefix_dict[col_prefix]}_{base_candidate_suffix}' for col_prefix in col_prefixes]

        for suffix in mod_suffixes:
            _, mod_candidate_suffix = get_prompt_and_candidate_suffixes(prompt_strategy, base_suffix, suffix)
            mod_txt_cols = [f'{col_prefix_dict[col_prefix]}_{mod_candidate_suffix}' for col_prefix in col_prefixes]
            for model in MODELS_DICT:
                model_txt_dict = defaultdict(list)
                for base_txt_col, mod_txt_col in zip(base_txt_cols, mod_txt_cols):
                    base_txt_score_col = pandas_utils.get_corresponding_score_col(base_txt_col, prompt_strategy, model, dataset_name)
                    mod_txt_score_col = pandas_utils.get_corresponding_score_col(mod_txt_col, prompt_strategy, model, dataset_name)

                    # Vectorized operations to compare scores
                    base_txt_scores = df[base_txt_score_col].values
                    mod_txt_scores = df[mod_txt_score_col].values
                    base_texts = df[base_txt_col].values
                    mod_texts = df[mod_txt_col].values

                    # Create boolean masks
                    model_chos_base_mask = base_txt_scores > mod_txt_scores
                    model_chos_mod_mask = base_txt_scores < mod_txt_scores
                    # rej_mask = base_txt_scores < mod_txt_scores

                    # Extract texts based on masks
                    model_txt_dict['chos'].extend(base_texts[model_chos_base_mask])
                    model_txt_dict['rej'].extend(mod_texts[model_chos_base_mask])
                    model_txt_dict['chos'].extend(mod_texts[model_chos_mod_mask])
                    model_txt_dict['rej'].extend(base_texts[model_chos_mod_mask])

                # Get alignments and vectorizer
                alignments, vectorizer = idp_utils.get_alignments(model_txt_dict, alignment_category='chos', return_vectorizer=True)

                alignments['aalScore'] = [langid_utils.detectDialect(t)['aav'] for t in alignments.index]
                path = f"{ALIGNMENTS_TOK_DIR}/{dataset_name}/{prompt_strategy}/{suffix}/{model}.csv"
                os.makedirs(os.path.dirname(path), exist_ok=True)
                alignments.to_csv(path, index=False)

#### Correlation: document RM score x document AALness

In [9]:
@lru_cache(maxsize=None)
def get_aal_score(text):
    return langid_utils.detectDialect(text)['aav']

for rb_execution_config in RB_EXECUTION_CONFIGS:
    dataset_name, col_prefix_dict, base_suffix, mod_suffixes, all_suffixes, _ = config_utils.parse_rb_execution_config(rb_execution_config)
    prompt_strategy = WO_PROMPT_KEY if dataset_name == DG_DATASET_NAME else W_PROMPT_KEY
    col_prefixes = [CHOSEN_KEY] if dataset_name == DG_DATASET_NAME else [CHOSEN_KEY, REJECTED_KEY]
    if dataset_name == DG_DATASET_NAME and not HAVE_ALL_DG_DATASETS:
        continue
    df = DF_DICT[dataset_name]
    
    # Prepare base text columns
    _, base_candidate_suffix = get_prompt_and_candidate_suffixes(
        prompt_strategy, base_suffix, base_suffix)
    base_txt_cols = [f'{col_prefix_dict[col_prefix]}_{base_candidate_suffix}'
                     for col_prefix in col_prefixes]

    # Precompute and store 'aalScore's for all texts
    all_texts = set()
    for suffix in mod_suffixes:
        _, mod_candidate_suffix = get_prompt_and_candidate_suffixes(
            prompt_strategy, base_suffix, suffix)
        mod_txt_cols = [f'{col_prefix_dict[col_prefix]}_{mod_candidate_suffix}'
                        for col_prefix in col_prefixes]
        # Collect all texts from base and modified columns
        for txt_col in base_txt_cols + mod_txt_cols:
            all_texts.update(df[txt_col].values)

    # Cache 'aalScore's for all unique texts
    aal_score_cache = {}
    for text in all_texts:
        aal_score_cache[text] = get_aal_score(text)

    for suffix in mod_suffixes:
        _, mod_candidate_suffix = get_prompt_and_candidate_suffixes(
            prompt_strategy, base_suffix, suffix)
        mod_txt_cols = [f'{col_prefix_dict[col_prefix]}_{mod_candidate_suffix}'
                        for col_prefix in col_prefixes]

        # Prepare a DataFrame once per suffix
        model_txt_df = pd.DataFrame()

        # Collect texts
        txt_list = []
        for base_txt_col, mod_txt_col in zip(base_txt_cols, mod_txt_cols):
            txt_list.extend(df[base_txt_col].values)
            txt_list.extend(df[mod_txt_col].values)
        model_txt_df['txt'] = txt_list

        # Map 'aalScore's from the cache
        model_txt_df['aalScore'] = model_txt_df['txt'].map(aal_score_cache)

        for model in MODELS_DICT:
            # Collect scores for the current model
            score_list = []
            for base_txt_col, mod_txt_col in zip(base_txt_cols, mod_txt_cols):
                base_txt_score_col = pandas_utils.get_corresponding_score_col(
                    base_txt_col, prompt_strategy, model, dataset_name)
                mod_txt_score_col = pandas_utils.get_corresponding_score_col(
                    mod_txt_col, prompt_strategy, model, dataset_name)
                score_list.extend(df[base_txt_score_col].values)
                score_list.extend(df[mod_txt_score_col].values)
            model_txt_df['score'] = score_list


            path = f"{ALIGNMENTS_DOC_DIR}/{dataset_name}/{prompt_strategy}/{suffix}/{model}.csv"
            os.makedirs(os.path.dirname(path), exist_ok=True)
            model_txt_df.to_csv(path, index=False)

### RB (+DG) Datasets Dialect Analysis

In [10]:
def detect_dialect_scores(text):
    """Calculate dialect scores for a given text."""
    s = detectDialect(str(text))
    return pd.Series({
        'aal_score': s['aav'],
        'hispanic_score': s['hispanic'], 
        'white_score': s['white'],
        'other_score': s['other']
    })

def process_text_column(df, txt_col):
    """Process a single text column and return scores."""
    scores = df[txt_col].progress_apply(detect_dialect_scores)
    scores.columns = [f'{txt_col}${col}' for col in scores.columns]
    return scores

def analyze_dialect_scores(df, dataset_name, execution_config):
    """Analyze dialect scores for a dataset and return a DataFrame."""
    dataset_name, col_prefix_dict, base_suffix, mod_suffixes, all_suffixes, _ = config_utils.parse_rb_execution_config(rb_execution_config)
    col_prefixes = [CHOSEN_KEY] if dataset_name == DG_DATASET_NAME else [PROMPT_KEY, CHOSEN_KEY, REJECTED_KEY]
    df = DF_DICT[dataset_name]

    results = []
    for suffix in all_suffixes:
        txt_cols = [f'{col_prefix_dict[col_prefix]}_{suffix}' for col_prefix in col_prefixes]
        
        for txt_col in txt_cols:
            df = df.join(process_text_column(df, txt_col))
            
            scores = {
                'Dataset': dataset_name,
                'Suffix': suffix,
                'Column': txt_col,
                'aal': df[f'{txt_col}$aal_score'].mean(),
                'hispanic': df[f'{txt_col}$hispanic_score'].mean(),
                'white': df[f'{txt_col}$white_score'].mean(),
                'other': df[f'{txt_col}$other_score'].mean()
            }
            results.append(scores)
    
    return results

if not os.path.exists(OUR_DATASETS_DIALECT_ANALYSIS_PATH) or FORCE_RERUN_OUR_DATASETS_DIALECT_ANALYSIS:
    all_results = []
    for rb_execution_config in RB_EXECUTION_CONFIGS:
        dataset_name = rb_execution_config[DATASET_NAME_KEY]
        if dataset_name == DG_DATASET_NAME and not HAVE_ALL_DG_DATASETS:
            continue
        df = DF_DICT[dataset_name]
        results = analyze_dialect_scores(df, dataset_name, rb_execution_config)
        all_results.extend(results)

    our_datasets_dialect_analysis_df = pd.DataFrame(all_results)
    our_datasets_dialect_analysis_df.to_csv(OUR_DATASETS_DIALECT_ANALYSIS_PATH, index=False)

### RM Training Datasets Dialect Analysis

In [11]:
# Load existing results from JSON if available
if os.path.exists(AVG_AAV_SCORES_JSON_PATH) and not FORCE_RERUN_RM_TRAINING_DATASETS_DIALECT_ANALYSIS:
    with open(AVG_AAV_SCORES_JSON_PATH, "r") as f:
        avg_aav_scores_dict = json.load(f)
else:
    avg_aav_scores_dict = {}

def get_dataset(dataset_name, split, save_path, config_name=None):
    os.makedirs(os.path.dirname(save_path), exist_ok=True)  # Ensure directory exists
    if not os.path.exists(save_path):
        dataset = load_dataset(dataset_name, split=split, name=config_name)
        dataset.save_to_disk(save_path)
    else:
        dataset = Dataset.load_from_disk(save_path)
    return dataset

def get_aav_score(text):
    s = detectDialect(str(text))
    return s['aav']

# Process each dataset
for d_name, d_vals in RM_TRAIN_DATASETS:
    config_name = d_vals['config'] if 'config' in d_vals else None
    for split in d_vals['splits']:
        for col in d_vals['text_cols']:
            score_key = f"{d_name}/{split}/{col}_aav_score"
            
            # Skip if the result already exists
            if score_key in avg_aav_scores_dict:
                print(f"Skipping {score_key}, result already exists.")
                continue

            # Load dataset
            dataset_path = os.path.join(RM_TRAINING_DATASETS_DIR, d_name.replace('/', '_'), split)
            dataset =  get_dataset(d_name, split, dataset_path, config_name=config_name)

            # Limit dataset to a random sample of min(full size, 30k)
            max_samples = min(len(dataset), 30000)
            random_indices = random.sample(range(len(dataset)), max_samples)
            dataset = dataset.select(random_indices)

            # Compute AAV scores
            score_col = f"{col}_aav_score"
            dataset = dataset.map(lambda row: {**row, score_col: get_aav_score(row[col])})
            aav_scores = [row[score_col] for row in dataset if score_col in row]

            # Calculate and store average AAV
            avg_aav = sum(aav_scores) / len(aav_scores) if aav_scores else 0
            avg_aav_scores_dict[score_key] = avg_aav

            # Print result
            print(f"Average 'aav' for {col} in {d_name}/{split}: {avg_aav}")

    # Save all results to JSON
    with open(AVG_AAV_SCORES_JSON_PATH, "w") as json_file:
        json.dump(avg_aav_scores_dict, json_file, indent=4)

# Convert the dictionary to a DataFrame
avg_aav_df = pd.DataFrame(list(avg_aav_scores_dict.items()), columns=['Dataset_Score_Column', 'Avg_AAV'])

if os.path.exists(MODEL_AVG_AAV_SCORE_JSON_PATH):
    with open(MODEL_AVG_AAV_SCORE_JSON_PATH, "r") as f:
        model_avg_aav_score_dict = json.load(f)
else:
    model_avg_aav_score_dict = {}

# Process each dataset
models_set = set()
for _, vals in RM_TRAIN_DATASETS:
    for model in vals['models']:
        models_set.add(model)
for model in models_set:
    scores = []
    lengths = []
    for d_name, d_vals in RM_TRAIN_DATASETS:
        if model not in d_vals['models']:
            continue
        config_name = d_vals['config'] if 'config' in d_vals else None
        for split in d_vals['splits']:
            for col in d_vals['text_cols']:
                score_key = f"{d_name}/{split}/{col}_aav_score"
                scores.append(avg_aav_scores_dict[score_key])
                dataset_path = os.path.join(RM_TRAINING_DATASETS_DIR, d_name.replace('/', '_'), split)
                dataset = get_dataset(d_name, split, dataset_path, config_name=config_name)
                lengths.append(len(dataset))
    result = 0
    for score, length in zip(scores, lengths):
        result += score * length
    
    model_avg_aav_score_dict[model] = result / sum(lengths)

    print(model, model_avg_aav_score_dict[model])
    print(scores)
    print(lengths)
    print()

    # Save all results to JSON
    with open(MODEL_AVG_AAV_SCORE_JSON_PATH, "w") as json_file:
        json.dump(model_avg_aav_score_dict, json_file, indent=4)

Skipping Anthropic/hh-rlhf/train/chosen_aav_score, result already exists.
Skipping Anthropic/hh-rlhf/train/rejected_aav_score, result already exists.
Skipping argilla/dpo-mix-7k/train/chosen_aav_score, result already exists.
Skipping argilla/dpo-mix-7k/train/rejected_aav_score, result already exists.
Skipping NCSOFT/offsetbias/train/output_1_aav_score, result already exists.
Skipping NCSOFT/offsetbias/train/output_2_aav_score, result already exists.
Skipping RLHFlow/UltraFeedback-preference-standard/train/chosen_aav_score, result already exists.
Skipping RLHFlow/UltraFeedback-preference-standard/train/rejected_aav_score, result already exists.
Skipping RLHFlow/Helpsteer-preference-standard/train/chosen_aav_score, result already exists.
Skipping RLHFlow/Helpsteer-preference-standard/train/rejected_aav_score, result already exists.
Skipping RLHFlow/HH-RLHF-Helpful-standard/train/chosen_aav_score, result already exists.
Skipping RLHFlow/HH-RLHF-Helpful-standard/train/rejected_aav_score, r