In [None]:
import os
import glob
import csv
import pandas as pd 

data_root_folder = os.path.join(os.getcwd(), "data")

Responses from Phi sometimes gives error. The processing notebook have been created to save the current results in a file and restart ignoring the previously configured questions. The next questions will be saved to a new file until a new error occurs or all the questions are processed.

This piece of code here merges incremental result files from interrupted runs into a single file for the same catgory.

In [None]:
filenames = os.path.join(data_root_folder, "temp", "*.csv")
output_df = pd.DataFrame()
for f in glob.glob(filenames):
    s = f.split("--")
    cat = s[0].split("\\")[-1]
    model = s[1]

    try:
        df = pd.read_csv(f)
    except Exception as e:
        print(f"Error reading {f}: {e}")
        continue

    output_df = pd.concat([output_df, df], ignore_index=True)
    output_file = os.path.join(data_root_folder, "merged", f"{cat}--{model}.csv")
    output_df.to_csv(
        output_file, 
        index=False, 
        quoting=csv.QUOTE_NONNUMERIC, 
        encoding='utf-8')

    print(f"Merged {len(glob.glob(filenames))} files into {output_file}")

This code combines the results file obtained from my experiment with the original BBQ file, to have all the metadata available in the same file for data analysis.

This also extract 'additional_metadata' from BBQ to single columns in the a CSV, including One Hot encoding for  'stereotyped_groups'

Interesting refs from pandas:
- https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html
- https://pandas.pydata.org/docs/reference/api/pandas.concat.html

In [None]:
from bbq_dataset import BBQ_CATEGORIES, fetch_bbq_dataframe

filenames = os.path.join(data_root_folder, "merged", "*.csv")
for f in glob.glob(filenames):
    s = f.split("--")
    cat = s[0].split("\\")[-1]
    model = s[1].removesuffix(".csv")
    bbq_filename = os.path.join(data_root_folder, "bbq", BBQ_CATEGORIES.get(cat))

    # Reads the original BBQ json file
    try:
        json_df = pd.read_json(bbq_filename, lines=True)
        json_df.set_index("example_id", inplace=True, drop=False)
    except Exception as e:
        print(f"Error reading {bbq_filename}: {e}")
        continue


    # extract additional metadata to single columns in the dataframe
    meta = json_df["additional_metadata"].apply(lambda x: {} if pd.isna(x) else x).apply(pd.Series)[["subcategory", "stereotyped_groups", "version", "source"]]
    json_df.drop(columns=["additional_metadata"], inplace=True)

    # extracts a new field to flag if the label is the unknown answer
    bbq_df = fetch_bbq_dataframe(category=cat, root_folder=os.path.join(data_root_folder, "bbq"))
    bbq_df = bbq_df.filter(["example_id", "correct_answer_unknown"]).rename(columns={"correct_answer_unknown": "label_is_unknown"})
    json_df.reset_index(drop = True, inplace = True)
    bbq_df.reset_index(drop = True, inplace = True)
    json_df = pd.merge(json_df, bbq_df, on="example_id")

    # one-hot encoding as extra columns in the dataframe
    meta = meta['stereotyped_groups'].str.join('|').str.lower().str.get_dummies(sep='|')
        
    # Reads my results CSV file
    try:
        csv_df = pd.read_csv(f)
        csv_df.set_index("example_id", inplace=True, verify_integrity=True)
        csv_df.drop(columns=["question"], inplace=True)
    except Exception as e:
        print(f"Error reading {f}: {e}")
        continue

    # Merge the dataframes into a single one
    res_df = pd.concat([json_df, csv_df, meta], axis=1).reindex(json_df.index)
    #print(res_df.info())

    # Extracts the dataframe into a CSV file
    output_file = os.path.join(data_root_folder, "final", f"{cat}--{model}.csv")
    res_df.to_csv(
        output_file, 
        index=False, 
        quoting=csv.QUOTE_NONNUMERIC, 
        encoding='utf-8')

    print(f"Merged {f} into {output_file}")

This piece of code merges all the results from the same bbq category obtained from different modules into a same CSV for data analysis. 

These are the files used on the data analysis section of the final memory.

In [None]:
bbq_category = "disability_status"

filenames = os.path.join(data_root_folder, "final", bbq_category + "--*.csv")
output_df = pd.DataFrame()
for f in glob.glob(filenames):
    s = f.split("--")
    cat = s[0].split("\\")[-1]
    model = s[1].removesuffix(".csv")

    if model == 'ALL':
        continue

    try:
        df = pd.read_csv(f)
        df.insert(loc=0, column='Model', value=model)
    except Exception as e:
        print(f"Error reading {f}: {e}")
        continue

    output_df = pd.concat([output_df, df], ignore_index=True)
    output_file = os.path.join(data_root_folder, "final", f"{cat}--ALL.csv")
    output_df.to_csv(
        output_file, 
        index=False, 
        quoting=csv.QUOTE_NONNUMERIC, 
        encoding='utf-8')
    print(f"Merged {f} into {output_file}")
