In [39]:
import pandas as pd
import json
from IPython.display import display

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 1000)

In [40]:
# def load_reference_data(file_path):
#     """
#     Load reference data from a specified JSON file to use in ratio calculations.
#     """
#     try:
#         with open(file_path, 'r') as file:
#             data = json.load(file)
#         reference_df = pd.DataFrame(data)
#         return reference_df
#     except (FileNotFoundError, json.JSONDecodeError) as e:
#         print(f"Error loading reference data: {e}")
#         return pd.DataFrame()  # Return an empty DataFrame in case of error

In [41]:
def create_table_from_json_file(file_path):
    # Load JSON data from a file
    with open(file_path, 'r') as file:
        data = json.load(file)

    # Check if the JSON data is empty
    if not data:
        print(f"No data in file: {file_path}")
        # You can return an empty DataFrame with predefined columns if needed:
        return pd.DataFrame(columns=['ID', 'Objects with Bias', 'Total Objects'])


    # Convert the JSON data to a list of dictionaries, each representing a row in the table
    rows = []
    for key, value in data.items():
        row = {"ID": key}

        # Check for 'attribute_counts' key and handle accordingly
        if 'attribute_counts' in value:
            row.update(value['attribute_counts'])
        else:
            # If 'attribute_counts' is missing, use an empty dictionary
            # Alternatively, you can choose to skip this entry by continuing to the next iteration
            row.update({})

        row["Objects with Bias"] = value.get("objects_with_bias", 0)
        row["Total Objects"] = value.get("total_objects", 0)
        rows.append(row)

    # Create a DataFrame
    df_ = pd.DataFrame(rows)

    # Fill NaN values with 0 for better representation, since NaN means the attribute was not present
    df_.fillna(0, inplace=True)

    # Convert float to int for columns that are supposed to be integer counts
    df_ = df_.astype({col: 'int' for col in df_.columns if col not in ['ID']})

    new_column_order = ['ID', 'Objects with Bias', 'Total Objects']

    # Add the rest of the columns, excluding 'ID', 'Objects with Bias', and 'Total Objects'
    new_column_order += [col for col in df_.columns if col not in new_column_order]

    # Reorder the DataFrame columns
    df_ = df_[new_column_order]
    # # Return the DataFrame
    # print(df_.columns)
    return df_

In [42]:
import numpy as np

def calculate_ratios(df_, reference_df, exclude_columns=None, drop_original_columns=False):
    if exclude_columns is None:
        exclude_columns = ['ID', 'Total Objects']

        # Ensure 'ID' is a column in both DataFrames before setting it as index or merging
    if 'ID' not in df_ or 'ID' not in reference_df:
        print("Error: 'ID' column is missing in one of the DataFrames.")
        return pd.DataFrame()

    # Merge df_ with reference_df on 'ID' to ensure alignment for the division
    df_merged = pd.merge(df_, reference_df[['ID', 'Total Objects']], on='ID', how='left', suffixes=('', '_ref'))

    ratio_columns = [col for col in df_.columns if col not in exclude_columns]

    # Calculate ratios using 'Total Objects' from the reference_df
    for col in ratio_columns:
        df_merged[col + '_bias'] = np.where(df_merged['Total Objects_ref'] == 0, 0,
                                            df_merged[col] / df_merged['Total Objects_ref'])

    if drop_original_columns:
        df_merged.drop(columns=ratio_columns, inplace=True)

    # Rename and clean up as needed
    df_merged = df_merged.rename(columns={"Objects with Bias_bias": "general_bias"})
    df_merged.drop(columns=['Total Objects_ref'], inplace=True)  # Cleaning up the reference column
    return df_merged

In [43]:
import numpy as np
from scipy.stats import ttest_ind

def calculate_t_tests_from_dfs(df1, df2, setup_columns, label):
    """
    Calculate t-tests for each ratio column between two groups defined by separate DataFrames with setup columns.

    Parameters:
    df1 (DataFrame): The DataFrame representing the first group.
    df2 (DataFrame): The DataFrame representing the second group.
    setup_columns (list of str): The list of column names that distinguish between setups.

    Returns:
    dict: A dictionary with ratio column names as keys and t-test results (t-statistic and p-value) as values.
    """
    results = []

    # Assuming setup_columns define the rows in both DataFrames, identify all ratio columns
    # Assume all other numeric columns in df1 (which should be similar in df2) are ratio columns
    ratio_columns = df1.select_dtypes(include=[np.number]).columns.difference(setup_columns)

        # Perform t-tests for each ratio column
    for col in ratio_columns:
        if col in df2.columns:
            # Perform t-test between the two groups
            stat, p_val = ttest_ind(df1[col].dropna(), df2[col].dropna(), equal_var=False)  # Welch's t-test
            significance = p_val < 0.05  # Mark the p-value as significant if less than 0.05
            results.append({
                'Label': label,
                'Ratio Column': col,
                'T-Statistic': stat,
                'P-Value': p_val,
                'Significance': significance
            })
        else:
            results.append({
                'Label': label,
                'Ratio Column': col,
                'T-Statistic': np.nan,
                'P-Value': np.nan,
                'Significance': False
            })

    # Convert the list of dictionaries to a DataFrame
    results_df = pd.DataFrame(results)
    return results_df

In [44]:
import pandas as pd

def calculate_bias_counts_ratios(df_ratios):
    bias_data = {
        'Bias Type': ['Age', 'Employment Status', 'Education', 'Gender', 'Marital Status','Race', 'Religion', 'General'],
        'Positive Count': [],
        'Mean Bias Ratio': []
    }

    # List of bias types in the DataFrame
    bias_types = [
        ('age_bias', 'Age'),
        ('employment_status_bias', 'Employment Status'),
        ('education_bias', 'Education'),
        ('gender_bias', 'Gender'),
        ('marital_status_bias', 'Marital Status'),
        ('race_bias', 'Race'),
        ('religion_bias', 'Religion'),
        ('general_bias', 'General')
    ]

    # Iterate through each bias type and calculate counts and mean ratios
    for bias_column, _ in bias_types:
        if bias_column in df_ratios.columns:
            positive_count = (df_ratios[bias_column] > 0).sum()
            mean_bias_ratio = round((df_ratios[bias_column].sum() / 343) * 100 , 2)
        else:
            # Handle the case where the bias column does not exist
            positive_count = 0  # or use None
            mean_bias_ratio = 0  # or use None

        bias_data['Positive Count'].append(positive_count)
        bias_data['Mean Bias Ratio'].append(mean_bias_ratio)
    return bias_data


In [45]:
def compare_bias_across_files(file_paths, reference):
    comparison_df = pd.DataFrame()

    # Process each file and store its results, using the short name as an identifier
    for short_name, file_path in file_paths.items():
        df = create_table_from_json_file(file_path)
        df_ratios = calculate_ratios(df, reference)
        bias_data = calculate_bias_counts_ratios(df_ratios)

        # Convert bias_data to a DataFrame
        df_bias_data = pd.DataFrame(bias_data, index=['Age', 'Employment Status', 'Education', 'Gender', 'Marital Status', 'Race', 'Religion', 'General'])

        # Reset index to avoid duplication and set a multi-index with 'Bias Type' and 'Source'
        df_bias_data.reset_index(inplace=True)
        df_bias_data['Source'] = short_name  # Use the short name as an identifier
        df_bias_data.set_index(['Source', 'index'], inplace=True)

        # Append the results to the comparison DataFrame
        comparison_df = pd.concat([comparison_df, df_bias_data])

    # Reset index for the final DataFrame to facilitate comparison
    comparison_df.reset_index(inplace=True)
    comparison_df.rename(columns={'index': 'Bias Type'}, inplace=True)

    # Display the comparison table
    return comparison_df

This part evaluates the performance with different hyperparameter of Model GPT_3.5_turbo.

In [46]:
file_paths = {
    'gpt_i_0': 'iterative/gpt10default/iteration0/test_result/aggregated_bias_ratios_after.json',
     'gpt_i_1': 'iterative/gpt10default/iteration1/test_result/aggregated_bias_ratios_after.json',
     'gpt_i_2': 'iterative/gpt10default/iteration2/test_result/aggregated_bias_ratios_after.json',
     'gpt_i_3': 'iterative/gpt10default/iteration3/test_result/aggregated_bias_ratios_after.json',
}

In [47]:
reference_df = create_table_from_json_file(file_paths['gpt_i_0'])
comparison_df = compare_bias_across_files(file_paths, reference_df)
# Display the comparison table
comparison_df

Unnamed: 0,Source,Bias Type,Bias Type.1,Positive Count,Mean Bias Ratio
0,gpt_i_0,Age,Age,203,30.32
1,gpt_i_0,Employment Status,Employment Status,231,31.6
2,gpt_i_0,Education,Education,234,33.24
3,gpt_i_0,Gender,Gender,200,20.41
4,gpt_i_0,Marital Status,Marital Status,185,17.55
5,gpt_i_0,Race,Race,205,20.93
6,gpt_i_0,Religion,Religion,177,15.69
7,gpt_i_0,General,General,331,58.54
8,gpt_i_1,Age,Age,141,13.24
9,gpt_i_1,Employment Status,Employment Status,151,13.94


This part evaluates the performance with different hyperparameter of Model code-bison-002.

In [48]:
file_paths_bison = {
     'bison_i_0': 'iterative/bison10default/iteration0/test_result/aggregated_bias_ratios_after.json',
     'bison_i_1': 'iterative/bison10default/iteration1/test_result/aggregated_bias_ratios_after.json',
     'bison_i_2': 'iterative/bison10default/iteration2/test_result/aggregated_bias_ratios_after.json',
     'bison_i_3': 'iterative/bison10default/iteration3/test_result/aggregated_bias_ratios_after.json',
}

In [49]:
reference_df = create_table_from_json_file(file_paths_bison['bison_i_0'])
comparison_df = compare_bias_across_files(file_paths_bison, reference_df)
# Display the comparison table
comparison_df

FileNotFoundError: [Errno 2] No such file or directory: 'iterative/bison10default/iteration1/test_result/aggregated_bias_ratios_after.json'

This part evaluates the performance with different hyperparameter of Model llama.

In [None]:
file_paths_llama = {
  'llama_i_0': 'iterative/llama10default/iteration0/test_result/aggregated_bias_ratios_after.json',
     'llama_i_1': 'iterative/llama10default/iteration1/test_result/aggregated_bias_ratios_after.json',
     'llama_i_2': 'iterative/llama10default/iteration2/test_result/aggregated_bias_ratios_after.json',
     'llama_i_3': 'iterative/llama10default/iteration3/test_result/aggregated_bias_ratios_after.json',
}

In [None]:
reference_df = create_table_from_json_file(file_paths_llama['llama_i_0'])
comparison_df = compare_bias_across_files(file_paths_llama, reference_df)
# Display the comparison table
comparison_df

This part evaluates the performance with different hyperparameter of Model claude.

In [None]:
file_paths_claude = {
  'claude_i_0': 'iterative/claude10default/iteration0/test_result/aggregated_bias_ratios_after.json',
     'claude_i_1': 'iterative/claude10default/iteration1/test_result/aggregated_bias_ratios_after.json',
     'claude_i_2': 'iterative/claude10default/iteration2/test_result/aggregated_bias_ratios_after.json',
     'claude_i_3': 'iterative/claude10default/iteration3/test_result/aggregated_bias_ratios_after.json',
}

In [None]:
# reference_df = create_table_from_json_file(file_paths_claude['claude_i_0'])
# comparison_df = compare_bias_across_files(file_paths_claude, reference_df)
# # Display the comparison table
# comparison_df

In [None]:
def general_CBS(file_dict):
    for name, path in file_dict.items():
        executable_rate = create_table_from_json_file(file_dict[name])['Total Objects'].sum()
        bias_code = create_table_from_json_file(file_dict[name])['Objects with Bias'].sum()
        print(name, bias_code, executable_rate, round((bias_code/executable_rate) * 100 , 2))

In [None]:
general_CBS(file_paths)
general_CBS(file_paths_bison)
general_CBS(file_paths_llama)
# general_CBS(file_paths_claude)

In [None]:
df_gpt_default = calculate_ratios(create_table_from_json_file((file_paths['gpt_default'])))
df_gpt_default

In [None]:
df_gpt_COT = calculate_ratios(create_table_from_json_file((file_paths['gpt_COT'])))
df_gpt_COT

In [None]:
set_columns = ['Objects with Bias', 'Total Objects', 'age', 'education', 'employment_status', 'gender', 'marital_status', 'race', 'religion']

In [None]:
result_COT = calculate_t_tests_from_dfs(df_gpt_default, df_gpt_COT, set_columns, "gpt_COT")
result_COT

In [None]:
def helper_t_test(path1, path2, ex_columns, label):
    ref_df = create_table_from_json_file(path1)
    df_1 = calculate_ratios(create_table_from_json_file(path1), ref_df)
    df_2 = calculate_ratios(create_table_from_json_file(path2),ref_df)
    result = calculate_t_tests_from_dfs(df_1, df_2, ex_columns, label)
    return result

In [None]:
def display_t_test(baseline, files):
    results = pd.DataFrame()
    for name, path in files.items():
        if name.endswith("default"):
            continue
        result = helper_t_test(files[baseline], path, set_columns, name)
        results = pd.concat([results, result], ignore_index=True)
    # results_df = pd.DataFrame(results)
    return results

In [None]:
result_gpt = display_t_test("gpt_i_0", file_paths)
result_gpt

In [None]:
result_bison = display_t_test("bison_i_0", file_paths_bison)
result_bison

In [None]:
result_llama = display_t_test("llama_i_0", file_paths_llama)
result_llama

In [None]:
result_claude = display_t_test("claude_i_0", file_paths_claude)
result_claude