In [11]:
import pandas as pd
import re
import numpy as np
import ast
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
import warnings
warnings.filterwarnings('ignore')

### Print accuracy before and after correct context

(Not in the paper, but is an initial check of the results)

#### Helper functions for this task

In [2]:
def convert_time(time_str):
    parts = time_str.split(':')
    total_seconds = 0.0
    total_minutes = 0.0
    
    if len(parts) == 3:  # Format: HH:MM:SS.milliseconds
        hours = float(parts[0])
        minutes = float(parts[1])
        seconds = float(parts[2])
        total_seconds = hours * 3600 + minutes * 60 + seconds
    elif len(parts) == 2:  # Format: MM:SS.milliseconds
        minutes = float(parts[0])
        seconds = float(parts[1])
        total_seconds = minutes * 60 + seconds
    elif len(parts) == 1:  # Format: SS.milliseconds
        seconds = float(parts[0])
        total_seconds = seconds
    else:
        return 0, 1
    
    total_minutes = total_seconds / 60
    
    return total_seconds, total_minutes
    
def compare_times(time1: str, time2: str) -> bool:
    time1_ = ''.join([x for x in time1 if x.isdigit()]).rstrip('0')
    time2_ = ''.join([x for x in time2 if x.isdigit()]).rstrip('0')
    if time1_ in time2_ or time2_ in time1_:
        return True
    def get_combinations(time_str):
        # time_str = ''.join([x for x in time_str if x.isdigit() or x == '.']).rstrip('0').rstrip('.')
        total_seconds, total_minutes = convert_time(time_str)
        return [total_seconds, total_minutes]
    
    try:
        time1_combinations = get_combinations(time1)
        time2_combinations = get_combinations(time2)
        
        for t1 in time1_combinations:
            for t2 in time2_combinations:
                if abs(t1-t2) <= 0.1:
                    return True
    except:
        return False
    
    return False

def normalize_name(name: str) -> str:
    # Split the name into parts
    parts = name.split()
    # Convert to lower case
    parts = [part.lower() for part in parts]
    # Remove special characters
    parts = [re.sub(r'[^a-z]', '', part) for part in parts]
    # Join the parts back into a string
    normalized_name = " ".join(parts)
    return normalized_name

def compare_names(name1: str, name2: str) -> bool:
    normalized_name1 = normalize_name(name1)
    normalized_name2 = normalize_name(name2)

    # Split normalized names into parts
    parts1 = normalized_name1.split()
    parts2 = normalized_name2.split()

    if len(parts1) == 1:
        # Check if the single name part is in the second name
        return parts1[0] in normalized_name2
    elif len(parts2) == 1:
        # Check if the single name part is in the first name
        return parts2[0] in normalized_name1
    else:
        # Compare the full normalized names
        return normalized_name1 == normalized_name2

def calculate_prior_accuracy(df: pd.DataFrame) -> pd.DataFrame:
    # Capitalize the first letter of each dataset group
    df = df.drop_duplicates(subset='question', keep="first")

    # Calculate the mean of prior_correct for each dataset
    group_means = df.groupby('dataset')['prior_correct'].mean().reset_index()

    # Rename the columns
    group_means.columns = ['Dataset', 'Prior Accuracy']

    # Calculate the overall mean
    overall_mean = df['prior_correct'].mean()

    # Append the 'All' row
    overall_row = pd.DataFrame({'Dataset': ['All'], 'Prior Accuracy': [overall_mean]})
    result_df = pd.concat([group_means, overall_row], ignore_index=True)
    result_dict = dict(zip(result_df['Dataset'], result_df['Prior Accuracy']))

    return result_dict

def calculate_post_accuracy(df: pd.DataFrame) -> pd.DataFrame:
    # Capitalize the first letter of each dataset group

    # Calculate the mean of prior_correct for each dataset
    group_means = df.groupby('dataset')['post_correct'].mean().reset_index()

    # Rename the columns
    group_means.columns = ['Dataset', 'Post Accuracy']

    # Calculate the overall mean
    overall_mean = df['post_correct'].mean()

    # Append the 'All' row
    overall_row = pd.DataFrame({'Dataset': ['All'], 'Post Accuracy': [overall_mean]})
    result_df = pd.concat([group_means, overall_row], ignore_index=True)
    result_dict = dict(zip(result_df['Dataset'], result_df['Post Accuracy']))

    return result_dict

#### Run this cell to get the table

In [19]:
model_names = ['claudeopus', 'claudesonnet', 'gemini15flash', 'gpt4', 'gpt35', 'llama3']
model_name_pretty_dict = {'claudeopus': 'Claude Opus', 'claudesonnet': 'Claude Sonnet', 'gemini15flash': 'Gemini 1.5 Flash', 'gpt4': 'GPT-4o', 'gpt35': 'GPT-3.5', 'llama3': 'Llama-3-8b-Instruct', }

In [5]:
for model_name in model_names:
    df = pd.read_parquet(f'data/model_responses/{model_name}.pqt')
    prior_post_table = {}
    prior_post_table['Model Prior'] = calculate_prior_accuracy(df)
    prior_post_table['With Correct Context'] = calculate_post_accuracy(df[df['mod_type']=='0'])
    prior_post_table = pd.DataFrame.from_dict(prior_post_table, orient='index').T
    prior_post_table.index = [x.capitalize() for x in prior_post_table.index]
    prior_post_table = prior_post_table.reset_index()
    prior_post_table.columns = ['Dataset', 'Acc. Without Context', 'Acc. With Correct Context']
    print(model_name)
    display(prior_post_table)

claudeopus


Unnamed: 0,Dataset,Acc. Without Context,Acc. With Correct Context
0,Drugs,0.566265,0.827309
1,Locations,0.55,0.935
2,Names,0.4,0.995
3,News,0.109244,0.966387
4,Records,0.717277,0.95288
5,Years,0.49,0.98
6,All,0.463224,0.938967


claudesonnet


Unnamed: 0,Dataset,Acc. Without Context,Acc. With Correct Context
0,Drugs,0.534137,0.7751
1,Locations,0.405,0.93
2,Names,0.285,0.995
3,News,0.096639,0.936975
4,Records,0.507853,0.879581
5,Years,0.215,0.98
6,All,0.339593,0.911581


gemini15flash


Unnamed: 0,Dataset,Acc. Without Context,Acc. With Correct Context
0,Drugs,0.212851,0.73494
1,Locations,0.325,0.92
2,Names,0.2,0.995
3,News,0.084034,0.957983
4,Records,0.507853,0.842932
5,Years,0.205,0.99
6,All,0.247261,0.902191


gpt4


Unnamed: 0,Dataset,Acc. Without Context,Acc. With Correct Context
0,Drugs,0.578313,0.863454
1,Locations,0.575,0.925
2,Names,0.445,0.99
3,News,0.088235,0.970588
4,Records,0.628272,0.921466
5,Years,0.54,0.99
6,All,0.467136,0.941315


gpt35


Unnamed: 0,Dataset,Acc. Without Context,Acc. With Correct Context
0,Drugs,0.445783,0.751004
1,Locations,0.41,0.875
2,Names,0.295,0.985
3,News,0.063025,0.907563
4,Records,0.591623,0.795812
5,Years,0.295,0.98
6,All,0.343505,0.878717


llama3


Unnamed: 0,Dataset,Acc. Without Context,Acc. With Correct Context
0,Drugs,0.317269,0.598394
1,Locations,0.29,0.915
2,Names,0.165,0.925
3,News,0.071429,0.911765
4,Records,0.376963,0.52356
5,Years,0.16,0.975
6,All,0.2277,0.805164


### Print Table 2

#### Helper functions for this task

In [8]:
def sample_to_match_smaller_group(subset: pd.DataFrame) -> pd.DataFrame:
    # Filter rows where mod_type is "0"
    mod_type_0 = subset[subset['prior_correct'] == 1]

    # Filter rows where mod_type is not "0"
    mod_type_not_0 = subset[subset['context_correct'] == 1]

    # Determine the smaller group size
    size_mod_type_0 = len(mod_type_0)
    size_mod_type_not_0 = len(mod_type_not_0)
    min_size = min(size_mod_type_0, size_mod_type_not_0)

    # Sample from both groups to match the smaller group size
    sampled_mod_type_0 = mod_type_0.sample(n=min_size, replace=False, random_state=1)
    sampled_mod_type_not_0 = mod_type_not_0.sample(n=min_size, replace=False, random_state=1)

    # Concatenate the two samples
    result = pd.concat([sampled_mod_type_0, sampled_mod_type_not_0])

    return result

def create_eval_df(df, response_col='post_response'):
    df['context_correct'] = (df['mod_type'] == '0').astype(int)
    df['neither_correct'] = ((df['context_correct'] == 0)&(df['prior_correct']==0)).astype(int)
    
    eval_df = df[(df['prior_correct']==1) ^ (df['context_correct']==1)]
    eval_df = sample_to_match_smaller_group(eval_df)
    
    eval_df['context_chosen'] = None
    eval_df['prior_chosen'] = None
    # eval_df['neither_chosen'] = None
    for idx, row in eval_df.iterrows():
        if row['dataset'] == 'records':
            prior_chosen = int(compare_times(row['prior_response'], row[response_col]))
            context_chosen = int(compare_times(row['answer_mod'], row[response_col]))
            
        elif row['dataset'] == 'names':
            prior_chosen = int(compare_names(row['prior_response'], row[response_col]))
            context_chosen = int(compare_names(row['answer_mod'], row[response_col]))
        else:
            prior_chosen = int(row['prior_response'] == row[response_col])
            context_chosen = int(row['answer_mod'] == row[response_col])
            
        # neither_chosen = int(prior_chosen + context_chosen == 0)
        
        eval_df.loc[idx, 'context_chosen'] = context_chosen
        eval_df.loc[idx, 'prior_chosen'] = prior_chosen
        # eval_df.loc[idx, 'neither_chosen'] = neither_chosen
    return eval_df

In [9]:
def construct_2x2_matrix_(df: pd.DataFrame) -> pd.DataFrame:
    # Create the initial 2x2 matrix with custom aggregation
    matrix = pd.DataFrame(
        {
            'Prior Correct': [
                ((df['prior_chosen'] == 1) & (df['prior_correct'] == 1)).sum(),
                ((df['context_chosen'] == 1) & (df['prior_correct'] == 1)).sum(),
                ((df['prior_chosen'] == 0) & (df['context_chosen'] == 0) & (df['prior_correct'] == 1)).sum()
            ],
            'Context Correct': [
                ((df['prior_chosen'] == 1) & (df['context_correct'] == 1)).sum(),
                ((df['context_chosen'] == 1) & (df['context_correct'] == 1)).sum(),
                ((df['prior_chosen'] == 0) & (df['context_chosen'] == 0) & (df['context_correct'] == 1)).sum()
            ]
        },
        index=['Prior Chosen', 'Context Chosen', 'Neither Chosen']
    )
    return matrix

def bootstrap_ci(data, num_bootstrap=1000, ci=95):
    """Calculate bootstrapped confidence intervals."""
    means = []
    for _ in range(num_bootstrap):
        sample = data.sample(frac=1, replace=True)
        means.append(sample.mean())
    lower = np.percentile(means, (100 - ci) / 2)
    upper = np.percentile(means, 100 - (100 - ci) / 2)
    return lower, upper

### Run this cell

In [70]:
for model_name in model_names:
    model_name_pretty = model_name_pretty_dict[model_name]
    df = pd.read_parquet(f'data/model_responses/{model_name}.pqt').reset_index()
    eval_df = create_eval_df(df)
    mat = construct_2x2_matrix_(eval_df)
    
    # Normalize the matrix
    mat = mat.div(mat.sum(axis=0), axis=1)
    
    for index, row in mat.iterrows():
        for col in mat.columns:
            mean_value = row[col]
            bootstrap_samples = []
            for _ in range(1000):
                sample = eval_df.sample(frac=1, replace=True)
                sample_matrix = construct_2x2_matrix_(sample)
                sample_matrix = sample_matrix.div(sample_matrix.sum(axis=0), axis=1)
                bootstrap_samples.append(sample_matrix.at[index, col])
            lower_ci = np.percentile(bootstrap_samples, 2.5)
            upper_ci = np.percentile(bootstrap_samples, 97.5)
            mat.at[index, col] = f"{mean_value:.3f} ({lower_ci:.3f}, {upper_ci:.3f})"
    
    print(model_name_pretty)
    display(mat)

Claude Opus


Unnamed: 0,Prior Correct,Context Correct
Prior Chosen,"0.585 (0.550, 0.620)","0.042 (0.028, 0.057)"
Context Chosen,"0.313 (0.279, 0.344)","0.901 (0.881, 0.922)"
Neither Chosen,"0.102 (0.081, 0.124)","0.057 (0.040, 0.074)"


Claude Sonnet


Unnamed: 0,Prior Correct,Context Correct
Prior Chosen,"0.436 (0.404, 0.468)","0.051 (0.036, 0.065)"
Context Chosen,"0.401 (0.371, 0.432)","0.881 (0.860, 0.902)"
Neither Chosen,"0.163 (0.139, 0.187)","0.068 (0.052, 0.087)"


Gemini 1.5 Flash


Unnamed: 0,Prior Correct,Context Correct
Prior Chosen,"0.388 (0.360, 0.415)","0.074 (0.058, 0.089)"
Context Chosen,"0.490 (0.461, 0.517)","0.860 (0.837, 0.881)"
Neither Chosen,"0.122 (0.102, 0.143)","0.066 (0.051, 0.081)"


GPT-4o


Unnamed: 0,Prior Correct,Context Correct
Prior Chosen,"0.327 (0.292, 0.363)","0.041 (0.027, 0.056)"
Context Chosen,"0.608 (0.575, 0.642)","0.903 (0.882, 0.926)"
Neither Chosen,"0.065 (0.048, 0.083)","0.056 (0.038, 0.074)"


GPT-3.5


Unnamed: 0,Prior Correct,Context Correct
Prior Chosen,"0.237 (0.213, 0.265)","0.057 (0.043, 0.072)"
Context Chosen,"0.626 (0.596, 0.654)","0.841 (0.818, 0.863)"
Neither Chosen,"0.137 (0.115, 0.159)","0.102 (0.082, 0.124)"


Llama-3-8b-Instruct


Unnamed: 0,Prior Correct,Context Correct
Prior Chosen,"0.208 (0.184, 0.230)","0.041 (0.030, 0.053)"
Context Chosen,"0.529 (0.499, 0.557)","0.793 (0.766, 0.818)"
Neither Chosen,"0.263 (0.237, 0.291)","0.166 (0.142, 0.190)"


## Table 3

### Helper functions for this task

In [78]:
def get_metrics(mat):
    accuracy = (mat[0][0] + mat[1][1]) / np.sum(mat)
    context_bias = mat[1][0] / np.sum(mat)
    prior_bias = mat[0][1] / np.sum(mat)
    return accuracy, context_bias, prior_bias

def compute_ci(df, metric_func, n_bootstrap=1000, alpha=0.05):
    bootstrapped_metrics = np.zeros(n_bootstrap)
    for i in range(n_bootstrap):
        bootstrap_sample = df.sample(frac=1, replace=True)
        mat = construct_2x2_matrix(bootstrap_sample).values
        metrics = metric_func(mat)
        bootstrapped_metrics[i] = metrics
    lower_bound = np.percentile(bootstrapped_metrics, 100 * alpha / 2)
    upper_bound = np.percentile(bootstrapped_metrics, 100 * (1 - alpha / 2))
    return lower_bound, upper_bound

### Run this

In [None]:
methods_dict = []

for model_name in ['gpt4', 'gpt35', 'llama3',]:
    
    df = pd.read_parquet(f'data/model_responses/{model_name}.pqt')
    model_name_pretty = model_name_pretty_dict[model_name]
    eval_df = create_eval_df(df, 'post_response')
    mat = construct_2x2_matrix(eval_df).values
    accuracy, context_bias, prior_bias = get_metrics(mat)
    
    accuracy_ci = compute_ci(eval_df, lambda x: get_metrics(x)[0])
    context_bias_ci = compute_ci(eval_df, lambda x: get_metrics(x)[1])
    prior_bias_ci = compute_ci(eval_df, lambda x: get_metrics(x)[2])
    
    model_metrics = {
        'Accuracy': accuracy,
        'Accuracy CI': accuracy_ci,
        'Context Bias': context_bias,
        'Context Bias CI': context_bias_ci,
        'Prior Bias': prior_bias,
        'Prior Bias CI': prior_bias_ci}
    methods_dict.append({"Model": model_name_pretty, "Method": "Uncorrected (Baseline)", **model_metrics})
    print(methods_dict)
    
# Corrected - token prob
for model_name in ['gpt4',  'gpt35', 'llama3',]:
    
    df = pd.read_parquet(f'data/model_responses/{model_name}.pqt')
    
    df['prior_prob'] = df['prior_logprobs'].apply(lambda x: np.mean([np.e**y for y in ast.literal_eval(x)]))
    df['post_prob'] = df['post_logprobs'].apply(lambda x: np.mean([np.e**y for y in ast.literal_eval(x)]) if x is not None else None)
    df['choose_prior'] = (df['prior_prob'] > df['post_prob']).astype(int)

    df['post_response_filtered'] = None
    for idx, row in df.iterrows():
        if row['choose_prior']:
            df.loc[idx, 'post_response_filtered'] = row['prior_response']
        else:
            df.loc[idx, 'post_response_filtered'] = row['post_response']
    
    model_name_pretty = model_name_pretty_dict[model_name]
    eval_df = create_eval_df(df, 'post_response_filtered')
    mat = construct_2x2_matrix(eval_df).values
    accuracy, context_bias, prior_bias = get_metrics(mat)
    
    accuracy_ci = compute_ci(eval_df, lambda x: get_metrics(x)[0])
    context_bias_ci = compute_ci(eval_df, lambda x: get_metrics(x)[1])
    prior_bias_ci = compute_ci(eval_df, lambda x: get_metrics(x)[2])
    
    model_metrics = {
        'Accuracy': accuracy,
        'Accuracy CI': accuracy_ci,
        'Context Bias': context_bias,
        'Context Bias CI': context_bias_ci,
        'Prior Bias': prior_bias,
        'Prior Bias CI': prior_bias_ci}
    methods_dict.append({"Model": model_name_pretty, "Method": "Token Prob.", **model_metrics})


# Corrected - calibrated
for model_name in ['gpt4', 'gpt35', 'llama3',]:
    
    df = pd.read_parquet(f'data/model_responses/{model_name}.pqt')
    
    df['prior_prob'] = df['prior_logprobs'].apply(lambda x: np.mean([np.e**y for y in ast.literal_eval(x)]))
    df['post_prob'] = df['post_logprobs'].apply(lambda x: np.mean([np.e**y for y in ast.literal_eval(x)]) if x is not None else None)
    df['choose_prior'] = (df['prior_prob'].rank(pct=True) > df['post_prob'].rank(pct=True)).astype(int)

    df['post_response_filtered'] = None
    for idx, row in df.iterrows():
        if row['choose_prior']:
            df.loc[idx, 'post_response_filtered'] = row['prior_response']
        else:
            df.loc[idx, 'post_response_filtered'] = row['post_response']
    
    model_name_pretty = model_name_pretty_dict[model_name]
    eval_df = create_eval_df(df, 'post_response_filtered')
    mat = construct_2x2_matrix(eval_df).values
    accuracy, context_bias, prior_bias = get_metrics(mat)
    
    accuracy_ci = compute_ci(eval_df, lambda x: get_metrics(x)[0])
    context_bias_ci = compute_ci(eval_df, lambda x: get_metrics(x)[1])
    prior_bias_ci = compute_ci(eval_df, lambda x: get_metrics(x)[2])
    
    model_metrics = {
        'Accuracy': accuracy,
        'Accuracy CI': accuracy_ci,
        'Context Bias': context_bias,
        'Context Bias CI': context_bias_ci,
        'Prior Bias': prior_bias,
        'Prior Bias CI': prior_bias_ci}
    methods_dict.append({"Model": model_name_pretty, "Method": "Calibrated Token Prob.", **model_metrics})


In [77]:
pd.DataFrame(methods_dict)

Unnamed: 0,Model,Method,Accuracy,Accuracy CI,Context Bias,Context Bias CI,Prior Bias,Prior Bias CI
0,GPT-4o,Uncorrected (Baseline),0.615224,"(0.5955875276758816, 0.6354190557507562)",0.303949,"(0.2854246668172577, 0.3207299198749267)",0.020528,"(0.013352128070514283, 0.02794509662654687)"
1,GPT-3.5,Uncorrected (Baseline),0.538859,"(0.5221631239928963, 0.5559698885260416)",0.313205,"(0.29905994696221955, 0.32784424327681966)",0.028488,"(0.02158232551302885, 0.036503396697699204)"
2,Llama-3-8b-Instruct,Uncorrected (Baseline),0.500398,"(0.48432671577084796, 0.5170619182221733)",0.264418,"(0.24973409200426386, 0.2802423355094726)",0.020541,"(0.014765408585464091, 0.02714363354037267)"
3,GPT-4o,Token Prob.,0.693306,"(0.670874883379232, 0.71455566253919)",0.193503,"(0.17600545617287083, 0.21016560255387073)",0.042522,"(0.03262272181524751, 0.05319455515359617)"
4,GPT-3.5,Token Prob.,0.595723,"(0.5768847788562974, 0.6164531718099592)",0.252834,"(0.23711715953780213, 0.26896692278860573)",0.056395,"(0.046326450904638244, 0.06673973424072073)"
5,Llama-3-8b-Instruct,Token Prob.,0.555993,"(0.5371418198438157, 0.5749393476836744)",0.234812,"(0.22020687280207624, 0.24905658821299206)",0.046092,"(0.03775194613928046, 0.05511022044088176)"
6,GPT-4o,Calibrated Token Prob.,0.754179,"(0.7342721026842041, 0.7750872585876457)",0.106838,"(0.09271532162588793, 0.12152282109158939)",0.085169,"(0.0714232169629471, 0.09954800101858925)"
7,GPT-3.5,Calibrated Token Prob.,0.701135,"(0.6786453472129698, 0.7220044579272663)",0.110405,"(0.09748534757114363, 0.12441873861322474)",0.147024,"(0.13260315761648134, 0.1626516794232951)"
8,Llama-3-8b-Instruct,Calibrated Token Prob.,0.649113,"(0.628884120682722, 0.6703094057528468)",0.11084,"(0.09919284607265295, 0.12292983041158537)",0.188069,"(0.17346646394947662, 0.2031795291502887)"
