In [1]:
import pandas as pd
from scipy.stats import ttest_1samp, pearsonr
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
sns.set_theme('paper')

In [2]:
models = [
    "meta-llama/Llama-3.2-1B-Instruct",
    "meta-llama/Llama-3.2-3B-Instruct",
    "meta-llama/Llama-3.1-8B-Instruct",
    "starmpcc/Asclepius-7B",
    "mistralai/Mistral-7B-Instruct-v0.3",
]

In [4]:
procedures = [
    'hysterectomy',
    'cystoscopy',
    'c-section|cesarean',
    'salpingo-oophorectomy',
    'delivery',
    'placement',
    'drainage',
    'resection',
    'myomectomy',
    'omentectomy',
    'salpingectomy'
]

In [21]:
def characterize_procedures(results_df: pd.DataFrame):
    for procedure in procedures:
        results_df[procedure] = results_df['procedure'].str.lower().str.contains(procedure, regex=True)
    return results_df

def get_quantity_for_experiment(results_df: pd.DataFrame, experiment_name: str):
    """ Get mean yes / (yes+no) guess probs, and total yes + no prob
    """
    yes_prob = results_df[f'{experiment_name}_Yes']
    no_prob = results_df[f'{experiment_name}_No']
    quantity = yes_prob / (yes_prob + no_prob)
    return quantity, yes_prob, no_prob, yes_prob + no_prob

def get_results_for_df(results_df: pd.DataFrame, subset_name: str=None):
    baseline_quantity, baseline_yes_prob, baseline_no_prob, baseline_yes_plus_no = get_quantity_for_experiment(results_df, 'prompt')
    results_df['baseline_quantity'] = baseline_quantity

    for experiment in ['F->M', 'F->NB', 'F->TM', 'random_proc']:
        experiment_quantity, experiment_yes_prob, experiment_no_prob, experiment_yes_plus_no = get_quantity_for_experiment(results_df, experiment)
        results_df[f'{experiment}_quantity'] = experiment_quantity
        experiment_diffs = baseline_quantity - experiment_quantity
        results_df[f'{experiment}_diffs'] = experiment_diffs
        ttest_res = ttest_1samp(experiment_diffs, popmean=0)
        p_val = ttest_res.pvalue
        teststat = ttest_res.statistic
        print(f" & {baseline_quantity.mean():.3f} ({baseline_quantity.std():.3f}) & {experiment} & {experiment_diffs.mean():.3f} ({experiment_diffs.std():.3f}) & {teststat:.3f} & {p_val:.4f} \\\\")
        # print(f"Experiment: {experiment}; mean diff: {experiment_diffs.mean()}; std diff: {experiment_diffs.mean()}; test stat: {teststat}; p-val: {p_val:.5f}")
        if experiment != 'random_proc':
            results_df[f'{experiment}_dist_frac'] = results_df[f'{experiment}_dist'] / results_df['tokenized_note_len']
    
    return results_df

def analyze_results(model_name: str):
    print(model_name)
    model_name = model_name.replace('/', '__')
    results_df = pd.read_csv(f'results/{model_name}_initial_results.csv')
    dists_df = pd.read_csv(f'edit_dists/{model_name}_edit_dists.csv')
    results_df = pd.merge(results_df, dists_df)
    results_df = characterize_procedures(results_df)
    augmented_results_df = get_results_for_df(results_df)
    augmented_results_df['model'] = model_name.split('__')[-1]
    # for procedure in procedures:
    #     results_sub_df = results_df[results_df[procedure] == True]
    #     print(procedure, f"N={len(results_sub_df)}")
    #     get_results_for_df(results_sub_df)
    # results_sub_df = results_df.copy()
    # for procedure in procedures:
    #     results_sub_df = results_sub_df[~results_sub_df[procedure]]
    # print('other_procedure', f"N={len(results_sub_df)}")
    # get_results_for_df(results_sub_df)
    return augmented_results_df


In [None]:
all_results = []
for model in models:
    all_results.append(analyze_results(model))
complete_results_df = pd.concat(all_results)


In [None]:
def analyze_results2(model_name: str):
    print(model_name)
    model_name = model_name.replace('/', '__')
    results_df = pd.read_csv(f'results/{model_name}_prompt_engineering_results.csv')
    dists_df = pd.read_csv(f'edit_dists/{model_name}_edit_dists.csv')
    results_df = pd.merge(results_df, dists_df)

    results_df = characterize_procedures(results_df)
    augmented_results_df = get_results_for_df(results_df)
    augmented_results_df['model'] = model_name.split('__')[-1]
    # for procedure in procedures:
    #     results_sub_df = results_df[results_df[procedure] == True]
    #     print(procedure, f"N={len(results_sub_df)}")
    #     get_results_for_df(results_sub_df)
    # results_sub_df = results_df.copy()
    # for procedure in procedures:
    #     results_sub_df = results_sub_df[~results_sub_df[procedure]]
    # print('other_procedure', f"N={len(results_sub_df)}")
    # get_results_for_df(results_sub_df)
    return augmented_results_df
pe_results = []
for model in models:
    pe_results.append(analyze_results2(model))
pe_results_df = pd.concat(pe_results)
pe_results_df = pe_results_df[['note_id', 'F->M_diffs','random_proc_diffs', 'model']]
pe_results_df = pe_results_df.rename({'F->M_diffs': 'F->M_diffs_PE', 'random_proc_diffs': 'random_proc_diffs_PE'}, axis='columns')
complete_results_df = pd.merge(complete_results_df, pe_results_df, on=['note_id', 'model'])

In [None]:
for model_name in models:
    model_results_df = complete_results_df[complete_results_df['model'] == model_name.split('/')[-1]]
    # print(model_name, pearsonr(model_results_df['F->M_dist'], model_results_df['F->M_diffs']))
    # print(model_name, pearsonr(model_results_df['F->M_dist_frac'], model_results_df['F->M_diffs']))
    correlation_results = pearsonr(model_results_df['F->M_dist_frac'], model_results_df['F->M_diffs'])
    print(f"{model_name.split('/')[-1]} & {correlation_results.statistic:.3f} & {correlation_results.pvalue:.3f}")

In [None]:
experiment_cols = [f'{exp}_diffs' for exp in ['F->M', 'F->NB', 'F->TM', 'random_proc']]
melted_df = complete_results_df.melt(id_vars=['model', 'note_id'], value_name='diffs', var_name='experiment')
melted_df_diffs = melted_df[melted_df['experiment'].isin(experiment_cols)]
fig, ax = plt.subplots(figsize=(10, 6))
g = sns.barplot(data=melted_df_diffs, hue='experiment', x='model', y='diffs', ax=ax)
plt.title("Mean Difference in Predicted Probability (Experiment - Baseline)")
g.get_legend().set_title("Experiment")
# replace labels
new_labels = ['F->M', 'F->NB', 'F->TM', 'Random Procedure']
for t, l in zip(g.get_legend().texts, new_labels):
    t.set_text(l)

plt.show()

melted_df_pe = melted_df[melted_df['experiment'].isin(['F->M_diffs', 'F->M_diffs_PE', 'random_proc_diffs', 'random_proc_diffs_PE'])]
fig, ax = plt.subplots(figsize=(10, 6))
g = sns.barplot(data=melted_df_pe, hue='experiment', x='model', y='diffs', ax=ax)
plt.title("Effects of Prompt Engineering")
g.get_legend().set_title("Experiment")
new_labels = ['F->M', 'Random Procedure', 'F->M (PE)', 'Random Procedure (PE)']
for t, l in zip(g.get_legend().texts, new_labels):
    t.set_text(l)

plt.show()


sns.histplot(data=complete_results_df, x='F->M_diffs', hue='model', multiple='dodge', bins=10)
plt.title("Histogram in changes from Baseline to Male Note")
plt.show()

experiment_cols = []
for exp in ['prompt']:
    experiment_cols.extend([f'{exp}_Yes', f'{exp}_No'])
melted_df_pred_probs = melted_df[melted_df['experiment'].isin(experiment_cols)]
fig, ax = plt.subplots(figsize=(10, 6))
g = sns.barplot(data=melted_df_pred_probs, hue='experiment', x='model', y='diffs', ax=ax)
plt.title("Mean Predicted Probability for Yes/No")
g.get_legend().set_title("Experiment")
# replace labels
new_labels = ['Predicted "Yes" Prob', 'Predicted "No" Prob']
for t, l in zip(g.get_legend().texts, new_labels):
    t.set_text(l)

plt.show()



In [113]:
# sns.barplot(x='model', y='random_proc_diffs', data=complete_results_df)

In [None]:
model_names = [model_name.split('/')[-1] for model_name in models]
model_sizes = [1, 3, 8, 7, 7]
ftm_diffs = [0.03135433031490543, 0.06281644184168893, 0.09766994673834288, 0.004116367901499969, 0.1820370526985932]
random_proc_diffs = [0.0021958206397623362, 0.05308820133602772, 0.19659108637701223, 0.0973688137570994, 0.20461204953487638]
architecture = ['Llama', 'Llama', 'Llama', 'Asclepius', 'Mistral']
fig, ax = plt.subplots()
ax.set_xlim(0,0.25)
ax.set_ylim(0,0.25)
ax.set_aspect('equal', adjustable='box')

df = pd.DataFrame({'model_name': a, 'Mean F->M Decrease': b, 'Mean Randomized Procedure Decrease': c, 'Model Size\n(Billion Params)': d} for a,b,c,d in zip(model_names, ftm_diffs, random_proc_diffs, model_sizes))
sns.scatterplot(data=df, x='Mean Randomized Procedure Decrease', y='Mean F->M Decrease', size='Model Size\n(Billion Params)', style=architecture, hue=architecture, ax=ax)
for i, txt in enumerate(model_names):
    ax.annotate(txt, (random_proc_diffs[i], ftm_diffs[i]))
plt.legend([],[], frameon=False)
ax.set_xlabel(ax.get_xlabel()+"\n(Higher is Better)")
ax.set_ylabel(ax.get_ylabel()+"\n(Lower is Better)")
plt.title("Decrease in Predicted Probability for a Random Procedure vs. for F->M Transform")



In [None]:
fig, axes = plt.subplots(nrows=1, ncols=5, figsize=(15,3))
for i, model_name in enumerate(model_names):
    model_results_df = complete_results_df[complete_results_df['model'] == model_name]
    x = np.linspace(0,1,1000)
    y_orig = [sum(model_results_df['baseline_quantity'] > thresh) / len(model_results_df) for thresh in x]
    y_changed = [sum(model_results_df['F->M_quantity'] > thresh) / len(model_results_df) for thresh in x]
    ax = axes[i]
    ax.set_title(model_name)
    ax.set_aspect('equal', adjustable='box')
    ax.plot(x, y_orig)
    ax.plot(x, y_changed)
    
    
plt.suptitle("Recall for Original vs. Recall for F->M")
plt.legend(["original", "swapped"])
fig.supxlabel("Threshold")
fig.supylabel("Recall")
fig.subplots_adjust(top=0.90)
plt.tight_layout()
print(fig, ax)

In [None]:
for model in models:
    dists_df = pd.read_csv(f'edit_dists/{model.replace('/', '__')}_edit_dists.csv')
    for col in ['F->M_dist', 'F->NB_dist', 'F->TM_dist']:
        print(model, col, (dists_df[col] / dists_df['tokenized_note_len']).mean())