# Jupyter Notebook for Calculating Statistics

In [67]:
import pandas as pd
import numpy as np
from math import sqrt
import altair as alt
from utils import save_dataset_to_json, save_dataset_to_csv
import statsmodels.api as sm
import statsmodels.formula.api as smf
from tqdm import tqdm

In [68]:
model_metadata = {
    "alpacare-7B": {"model_type": "clinical/biomedical", "model_size_in_b": 7},
    "alpacare-13B": {"model_type": "clinical/biomedical", "model_size_in_b": 13},
    "biomedgpt7B": {"model_type": "clinical/biomedical", "model_size_in_b": 7},
    "biomistral7B": {"model_type": "clinical/biomedical", "model_size_in_b": 7},
    "claude_3.5-haiku": {"model_type": "closed/proprietary", "model_size_in_b": None}, # 175?
    "claude_3.5-sonnet": {"model_type": "closed/proprietary", "model_size_in_b": None}, # 175?
    "gemini_1.5_flash": {"model_type": "closed/proprietary", "model_size_in_b": None},
    "gemini_1.5_flash-8B": {"model_type": "closed/proprietary", "model_size_in_b": 8},
    "gpt4o": {"model_type": "closed/proprietary", "model_size_in_b": None}, # around 1 trillion (1000B)
    "gpt4o-mini": {"model_type": "closed/proprietary", "model_size_in_b": None}, # 175?
    "gpt35": {"model_type": "closed/proprietary", "model_size_in_b": 175},
    "llama2_chat-7B": {"model_type": "general open", "model_size_in_b": 7},
    "llama2_chat-13B": {"model_type": "general open", "model_size_in_b": 13},
    "llama2_chat-70B": {"model_type": "general open", "model_size_in_b": 70},
    "llama3_instruct-8B": {"model_type": "general open", "model_size_in_b": 8},
    "llama3_instruct-70B": {"model_type": "general open", "model_size_in_b": 70},
    "med42-8B": {"model_type": "clinical/biomedical", "model_size_in_b": 8},
    "med42-70B": {"model_type": "clinical/biomedical", "model_size_in_b": 70},
    "mistral_instruct7B": {"model_type": "general open", "model_size_in_b": 7},
    "olmo2_instruct-7B": {"model_type": "general open", "model_size_in_b": 7},
    "olmo2_instruct-13B": {"model_type": "general open", "model_size_in_b": 13},
    "openbiollm-8B": {"model_type": "clinical/biomedical", "model_size_in_b": 8},
    "openbiollm-70B": {"model_type": "clinical/biomedical", "model_size_in_b": 70}
}

## Spin Detection Task

In [69]:
detection_stats_df = pd.read_json("./eval_outputs/detection_overall_metrics.json", orient="index")

detection_stats_df["model_name"] = detection_stats_df.index
detection_stats_df["model_type"] = detection_stats_df.index.map(lambda x: model_metadata[x]["model_type"])
detection_stats_df["model_size_in_b"] = detection_stats_df.index.map(lambda x: model_metadata[x]["model_size_in_b"])
# remove index
detection_stats_df.reset_index(drop=True, inplace=True)

print(f"Number of models: {len(detection_stats_df)}")

detection_stats_df.sort_index(inplace=True) # alphabetical order
detection_stats_df

Number of models: 22


Unnamed: 0,accuracy,precision,recall,f1,model_name,model_type,model_size_in_b
0,0.783333,0.707317,0.966667,0.816901,gpt4o,closed/proprietary,
1,0.85,0.783784,0.966667,0.865672,gpt4o-mini,closed/proprietary,
2,0.516667,1.0,0.033333,0.064516,gpt35,closed/proprietary,175.0
3,0.733333,0.652174,1.0,0.789474,gemini_1.5_flash,closed/proprietary,
4,0.833333,0.794118,0.9,0.84375,gemini_1.5_flash-8B,closed/proprietary,8.0
5,0.966667,1.0,0.933333,0.965517,claude_3.5-sonnet,closed/proprietary,
6,0.566667,0.535714,1.0,0.697674,claude_3.5-haiku,closed/proprietary,
7,0.516667,1.0,0.033333,0.064516,biomistral7B,clinical/biomedical,7.0
8,0.566667,0.537037,0.966667,0.690476,llama2_chat-13B,general open,13.0
9,0.633333,0.58,0.966667,0.725,llama2_chat-70B,general open,70.0


### Average of accuracy, precision, recall, and F1 score by model type

In [70]:
# Group by model type and calculate mean accuracy and standard deviation
accuracy_by_model_type = detection_stats_df.groupby('model_type')['accuracy'].agg(['mean', 'std']).reset_index()

# Rename columns for clarity
accuracy_by_model_type.columns = ['model_type', 'mean_accuracy', 'std_deviation']

print(accuracy_by_model_type)

            model_type  mean_accuracy  std_deviation
0  clinical/biomedical       0.632163       0.141272
1   closed/proprietary       0.750000       0.159861
2         general open       0.635417       0.140418


In [71]:
# Group by model type and calculate mean precision and standard deviation
precision_by_model_type = detection_stats_df.groupby('model_type')['precision'].agg(['mean', 'std']).reset_index()

# Rename columns for clarity
precision_by_model_type.columns = ['model_type', 'mean_precision', 'std_deviation']

print(precision_by_model_type)

            model_type  mean_precision  std_deviation
0  clinical/biomedical        0.812759       0.219535
1   closed/proprietary        0.781872       0.172379
2         general open        0.702130       0.364676


In [72]:
# Group by model type and calculate mean recall and standard deviation
recall_by_model_type = detection_stats_df.groupby('model_type')['recall'].agg(['mean', 'std']).reset_index()

# Rename columns for clarity
recall_by_model_type.columns = ['model_type', 'mean_recall', 'std_deviation']

print(recall_by_model_type)

            model_type  mean_recall  std_deviation
0  clinical/biomedical     0.576190       0.383799
1   closed/proprietary     0.828571       0.352467
2         general open     0.587500       0.406666


In [73]:
# Group by model type and calculate mean f1 and standard deviation
f1_by_model_type = detection_stats_df.groupby('model_type')['f1'].agg(['mean', 'std']).reset_index()

# Rename columns for clarity
f1_by_model_type.columns = ['model_type', 'mean_f1', 'std_deviation']

print(f1_by_model_type)

            model_type   mean_f1  std_deviation
0  clinical/biomedical  0.547278       0.275737
1   closed/proprietary  0.720501       0.300329
2         general open  0.539761       0.322221


#### Plots

In [74]:
chart = alt.Chart(detection_stats_df).mark_bar().encode(
    x=alt.X('model_name:N', sort='-y', title='Model Name'),
    y=alt.Y('accuracy:Q', title='Accuracy'),
    color=alt.Color('model_type:N', title='Model Type')
).properties(
    # title='Accuracy by Model',
    width=800,
)

# Add value labels
text = chart.mark_text(
    align='center',
    baseline='bottom',
    dy=-5  # Adjust the position of the text
).encode(
    text=alt.Text('accuracy:Q', format='.2f')
)



c_t = chart + text
# save to html
c_t.save("detection_accuracy_by_model.html")


  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [75]:
# Plot average accuracy by model_type and add error bars
bars = alt.Chart(detection_stats_df).mark_bar().encode(
    x=alt.X('model_type:N', title='Model Type', axis=alt.Axis(labelAngle=0)),
    y=alt.Y('mean(accuracy):Q', title='Mean Accuracy'),
    color=alt.Color('model_type:N', title='Model Type', legend=None)
).properties(
    title='Average Accuracy by Model Type',
    width=800  # Set the width to 800 pixels
)

error_bars = alt.Chart(detection_stats_df).mark_errorbar(extent='stdev').encode(
    x=alt.X('model_type:N'),
    y=alt.Y('accuracy:Q')
)

# Add value labels
text = bars.mark_text(
    align='center',
    baseline='bottom',
    dy=-5  # Adjust the position of the text
).encode(
    text=alt.Text('mean(accuracy):Q', format='.2f')
)

alt.layer(bars, error_bars, text)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


### Average of accuracy, precision, recall, and F1 score by model size

In [76]:
# model size in buckets (0-10B, 11-20B, 22-100B, 100B+/NaN)
def model_size_bucket(model_size): 
    if model_size is None or pd.isna(model_size):
        return "Unknown"
    elif model_size >= 100:
        return "100B+"
    elif model_size <= 10:
        return "0-10B"
    elif model_size <= 20:
        return "11-20B"
    else:
        return "21-100B"

In [77]:
# average accuracy by model size
detection_stats_df["model_size_bucket"] = detection_stats_df["model_size_in_b"].map(model_size_bucket)

accuracy_by_model_size = detection_stats_df.groupby('model_size_bucket')['accuracy'].agg(['mean', 'std']).reset_index()

# Rename columns for clarity
accuracy_by_model_size.columns = ['model_size_bucket', 'mean_accuracy', 'std_deviation']

print(accuracy_by_model_size)

  model_size_bucket  mean_accuracy  std_deviation
0             0-10B       0.615847       0.137000
1             100B+       0.516667            NaN
2            11-20B       0.541667       0.035355
3           21-100B       0.775000       0.095743
4           Unknown       0.780000       0.147855


In [78]:
# average precision by model size
precision_by_model_size = detection_stats_df.groupby('model_size_bucket')['precision'].agg(['mean', 'std']).reset_index()

# Rename columns for clarity
precision_by_model_size.columns = ['model_size_bucket', 'mean_precision', 'std_deviation']

print(precision_by_model_size)

  model_size_bucket  mean_precision  std_deviation
0             0-10B        0.721719       0.337294
1             100B+        1.000000            NaN
2            11-20B        0.768519       0.327364
3           21-100B        0.836558       0.180942
4           Unknown        0.735798       0.173164


In [79]:
# average recall by model size
recall_by_model_size = detection_stats_df.groupby('model_size_bucket')['recall'].agg(['mean', 'std']).reset_index()

# Rename columns for clarity
recall_by_model_size.columns = ['model_size_bucket', 'mean_recall', 'std_deviation']

print(recall_by_model_size)

  model_size_bucket  mean_recall  std_deviation
0             0-10B     0.553333       0.404969
1             100B+     0.033333            NaN
2            11-20B     0.500000       0.659966
3           21-100B     0.775000       0.142400
4           Unknown     0.973333       0.027889


In [80]:
# average f1 score by model size 
f1_by_model_size_bucket = detection_stats_df.groupby('model_size_bucket')['f1'].agg(['mean', 'std']).reset_index()

# Rename columns for clarity
f1_by_model_size_bucket.columns = ['model_size_bucket', 'mean_f1', 'std_deviation']

print(f1_by_model_size_bucket)

  model_size_bucket   mean_f1  std_deviation
0             0-10B  0.511597       0.294719
1             100B+  0.064516            NaN
2            11-20B  0.377496       0.442621
3           21-100B  0.780454       0.043987
4           Unknown  0.827048       0.098638


#### Plots

In [81]:
bars = alt.Chart(detection_stats_df).mark_bar().encode(
    x=alt.X('model_name:N', sort='-y', title='Model Name'),
    y=alt.Y('accuracy:Q', title='Accuracy'),
    color=alt.Color('model_size_bucket:N', title='Model Size Bucket')
).properties(
    title='Accuracy by Model Size Bucket',
    width=800,
)

# Add value labels
text = bars.mark_text(
    align='center',
    baseline='bottom',
    dy=-5  # Adjust the position of the text
).encode(
    text=alt.Text('accuracy:Q', format='.2f')
)

bars + text

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [82]:
# Plot average accuracy by model_size_bucket and add error bars
bars = alt.Chart(detection_stats_df).mark_bar().encode(
    x=alt.X('model_size_bucket:N', title='Model Size Bucket', axis=alt.Axis(labelAngle=0), sort=['0-10B', '11-20B', '21-100B', '100B+', 'Unknown']),
    y=alt.Y('mean(accuracy):Q', title='Mean Accuracy'),
    color=alt.Color('model_size_bucket:N', title='Model Size Bucket', legend=None)
).properties(
    title='Average Accuracy by Model Size',
    width=800  # Set the width to 800 pixels
)

error_bars = alt.Chart(detection_stats_df).mark_errorbar(extent='stdev').encode(
    x=alt.X('model_size_bucket:N', sort=['0-10B', '11-20B', '21-100B', '100B+', 'Unknown']),
    y=alt.Y('accuracy:Q')
)

# Add value labels
text = bars.mark_text(
    align='center',
    baseline='bottom',
    dy=-5  # Adjust the position of the text
).encode(
    text=alt.Text('mean(accuracy):Q', format='.2f')
)

alt.layer(bars, error_bars, text)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [83]:
# scatter plot of model size vs accuracy with model names as labels
scatter_plot = alt.Chart(detection_stats_df).mark_circle().encode(
    x=alt.X('model_size_in_b:Q', title='Model Size (in Billion Parameters)'),
    y=alt.Y('accuracy:Q', title='Accuracy'),
    color=alt.Color('model_type:N', title='Model Type')
).properties(
    title='Model Size vs Accuracy',
    width=800,  # Set the width to 800 pixels
    height=400  # Set the height to 400 pixels
)

text = scatter_plot.mark_text(
    align='left',
    baseline='middle',
    dx=7,  # Adjust the position of the text
    dy=-5,  # Adjust the vertical position of the text
).encode(
    text='model_name:N'
)

scatter_plot + text

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


## RCT Trial Result Interpretation Task

In [84]:
interpretation_stats_df = pd.read_json("./eval_outputs/interpretation_overall_metrics.json", orient="index")

interpretation_stats_df["model_name"] = interpretation_stats_df.index
interpretation_stats_df["model_type"] = interpretation_stats_df.index.map(lambda x: model_metadata[x]["model_type"])
interpretation_stats_df["model_size_in_b"] = interpretation_stats_df.index.map(lambda x: model_metadata[x]["model_size_in_b"])
# remove index
interpretation_stats_df.reset_index(drop=True, inplace=True)

print(f"Number of models: {len(interpretation_stats_df)}")

interpretation_stats_df.sort_index(inplace=True) # alphabetical order
interpretation_stats_df

Number of models: 22


Unnamed: 0,benefit_answer_mean_diff,rigor_answer_mean_diff,importance_answer_mean_diff,full_text_answer_mean_diff,another_trial_answer_mean_diff,overall_mean_diff_avg,model_name,model_type,model_size_in_b
0,3.133333,0.1,1.233333,2.866667,3.333333,2.133333,gpt4o,closed/proprietary,
1,3.566667,1.466667,2.733333,3.933333,3.866667,3.113333,gpt4o-mini,closed/proprietary,
2,3.9,1.433333,2.066667,2.6,3.766667,2.753333,gpt35,closed/proprietary,175.0
3,2.5,-0.1,2.166667,3.0,3.7,2.253333,gemini_1.5_flash,closed/proprietary,
4,3.066667,-0.1,0.966667,2.733333,3.433333,2.02,gemini_1.5_flash-8B,closed/proprietary,8.0
5,2.5,-0.166667,-0.633333,3.233333,2.866667,1.56,claude_3.5-sonnet,closed/proprietary,
6,2.966667,-0.033333,0.466667,1.3,2.166667,1.373333,claude_3.5-haiku,closed/proprietary,
7,6.051724,0.266667,0.8,0.0,,,alpacare-7B,clinical/biomedical,7.0
8,1.666667,0.35,1.116667,1.035714,1.666667,1.167143,biomistral7B,clinical/biomedical,7.0
9,3.5,0.5,1.066667,0.333333,1.7,1.42,llama2_chat-7B,general open,7.0


In [85]:
def calculate_sd_from_ci(ci_low, ci_high):
    se = (ci_high - ci_low) / 3.92
    sd = se / sqrt((1/150) + (1/150))
    return sd

In [86]:
human_expert_stats = {
        "benefit_answer": {"mean_diff": 0.71, "ci": [0.07, 1.35]},
        "rigor_answer": {"mean_diff": 0.59, "ci": [0.05, 1.13]},
        "importance_answer": {"mean_diff": 0.38, "ci": [0.19, 0.95]},
        "full_text_answer": {"mean_diff": 0.77, "ci": [0.08, 1.47]},
        "another_trial_answer": {"mean_diff": 0.64, "ci": [0.03, 1.31]}
    }

human_expert_stats_df = pd.DataFrame(human_expert_stats).T
human_expert_stats_df["metric"] = human_expert_stats_df.index
# remove index
human_expert_stats_df.reset_index(drop=True, inplace=True)
human_expert_stats_df["method"] = "human experts"

# calculate the standard deviation based on the 95% CI
human_expert_stats_df["std_deviation"] = human_expert_stats_df.apply(lambda x: calculate_sd_from_ci(x["ci"][0], x["ci"][1]), axis=1)
# drop ci
human_expert_stats_df.drop(columns=["ci"], inplace=True)

human_expert_stats_df

Unnamed: 0,mean_diff,metric,method,std_deviation
0,0.71,benefit_answer,human experts,2.827838
1,0.59,rigor_answer,human experts,2.385988
2,0.38,importance_answer,human experts,1.679029
3,0.77,full_text_answer,human experts,3.070855
4,0.64,another_trial_answer,human experts,2.827838


In [87]:
# calculate the average of all model metrics and calculate std_deviation
average_model_benefit = interpretation_stats_df["benefit_answer_mean_diff"].mean()
stdev_model_benefit = interpretation_stats_df["benefit_answer_mean_diff"].std()

average_model_rigor = interpretation_stats_df["rigor_answer_mean_diff"].mean()
stdev_model_rigor = interpretation_stats_df["rigor_answer_mean_diff"].std()

average_model_importance = interpretation_stats_df["importance_answer_mean_diff"].mean()
stdev_model_importance = interpretation_stats_df["importance_answer_mean_diff"].std()

average_model_full_text = interpretation_stats_df["full_text_answer_mean_diff"].mean()
stdev_model_full_text = interpretation_stats_df["full_text_answer_mean_diff"].std()

average_model_another_trial = interpretation_stats_df["another_trial_answer_mean_diff"].mean()
stdev_model_another_trial = interpretation_stats_df["another_trial_answer_mean_diff"].std()

model_stats = {
    "benefit_answer": {"mean_diff": average_model_benefit, "std_deviation": stdev_model_benefit},
    "rigor_answer": {"mean_diff": average_model_rigor, "std_deviation": stdev_model_rigor},
    "importance_answer": {"mean_diff": average_model_importance, "std_deviation": stdev_model_importance},
    "full_text_answer": {"mean_diff": average_model_full_text, "std_deviation": stdev_model_full_text},
    "another_trial_answer": {"mean_diff": average_model_another_trial, "std_deviation": stdev_model_another_trial}
}

model_stats_df = pd.DataFrame(model_stats).T
model_stats_df["metric"] = model_stats_df.index
# remove index
model_stats_df.reset_index(drop=True, inplace=True)
model_stats_df["method"] = "all LLMs"

model_stats_df

Unnamed: 0,mean_diff,std_deviation,metric,method
0,3.381165,1.212055,benefit_answer,all LLMs
1,0.277165,0.45125,rigor_answer,all LLMs
2,1.264057,0.816844,importance_answer,all LLMs
3,2.205034,1.433917,full_text_answer,all LLMs
4,2.9625,1.116122,another_trial_answer,all LLMs


In [88]:
# get average and std deviation by model_type from interpretation_stats_df
average_benefit_by_model_type = interpretation_stats_df.groupby('model_type')['benefit_answer_mean_diff'].mean().reset_index()
average_benefit_by_model_type.columns = ['method', 'mean_diff']
stdev_benefit_by_model_type = interpretation_stats_df.groupby('model_type')['benefit_answer_mean_diff'].std().reset_index()
stdev_benefit_by_model_type.columns = ['method', 'std_deviation']
#combine the two dataframes
average_benefit_by_model_type = pd.merge(average_benefit_by_model_type, stdev_benefit_by_model_type, on='method')
average_benefit_by_model_type['metric'] = 'benefit_answer'

average_rigor_by_model_type = interpretation_stats_df.groupby('model_type')['rigor_answer_mean_diff'].mean().reset_index()
average_rigor_by_model_type.columns = ['method', 'mean_diff']
stdev_rigor_by_model_type = interpretation_stats_df.groupby('model_type')['rigor_answer_mean_diff'].std().reset_index()
stdev_rigor_by_model_type.columns = ['method', 'std_deviation']
#combine the two dataframes
average_rigor_by_model_type = pd.merge(average_rigor_by_model_type, stdev_rigor_by_model_type, on='method')
average_rigor_by_model_type['metric'] = 'rigor_answer'

average_importance_by_model_type = interpretation_stats_df.groupby('model_type')['importance_answer_mean_diff'].mean().reset_index()
average_importance_by_model_type.columns = ['method', 'mean_diff']
stdev_importance_by_model_type = interpretation_stats_df.groupby('model_type')['importance_answer_mean_diff'].std().reset_index()
stdev_importance_by_model_type.columns = ['method', 'std_deviation']
#combine the two dataframes
average_importance_by_model_type = pd.merge(average_importance_by_model_type, stdev_importance_by_model_type, on='method')
average_importance_by_model_type['metric'] = 'importance_answer'

average_full_text_by_model_type = interpretation_stats_df.groupby('model_type')['full_text_answer_mean_diff'].mean().reset_index()
average_full_text_by_model_type.columns = ['method', 'mean_diff']
stdev_full_text_by_model_type = interpretation_stats_df.groupby('model_type')['full_text_answer_mean_diff'].std().reset_index()
stdev_full_text_by_model_type.columns = ['method', 'std_deviation']
#combine the two dataframes
average_full_text_by_model_type = pd.merge(average_full_text_by_model_type, stdev_full_text_by_model_type, on='method')
average_full_text_by_model_type['metric'] = 'full_text_answer'

average_another_trial_by_model_type = interpretation_stats_df.groupby('model_type')['another_trial_answer_mean_diff'].mean().reset_index()
average_another_trial_by_model_type.columns = ['method', 'mean_diff']
stdev_another_trial_by_model_type = interpretation_stats_df.groupby('model_type')['another_trial_answer_mean_diff'].std().reset_index()
stdev_another_trial_by_model_type.columns = ['method', 'std_deviation']
#combine the two dataframes
average_another_trial_by_model_type = pd.merge(average_another_trial_by_model_type, stdev_another_trial_by_model_type, on='method')
average_another_trial_by_model_type['metric'] = 'another_trial_answer'

In [89]:
average_by_model_type = pd.concat([average_benefit_by_model_type, average_rigor_by_model_type, average_importance_by_model_type, average_full_text_by_model_type, average_another_trial_by_model_type], ignore_index=True)

average_by_model_type

Unnamed: 0,method,mean_diff,std_deviation,metric
0,clinical/biomedical,3.413711,1.750199,benefit_answer
1,closed/proprietary,3.090476,0.515937,benefit_answer
2,general open,3.60704,1.187267,benefit_answer
3,clinical/biomedical,0.166327,0.161755,rigor_answer
4,closed/proprietary,0.371429,0.741477,rigor_answer
5,general open,0.291667,0.309505,rigor_answer
6,clinical/biomedical,1.015608,0.581945,importance_answer
7,closed/proprietary,1.285714,1.150339,importance_answer
8,general open,1.4625,0.685551,importance_answer
9,clinical/biomedical,1.482489,1.555384,full_text_answer


In [90]:
#combine all the dataframes
model_stats_final_df = pd.concat([human_expert_stats_df, model_stats_df, average_by_model_type], ignore_index=True)
#drop "_answer" from the values in metric column
model_stats_final_df['metric'] = model_stats_final_df['metric'].str.replace('_answer', '')

model_stats_final_df

Unnamed: 0,mean_diff,metric,method,std_deviation
0,0.71,benefit,human experts,2.827838
1,0.59,rigor,human experts,2.385988
2,0.38,importance,human experts,1.679029
3,0.77,full_text,human experts,3.070855
4,0.64,another_trial,human experts,2.827838
5,3.381165,benefit,all LLMs,1.212055
6,0.277165,rigor,all LLMs,0.45125
7,1.264057,importance,all LLMs,0.816844
8,2.205034,full_text,all LLMs,1.433917
9,2.9625,another_trial,all LLMs,1.116122


### Plots

In [100]:
# bar chart of mean_diff for each method grouped by metric
# error bars are added (std_deviation)
bars = alt.Chart(model_stats_final_df).mark_bar().encode(
    x=alt.X('method:N', title='Method', axis=alt.Axis(labelAngle=-45)),
    y=alt.Y('mean_diff:Q', title='Mean Difference'),
    color=alt.Color('method:N', title='Method', legend=None)
).properties(
    width=300,  # Set the width to 300 pixels
    height=300  # Set the height to 300 pixels
)

error_bars = alt.Chart().mark_rule().encode(
    alt.X('method:N', title='Method'),
    alt.Y('errbar_min:Q'),
    alt.Y2('errbar_max:Q')
).transform_calculate(
    errbar_min = alt.datum.mean_diff - alt.datum.std_deviation / 2,
    errbar_max = alt.datum.mean_diff + alt.datum.std_deviation / 2
)

# Add value labels
text = bars.mark_text(
    align='center',
    baseline='bottom',
    dy=-5  # Adjust the position of the text
).encode(
    text=alt.Text('mean_diff:Q', format='.2f')
)

bars = alt.layer(bars, error_bars, text, data=model_stats_final_df).facet(
    column=alt.Column('metric:N', title='Metric'),
)
bars
# save to html
bars.save("interpretation_by_measures.html")

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


## Relationship between spin detection and spin interpretation

Linear Regression with statsmodels Python package

In [92]:
# get all model names
model_names = model_metadata.keys()
# remove alpacare-13B
model_names = [x for x in model_names if x != "alpacare-13B"]

len(model_names)

22

In [93]:
measures = ["benefit_answer", "rigor_answer", "importance_answer", "full_text_answer", "another_trial_answer"]
gpt_models = ["gpt4o", "gpt4o-mini", "gpt35"]
huggingface_models = ["alpacare-7B", "biomedgpt7B", "biomistral7B", 
                      "llama2_chat-7B", "llama2_chat-13B", "llama2_chat-70B",
                      "llama3_instruct-8B", "llama3_instruct-70B",
                      "med42-8B", "med42-70B", "mistral_instruct7B", 
                      "olmo2_instruct-7B", "olmo2_instruct-13B",
                      "openbiollm-8B", "openbiollm-70B"]
no_probability_models = ["claude_3.5-haiku", "claude_3.5-sonnet", "gemini_1.5_flash", "gemini_1.5_flash-8B"]

def get_is_detection_correct(row):
    if row['abstract_type'] == "spin":
        return row['model_answer'] == "yes"
    else:
        return row['model_answer'] == "no"
    
def get_is_abstract_type_spin(row):
    return row['abstract_type'] == "spin"
    
def detection_probability_gpt(row):
    # find the first instance of "yes" or "no"
    token_probabilties = row['model_log_probabilities']
    for token_prob in token_probabilties:
        if token_prob['token'].lower() == "yes":
            return np.exp(token_prob['logprob'])
        elif token_prob['token'].lower() == "no":
            return np.exp(token_prob['logprob'])
    return None # this should not happen but just in case

def detection_probability_huggingface(row):
    # find the first instance of "yes" or "no"
    token_probabilties = row['model_log_probabilities']
    for token_prob in token_probabilties:
        if token_prob['token_string'].lower() == "yes":
            return token_prob['probability']
        elif token_prob['token_string'].lower() == "no":
            return token_prob['probability']
    return None # this should not happen but just in case


def prepare_data_for_regression(model_names):
    for model_name in tqdm(model_names):
        # print(f"Processing {model_name}...")
        final_data = []
        detection_output_file_path = f"./eval_outputs/{model_name}/{model_name}_detection_outputs.json"
        interpretation_output_file_path = f"./eval_outputs/{model_name}/{model_name}_interpretation_outputs.json"
        model_detection_data = pd.read_json(detection_output_file_path, orient="records")
        model_interpretation_data = pd.read_json(interpretation_output_file_path, orient="records")

        # merge model_detection_data and model_interpretation_data by PMID and abstract_type
        model_data = pd.merge(model_detection_data, model_interpretation_data, on=['PMID', 'abstract_type'])

        # loop through each row in model_data
        for _, row in model_data.iterrows():
            detection_model_prediction = 1 if row['model_answer'] == "yes" else 0
            is_detection_correct = 1 if get_is_detection_correct(row) else 0
            is_spin_in_abstract = 1 if get_is_abstract_type_spin(row) else 0

            if model_name in gpt_models:
                detection_probability = detection_probability_gpt(row)
            elif model_name in huggingface_models:
                detection_probability = detection_probability_huggingface(row)
            else:
                detection_probability = None
            
            for measure in measures:
                final_data.append({
                    "pmid": row['PMID'],
                    "measure": measure,
                    "is_spin_in_abstract": is_spin_in_abstract,
                    "is_detection_correct": is_detection_correct,
                    "detection_model_prediction": detection_model_prediction,
                    "detection_probability": detection_probability,
                    "interpretation_answer": float(row[measure]) if row[measure] != "" else None
                })
            # calculate the average of the differences
            answers = []
            for measure in measures:
                if row[measure] != "":
                    answers.append(float(row[measure]))
            if len(answers) > 0:
                avg_answer= round(np.mean(answers), 6)
            else:
                avg_answer = None
            # add the average difference to the data
            final_data.append({
                "pmid": row['PMID'],
                "measure": "overall",
                "is_spin_in_abstract": is_spin_in_abstract,
                "is_detection_correct": is_detection_correct,
                "detection_model_prediction": detection_model_prediction,
                "detection_probability": detection_probability,
                "interpretation_answer": avg_answer
            })

        # save the final data to a json file
        json_file_path = f"./eval_outputs/{model_name}/{model_name}_combined_data.csv"
        save_dataset_to_csv(final_data, json_file_path)

In [94]:
prepare_data_for_regression(model_names=model_names)

100%|██████████| 22/22 [00:05<00:00,  3.90it/s]


#### Simplest Regression

Is spin in abstract and the measures answers

In [95]:
for model_name in model_names:
    output_string = ""
    csv_file_path = f"./eval_outputs/{model_name}/{model_name}_combined_data.csv"
    data = pd.read_csv(csv_file_path)

    measures = ["benefit_answer", "rigor_answer", "importance_answer", "full_text_answer", "another_trial_answer", "overall"]
    for measure in measures:
        # get the data for the current measure
        measure_data = data[data['measure'] == measure]
        nan_rows_number = measure_data['interpretation_answer'].isnull().sum()
        # remove rows with NaN values in interpretation_answer
        measure_data = measure_data.dropna(subset=['interpretation_answer'])

        # check if there are less than 2 rows
        if len(measure_data) < 2:
            continue
        
        # fit the model
        model = smf.ols(formula="interpretation_answer ~ is_spin_in_abstract", 
                                    data=measure_data)
        results = model.fit()

        output_string += f"Model: {model_name} - {measure}\n"
        # print number of rows with NaN value(s)
        output_string += f"Number of rows with NaN value(s) in {model_name}: {nan_rows_number}\n"
        output_string += results.summary().as_text()
        output_string += "\n"

    # save the model summary
    with open(f"./eval_outputs/{model_name}/{model_name}_simple_regression_summary.txt", "w") as f:
        f.write(output_string)

  return 1 - self.ssr/self.centered_tss
  return self.mse_model/self.mse_resid
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)
  warn("omni_normtest is not valid with less than 8 observations; %i "
  return np.sqrt(eigvals[0]/eigvals[-1])
  k, _ = kurtosistest(a, axis)
  k, _ = kurtosistest(a, axis)
  warn("omni_normtest is not valid with less than 8 observations; %i "
  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss


#### Binary Spin Detection Results Version

In [96]:
for model_name in model_names:
    output_string = ""
    csv_file_path = f"./eval_outputs/{model_name}/{model_name}_combined_data.csv"
    data = pd.read_csv(csv_file_path)

    measures = ["benefit_answer", "rigor_answer", "importance_answer", "full_text_answer", "another_trial_answer", "overall"]
    for measure in measures:
        # get the data for the current measure
        measure_data = data[data['measure'] == measure]
        nan_rows_number = measure_data['interpretation_answer'].isnull().sum()
        # remove rows with NaN values in interpretation_answer
        measure_data = measure_data.dropna(subset=['interpretation_answer'])

        # check if there are less than 2 rows
        if len(measure_data) < 2:
            continue
        
        # fit the model
        model = smf.ols(formula="interpretation_answer ~ is_spin_in_abstract * is_detection_correct", 
                                    data=measure_data)
        results = model.fit()

        output_string += f"Model: {model_name} - {measure}\n"
        # print number of rows with NaN value(s)
        output_string += f"Number of rows with NaN value(s) in {model_name}: {nan_rows_number}\n"
        output_string += results.summary().as_text()
        output_string += "\n"

    # save the model summary
    with open(f"./eval_outputs/{model_name}/{model_name}_regression_binary_summary.txt", "w") as f:
        f.write(output_string)

  return 1 - self.ssr/self.centered_tss
  warn("omni_normtest is not valid with less than 8 observations; %i "
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return np.dot(wresid, wresid) / self.df_resid
  k, _ = kurtosistest(a, axis)
  k, _ = kurtosistest(a, axis)
  return np.sqrt(eigvals[0]/eigvals[-1])
  warn("omni_normtest is not valid with less than 8 observations; %i "
  return 1 - self.ssr/self.centered_tss
  return np.sqrt(eigvals[0]/eigvals[-1])
  return np.sqrt(eigvals[0]/eigvals[-1])
  return np.sqrt(eigvals[0]/eigvals[-1])
  return np.sqrt(eigvals[0]/eigvals[-1])
  return np.sqrt(eigvals[0]/eigvals[-1])
  return np.sqrt(eigvals[0]/eigvals[-1])
  return 1 - self.ssr/self.centered_tss


In [97]:
# what the model predicts rather than whether it was correct or not
for model_name in model_names:
    output_string = ""
    csv_file_path = f"./eval_outputs/{model_name}/{model_name}_combined_data.csv"
    data = pd.read_csv(csv_file_path)

    measures = ["benefit_answer", "rigor_answer", "importance_answer", "full_text_answer", "another_trial_answer", "overall"]
    for measure in measures:
        # get the data for the current measure
        measure_data = data[data['measure'] == measure]
        nan_rows_number = measure_data['interpretation_answer'].isnull().sum()
        # remove rows with NaN values in interpretation_answer
        measure_data = measure_data.dropna(subset=['interpretation_answer'])

        # check if there are less than 2 rows
        if len(measure_data) < 2:
            continue
        
        # fit the model
        model = smf.ols(formula="interpretation_answer ~ is_spin_in_abstract * detection_model_prediction", 
                                    data=measure_data)
        results = model.fit()

        output_string += f"Model: {model_name} - {measure}\n"
        # print number of rows with NaN value(s)
        output_string += f"Number of rows with NaN value(s) in {model_name}: {nan_rows_number}\n"
        output_string += results.summary().as_text()
        output_string += "\n"

    # save the model summary
    with open(f"./eval_outputs/{model_name}/{model_name}_regression_binary_direct_model_prediction_summary.txt", "w") as f:
        f.write(output_string)

  return 1 - self.ssr/self.centered_tss
  warn("omni_normtest is not valid with less than 8 observations; %i "
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return np.dot(wresid, wresid) / self.df_resid
  k, _ = kurtosistest(a, axis)
  k, _ = kurtosistest(a, axis)
  return np.sqrt(eigvals[0]/eigvals[-1])
  warn("omni_normtest is not valid with less than 8 observations; %i "
  return 1 - self.ssr/self.centered_tss
  return np.sqrt(eigvals[0]/eigvals[-1])
  return np.sqrt(eigvals[0]/eigvals[-1])
  return np.sqrt(eigvals[0]/eigvals[-1])
  return np.sqrt(eigvals[0]/eigvals[-1])
  return np.sqrt(eigvals[0]/eigvals[-1])
  return np.sqrt(eigvals[0]/eigvals[-1])
  return 1 - self.ssr/self.centered_tss


#### Probability Spin Detection Results Version

In [98]:
model_names = gpt_models + huggingface_models # remove no token probability models

for model_name in model_names:
    output_string = ""
    csv_file_path = f"./eval_outputs/{model_name}/{model_name}_combined_data.csv"
    data = pd.read_csv(csv_file_path)

    measures = ["benefit_answer", "rigor_answer", "importance_answer", "full_text_answer", "another_trial_answer", "overall"]
    for measure in measures:
        # get the data for the current measure
        measure_data = data[data['measure'] == measure]
        nan_rows_number = measure_data['interpretation_answer'].isnull().sum()
        # remove rows with NaN values in interpretation_answer
        measure_data = measure_data.dropna(subset=['interpretation_answer', 'detection_probability'])
        
        # if is_detection_no_spin_correct == 1, then detection_probability. Otherwise, 1 - detection_probability
        measure_data['regression_detection_variable'] = measure_data.apply(lambda x: x['detection_probability'] if x['is_detection_correct'] == 1 else 1 - x['detection_probability'], axis=1)
        # check if there are less than 2 rows
        if len(measure_data) < 2:
            continue

        # fit the model
        model = smf.ols(formula="interpretation_answer ~ is_spin_in_abstract * regression_detection_variable",
                                    data=measure_data)
        results = model.fit()

        output_string += f"Model: {model_name} - {measure}\n"
        # print number of rows with NaN value(s)
        output_string += f"Number of rows with NaN value(s) in {model_name}: {nan_rows_number}\n"
        output_string += results.summary().as_text()
        output_string += "\n"

    # save the model summary
    with open(f"./eval_outputs/{model_name}/{model_name}_regression_probability_summary.txt", "w") as f:
        f.write(output_string)

  return 1 - self.ssr/self.centered_tss
  warn("omni_normtest is not valid with less than 8 observations; %i "
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return np.dot(wresid, wresid) / self.df_resid
  warn("omni_normtest is not valid with less than 8 observations; %i "
  warn("omni_normtest is not valid with less than 8 observations; %i "
  warn("omni_normtest is not valid with less than 8 observations; %i "
  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss


In [99]:
# what the model predicts rather than whether it was correct or not

model_names = gpt_models + huggingface_models # remove no token probability models

for model_name in model_names:
    output_string = ""
    csv_file_path = f"./eval_outputs/{model_name}/{model_name}_combined_data.csv"
    data = pd.read_csv(csv_file_path)

    measures = ["benefit_answer", "rigor_answer", "importance_answer", "full_text_answer", "another_trial_answer", "overall"]
    for measure in measures:
        # get the data for the current measure
        measure_data = data[data['measure'] == measure]
        nan_rows_number = measure_data['interpretation_answer'].isnull().sum()
        # remove rows with NaN values in interpretation_answer
        measure_data = measure_data.dropna(subset=['interpretation_answer', 'detection_probability'])
        
        # if is_detection_no_spin_correct == 1, then detection_probability. Otherwise, 1 - detection_probability
        measure_data['regression_detection_variable'] = measure_data.apply(lambda x: x['detection_probability'] if x['detection_model_prediction'] == 1 else 1 - x['detection_probability'], axis=1)
        # check if there are less than 2 rows
        if len(measure_data) < 2:
            continue

        # fit the model
        model = smf.ols(formula="interpretation_answer ~ is_spin_in_abstract * regression_detection_variable",
                                    data=measure_data)
        results = model.fit()

        output_string += f"Model: {model_name} - {measure}\n"
        # print number of rows with NaN value(s)
        output_string += f"Number of rows with NaN value(s) in {model_name}: {nan_rows_number}\n"
        output_string += results.summary().as_text()
        output_string += "\n"

    # save the model summary
    with open(f"./eval_outputs/{model_name}/{model_name}_regression_probability_direct_model_prediction_summary.txt", "w") as f:
        f.write(output_string)

  return 1 - self.ssr/self.centered_tss
  warn("omni_normtest is not valid with less than 8 observations; %i "
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return np.dot(wresid, wresid) / self.df_resid
  warn("omni_normtest is not valid with less than 8 observations; %i "
  warn("omni_normtest is not valid with less than 8 observations; %i "
  warn("omni_normtest is not valid with less than 8 observations; %i "
  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss
