# Jupyter Notebook for Calculating Statistics

In [127]:
import pandas as pd
import numpy as np
from math import sqrt
import altair as alt
from utils import save_dataset_to_json, save_dataset_to_csv
import statsmodels.api as sm
import statsmodels.stats.api as sms
import statsmodels.formula.api as smf
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from tqdm import tqdm
import tiktoken

In [106]:
model_metadata = {
    "alpacare-7B": {"model_type": "biomedical open", "model_size_in_b": 7},
    "biomedgpt7B": {"model_type": "biomedical open", "model_size_in_b": 7},
    "biomistral7B": {"model_type": "biomedical open", "model_size_in_b": 7},
    "claude_3.5-haiku": {"model_type": "generalist closed", "model_size_in_b": None}, # 175?
    "claude_3.5-sonnet": {"model_type": "generalist closed", "model_size_in_b": None}, # 175?
    "gemini_1.5_flash": {"model_type": "generalist closed", "model_size_in_b": None},
    "gemini_1.5_flash-8B": {"model_type": "generalist closed", "model_size_in_b": 8},
    "gpt4o": {"model_type": "generalist closed", "model_size_in_b": None}, # around 1 trillion (1000B)
    "gpt4o-mini": {"model_type": "generalist closed", "model_size_in_b": None}, # 175?
    "gpt35": {"model_type": "generalist closed", "model_size_in_b": 175},
    "llama2_chat-7B": {"model_type": "generalist open", "model_size_in_b": 7},
    "llama2_chat-13B": {"model_type": "generalist open", "model_size_in_b": 13},
    "llama2_chat-70B": {"model_type": "generalist open", "model_size_in_b": 70},
    "llama3_instruct-8B": {"model_type": "generalist open", "model_size_in_b": 8},
    "llama3_instruct-70B": {"model_type": "generalist open", "model_size_in_b": 70},
    "med42-8B": {"model_type": "biomedical open", "model_size_in_b": 8},
    "med42-70B": {"model_type": "biomedical open", "model_size_in_b": 70},
    "mistral_instruct7B": {"model_type": "generalist open", "model_size_in_b": 7},
    "olmo2_instruct-7B": {"model_type": "generalist open", "model_size_in_b": 7},
    "olmo2_instruct-13B": {"model_type": "generalist open", "model_size_in_b": 13},
    "openbiollm-8B": {"model_type": "biomedical open", "model_size_in_b": 8},
    "openbiollm-70B": {"model_type": "biomedical open", "model_size_in_b": 70}
}

## Spin Detection Task

In [65]:
detection_stats_df = pd.read_json("./eval_outputs/detection_overall_metrics.json", orient="index")

detection_stats_df["model_name"] = detection_stats_df.index
detection_stats_df["model_type"] = detection_stats_df.index.map(lambda x: model_metadata[x]["model_type"])
detection_stats_df["model_size_in_b"] = detection_stats_df.index.map(lambda x: model_metadata[x]["model_size_in_b"])
# remove index
detection_stats_df.reset_index(drop=True, inplace=True)

print(f"Number of models: {len(detection_stats_df)}")

detection_stats_df.sort_index(inplace=True) # alphabetical order
detection_stats_df

Number of models: 22


Unnamed: 0,accuracy,precision,recall,f1,model_name,model_type,model_size_in_b
0,0.783333,0.707317,0.966667,0.816901,gpt4o,generalist closed,
1,0.85,0.783784,0.966667,0.865672,gpt4o-mini,generalist closed,
2,0.516667,1.0,0.033333,0.064516,gpt35,generalist closed,175.0
3,0.733333,0.652174,1.0,0.789474,gemini_1.5_flash,generalist closed,
4,0.833333,0.794118,0.9,0.84375,gemini_1.5_flash-8B,generalist closed,8.0
5,0.966667,1.0,0.933333,0.965517,claude_3.5-sonnet,generalist closed,
6,0.566667,0.535714,1.0,0.697674,claude_3.5-haiku,generalist closed,
7,0.516667,1.0,0.033333,0.064516,biomistral7B,biomedical open,7.0
8,0.566667,0.537037,0.966667,0.690476,llama2_chat-13B,generalist open,13.0
9,0.633333,0.58,0.966667,0.725,llama2_chat-70B,generalist open,70.0


### Average of accuracy, precision, recall, and F1 score by model type

In [66]:
# Group by model type and calculate mean accuracy and standard deviation
accuracy_by_model_type = detection_stats_df.groupby('model_type')['accuracy'].agg(['mean', 'std']).reset_index()

# Rename columns for clarity
accuracy_by_model_type.columns = ['model_type', 'mean_accuracy', 'std_deviation']

print(accuracy_by_model_type)

          model_type  mean_accuracy  std_deviation
0    biomedical open       0.632163       0.141272
1  generalist closed       0.750000       0.159861
2    generalist open       0.635417       0.140418


In [67]:
# Group by model type and calculate mean precision and standard deviation
precision_by_model_type = detection_stats_df.groupby('model_type')['precision'].agg(['mean', 'std']).reset_index()

# Rename columns for clarity
precision_by_model_type.columns = ['model_type', 'mean_precision', 'std_deviation']

print(precision_by_model_type)

          model_type  mean_precision  std_deviation
0    biomedical open        0.812759       0.219535
1  generalist closed        0.781872       0.172379
2    generalist open        0.702130       0.364676


In [68]:
# Group by model type and calculate mean recall and standard deviation
recall_by_model_type = detection_stats_df.groupby('model_type')['recall'].agg(['mean', 'std']).reset_index()

# Rename columns for clarity
recall_by_model_type.columns = ['model_type', 'mean_recall', 'std_deviation']

print(recall_by_model_type)

          model_type  mean_recall  std_deviation
0    biomedical open     0.576190       0.383799
1  generalist closed     0.828571       0.352467
2    generalist open     0.587500       0.406666


In [69]:
# Group by model type and calculate mean f1 and standard deviation
f1_by_model_type = detection_stats_df.groupby('model_type')['f1'].agg(['mean', 'std']).reset_index()

# Rename columns for clarity
f1_by_model_type.columns = ['model_type', 'mean_f1', 'std_deviation']

print(f1_by_model_type)

          model_type   mean_f1  std_deviation
0    biomedical open  0.547278       0.275737
1  generalist closed  0.720501       0.300329
2    generalist open  0.539761       0.322221


#### Plots

In [70]:
# Create a dictionary for custom labels
custom_labels = {
    "alpacare-7B": "AlpaCare 7B",
    "biomedgpt7B": "BioMedGPT 7B",
    "biomistral7B": "BioMistral 7B",
    "claude_3.5-haiku": "Claude3.5 Haiku", # 175?
    "claude_3.5-sonnet": "Claude3.5 Sonnet", # 175?
    "gemini_1.5_flash": "Gemini1.5 Flash",
    "gemini_1.5_flash-8B": "Gemini1.5 Flash 8B",
    "gpt4o": "GPT4o", # around 1 trillion (1000B)
    "gpt4o-mini": "GPT4o Mini", # 175?
    "gpt35": "GPT3.5",
    "llama2_chat-7B": "Llama2 Chat 7B",
    "llama2_chat-13B": "Llama2 Chat 13B",
    "llama2_chat-70B": "Llama2 Chat 70B",
    "llama3_instruct-8B": "Llama3 Instruct 8B",
    "llama3_instruct-70B": "Llama3 Instruct 70B",
    "med42-8B": "Med42 8B",
    "med42-70B": "Med42 70B",
    "mistral_instruct7B": "Mistral Instruct 7B",
    "olmo2_instruct-7B": "Olmo2 Instruct 7B",
    "olmo2_instruct-13B": "Olmo2 Instruct 13B",
    "openbiollm-8B": "OpenBioLM 8B",
    "openbiollm-70B": "OpenBioLM 70B"
}

detection_stats_df['model_name_custom'] = detection_stats_df['model_name'].map(custom_labels)

color_mapping = {
    'biomedical open': '#0868ac', 
    'generalist closed': '#7bccc4',
    'generalist open': '#bae4bc',
}

# Create the bar chart
chart = alt.Chart(detection_stats_df).mark_bar().encode(
    y=alt.Y('model_name_custom:N', sort='-x', title='Model Name'),
    x=alt.X('accuracy:Q', title='Accuracy'),
    color=alt.Color('model_type:N', title='Model Type',
                    scale=alt.Scale(domain=list(color_mapping.keys()), range=list(color_mapping.values())),
                    legend=alt.Legend(
                    orient='none',
                    legendX=130, legendY=-45,
                    direction='horizontal',
                    titleAnchor='middle'))  # Legend at the bottom
).properties(
    width=800,
)

# Add value labels with increased font size
text = chart.mark_text(
    align='center',
    baseline='middle',
    fontWeight='bold',
    dx=20  # Adjust the position of the text
).encode(
    text=alt.Text('accuracy:Q', format='.2f'),
    color=alt.value('black'),
)

# Add a mean rule
avg_rule = alt.Chart(detection_stats_df).mark_rule(color='red').encode(
    x='mean(accuracy):Q',
    size=alt.value(2)
)

# Add a 50% chance rule
chance_rule = alt.Chart(detection_stats_df).mark_rule(color='gray').encode(
    x='min(accuracy):Q',
    size=alt.value(2),
    strokeDash=alt.value([10, 10])
)

# Increase font size for axis labels, titles, and other components
chart_config = {
    "axis": {"labelFontSize": 20, "titleFontSize": 22},  # Axis labels and titles
    "header": {"labelFontSize": 20, "titleFontSize": 22},  # Title and facet headers (if any)
    "legend": {"labelFontSize": 18, "titleFontSize": 20},  # Legend labels and titles
    "text": {"fontSize": 20},  # Text mark size
}

# Combine chart and text, and apply the config
c_t = chart + avg_rule + chance_rule + text
c_t = c_t.configure(**chart_config)  # Apply the global configuration

# Save to HTML
c_t.save("./plots/detection_accuracy_by_model.html")

# Display the chart
c_t

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [71]:
# # Create the bar chart
# chart = alt.Chart(detection_stats_df).mark_bar().encode(
#     y=alt.Y('model_name_custom:N', sort='-x', title='Model Name'),
#     x=alt.X('accuracy:Q', title='Accuracy'),
#    color=alt.Color('model_type:N', title='Model Type', legend=alt.Legend(
#         orient='none',
#         legendX=130, legendY=-45,
#         direction='horizontal',
#         titleAnchor='middle'), scale=alt.Scale(range=["#808080", "#A9A9A9", "#D3D3D3", "#BEBEBE"]))  # Legend at the bottom
# ).properties(
#     width=800,
# )

# # Add value labels with increased font size
# text = chart.mark_text(
#     align='center',
#     baseline='middle',
#     fontWeight='bold',
#     dx=18  # Adjust the position of the text
# ).encode(
#     text=alt.Text('accuracy:Q', format='.2f'),
#     color=alt.value('black')  # Set text color to black
# )

# # Add a mean rule
# rule = alt.Chart(detection_stats_df).mark_rule(color='gray').encode(
#     x='mean(accuracy):Q',
#     size=alt.value(2)
# )

# # Increase font size for axis labels, titles, and other components
# chart_config = {
#     "axis": {"labelFontSize": 20, "titleFontSize": 22},  # Axis labels and titles
#     "header": {"labelFontSize": 20, "titleFontSize": 22},  # Title and facet headers (if any)
#     "legend": {"labelFontSize": 18, "titleFontSize": 20},  # Legend labels and titles
#     "text": {"fontSize": 20},  # Text mark size
# }

# # Combine chart and text, and apply the config
# c_t = chart + rule + text
# c_t = c_t.configure(**chart_config)  # Apply the global configuration

# # Save to HTML
# c_t.save("./plots/detection_accuracy_by_model_gray.html")

# # Display the chart
# c_t

In [72]:
# Plot average accuracy by model_type and add error bars
bars = alt.Chart(detection_stats_df).mark_bar().encode(
    x=alt.X('model_type:N', title='Model Type', axis=alt.Axis(labelAngle=0)),
    y=alt.Y('mean(accuracy):Q', title='Mean Accuracy'),
    color=alt.Color('model_type:N', title='Model Type', legend=None)
).properties(
    title='Average Accuracy by Model Type',
    width=800  # Set the width to 800 pixels
)

error_bars = alt.Chart(detection_stats_df).mark_errorbar(extent='stdev').encode(
    x=alt.X('model_type:N'),
    y=alt.Y('accuracy:Q')
)

# Add value labels
text = bars.mark_text(
    align='center',
    baseline='bottom',
    dy=-5  # Adjust the position of the text
).encode(
    text=alt.Text('mean(accuracy):Q', format='.2f')
)

alt.layer(bars, error_bars, text)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


### Average of accuracy, precision, recall, and F1 score by model size

In [73]:
# model size in buckets (0-10B, 11-20B, 22-100B, 100B+/NaN)
def model_size_bucket(model_size): 
    if model_size is None or pd.isna(model_size):
        return "Unknown"
    elif model_size >= 100:
        return "100B+"
    elif model_size <= 10:
        return "0-10B"
    elif model_size <= 20:
        return "11-20B"
    else:
        return "21-100B"

In [74]:
# average accuracy by model size
detection_stats_df["model_size_bucket"] = detection_stats_df["model_size_in_b"].map(model_size_bucket)

accuracy_by_model_size = detection_stats_df.groupby('model_size_bucket')['accuracy'].agg(['mean', 'std']).reset_index()

# Rename columns for clarity
accuracy_by_model_size.columns = ['model_size_bucket', 'mean_accuracy', 'std_deviation']

print(accuracy_by_model_size)

  model_size_bucket  mean_accuracy  std_deviation
0             0-10B       0.615847       0.137000
1             100B+       0.516667            NaN
2            11-20B       0.541667       0.035355
3           21-100B       0.775000       0.095743
4           Unknown       0.780000       0.147855


In [75]:
# average precision by model size
precision_by_model_size = detection_stats_df.groupby('model_size_bucket')['precision'].agg(['mean', 'std']).reset_index()

# Rename columns for clarity
precision_by_model_size.columns = ['model_size_bucket', 'mean_precision', 'std_deviation']

print(precision_by_model_size)

  model_size_bucket  mean_precision  std_deviation
0             0-10B        0.721719       0.337294
1             100B+        1.000000            NaN
2            11-20B        0.768519       0.327364
3           21-100B        0.836558       0.180942
4           Unknown        0.735798       0.173164


In [76]:
# average recall by model size
recall_by_model_size = detection_stats_df.groupby('model_size_bucket')['recall'].agg(['mean', 'std']).reset_index()

# Rename columns for clarity
recall_by_model_size.columns = ['model_size_bucket', 'mean_recall', 'std_deviation']

print(recall_by_model_size)

  model_size_bucket  mean_recall  std_deviation
0             0-10B     0.553333       0.404969
1             100B+     0.033333            NaN
2            11-20B     0.500000       0.659966
3           21-100B     0.775000       0.142400
4           Unknown     0.973333       0.027889


In [77]:
# average f1 score by model size 
f1_by_model_size_bucket = detection_stats_df.groupby('model_size_bucket')['f1'].agg(['mean', 'std']).reset_index()

# Rename columns for clarity
f1_by_model_size_bucket.columns = ['model_size_bucket', 'mean_f1', 'std_deviation']

print(f1_by_model_size_bucket)

  model_size_bucket   mean_f1  std_deviation
0             0-10B  0.511597       0.294719
1             100B+  0.064516            NaN
2            11-20B  0.377496       0.442621
3           21-100B  0.780454       0.043987
4           Unknown  0.827048       0.098638


#### Plots

In [78]:
bars = alt.Chart(detection_stats_df).mark_bar().encode(
    x=alt.X('model_name:N', sort='-y', title='Model Name'),
    y=alt.Y('accuracy:Q', title='Accuracy'),
    color=alt.Color('model_size_bucket:N', title='Model Size Bucket')
).properties(
    title='Accuracy by Model Size Bucket',
    width=800,
)

# Add value labels
text = bars.mark_text(
    align='center',
    baseline='bottom',
    dy=-5  # Adjust the position of the text
).encode(
    text=alt.Text('accuracy:Q', format='.2f')
)

bars + text

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [79]:
# Plot average accuracy by model_size_bucket and add error bars
bars = alt.Chart(detection_stats_df).mark_bar().encode(
    x=alt.X('model_size_bucket:N', title='Model Size Bucket', axis=alt.Axis(labelAngle=0), sort=['0-10B', '11-20B', '21-100B', '100B+', 'Unknown']),
    y=alt.Y('mean(accuracy):Q', title='Mean Accuracy'),
    color=alt.Color('model_size_bucket:N', title='Model Size Bucket', legend=None)
).properties(
    title='Average Accuracy by Model Size',
    width=800  # Set the width to 800 pixels
)

error_bars = alt.Chart(detection_stats_df).mark_errorbar(extent='stdev').encode(
    x=alt.X('model_size_bucket:N', sort=['0-10B', '11-20B', '21-100B', '100B+', 'Unknown']),
    y=alt.Y('accuracy:Q')
)

# Add value labels
text = bars.mark_text(
    align='center',
    baseline='bottom',
    dy=-5  # Adjust the position of the text
).encode(
    text=alt.Text('mean(accuracy):Q', format='.2f')
)

alt.layer(bars, error_bars, text)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [80]:
# scatter plot of model size vs accuracy with model names as labels
scatter_plot = alt.Chart(detection_stats_df).mark_circle().encode(
    x=alt.X('model_size_in_b:Q', title='Model Size (in Billion Parameters)'),
    y=alt.Y('accuracy:Q', title='Accuracy'),
    color=alt.Color('model_type:N', title='Model Type')
).properties(
    title='Model Size vs Accuracy',
    width=800,  # Set the width to 800 pixels
    height=400  # Set the height to 400 pixels
)

text = scatter_plot.mark_text(
    align='left',
    baseline='middle',
    dx=7,  # Adjust the position of the text
    dy=-5,  # Adjust the vertical position of the text
).encode(
    text='model_name:N'
)

scatter_plot + text

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


## RCT Trial Result Interpretation Task

In [81]:
# "interpretation_overall_metrics.json"
# "gold_labelled_interpretation_overall_metrics.json"
# "model_output_labelled_interpretation_overall_metrics.json"

interpretation_stats_df = pd.read_json("./eval_outputs/interpretation_overall_metrics.json", orient="index")

interpretation_stats_df["model_name"] = interpretation_stats_df.index
interpretation_stats_df["model_type"] = interpretation_stats_df.index.map(lambda x: model_metadata[x]["model_type"])
interpretation_stats_df["model_size_in_b"] = interpretation_stats_df.index.map(lambda x: model_metadata[x]["model_size_in_b"])
# remove index
interpretation_stats_df.reset_index(drop=True, inplace=True)

print(f"Number of models: {len(interpretation_stats_df)}")

interpretation_stats_df.sort_index(inplace=True) # alphabetical order
interpretation_stats_df

Number of models: 22


Unnamed: 0,benefit_answer_mean_diff,rigor_answer_mean_diff,importance_answer_mean_diff,full_text_answer_mean_diff,another_trial_answer_mean_diff,overall_mean_diff_avg,model_name,model_type,model_size_in_b
0,3.133333,0.1,1.233333,2.866667,3.333333,2.133333,gpt4o,generalist closed,
1,3.566667,1.466667,2.733333,3.933333,3.866667,3.113333,gpt4o-mini,generalist closed,
2,3.9,1.433333,2.066667,2.6,3.766667,2.753333,gpt35,generalist closed,175.0
3,2.5,-0.1,2.166667,3.0,3.7,2.253333,gemini_1.5_flash,generalist closed,
4,3.066667,-0.1,0.966667,2.733333,3.433333,2.02,gemini_1.5_flash-8B,generalist closed,8.0
5,2.5,-0.166667,-0.633333,3.233333,2.866667,1.56,claude_3.5-sonnet,generalist closed,
6,2.966667,-0.033333,0.466667,1.3,2.166667,1.373333,claude_3.5-haiku,generalist closed,
7,6.051724,0.266667,0.8,0.0,,,alpacare-7B,biomedical open,7.0
8,1.666667,0.35,1.116667,1.035714,1.666667,1.167143,biomistral7B,biomedical open,7.0
9,3.5,0.5,1.066667,0.333333,1.7,1.42,llama2_chat-7B,generalist open,7.0


In [82]:
human_expert_stats = {
        "benefit_answer": {"mean_diff": 0.71, "ci_lower": 0.07, "ci_upper": 1.35},
        "rigor_answer": {"mean_diff": -0.59, "ci_lower": -1.13, "ci_upper": -0.05},
        "importance_answer": {"mean_diff": -0.38, "ci_lower": -0.95, "ci_upper": 0.19},
        "full_text_answer": {"mean_diff": 0.77, "ci_lower": 0.08, "ci_upper": 1.47},
        "another_trial_answer": {"mean_diff": 0.64, "ci_lower": -0.03, "ci_upper": 1.31}
    }

human_expert_stats_df = pd.DataFrame(human_expert_stats).T
human_expert_stats_df["metric"] = human_expert_stats_df.index
# remove index
human_expert_stats_df.reset_index(drop=True, inplace=True)
human_expert_stats_df["method"] = "human experts"

human_expert_stats_df

Unnamed: 0,mean_diff,ci_lower,ci_upper,metric,method
0,0.71,0.07,1.35,benefit_answer,human experts
1,-0.59,-1.13,-0.05,rigor_answer,human experts
2,-0.38,-0.95,0.19,importance_answer,human experts
3,0.77,0.08,1.47,full_text_answer,human experts
4,0.64,-0.03,1.31,another_trial_answer,human experts


In [83]:
def calculate_confidence_interval(df, df_column_name):
    mean_diff = df[df_column_name].mean()  # Calculate the mean
    std_dev = df[df_column_name].std()  # Calculate the standard deviation
    n = len(df[df_column_name])  # Sample size

    # Calculate the margin of error for 95% CI (z = 1.96)
    z = 1.96
    margin_of_error = z * (std_dev / sqrt(n))

    # Calculate the 95% Confidence Interval
    ci_lower = mean_diff - margin_of_error
    ci_upper = mean_diff + margin_of_error

    return ci_lower, ci_upper

In [84]:
# calculate the average of all model metrics and calculate 95% CI
average_model_benefit = interpretation_stats_df["benefit_answer_mean_diff"].mean()
ci_lower_model_benefit, ci_upper_model_benefit = calculate_confidence_interval(interpretation_stats_df, "benefit_answer_mean_diff")

average_model_rigor = interpretation_stats_df["rigor_answer_mean_diff"].mean()
ci_lower_model_rigor, ci_upper_model_rigor = calculate_confidence_interval(interpretation_stats_df, "rigor_answer_mean_diff")

average_model_importance = interpretation_stats_df["importance_answer_mean_diff"].mean()
ci_lower_model_importance, ci_upper_model_importance = calculate_confidence_interval(interpretation_stats_df, "importance_answer_mean_diff")

average_model_full_text = interpretation_stats_df["full_text_answer_mean_diff"].mean()
ci_lower_model_full_text, ci_upper_model_full_text = calculate_confidence_interval(interpretation_stats_df, "full_text_answer_mean_diff")

average_model_another_trial = interpretation_stats_df["another_trial_answer_mean_diff"].mean()
ci_lower_model_another_trial, ci_upper_model_another_trial = calculate_confidence_interval(interpretation_stats_df, "another_trial_answer_mean_diff")

model_stats = {
    "benefit_answer": {"mean_diff": average_model_benefit, "ci_lower": ci_lower_model_benefit, "ci_upper": ci_upper_model_benefit},
    "rigor_answer": {"mean_diff": average_model_rigor, "ci_lower": ci_lower_model_rigor, "ci_upper": ci_upper_model_rigor},
    "importance_answer": {"mean_diff": average_model_importance, "ci_lower": ci_lower_model_importance, "ci_upper": ci_upper_model_importance},
    "full_text_answer": {"mean_diff": average_model_full_text, "ci_lower": ci_lower_model_full_text, "ci_upper": ci_upper_model_full_text},
    "another_trial_answer": {"mean_diff": average_model_another_trial, "ci_lower": ci_lower_model_another_trial, "ci_upper": ci_upper_model_another_trial}
}

model_stats_df = pd.DataFrame(model_stats).T
model_stats_df["metric"] = model_stats_df.index
# remove index
model_stats_df.reset_index(drop=True, inplace=True)
model_stats_df["method"] = "all LLMs"

model_stats_df

Unnamed: 0,mean_diff,ci_lower,ci_upper,metric,method
0,3.381165,2.87468,3.88765,benefit_answer,all LLMs
1,0.277165,0.088599,0.46573,rigor_answer,all LLMs
2,1.264057,0.92272,1.605395,importance_answer,all LLMs
3,2.205034,1.605838,2.80423,full_text_answer,all LLMs
4,2.9625,2.496102,3.428898,another_trial_answer,all LLMs


In [85]:
# get average and 95% CI by model_type from interpretation_stats_df
average_benefit_by_model_type = interpretation_stats_df.groupby('model_type')['benefit_answer_mean_diff'].mean().reset_index()
average_benefit_by_model_type.columns = ['method', 'mean_diff']
ci_lower_benefit_by_model_type, ci_upper_benefit_by_model_type = calculate_confidence_interval(average_benefit_by_model_type, "mean_diff")
average_benefit_by_model_type['ci_lower'] = ci_lower_benefit_by_model_type
average_benefit_by_model_type['ci_upper'] = ci_upper_benefit_by_model_type
average_benefit_by_model_type['metric'] = 'benefit_answer'

average_rigor_by_model_type = interpretation_stats_df.groupby('model_type')['rigor_answer_mean_diff'].mean().reset_index()
average_rigor_by_model_type.columns = ['method', 'mean_diff']
ci_lower_rigor_by_model_type, ci_upper_rigor_by_model_type = calculate_confidence_interval(average_rigor_by_model_type, "mean_diff")
average_rigor_by_model_type['ci_lower'] = ci_lower_rigor_by_model_type
average_rigor_by_model_type['ci_upper'] = ci_upper_rigor_by_model_type
average_rigor_by_model_type['metric'] = 'rigor_answer'

average_importance_by_model_type = interpretation_stats_df.groupby('model_type')['importance_answer_mean_diff'].mean().reset_index()
average_importance_by_model_type.columns = ['method', 'mean_diff']
ci_lower_importance_by_model_type, ci_upper_importance_by_model_type = calculate_confidence_interval(average_importance_by_model_type, "mean_diff")
average_importance_by_model_type['ci_lower'] = ci_lower_importance_by_model_type
average_importance_by_model_type['ci_upper'] = ci_upper_importance_by_model_type
average_importance_by_model_type['metric'] = 'importance_answer'

average_full_text_by_model_type = interpretation_stats_df.groupby('model_type')['full_text_answer_mean_diff'].mean().reset_index()
average_full_text_by_model_type.columns = ['method', 'mean_diff']
ci_lower_full_text_by_model_type, ci_upper_full_text_by_model_type = calculate_confidence_interval(average_full_text_by_model_type, "mean_diff")
average_full_text_by_model_type['ci_lower'] = ci_lower_full_text_by_model_type
average_full_text_by_model_type['ci_upper'] = ci_upper_full_text_by_model_type
average_full_text_by_model_type['metric'] = 'full_text_answer'

average_another_trial_by_model_type = interpretation_stats_df.groupby('model_type')['another_trial_answer_mean_diff'].mean().reset_index()
average_another_trial_by_model_type.columns = ['method', 'mean_diff']
ci_lower_another_trial_by_model_type, ci_upper_another_trial_by_model_type = calculate_confidence_interval(average_another_trial_by_model_type, "mean_diff")
average_another_trial_by_model_type['ci_lower'] = ci_lower_another_trial_by_model_type
average_another_trial_by_model_type['ci_upper'] = ci_upper_another_trial_by_model_type
average_another_trial_by_model_type['metric'] = 'another_trial_answer'

In [86]:
average_by_model_type = pd.concat([average_benefit_by_model_type, average_rigor_by_model_type, average_importance_by_model_type, average_full_text_by_model_type, average_another_trial_by_model_type], ignore_index=True)

average_by_model_type

Unnamed: 0,method,mean_diff,ci_lower,ci_upper,metric
0,biomedical open,3.413711,3.075071,3.665747,benefit_answer
1,generalist closed,3.090476,3.075071,3.665747,benefit_answer
2,generalist open,3.60704,3.075071,3.665747,benefit_answer
3,biomedical open,0.166327,0.159475,0.393473,rigor_answer
4,generalist closed,0.371429,0.159475,0.393473,rigor_answer
5,generalist open,0.291667,0.159475,0.393473,rigor_answer
6,biomedical open,1.015608,0.999924,1.509291,importance_answer
7,generalist closed,1.285714,0.999924,1.509291,importance_answer
8,generalist open,1.4625,0.999924,1.509291,importance_answer
9,biomedical open,1.482489,1.441822,2.958409,full_text_answer


In [87]:
#combine all the dataframes
model_stats_final_df = pd.concat([human_expert_stats_df, model_stats_df, average_by_model_type], ignore_index=True)
#drop "_answer" from the values in metric column
model_stats_final_df['metric'] = model_stats_final_df['metric'].str.replace('_answer', '')

model_stats_final_df

Unnamed: 0,mean_diff,ci_lower,ci_upper,metric,method
0,0.71,0.07,1.35,benefit,human experts
1,-0.59,-1.13,-0.05,rigor,human experts
2,-0.38,-0.95,0.19,importance,human experts
3,0.77,0.08,1.47,full_text,human experts
4,0.64,-0.03,1.31,another_trial,human experts
5,3.381165,2.87468,3.88765,benefit,all LLMs
6,0.277165,0.088599,0.46573,rigor,all LLMs
7,1.264057,0.92272,1.605395,importance,all LLMs
8,2.205034,1.605838,2.80423,full_text,all LLMs
9,2.9625,2.496102,3.428898,another_trial,all LLMs


### Plots

In [88]:
# Create a mapping for custom facet titles
facet_title_mapping = {
    'benefit': 'Treatment Benefit',
    'rigor': 'Study Rigor',
    'importance': 'Study Importance',
    'full_text': 'Interest to Read Full-Text',
    'another_trial': 'Interest to Run Another Trial'
}

# Define the desired order for the facets
facet_order = ['Treatment Benefit', 'Study Rigor', 'Study Importance', 'Interest to Read Full-Text', 'Interest to Run Another Trial']

color_mapping = {
    'human experts': '#0868ac', 
    'all LLMs': '#43a2ca',  
    'generalist closed': '#7bccc4',  
    'generalist open': '#bae4bc',  
    'biomedical open': '#E3F4D4'
}

method_order = ['human experts', 'all LLMs', 'generalist closed', 'generalist open', 'biomedical open']

# Apply the mapping as a calculated field
chart_data = model_stats_final_df.copy()
chart_data['metric'] = chart_data['metric'].map(facet_title_mapping)

# Configure global font sizes
chart_config = {
    "axis": {"labelFontSize": 20, "titleFontSize": 22},  # Axis labels and titles
    "header": {"labelFontSize": 20, "titleFontSize": 22},  # Facet headers
    "legend": {"labelFontSize": 18, "titleFontSize": 20},  # Legend labels and titles
    "text": {"fontSize": 20},  # Text mark size
}

# Bar chart
bars = alt.Chart(chart_data).mark_bar().encode(
    x=alt.X('method:N', title=None, axis=alt.Axis(labelAngle=-45), sort=method_order),
    y=alt.Y('mean_diff:Q', title='Mean Difference'),
    color=alt.Color('method:N', title='Method', legend=None, scale=alt.Scale(domain=list(color_mapping.keys()), range=list(color_mapping.values())))
).properties(
    width=300,  # Set the width to 300 pixels
    height=300  # Set the height to 300 pixels
)

# Error bars
error_bars = alt.Chart(chart_data).mark_errorbar().encode(
    alt.X("method:N", sort=method_order),
    alt.Y("ci_lower:Q").title("Mean Difference"),
    alt.Y2("ci_upper:Q"),
    strokeWidth=alt.value(2),
    color=alt.value('gray')
)

# Add value labels
text = bars.mark_text(
    align='center',
    baseline='bottom',
    fontWeight='bold',
    dy=alt.expr(expr=alt.expr.if_(alt.datum.mean_diff >= 0, -1, 20))  # Adjust the position of the text    
).encode(
    text=alt.Text('mean_diff:Q', format='.2f'),
    color=alt.value('black')  # Set text color to black
)

# Combine layers and facet
chart = alt.layer(bars, error_bars, text, data=chart_data).facet(
    column=alt.Column('metric:N', title=None, sort=facet_order),
).configure(**chart_config)  # Apply the global configuration

# save to html
chart.save("./plots/interpretation_by_measures.html")

chart

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


### Mitigation Strategies Results

In [89]:
model_stats_df['method'] = 'baseline'

In [90]:
# "interpretation_overall_metrics.json"
# "gold_labelled_interpretation_overall_metrics.json"
# "model_output_labelled_interpretation_overall_metrics.json"

gold_labelled_interpretation_stats_df = pd.read_json("./eval_outputs/gold_labelled_interpretation_overall_metrics.json", orient="index")

gold_labelled_interpretation_stats_df["model_name"] = gold_labelled_interpretation_stats_df.index
gold_labelled_interpretation_stats_df["model_type"] = gold_labelled_interpretation_stats_df.index.map(lambda x: model_metadata[x]["model_type"])
gold_labelled_interpretation_stats_df["model_size_in_b"] = gold_labelled_interpretation_stats_df.index.map(lambda x: model_metadata[x]["model_size_in_b"])
# remove index
gold_labelled_interpretation_stats_df.reset_index(drop=True, inplace=True)

print(f"Number of models: {len(gold_labelled_interpretation_stats_df)}")

gold_labelled_interpretation_stats_df.sort_index(inplace=True) # alphabetical order
# gold_labelled_interpretation_stats_df

Number of models: 22


In [91]:
# calculate the average of all model metrics and calculate 95% CI
average_model_benefit = gold_labelled_interpretation_stats_df["benefit_answer_mean_diff"].mean()
ci_lower_model_benefit, ci_upper_model_benefit = calculate_confidence_interval(gold_labelled_interpretation_stats_df, "benefit_answer_mean_diff")

average_model_rigor = gold_labelled_interpretation_stats_df["rigor_answer_mean_diff"].mean()
ci_lower_model_rigor, ci_upper_model_rigor = calculate_confidence_interval(gold_labelled_interpretation_stats_df, "rigor_answer_mean_diff")

average_model_importance = gold_labelled_interpretation_stats_df["importance_answer_mean_diff"].mean()
ci_lower_model_importance, ci_upper_model_importance = calculate_confidence_interval(gold_labelled_interpretation_stats_df, "importance_answer_mean_diff")

average_model_full_text = gold_labelled_interpretation_stats_df["full_text_answer_mean_diff"].mean()
ci_lower_model_full_text, ci_upper_model_full_text = calculate_confidence_interval(gold_labelled_interpretation_stats_df, "full_text_answer_mean_diff")

average_model_another_trial = gold_labelled_interpretation_stats_df["another_trial_answer_mean_diff"].mean()
ci_lower_model_another_trial, ci_upper_model_another_trial = calculate_confidence_interval(gold_labelled_interpretation_stats_df, "another_trial_answer_mean_diff")

model_stats = {
    "benefit_answer": {"mean_diff": average_model_benefit, "ci_lower": ci_lower_model_benefit, "ci_upper": ci_upper_model_benefit},
    "rigor_answer": {"mean_diff": average_model_rigor, "ci_lower": ci_lower_model_rigor, "ci_upper": ci_upper_model_rigor},
    "importance_answer": {"mean_diff": average_model_importance, "ci_lower": ci_lower_model_importance, "ci_upper": ci_upper_model_importance},
    "full_text_answer": {"mean_diff": average_model_full_text, "ci_lower": ci_lower_model_full_text, "ci_upper": ci_upper_model_full_text},
    "another_trial_answer": {"mean_diff": average_model_another_trial, "ci_lower": ci_lower_model_another_trial, "ci_upper": ci_upper_model_another_trial}
}

gold_labelled_model_stats_df = pd.DataFrame(model_stats).T
gold_labelled_model_stats_df["metric"] = gold_labelled_model_stats_df.index
# remove index
gold_labelled_model_stats_df.reset_index(drop=True, inplace=True)
gold_labelled_model_stats_df["method"] = "+ ref labels"

gold_labelled_model_stats_df

Unnamed: 0,mean_diff,ci_lower,ci_upper,metric,method
0,1.939576,1.467239,2.411913,benefit_answer,+ ref labels
1,-1.43268,-1.804166,-1.061195,rigor_answer,+ ref labels
2,-0.446717,-0.918034,0.024599,importance_answer,+ ref labels
3,0.287751,-0.257165,0.832668,full_text_answer,+ ref labels
4,1.094828,0.541371,1.648284,another_trial_answer,+ ref labels


In [92]:
# "interpretation_overall_metrics.json"
# "gold_labelled_interpretation_overall_metrics.json"
# "model_output_labelled_interpretation_overall_metrics.json"

model_output_labelled_interpretation_stats_df = pd.read_json("./eval_outputs/model_output_labelled_interpretation_overall_metrics.json", orient="index")

model_output_labelled_interpretation_stats_df["model_name"] = model_output_labelled_interpretation_stats_df.index
model_output_labelled_interpretation_stats_df["model_type"] = model_output_labelled_interpretation_stats_df.index.map(lambda x: model_metadata[x]["model_type"])
model_output_labelled_interpretation_stats_df["model_size_in_b"] = model_output_labelled_interpretation_stats_df.index.map(lambda x: model_metadata[x]["model_size_in_b"])
# remove index
model_output_labelled_interpretation_stats_df.reset_index(drop=True, inplace=True)

print(f"Number of models: {len(model_output_labelled_interpretation_stats_df)}")

model_output_labelled_interpretation_stats_df.sort_index(inplace=True) # alphabetical order
# model_output_labelled_interpretation_stats_df

Number of models: 22


In [93]:
# calculate the average of all model metrics and calculate 95% CI
average_model_benefit = model_output_labelled_interpretation_stats_df["benefit_answer_mean_diff"].mean()
ci_lower_model_benefit, ci_upper_model_benefit = calculate_confidence_interval(model_output_labelled_interpretation_stats_df, "benefit_answer_mean_diff")

average_model_rigor = model_output_labelled_interpretation_stats_df["rigor_answer_mean_diff"].mean()
ci_lower_model_rigor, ci_upper_model_rigor = calculate_confidence_interval(model_output_labelled_interpretation_stats_df, "rigor_answer_mean_diff")

average_model_importance = model_output_labelled_interpretation_stats_df["importance_answer_mean_diff"].mean()
ci_lower_model_importance, ci_upper_model_importance = calculate_confidence_interval(model_output_labelled_interpretation_stats_df, "importance_answer_mean_diff")

average_model_full_text = model_output_labelled_interpretation_stats_df["full_text_answer_mean_diff"].mean()
ci_lower_model_full_text, ci_upper_model_full_text = calculate_confidence_interval(model_output_labelled_interpretation_stats_df, "full_text_answer_mean_diff")

average_model_another_trial = model_output_labelled_interpretation_stats_df["another_trial_answer_mean_diff"].mean()
ci_lower_model_another_trial, ci_upper_model_another_trial = calculate_confidence_interval(model_output_labelled_interpretation_stats_df, "another_trial_answer_mean_diff")

model_stats = {
    "benefit_answer": {"mean_diff": average_model_benefit, "ci_lower": ci_lower_model_benefit, "ci_upper": ci_upper_model_benefit},
    "rigor_answer": {"mean_diff": average_model_rigor, "ci_lower": ci_lower_model_rigor, "ci_upper": ci_upper_model_rigor},
    "importance_answer": {"mean_diff": average_model_importance, "ci_lower": ci_lower_model_importance, "ci_upper": ci_upper_model_importance},
    "full_text_answer": {"mean_diff": average_model_full_text, "ci_lower": ci_lower_model_full_text, "ci_upper": ci_upper_model_full_text},
    "another_trial_answer": {"mean_diff": average_model_another_trial, "ci_lower": ci_lower_model_another_trial, "ci_upper": ci_upper_model_another_trial}
}

model_output_labelled_model_stats_df = pd.DataFrame(model_stats).T
model_output_labelled_model_stats_df["metric"] = model_output_labelled_model_stats_df.index
# remove index
model_output_labelled_model_stats_df.reset_index(drop=True, inplace=True)
model_output_labelled_model_stats_df["method"] = "+ model labels"

model_output_labelled_model_stats_df

Unnamed: 0,mean_diff,ci_lower,ci_upper,metric,method
0,2.588531,2.070864,3.106198,benefit_answer,+ model labels
1,-0.251387,-0.426146,-0.076627,rigor_answer,+ model labels
2,0.771471,0.488157,1.054785,importance_answer,+ model labels
3,1.357926,0.830169,1.885683,full_text_answer,+ model labels
4,2.147018,1.642268,2.651768,another_trial_answer,+ model labels


In [94]:
# "interpretation_overall_metrics.json"
# "gold_labelled_interpretation_overall_metrics.json"
# "model_output_labelled_interpretation_overall_metrics.json"
# "combined_detection_interpretation_overall_metrics.json"

combined_interpretation_stats_df = pd.read_json("./eval_outputs/combined_detection_interpretation_overall_metrics.json", orient="index")

combined_interpretation_stats_df["model_name"] = combined_interpretation_stats_df.index
combined_interpretation_stats_df["model_type"] = combined_interpretation_stats_df.index.map(lambda x: model_metadata[x]["model_type"])
combined_interpretation_stats_df["model_size_in_b"] = combined_interpretation_stats_df.index.map(lambda x: model_metadata[x]["model_size_in_b"])
# remove index
combined_interpretation_stats_df.reset_index(drop=True, inplace=True)

print(f"Number of models: {len(combined_interpretation_stats_df)}")

combined_interpretation_stats_df.sort_index(inplace=True) # alphabetical order
# combined_interpretation_stats_df

Number of models: 7


In [95]:
# calculate the average of all model metrics and calculate 95% CI
average_model_benefit = combined_interpretation_stats_df["benefit_answer_mean_diff"].mean()
ci_lower_model_benefit, ci_upper_model_benefit = calculate_confidence_interval(combined_interpretation_stats_df, "benefit_answer_mean_diff")

average_model_rigor = combined_interpretation_stats_df["rigor_answer_mean_diff"].mean()
ci_lower_model_rigor, ci_upper_model_rigor = calculate_confidence_interval(combined_interpretation_stats_df, "rigor_answer_mean_diff")

average_model_importance = combined_interpretation_stats_df["importance_answer_mean_diff"].mean()
ci_lower_model_importance, ci_upper_model_importance = calculate_confidence_interval(combined_interpretation_stats_df, "importance_answer_mean_diff")

average_model_full_text = combined_interpretation_stats_df["full_text_answer_mean_diff"].mean()
ci_lower_model_full_text, ci_upper_model_full_text = calculate_confidence_interval(combined_interpretation_stats_df, "full_text_answer_mean_diff")

average_model_another_trial = combined_interpretation_stats_df["another_trial_answer_mean_diff"].mean()
ci_lower_model_another_trial, ci_upper_model_another_trial = calculate_confidence_interval(combined_interpretation_stats_df, "another_trial_answer_mean_diff")

model_stats = {
    "benefit_answer": {"mean_diff": average_model_benefit, "ci_lower": ci_lower_model_benefit, "ci_upper": ci_upper_model_benefit},
    "rigor_answer": {"mean_diff": average_model_rigor, "ci_lower": ci_lower_model_rigor, "ci_upper": ci_upper_model_rigor},
    "importance_answer": {"mean_diff": average_model_importance, "ci_lower": ci_lower_model_importance, "ci_upper": ci_upper_model_importance},
    "full_text_answer": {"mean_diff": average_model_full_text, "ci_lower": ci_lower_model_full_text, "ci_upper": ci_upper_model_full_text},
    "another_trial_answer": {"mean_diff": average_model_another_trial, "ci_lower": ci_lower_model_another_trial, "ci_upper": ci_upper_model_another_trial}
}

combined_interpretation_stats_df = pd.DataFrame(model_stats).T
combined_interpretation_stats_df["metric"] = combined_interpretation_stats_df.index
# remove index
combined_interpretation_stats_df.reset_index(drop=True, inplace=True)
combined_interpretation_stats_df["method"] = "detect + interpret"

combined_interpretation_stats_df

Unnamed: 0,mean_diff,ci_lower,ci_upper,metric,method
0,1.052381,0.497852,1.60691,benefit_answer,detect + interpret
1,-1.928571,-2.58693,-1.270213,rigor_answer,detect + interpret
2,-0.233333,-1.164081,0.697414,importance_answer,detect + interpret
3,-0.842857,-2.216687,0.530972,full_text_answer,detect + interpret
4,0.819048,0.163872,1.474223,another_trial_answer,detect + interpret


In [96]:
all_results = pd.concat([human_expert_stats_df, model_stats_df, gold_labelled_model_stats_df, model_output_labelled_model_stats_df, combined_interpretation_stats_df], ignore_index=True)

all_results

Unnamed: 0,mean_diff,ci_lower,ci_upper,metric,method
0,0.71,0.07,1.35,benefit_answer,human experts
1,-0.59,-1.13,-0.05,rigor_answer,human experts
2,-0.38,-0.95,0.19,importance_answer,human experts
3,0.77,0.08,1.47,full_text_answer,human experts
4,0.64,-0.03,1.31,another_trial_answer,human experts
5,3.381165,2.87468,3.88765,benefit_answer,baseline
6,0.277165,0.088599,0.46573,rigor_answer,baseline
7,1.264057,0.92272,1.605395,importance_answer,baseline
8,2.205034,1.605838,2.80423,full_text_answer,baseline
9,2.9625,2.496102,3.428898,another_trial_answer,baseline


In [97]:
# Create a mapping for custom facet titles
facet_title_mapping = {
    'benefit_answer': 'Treatment Benefit',
    'rigor_answer': 'Study Rigor',
    'importance_answer': 'Study Importance',
    'full_text_answer': 'Interest to Read Full-Text',
    'another_trial_answer': 'Interest to Run Another Trial'
}

# Define the desired order for the facets
facet_order = ['Treatment Benefit', 'Study Rigor', 'Study Importance', 'Interest to Read Full-Text', 'Interest to Run Another Trial']

color_mapping = {
    'human experts': '#0868ac',  
    'baseline': '#43a2ca',  
    '+ ref labels': '#7bccc4',  
    '+ model labels': '#bae4bc', 
    'detect + interpret': '#E3F4D4'  
}

method_order = ['human experts', 'baseline', '+ ref labels', '+ model labels', 'detect + interpret']

# Apply the mapping as a calculated field
chart_data = all_results.copy()
chart_data['metric'] = chart_data['metric'].map(facet_title_mapping)

# Configure global font sizes
chart_config = {
    "axis": {"labelFontSize": 20, "titleFontSize": 22},  # Axis labels and titles
    "header": {"labelFontSize": 20, "titleFontSize": 22},  # Facet headers
    "legend": {"labelFontSize": 18, "titleFontSize": 20},  # Legend labels and titles
    "text": {"fontSize": 20},  # Text mark size
}

# Bar chart
bars = alt.Chart(chart_data).mark_bar().encode(
    x=alt.X('method:N', title=None, axis=alt.Axis(labelAngle=-45), sort = method_order),
    y=alt.Y('mean_diff:Q', title='Mean Difference'),
    color=alt.Color('method:N', title='Method', legend=None, scale=alt.Scale(domain=list(color_mapping.keys()), range=list(color_mapping.values())))
).properties(
    width=300,  # Set the width to 300 pixels
    height=300  # Set the height to 300 pixels
)

# Error bars
error_bars = alt.Chart(chart_data).mark_errorbar().encode(
    alt.X("method:N", sort = method_order),
    alt.Y("ci_lower:Q").title("Mean Difference"),
    alt.Y2("ci_upper:Q"),
    strokeWidth=alt.value(2),
    color=alt.value('gray')
)

# Add value labels
text = bars.mark_text(
    align='center',
    baseline='bottom',
    fontWeight='bold',
    dy=alt.expr(expr=alt.expr.if_(alt.datum.mean_diff >= 0, -1, 20))  # Adjust the position of the text    
).encode(
    text=alt.Text('mean_diff:Q', format='.2f'),
    color=alt.value('black')  # Set text color to black
)

# Combine layers and facet
chart = alt.layer(bars, error_bars, text, data=chart_data).facet(
    column=alt.Column('metric:N', title=None, sort=facet_order),
).configure(**chart_config)  # Apply the global configuration

# save to html
chart.save("./plots/interpretation_by_measures_all_methods.html")

chart

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [35]:
interpretation_stats_df["method_category"] = "baseline"
gold_labelled_interpretation_stats_df["method_category"] = "gold_labelled"
model_output_labelled_interpretation_stats_df["method_category"] = "model_output_labelled"

all_interpretation_stats_df = pd.concat([interpretation_stats_df, gold_labelled_interpretation_stats_df, model_output_labelled_interpretation_stats_df], ignore_index=True)

all_interpretation_stats_df

Unnamed: 0,benefit_answer_mean_diff,rigor_answer_mean_diff,importance_answer_mean_diff,full_text_answer_mean_diff,another_trial_answer_mean_diff,overall_mean_diff_avg,model_name,model_type,model_size_in_b,method_category,overall_avg
0,3.133333,0.100000,1.233333,2.866667,3.333333,2.133333,gpt4o,generalist closed,,baseline,
1,3.566667,1.466667,2.733333,3.933333,3.866667,3.113333,gpt4o-mini,generalist closed,,baseline,
2,3.900000,1.433333,2.066667,2.600000,3.766667,2.753333,gpt35,generalist closed,175.0,baseline,
3,2.500000,-0.100000,2.166667,3.000000,3.700000,2.253333,gemini_1.5_flash,generalist closed,,baseline,
4,3.066667,-0.100000,0.966667,2.733333,3.433333,2.020000,gemini_1.5_flash-8B,generalist closed,8.0,baseline,
...,...,...,...,...,...,...,...,...,...,...,...
61,4.633333,-0.233333,1.366667,1.900000,2.766667,,olmo2_instruct-13B,generalist open,13.0,model_output_labelled,2.086667
62,3.304348,-0.068966,1.518519,2.518519,1.800000,,openbiollm-8B,biomedical open,8.0,model_output_labelled,1.814484
63,3.833333,-0.183333,1.000000,0.615385,4.733333,,openbiollm-70B,biomedical open,70.0,model_output_labelled,1.999744
64,2.866667,-0.133333,1.533333,2.600000,2.333333,,mistral_instruct7B,generalist open,7.0,model_output_labelled,1.840000


In [36]:
# fit the model
model = smf.ols(formula="benefit_answer_mean_diff ~ method_category", 
                            data=all_interpretation_stats_df)
results = model.fit()

print(results.summary())


tukey_oneway = pairwise_tukeyhsd(endog = all_interpretation_stats_df["benefit_answer_mean_diff"], groups = all_interpretation_stats_df["method_category"])

# Display the results
tukey_oneway.summary()

                               OLS Regression Results                               
Dep. Variable:     benefit_answer_mean_diff   R-squared:                       0.203
Model:                                  OLS   Adj. R-squared:                  0.178
Method:                       Least Squares   F-statistic:                     8.036
Date:                      Mon, 03 Feb 2025   Prob (F-statistic):           0.000779
Time:                              17:25:07   Log-Likelihood:                -103.85
No. Observations:                        66   AIC:                             213.7
Df Residuals:                            63   BIC:                             220.3
Df Model:                                 2                                         
Covariance Type:                  nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

group1,group2,meandiff,p-adj,lower,upper,reject
baseline,gold_labelled,-1.4416,0.0005,-2.3062,-0.577,True
baseline,model_output_labelled,-0.7926,0.0789,-1.6572,0.0719,False
gold_labelled,model_output_labelled,0.649,0.1773,-0.2156,1.5135,False


## Relationship between spin detection and spin interpretation

Linear Regression with statsmodels Python package

In [37]:
# get all model names
model_names = model_metadata.keys()
# remove alpacare-13B
model_names = [x for x in model_names if x != "alpacare-13B"]

len(model_names)

22

In [38]:
measures = ["benefit_answer", "rigor_answer", "importance_answer", "full_text_answer", "another_trial_answer"]
gpt_models = ["gpt4o", "gpt4o-mini", "gpt35"]
huggingface_models = ["alpacare-7B", "biomedgpt7B", "biomistral7B", 
                      "llama2_chat-7B", "llama2_chat-13B", "llama2_chat-70B",
                      "llama3_instruct-8B", "llama3_instruct-70B",
                      "med42-8B", "med42-70B", "mistral_instruct7B", 
                      "olmo2_instruct-7B", "olmo2_instruct-13B",
                      "openbiollm-8B", "openbiollm-70B"]
no_probability_models = ["claude_3.5-haiku", "claude_3.5-sonnet", "gemini_1.5_flash", "gemini_1.5_flash-8B"]

def get_is_detection_correct(row):
    if row['abstract_type'] == "spin":
        return row['model_answer'] == "yes"
    else:
        return row['model_answer'] == "no"
    
def get_is_abstract_type_spin(row):
    return row['abstract_type'] == "spin"
    
def detection_probability_gpt(row):
    # find the first instance of "yes" or "no"
    token_probabilties = row['model_log_probabilities']
    for token_prob in token_probabilties:
        if token_prob['token'].lower() == "yes":
            return np.exp(token_prob['logprob'])
        elif token_prob['token'].lower() == "no":
            return np.exp(token_prob['logprob'])
    return None # this should not happen but just in case

def detection_probability_huggingface(row):
    # find the first instance of "yes" or "no"
    token_probabilties = row['model_log_probabilities']
    for token_prob in token_probabilties:
        if token_prob['token_string'].lower() == "yes":
            return token_prob['probability']
        elif token_prob['token_string'].lower() == "no":
            return token_prob['probability']
    return None # this should not happen but just in case


def prepare_data_for_regression(model_names):
    for model_name in tqdm(model_names):
        # print(f"Processing {model_name}...")
        final_data = []
        detection_output_file_path = f"./eval_outputs/{model_name}/{model_name}_detection_outputs.json"
        interpretation_output_file_path = f"./eval_outputs/{model_name}/{model_name}_interpretation_outputs.json"
        model_detection_data = pd.read_json(detection_output_file_path, orient="records")
        model_interpretation_data = pd.read_json(interpretation_output_file_path, orient="records")

        # merge model_detection_data and model_interpretation_data by PMID and abstract_type
        model_data = pd.merge(model_detection_data, model_interpretation_data, on=['PMID', 'abstract_type'])

        # loop through each row in model_data
        for _, row in model_data.iterrows():
            detection_model_prediction = 1 if row['model_answer'] == "yes" else 0
            is_detection_correct = 1 if get_is_detection_correct(row) else 0
            is_spin_in_abstract = 1 if get_is_abstract_type_spin(row) else 0

            if model_name in gpt_models:
                detection_probability = detection_probability_gpt(row)
            elif model_name in huggingface_models:
                detection_probability = detection_probability_huggingface(row)
            else:
                detection_probability = None
            
            for measure in measures:
                final_data.append({
                    "pmid": row['PMID'],
                    "measure": measure,
                    "is_spin_in_abstract": is_spin_in_abstract,
                    "is_detection_correct": is_detection_correct,
                    "detection_model_prediction": detection_model_prediction,
                    "detection_probability": detection_probability,
                    "interpretation_answer": float(row[measure]) if row[measure] != "" else None
                })
            # calculate the average of the differences
            answers = []
            for measure in measures:
                if row[measure] != "":
                    answers.append(float(row[measure]))
            if len(answers) > 0:
                avg_answer= round(np.mean(answers), 6)
            else:
                avg_answer = None
            # add the average difference to the data
            final_data.append({
                "pmid": row['PMID'],
                "measure": "overall",
                "is_spin_in_abstract": is_spin_in_abstract,
                "is_detection_correct": is_detection_correct,
                "detection_model_prediction": detection_model_prediction,
                "detection_probability": detection_probability,
                "interpretation_answer": avg_answer
            })

        # save the final data to a json file
        json_file_path = f"./eval_outputs/{model_name}/{model_name}_combined_data.csv"
        save_dataset_to_csv(final_data, json_file_path)

In [39]:
prepare_data_for_regression(model_names=model_names)

100%|██████████| 22/22 [00:01<00:00, 14.44it/s]


#### Simplest Regression

Is spin in abstract and the measures answers

In [40]:
# for model_name in model_names:
#     output_string = ""
#     csv_file_path = f"./eval_outputs/{model_name}/{model_name}_combined_data.csv"
#     data = pd.read_csv(csv_file_path)

#     measures = ["benefit_answer", "rigor_answer", "importance_answer", "full_text_answer", "another_trial_answer", "overall"]
#     for measure in measures:
#         # get the data for the current measure
#         measure_data = data[data['measure'] == measure]
#         nan_rows_number = measure_data['interpretation_answer'].isnull().sum()
#         # remove rows with NaN values in interpretation_answer
#         measure_data = measure_data.dropna(subset=['interpretation_answer'])

#         # check if there are less than 2 rows
#         if len(measure_data) < 2:
#             continue
        
#         # fit the model
#         model = smf.ols(formula="interpretation_answer ~ is_spin_in_abstract", 
#                                     data=measure_data)
#         results = model.fit()

#         output_string += f"Model: {model_name} - {measure}\n"
#         # print number of rows with NaN value(s)
#         output_string += f"Number of rows with NaN value(s) in {model_name}: {nan_rows_number}\n"
#         output_string += results.summary().as_text()
#         output_string += "\n"

#     # save the model summary
#     with open(f"./eval_outputs/{model_name}/{model_name}_simple_regression_summary.txt", "w") as f:
#         f.write(output_string)

  return 1 - self.ssr/self.centered_tss
  return self.mse_model/self.mse_resid
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)
  warn("omni_normtest is not valid with less than 8 observations; %i "
  return np.sqrt(eigvals[0]/eigvals[-1])
  k, _ = kurtosistest(a, axis)
  k, _ = kurtosistest(a, axis)
  warn("omni_normtest is not valid with less than 8 observations; %i "
  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss


##### Forest Plot for "Benefit" Linear Regression Results

In [62]:
import altair as alt
import pandas as pd

# Load the JSON data into a DataFrame
regression_results_df = pd.read_json("./eval_outputs/simple_linear_regression_benefit_data.json", orient="index")

# Ensure index is reset and available as a column
regression_results_df.reset_index(inplace=True)
regression_results_df = regression_results_df.rename(columns={'index': 'model_name'})

regression_results_df["model_name_custom"] = regression_results_df["model_name"].map(custom_labels)

# Create the Altair chart
points = alt.Chart(regression_results_df).mark_point(
    filled=True,
    color='red',
    size=50  # Increase point size
).encode(
    x=alt.X('coef:Q').title('Coefficient'),
    y=alt.Y('model_name_custom:N').title('LLM Name').sort(
        field='coef',  # Sort by coefficient values
        order='descending'
    )
).properties(
    width=600,
    height=300
)

# Add error bars
error_bars = points.mark_rule(
    strokeWidth=2  # Increase width of error bars
).encode(
    x='ci_lower:Q',
    x2='ci_upper:Q',
    size=alt.value(2)  # Set the width of error bars
)

# Add vertical line at x = 0.71
vertical_line = alt.Chart(pd.DataFrame({'x': [0.71]})).mark_rule(
    color='blue',
    strokeDash=[4, 4],  # Make it dashed
    strokeWidth=2
).encode(
    x='x:Q',
    color=alt.value('#0868ac')  # Specify the color directly
)

# Add label for vertical line
label = alt.Chart(pd.DataFrame({'x': [0.71], 'y': [regression_results_df['model_name_custom'].iloc[0]]})).mark_text(
    text='Human Experts',
    align='center',
    dx=5,  # Adjust text position
    dy=-10,
    fontSize=14,
    fontWeight='bold',
).encode(
    x='x:Q',
    y=alt.value(0),  # Adjust position if necessary
    color=alt.value('#0868ac')  # Specify the color directly
)

# Define custom x-axis labels
custom_labels_df = pd.DataFrame({
    'x': [regression_results_df['coef'].min(), regression_results_df['coef'].max()],
    'text': ['Less susceptible to spin', 'More susceptible to spin']
})

# Define custom x-axis labels
left_arrow_df = pd.DataFrame({
    'x': [0.2],
    'text': ['←']
})

# Define custom x-axis labels
right_arrow_df = pd.DataFrame({
    'x': [7.7],
    'text': ['→']
})

# Create a text layer for custom x-axis labels
custom_x_labels = alt.Chart(custom_labels_df).mark_text(
    align='center',
    baseline='bottom',
    dy=195,  # Adjust vertical positioning
    dx=24, # adjust horizontal positioning
    fontSize=14,
    fontWeight='bold',
).encode(
    x='x:Q',
    text='text:N',
    color=alt.value('#0868ac')  # Specify the color directly
)

# Create a text layer for custom x-axis labels
custom_x_left_arrow = alt.Chart(left_arrow_df).mark_text(
    align='center',
    baseline='bottom',
    dy=195,  # Adjust vertical positioning
    dx=10, # adjust horizontal positioning
    fontSize=24,
    fontWeight='bold',
).encode(
    x='x:Q',
    text='text:N',
    color=alt.value('#0868ac')  # Specify the color directly
)

# Create a text layer for custom x-axis labels
custom_x_right_arrow = alt.Chart(right_arrow_df).mark_text(
    align='center',
    baseline='bottom',
    dy=195,  # Adjust vertical positioning
    dx=0, # adjust horizontal positioning
    fontSize=24,
    fontWeight='bold',
).encode(
    x='x:Q',
    text='text:N',
    color=alt.value('#0868ac')  # Specify the color directly
)

# Combine all layers, including the new x-axis labels
chart = alt.layer(error_bars, points, vertical_line, label, custom_x_labels, custom_x_left_arrow, custom_x_right_arrow).configure_axis(
    labelFontSize=16,
    titleFontSize=18
).configure_title(
    fontSize=20
)

# # Save to HTML
chart.save("./plots/simple_regression_benefit_data.html")

chart


  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df

In [None]:
# FOR KAREN

import altair as alt
import pandas as pd

# TODO: CHANGE FILE NAME HERE
# Load the JSON data into a DataFrame
regression_results_df = pd.read_json("./eval_outputs/simple_linear_regression_benefit_data.json", orient="index")

# Ensure index is reset and available as a column
regression_results_df.reset_index(inplace=True)
regression_results_df = regression_results_df.rename(columns={'index': 'model_name'})

regression_results_df["model_name_custom"] = regression_results_df["model_name"].map(custom_labels)

# Create the Altair chart
points = alt.Chart(regression_results_df).mark_point(
    filled=True,
    color='red',
    size=50  # Increase point size
).encode(
    x=alt.X('coef:Q').title('Coefficient'),
    y=alt.Y('model_name_custom:N').title('LLM Name').sort(
        field='coef',  # Sort by coefficient values
        order='descending'
    )
).properties(
    width=600,
    height=300
)

# Add error bars
error_bars = points.mark_rule(
    strokeWidth=2  # Increase width of error bars
).encode(
    x='ci_lower:Q',
    x2='ci_upper:Q',
    size=alt.value(2)  # Set the width of error bars
)

# Add vertical line at x = 0.71
# TODO: change the x value to what the human expert values are
# 1	-0.590000	rigor_answer	human experts
# 2	-0.380000	importance_answer	human experts
# 3	0.770000	full_text_answer	human experts
# 4	0.640000	another_trial_answer	human experts
vertical_line = alt.Chart(pd.DataFrame({'x': [0.71]})).mark_rule(
    color='blue',
    strokeDash=[4, 4],  # Make it dashed
    strokeWidth=2
).encode(
    x='x:Q',
    color=alt.value('#0868ac')  # Specify the color directly
)

# Add label for vertical line
label = alt.Chart(pd.DataFrame({'x': [0.71], 'y': [regression_results_df['model_name_custom'].iloc[0]]})).mark_text(
    text='Human Experts',
    align='center',
    dx=5,  # Adjust text position
    dy=-10,
    fontSize=14,
    fontWeight='bold',
).encode(
    x='x:Q',
    y=alt.value(0),  # Adjust position if necessary
    color=alt.value('#0868ac')  # Specify the color directly
)

# Define custom x-axis labels
custom_labels_df = pd.DataFrame({
    'x': [regression_results_df['coef'].min(), regression_results_df['coef'].max()],
    'text': ['Less susceptible to spin', 'More susceptible to spin']
})

# TODO: hard coded x-axis for the arrows
# MIGHT NEED TO ADJUST AS NEEDED
# Define custom x-axis labels
left_arrow_df = pd.DataFrame({
    'x': [0.2],
    'text': ['←']
})

# TODO: hard coded x-axis for the arrows
# MIGHT NEED TO ADJUST AS NEEDED
# Define custom x-axis labels
right_arrow_df = pd.DataFrame({
    'x': [7.7],
    'text': ['→']
})

# Create a text layer for custom x-axis labels
custom_x_labels = alt.Chart(custom_labels_df).mark_text(
    align='center',
    baseline='bottom',
    dy=195,  # Adjust vertical positioning
    dx=24, # adjust horizontal positioning
    fontSize=14,
    fontWeight='bold',
).encode(
    x='x:Q',
    text='text:N',
    color=alt.value('#0868ac')  # Specify the color directly
)

# Create a text layer for custom x-axis labels
custom_x_left_arrow = alt.Chart(left_arrow_df).mark_text(
    align='center',
    baseline='bottom',
    dy=195,  # Adjust vertical positioning
    dx=10, # adjust horizontal positioning
    fontSize=24,
    fontWeight='bold',
).encode(
    x='x:Q',
    text='text:N',
    color=alt.value('#0868ac')  # Specify the color directly
)

# Create a text layer for custom x-axis labels
custom_x_right_arrow = alt.Chart(right_arrow_df).mark_text(
    align='center',
    baseline='bottom',
    dy=195,  # Adjust vertical positioning
    dx=0, # adjust horizontal positioning
    fontSize=24,
    fontWeight='bold',
).encode(
    x='x:Q',
    text='text:N',
    color=alt.value('#0868ac')  # Specify the color directly
)

# Combine all layers, including the new x-axis labels
chart = alt.layer(error_bars, points, vertical_line, label, custom_x_labels, custom_x_left_arrow, custom_x_right_arrow).configure_axis(
    labelFontSize=16,
    titleFontSize=18
).configure_title(
    fontSize=20
)

# Save to HTML
# TODO: 
# CHANGE FILE NAME HERE
chart.save("./plots/simple_regression_benefit_data.html")

chart


  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df

#### Binary Spin Detection Results Version

In [None]:
# for model_name in model_names:
#     output_string = ""
#     csv_file_path = f"./eval_outputs/{model_name}/{model_name}_combined_data.csv"
#     data = pd.read_csv(csv_file_path)

#     measures = ["benefit_answer", "rigor_answer", "importance_answer", "full_text_answer", "another_trial_answer", "overall"]
#     for measure in measures:
#         # get the data for the current measure
#         measure_data = data[data['measure'] == measure]
#         nan_rows_number = measure_data['interpretation_answer'].isnull().sum()
#         # remove rows with NaN values in interpretation_answer
#         measure_data = measure_data.dropna(subset=['interpretation_answer'])

#         # check if there are less than 2 rows
#         if len(measure_data) < 2:
#             continue
        
#         # fit the model
#         model = smf.ols(formula="interpretation_answer ~ is_spin_in_abstract * is_detection_correct", 
#                                     data=measure_data)
#         results = model.fit()

#         output_string += f"Model: {model_name} - {measure}\n"
#         # print number of rows with NaN value(s)
#         output_string += f"Number of rows with NaN value(s) in {model_name}: {nan_rows_number}\n"
#         output_string += results.summary().as_text()
#         output_string += "\n"

#     # save the model summary
#     with open(f"./eval_outputs/{model_name}/{model_name}_regression_binary_summary.txt", "w") as f:
#         f.write(output_string)

In [None]:
# # what the model predicts rather than whether it was correct or not
# for model_name in model_names:
#     output_string = ""
#     csv_file_path = f"./eval_outputs/{model_name}/{model_name}_combined_data.csv"
#     data = pd.read_csv(csv_file_path)

#     measures = ["benefit_answer", "rigor_answer", "importance_answer", "full_text_answer", "another_trial_answer", "overall"]
#     for measure in measures:
#         # get the data for the current measure
#         measure_data = data[data['measure'] == measure]
#         nan_rows_number = measure_data['interpretation_answer'].isnull().sum()
#         # remove rows with NaN values in interpretation_answer
#         measure_data = measure_data.dropna(subset=['interpretation_answer'])

#         # check if there are less than 2 rows
#         if len(measure_data) < 2:
#             continue
        
#         # fit the model
#         model = smf.ols(formula="interpretation_answer ~ is_spin_in_abstract * detection_model_prediction", 
#                                     data=measure_data)
#         results = model.fit()

#         output_string += f"Model: {model_name} - {measure}\n"
#         # print number of rows with NaN value(s)
#         output_string += f"Number of rows with NaN value(s) in {model_name}: {nan_rows_number}\n"
#         output_string += results.summary().as_text()
#         output_string += "\n"

#     # save the model summary
#     with open(f"./eval_outputs/{model_name}/{model_name}_regression_binary_direct_model_prediction_summary.txt", "w") as f:
#         f.write(output_string)

##### Forest Plot for "Benefit" Linear Regression Results (Binary Model Prediction)

In [62]:
# # Load the JSON data into a DataFrame
# regression_results_df = pd.read_json("./eval_outputs/model_detection_prediction_linear_regression_benefit_data.json", orient="index")

# # Ensure index is reset and available as a column
# regression_results_df.reset_index(inplace=True)
# regression_results_df = regression_results_df.rename(columns={'index': 'model_name'})

# regression_results_df["model_name_custom"] = regression_results_df["model_name"].map(custom_labels)

# # Create the Altair chart
# points = alt.Chart(regression_results_df).mark_point(
#     filled=True,
#     color='red',
#     size=50  # Increase point size
# ).encode(
#     x=alt.X('coef:Q').title('Coefficient'),
#     y=alt.Y('model_name_custom:N', title='LLM Name').sort(
#         field='coef',  # Sort by coefficient values
#         order='descending'
#     )
# ).properties(
#     width=600,
#     height=300  # Increase height for more space between error bars
# )

# # Add error bars
# error_bars = points.mark_rule(
#     strokeWidth=2  # Increase width of error bars
# ).encode(
#     x='ci_lower:Q',
#     x2='ci_upper:Q',
#     # y='model_name:N',  # Align the error bars with the points
#     size=alt.value(2)  # Set the width of error bars
# )

# # Combine the points and error bars
# chart = error_bars + points

# # Apply the configuration directly to the chart
# chart = chart.configure_axis(
#     labelFontSize=16,  # Increase font size for axis labels
#     titleFontSize=18   # Increase font size for axis title
# ).configure_title(
#     fontSize=20  # Increase font size for chart title (if any)
# )

# # Save to HTML
# chart.save("./plots/model_detection_prediction_regression_benefit_data.html")

# # Display the chart
# chart

#### Probability Spin Detection Results Version

In [None]:
# model_names = gpt_models + huggingface_models # remove no token probability models

# for model_name in model_names:
#     output_string = ""
#     csv_file_path = f"./eval_outputs/{model_name}/{model_name}_combined_data.csv"
#     data = pd.read_csv(csv_file_path)

#     measures = ["benefit_answer", "rigor_answer", "importance_answer", "full_text_answer", "another_trial_answer", "overall"]
#     for measure in measures:
#         # get the data for the current measure
#         measure_data = data[data['measure'] == measure]
#         nan_rows_number = measure_data['interpretation_answer'].isnull().sum()
#         # remove rows with NaN values in interpretation_answer
#         measure_data = measure_data.dropna(subset=['interpretation_answer', 'detection_probability'])
        
#         # if is_detection_no_spin_correct == 1, then detection_probability. Otherwise, 1 - detection_probability
#         measure_data['regression_detection_variable'] = measure_data.apply(lambda x: x['detection_probability'] if x['is_detection_correct'] == 1 else 1 - x['detection_probability'], axis=1)
#         # check if there are less than 2 rows
#         if len(measure_data) < 2:
#             continue

#         # fit the model
#         model = smf.ols(formula="interpretation_answer ~ is_spin_in_abstract * regression_detection_variable",
#                                     data=measure_data)
#         results = model.fit()

#         output_string += f"Model: {model_name} - {measure}\n"
#         # print number of rows with NaN value(s)
#         output_string += f"Number of rows with NaN value(s) in {model_name}: {nan_rows_number}\n"
#         output_string += results.summary().as_text()
#         output_string += "\n"

#     # save the model summary
#     with open(f"./eval_outputs/{model_name}/{model_name}_regression_probability_summary.txt", "w") as f:
#         f.write(output_string)

In [None]:
# # what the model predicts rather than whether it was correct or not

# model_names = gpt_models + huggingface_models # remove no token probability models

# for model_name in model_names:
#     output_string = ""
#     csv_file_path = f"./eval_outputs/{model_name}/{model_name}_combined_data.csv"
#     data = pd.read_csv(csv_file_path)

#     measures = ["benefit_answer", "rigor_answer", "importance_answer", "full_text_answer", "another_trial_answer", "overall"]
#     for measure in measures:
#         # get the data for the current measure
#         measure_data = data[data['measure'] == measure]
#         nan_rows_number = measure_data['interpretation_answer'].isnull().sum()
#         # remove rows with NaN values in interpretation_answer
#         measure_data = measure_data.dropna(subset=['interpretation_answer', 'detection_probability'])
        
#         # if is_detection_no_spin_correct == 1, then detection_probability. Otherwise, 1 - detection_probability
#         measure_data['regression_detection_variable'] = measure_data.apply(lambda x: x['detection_probability'] if x['detection_model_prediction'] == 1 else 1 - x['detection_probability'], axis=1)
#         # check if there are less than 2 rows
#         if len(measure_data) < 2:
#             continue

#         # fit the model
#         model = smf.ols(formula="interpretation_answer ~ is_spin_in_abstract * regression_detection_variable",
#                                     data=measure_data)
#         results = model.fit()

#         output_string += f"Model: {model_name} - {measure}\n"
#         # print number of rows with NaN value(s)
#         output_string += f"Number of rows with NaN value(s) in {model_name}: {nan_rows_number}\n"
#         output_string += results.summary().as_text()
#         output_string += "\n"

#     # save the model summary
#     with open(f"./eval_outputs/{model_name}/{model_name}_regression_probability_direct_model_prediction_summary.txt", "w") as f:
#         f.write(output_string)

## PLS LLM Interpretations

### Average Tokens

In [103]:
# get all PLS outputs from all LLMs
enc = tiktoken.get_encoding("o200k_base") # for gpt-4o and gpt-4o mini

model_token_stats = {}
number_of_tokens_total = [] # for all models
for model_name in model_names:
    csv_file_path = f"./pls_outputs/{model_name}/{model_name}_outputs.csv"
    data = pd.read_csv(csv_file_path)
    # calculate the number of tokens for each row in plain_language_summary
    plain_language_summaries = data['plain_language_summary'].tolist()

    number_of_tokens = []
    for summary in plain_language_summaries:
        token_integers = enc.encode(summary)
        number_of_tokens.append(len(token_integers))
        number_of_tokens_total.append(len(token_integers))

    # average number of tokens
    average_number_of_tokens = np.mean(number_of_tokens)
    # SD of tokens
    sd_number_of_tokens = np.std(number_of_tokens)
    model_token_stats[model_name] = {"average_number_of_tokens": average_number_of_tokens, "sd_number_of_tokens": sd_number_of_tokens}

model_token_stats_df = pd.DataFrame(model_token_stats).T
model_token_stats_df["model_name"] = model_token_stats_df.index

model_token_stats_df

Unnamed: 0,average_number_of_tokens,sd_number_of_tokens,model_name
alpacare-7B,120.416667,56.264936,alpacare-7B
biomedgpt7B,195.0,59.498459,biomedgpt7B
biomistral7B,140.766667,73.807941,biomistral7B
claude_3.5-haiku,225.033333,15.084171,claude_3.5-haiku
claude_3.5-sonnet,230.516667,19.185491,claude_3.5-sonnet
gemini_1.5_flash,214.216667,30.953778,gemini_1.5_flash
gemini_1.5_flash-8B,207.7,37.768285,gemini_1.5_flash-8B
gpt4o,207.85,44.255254,gpt4o
gpt4o-mini,253.783333,38.084595,gpt4o-mini
gpt35,94.483333,18.754992,gpt35


In [104]:
# get the average across all
average_number_of_tokens = np.mean(number_of_tokens_total)
sd_number_of_tokens = np.std(number_of_tokens_total)

average_number_of_tokens, sd_number_of_tokens

(208.10378787878787, 67.01472464306116)

In [105]:
# average across all models
average_number_of_tokens = model_token_stats_df["average_number_of_tokens"].mean()
sd_number_of_tokens = model_token_stats_df["sd_number_of_tokens"].mean()

average_number_of_tokens, sd_number_of_tokens

(208.10378787878787, 37.62843632260712)

### LLM Evaluation Results

In [45]:
claude_evaluator_results = pd.read_json("./pls_outputs/_interpretation_eval_results/claude_3.5-sonnet/claude_3.5-sonnet_interpretation_overall_metrics.json", orient="index")
gpt4o_mini_evaluator_results = pd.read_json("./pls_outputs/_interpretation_eval_results/gpt4o-mini/gpt4o-mini_interpretation_overall_metrics.json", orient="index")

In [46]:
def calculate_confidence_interval(df, df_column_name):
    mean_diff = df[df_column_name].mean()  # Calculate the mean
    std_dev = df[df_column_name].std()  # Calculate the standard deviation
    n = len(df[df_column_name])  # Sample size

    # Calculate the margin of error for 95% CI (z = 1.96)
    z = 1.96
    margin_of_error = z * (std_dev / sqrt(n))

    # Calculate the 95% Confidence Interval
    ci_lower = mean_diff - margin_of_error
    ci_upper = mean_diff + margin_of_error

    return ci_lower, ci_upper

In [47]:
# calculate the average of all model metrics and calculate 95% CI
average_model_pls_benefit = gpt4o_mini_evaluator_results["benefit_answer_mean_diff"].mean()
ci_lower_model_benefit, ci_upper_model_benefit = calculate_confidence_interval(gpt4o_mini_evaluator_results, "benefit_answer_mean_diff")

average_pls_model_rigor = gpt4o_mini_evaluator_results["rigor_answer_mean_diff"].mean()
ci_lower_model_rigor, ci_upper_model_rigor = calculate_confidence_interval(gpt4o_mini_evaluator_results, "rigor_answer_mean_diff")

average_pls_model_importance = gpt4o_mini_evaluator_results["importance_answer_mean_diff"].mean()
ci_lower_model_importance, ci_upper_model_importance = calculate_confidence_interval(gpt4o_mini_evaluator_results, "importance_answer_mean_diff")

average_pls_model_full_text = gpt4o_mini_evaluator_results["full_text_answer_mean_diff"].mean()
ci_lower_model_full_text, ci_upper_model_full_text = calculate_confidence_interval(gpt4o_mini_evaluator_results, "full_text_answer_mean_diff")

average_pls_model_another_trial = gpt4o_mini_evaluator_results["another_trial_answer_mean_diff"].mean()
ci_lower_model_another_trial, ci_upper_model_another_trial = calculate_confidence_interval(gpt4o_mini_evaluator_results, "another_trial_answer_mean_diff")

gpt4o_mini_pls_model_stats = {
    "benefit_answer": {"mean_diff": average_model_pls_benefit, "ci_lower": ci_lower_model_benefit, "ci_upper": ci_upper_model_benefit},
    "rigor_answer": {"mean_diff": average_pls_model_rigor, "ci_lower": ci_lower_model_rigor, "ci_upper": ci_upper_model_rigor},
    "importance_answer": {"mean_diff": average_pls_model_importance, "ci_lower": ci_lower_model_importance, "ci_upper": ci_upper_model_importance},
    "full_text_answer": {"mean_diff": average_pls_model_full_text, "ci_lower": ci_lower_model_full_text, "ci_upper": ci_upper_model_full_text},
    "another_trial_answer": {"mean_diff": average_pls_model_another_trial, "ci_lower": ci_lower_model_another_trial, "ci_upper": ci_upper_model_another_trial}
}

pls_gpt4o_mini_model_stats_df = pd.DataFrame(gpt4o_mini_pls_model_stats).T
pls_gpt4o_mini_model_stats_df["metric"] = pls_gpt4o_mini_model_stats_df.index
# remove index
pls_gpt4o_mini_model_stats_df.reset_index(drop=True, inplace=True)
pls_gpt4o_mini_model_stats_df["evaluator"] = "GPT4o Mini"

pls_gpt4o_mini_model_stats_df

Unnamed: 0,mean_diff,ci_lower,ci_upper,metric,evaluator
0,3.592424,3.573721,3.611128,benefit_answer,GPT4o Mini
1,1.35303,1.332122,1.373939,rigor_answer,GPT4o Mini
2,2.730303,2.707594,2.753012,importance_answer,GPT4o Mini
3,3.686364,3.65517,3.717557,full_text_answer,GPT4o Mini
4,3.868182,3.840497,3.895866,another_trial_answer,GPT4o Mini


In [48]:
# calculate the average of all model metrics and calculate 95% CI
average_model_pls_benefit = claude_evaluator_results["benefit_answer_mean_diff"].mean()
ci_lower_model_benefit, ci_upper_model_benefit = calculate_confidence_interval(claude_evaluator_results, "benefit_answer_mean_diff")

average_pls_model_rigor = claude_evaluator_results["rigor_answer_mean_diff"].mean()
ci_lower_model_rigor, ci_upper_model_rigor = calculate_confidence_interval(claude_evaluator_results, "rigor_answer_mean_diff")

average_pls_model_importance = claude_evaluator_results["importance_answer_mean_diff"].mean()
ci_lower_model_importance, ci_upper_model_importance = calculate_confidence_interval(claude_evaluator_results, "importance_answer_mean_diff")

average_pls_model_full_text = claude_evaluator_results["full_text_answer_mean_diff"].mean()
ci_lower_model_full_text, ci_upper_model_full_text = calculate_confidence_interval(claude_evaluator_results, "full_text_answer_mean_diff")

average_pls_model_another_trial = claude_evaluator_results["another_trial_answer_mean_diff"].mean()
ci_lower_model_another_trial, ci_upper_model_another_trial = calculate_confidence_interval(claude_evaluator_results, "another_trial_answer_mean_diff")

claude_pls_model_stats = {
    "benefit_answer": {"mean_diff": average_model_pls_benefit, "ci_lower": ci_lower_model_benefit, "ci_upper": ci_upper_model_benefit},
    "rigor_answer": {"mean_diff": average_pls_model_rigor, "ci_lower": ci_lower_model_rigor, "ci_upper": ci_upper_model_rigor},
    "importance_answer": {"mean_diff": average_pls_model_importance, "ci_lower": ci_lower_model_importance, "ci_upper": ci_upper_model_importance},
    "full_text_answer": {"mean_diff": average_pls_model_full_text, "ci_lower": ci_lower_model_full_text, "ci_upper": ci_upper_model_full_text},
    "another_trial_answer": {"mean_diff": average_pls_model_another_trial, "ci_lower": ci_lower_model_another_trial, "ci_upper": ci_upper_model_another_trial}
}

pls_claude_model_stats_df = pd.DataFrame(claude_pls_model_stats).T
pls_claude_model_stats_df["metric"] = pls_claude_model_stats_df.index
# remove index
pls_claude_model_stats_df.reset_index(drop=True, inplace=True)
pls_claude_model_stats_df["evaluator"] = "Claude 3.5 Sonnet"

pls_claude_model_stats_df

Unnamed: 0,mean_diff,ci_lower,ci_upper,metric,evaluator
0,2.471212,2.46632,2.476105,benefit_answer,Claude 3.5 Sonnet
1,-0.175758,-0.182107,-0.169408,rigor_answer,Claude 3.5 Sonnet
2,-0.337879,-0.345682,-0.330075,importance_answer,Claude 3.5 Sonnet
3,2.818182,2.804775,2.831589,full_text_answer,Claude 3.5 Sonnet
4,2.706061,2.687012,2.725109,another_trial_answer,Claude 3.5 Sonnet


In [49]:
# combine two dataframes
all_pls_model_stats_df = pd.concat([pls_gpt4o_mini_model_stats_df, pls_claude_model_stats_df], ignore_index=True)

all_pls_model_stats_df

Unnamed: 0,mean_diff,ci_lower,ci_upper,metric,evaluator
0,3.592424,3.573721,3.611128,benefit_answer,GPT4o Mini
1,1.35303,1.332122,1.373939,rigor_answer,GPT4o Mini
2,2.730303,2.707594,2.753012,importance_answer,GPT4o Mini
3,3.686364,3.65517,3.717557,full_text_answer,GPT4o Mini
4,3.868182,3.840497,3.895866,another_trial_answer,GPT4o Mini
5,2.471212,2.46632,2.476105,benefit_answer,Claude 3.5 Sonnet
6,-0.175758,-0.182107,-0.169408,rigor_answer,Claude 3.5 Sonnet
7,-0.337879,-0.345682,-0.330075,importance_answer,Claude 3.5 Sonnet
8,2.818182,2.804775,2.831589,full_text_answer,Claude 3.5 Sonnet
9,2.706061,2.687012,2.725109,another_trial_answer,Claude 3.5 Sonnet


In [61]:
# create altair grouped barchart
# grouped by metric and evaluator

# Create a mapping for custom facet titles
facet_title_mapping = {
    'benefit_answer': 'Benefit',
    'rigor_answer': 'Rigor',
    'importance_answer': 'Importance',
    'full_text_answer': 'Full-Text',
    'another_trial_answer': 'Another Trial'
}

# Define the desired order for the facets
facet_order = ['Benefit', 'Rigor', 'Importance', 'Full-Text', 'Another Trial']

color_mapping = {
    'Claude 3.5 Sonnet': '#0868ac',  
    'GPT4o Mini': '#43a2ca',  
}

method_order = ['Claude 3.5 Sonnet', 'GPT4o Mini']

# Apply the mapping as a calculated field
chart_data = all_pls_model_stats_df.copy()
chart_data['metric'] = chart_data['metric'].map(facet_title_mapping)

# Configure global font sizes
chart_config = {
    "axis": {"labelFontSize": 20, "titleFontSize": 22},  # Axis labels and titles
    "header": {"labelFontSize": 20, "titleFontSize": 22},  # Facet headers
    "legend": {"labelFontSize": 18, "titleFontSize": 20},  # Legend labels and titles
    "text": {"fontSize": 20},  # Text mark size
}

# Bar chart
bars = alt.Chart(chart_data).mark_bar().encode(
    x=alt.X('evaluator:N', title=None, axis=alt.Axis(labelAngle=-45), sort = method_order),
    y=alt.Y('mean_diff:Q', title='Mean Difference'),
    color=alt.Color('evaluator:N', title='Evaluator', legend=None, scale=alt.Scale(domain=list(color_mapping.keys()), range=list(color_mapping.values())))
).properties(
    width=120,  # Set the width to 300 pixels
    height=250  # Set the height to 300 pixels
)

# Error bars
error_bars = alt.Chart(chart_data).mark_errorbar().encode(
    alt.X("evaluator:N", sort = method_order),
    alt.Y("ci_lower:Q").title("Mean Difference"),
    alt.Y2("ci_upper:Q"),
    strokeWidth=alt.value(2),
    color=alt.value('gray')
)

# Add value labels
text = bars.mark_text(
    align='center',
    baseline='bottom',
    fontWeight='bold',
    dy=alt.expr(expr=alt.expr.if_(alt.datum.mean_diff >= 0, -1, 20))  # Adjust the position of the text    
).encode(
    text=alt.Text('mean_diff:Q', format='.2f'),
    color=alt.value('black')  # Set text color to black
)

# Combine layers and facet
chart = alt.layer(bars, text, error_bars, data=chart_data).facet(
    column=alt.Column('metric:N', title=None, sort=facet_order),
).configure(**chart_config)  # Apply

# save to html
chart.save("./plots/pls_evaluator_comparison_by_measures.html")

chart



  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
