In [None]:
import pandas as pd
import json

# Read the CSV file
df = pd.read_csv('./outputs/gpt35/gpt35_outputs.csv')

# get unique PMID values in a list
pmids = df['PMID'].unique()

column_names = ["benefit_answer", "rigor_answer", "importance_answer", "full_text_answer", "another_trial_answer"]

metrics = {}
for col in column_names:
    column_dffs = []
    for pmid in pmids:
        # Get the rows for the current PMID
        pmid_rows = df[df['PMID'] == pmid]
        # get the 'spin' answer
        spin_answer = pmid_rows.loc[pmid_rows['abstract_type'] == 'spin', col].values[0]
        # get the 'no spin' answer
        no_spin_answer = pmid_rows.loc[pmid_rows['abstract_type'] == 'no_spin', col].values[0]
        # subtract the 'spin' answer from the 'no spin' answer
        diff = no_spin_answer - spin_answer
        
        column_dffs.append(diff)

    # Average all the differences for each column
    column_avg = diff.mean()

    metrics[f"{col}_avg"] = column_avg
    print(f"Average differences for '{col}':")
    print(column_avg)

# Average across all columns
overall_avg = sum(metrics.values()) / len(metrics)
metrics['overall_avg'] = overall_avg

print(f"\nOverall average difference across all answers:")
print(overall_avg)

# Save the results to a JSON file
with open('./outputs/gpt35/gpt35_differences_metrics.json', 'w') as f:
    json.dump(metrics, f, indent=4)


In [7]:
import pandas as pd
import json
from statistics import mean

# Read the CSV file
df = pd.read_csv('./eval_outputs/openbiollm-70B/openbiollm-70B_interpretation_outputs.csv')

# column names for the 5 questions
column_names = ["benefit_answer", "rigor_answer", "importance_answer", "full_text_answer", "another_trial_answer"]

diff_metrics = {}
for col in column_names:
    df_copy = df.copy()
    #change to string the column values
    df_copy[col] = df_copy[col].astype(str)
    # check if column values have any Error or empty string values for model outputs
    if df_copy[col].apply(lambda x: "Error" in x or x == "" or x == "nan").any():
        print(f"Column '{col}' has some 'Error' or empty string values. Removing these rows from the metrics...")
        # remove pmids rows with 'Error' or empty string values
        error_pmids = df_copy[df_copy[col].apply(lambda x: "Error" in x or x == "" or x == "nan")]['PMID'].tolist()
        # unique ids
        error_pmids = list(set(error_pmids))
        df_copy = df_copy[~df_copy['PMID'].isin(error_pmids)]
        print(f"PMIDs with 'Error' or empty string values in column '{col}': {error_pmids}")
        print(f"Number of rows after removing 'Error' or empty string values: {len(df_copy)}")

    # for each column, get the average of spin and no_spin answers
    spin_avg = df_copy[df_copy['abstract_type'] == 'spin'][col].astype(float).mean()
    no_spin_avg = df_copy[df_copy['abstract_type'] == 'no_spin'][col].astype(float).mean()
    
    # print(f"Average for '{col}' (spin): {spin_avg}")
    # print(f"Average for '{col}' (no_spin): {no_spin_avg}")

    diff = spin_avg - no_spin_avg
    diff_metrics[f"{col}_diff"] = diff
    print(f"Mean difference for '{col}': {diff}")

# Average across all columns
overall_avg = mean(diff_metrics.values())
diff_metrics['overall_avg'] = overall_avg

print(f"\nOverall mean difference across all answers: {overall_avg}")

# Save the results to a JSON file
with open('./eval_outputs/openbiollm-70B/openbiollm-70B_mean_differences_metrics.json', 'w') as f:
    json.dump(diff_metrics, f, indent=4)

Mean difference for 'benefit_answer': 4.4
Mean difference for 'rigor_answer': 0.0
Mean difference for 'importance_answer': 0.6333333333333337
Column 'full_text_answer' has some 'Error' or empty string values. Removing these rows from the metrics...
PMIDs with 'Error' or empty string values in column 'full_text_answer': [20087643, 11261827, 20530276, 15947110, 22112969, 12177098, 21471562, 16148021, 18794551, 17179098, 16314619, 9093724]
Number of rows after removing 'Error' or empty string values: 36
Mean difference for 'full_text_answer': 1.0555555555555554
Mean difference for 'another_trial_answer': 4.933333333333334

Overall mean difference across all answers: 2.2044444444444444


In [10]:
import pandas as pd

files = [
    './eval_outputs/biomistral7B/biomistral7B_interpretation_outputs.csv',
    './eval_outputs/openbiollm-8B/openbiollm-8B_interpretation_outputs.csv',
    './eval_outputs/openbiollm-70B/openbiollm-70B_interpretation_outputs.csv',
    './eval_outputs/llama2_chat-70B/llama2_chat-70B_interpretation_outputs.csv'
]

# Read the CSV file
for file_path in files:
    print(f"\nReading file: {file_path}")
    df = pd.read_csv(file_path)

    # column names for the 5 questions
    column_names = ["benefit_answer", "rigor_answer", "importance_answer", "full_text_answer", "another_trial_answer"]

    diff_metrics = {}
    for col in column_names:
        #change to string the column values
        df[col] = df[col].astype(str)
        # check which PMIDs have errors or empty string or nan values for model outputs
        error_pmids = df[df[col].apply(lambda x: "Error" in x or x == "" or x == "nan")]['PMID'].tolist()
        # unique ids
        error_pmids = list(set(error_pmids))
        print(f"Column '{col}' ======= Number of PMIDs with error: {len(error_pmids)}")

    print("#################")


Reading file: ./eval_outputs/biomistral7B/biomistral7B_interpretation_outputs.csv
#################

Reading file: ./eval_outputs/openbiollm-8B/openbiollm-8B_interpretation_outputs.csv
#################

Reading file: ./eval_outputs/openbiollm-70B/openbiollm-70B_interpretation_outputs.csv
#################

Reading file: ./eval_outputs/llama2_chat-70B/llama2_chat-70B_interpretation_outputs.csv
#################


In [5]:
import pandas as pd
import json
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Read the CSV file
df = pd.read_csv('./eval_outputs/openbiollm-8B/openbiollm-8B_detection_outputs.csv')

# check if any of results are errors
if df["model_answer"].str.contains("Error").any():
    print("Some of the model outputs are errors. Cannot calculate the metrics.")

# check if column values have any Error or empty string values for model outputs
if df["model_answer"].apply(lambda x: "Error" in x or x == "").any():
    print("Model's output has some 'Error' or empty string values. Removing these rows from the metrics...")
    # remove rows with 'Error' or empty string values
    df = df[df["model_answer"].apply(lambda x: "Error" not in x and x != "")]
    print(f"Number of rows after removing 'Error' or empty string values: {len(df)}")

# calculate the metrics
metrics = {}
# convert the spin and model_answer to binary values
df["ground_truth"] = df["abstract_type"].apply(lambda x: 1 if x == "spin" else 0)
df["model_answer"] = df["model_answer"].apply(lambda x: 1 if x == "yes" else 0)

# calculate the metrics (accuracy, precision, recall, f1 score)
accuracy = accuracy_score(df["ground_truth"], df["model_answer"])
precision = precision_score(df["ground_truth"], df["model_answer"])
recall = recall_score(df["ground_truth"], df["model_answer"])
f1 = f1_score(df["ground_truth"], df["model_answer"])

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1: {f1}")

metrics["accuracy"] = accuracy
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1

# Save the results to a JSON file
with open('./eval_outputs/openbiollm-8B/openbiollm-8B_detection_outputs.json', 'w') as f:
    json.dump(metrics, f, indent=4)

Accuracy: 0.5
Precision: 0.5
Recall: 1.0
F1: 0.6666666666666666
