In [57]:
from utils import load_csv_file, save_dataset_to_json
import random
import tiktoken

random.seed(0)

ImportError: cannot import name 'load_json_file' from 'utils' (/Users/hyesunyun/Documents/nlp/llms_spin_med_lit/MedLitSpin/code/utils.py)

In [46]:
# full list of models
models = [
    "alpacare-7B",
    "biomedgpt7B",
    "biomistral7B",
    "claude_3.5-haiku", 
    "claude_3.5-sonnet",
    "gemini_1.5_flash",
    "gemini_1.5_flash-8B",
    "gpt4o", 
    "gpt4o-mini",
    "gpt35",
    "llama2_chat-7B",
    "llama2_chat-13B",
    "llama2_chat-70B",
    "llama3_instruct-8B",
    "llama3_instruct-70B",
    "med42-8B",
    "med42-70B",
    "mistral_instruct7B",
    "olmo2_instruct-7B",
    "olmo2_instruct-13B",
    "openbiollm-8B",
    "openbiollm-70B"
]

In [47]:
sampled_summaries = []
for model in models:
    file_path = f"./pls_outputs/{model}/{model}_outputs.csv"
    summaries = load_csv_file(file_path)

    # get unique PMIDs
    pmids = list(set([summary['PMID'] for summary in summaries]))
    # shuffle PMIDs
    random.shuffle(pmids)
    # sample 5 PMIDs randomly with a seed
    sampled_pmids = random.sample(pmids, 5)

    # get summaries for sampled PMIDs
    for summary in summaries:
        summary_pmid = summary["PMID"]
        if summary_pmid in sampled_pmids:
            plain_language_summary = summary["plain_language_summary"]
            # parts = plain_language_summary.split("\n\n", 1)  # Split at the first occurrence only
            # plain_language_summary = parts[1] if len(parts) > 1 else plain_language_summary
            sampled_summary = {
                "model": model,
                "pmid": summary_pmid,
                "from_abstract_type": summary["abstract_type"],
                "summary": plain_language_summary
            }
            sampled_summaries.append(sampled_summary)

In [2]:
import json

with open('./pls_outputs/sampled_summaries_for_human_eval.json', 'w') as outfile:
    for entry in sampled_summaries:
        json.dump(entry, outfile)
        outfile.write('\n')

JSONDecodeError: Extra data: line 2 column 1 (char 911)

In [5]:
# read jsonl file
sampled_summaries = []
with open('./pls_outputs/sampled_summaries_for_human_eval.json', 'r') as f:
    for line in f:
        sampled_summaries.append(json.loads(line))

# add a unique id for each entry
for i, entry in enumerate(sampled_summaries):
    # change the summary key to text
    entry["text"] = entry.pop("summary")

# save the dataset to jsonl
with open('./pls_outputs/sampled_summaries_for_human_eval.json', 'w') as outfile:
    for entry in sampled_summaries:
        json.dump(entry, outfile)
        outfile.write('\n')

In [64]:
from docx import Document
import json

# read from json file
with open("./pls_outputs/sampled_summaries_for_human_eval.json", "r") as f:
    sampled_summaries = json.load(f)
data = sampled_summaries

def create_word_doc(data, filename="output.docx"):
    doc = Document()
    doc.add_heading('Sampled Summaries for Evaluation', 0)
    doc.add_paragraph(f"Number of sampled summaries: {len(data)}")
    doc.add_paragraph(f"Number of unique PMIDs: {len(set([summary['pmid'] for summary in data]))}")
    doc.add_paragraph("\n")

    for entry in data:
        pmid = entry["pmid"]
        model = entry["model"]
        summary = entry["text"]
        doc.add_paragraph(f"PMID: {pmid}", style='Heading2')
        
        llm_name_paragraph = doc.add_paragraph()
        llm_name_paragraph.add_run("LLM Name:").bold = True
        llm_name_paragraph.add_run(f" {model}")
        
        summary_paragraph = doc.add_paragraph()
        summary_paragraph.add_run("Plain Language Summary:").bold = True
        summary_paragraph.add_run(f" {summary}")
        
        doc.add_paragraph("\n")

    # Save the document
    doc.save(filename)
    print(f"Document saved as {filename}")

create_word_doc(sampled_summaries, "./pls_outputs/sampled_plain_language_summaries.docx")

Document saved as ./pls_outputs/sampled_plain_language_summaries.docx


In [65]:
# stats of the sampled summaries
print(f"Number of sampled summaries: {len(sampled_summaries)}")
print(f"Number of unique PMIDs: {len(set([summary['pmid'] for summary in sampled_summaries]))}")

# get average token of summaries
encoding = tiktoken.get_encoding("cl100k_base")
token_counts = [len(encoding.encode(summary["summary"])) for summary in sampled_summaries]
average_token_count = sum(token_counts) / len(token_counts)
print(f"Average token count: {average_token_count}")

# count for each unique PMID
pmids = [summary["pmid"] for summary in sampled_summaries]
pmid_count = {pmid: pmids.count(pmid) for pmid in set(pmids)}
# sort by count
pmid_count = dict(sorted(pmid_count.items(), key=lambda item: item[1], reverse=True))
print(f"PMID count: {pmid_count}")

# count for each from_abstract_type
abstract_types = [summary["from_abstract_type"] for summary in sampled_summaries]
abstract_type_count = {abstract_type: abstract_types.count(abstract_type) for abstract_type in set(abstract_types)}
print(f"Abstract type count: {abstract_type_count}")

Number of sampled summaries: 220
Number of unique PMIDs: 30
Average token count: 204.9318181818182
PMID count: {'17264336': 16, '20448107': 12, '16314619': 12, '18955454': 12, '18794551': 12, '9093724': 10, '21041710': 10, '10547391': 10, '20673585': 10, '21060024': 10, '11261827': 10, '17179098': 10, '19273714': 8, '20973267': 8, '16504757': 6, '20800381': 6, '20564068': 6, '21471562': 6, '15947110': 6, '22112969': 6, '12177098': 4, '20530276': 4, '20153039': 4, '17530429': 4, '21399726': 4, '10637238': 4, '20087643': 4, '17134892': 2, '17173959': 2, '16148021': 2}
Abstract type count: {'no_spin': 110, 'spin': 110}
