In [15]:
import re
import time
from pymed import PubMed
import pymed

# Global counters for statistics.
total_matches = 0
credible_matches = 0

# Revised regex pattern with five capturing groups:
# Group 1: Year (4 digits)
# Group 2: Authors (text until the next comma)
# Group 3: Title (text within quotes)
# Group 4: Journal (text following the title up until the HTML line break)
# Group 5: The remainder (starting with <br> and following)
pattern = re.compile(
    r'(\d{4}),\s+([^,]+),\s+"(.*?)",\s+([^<]+)(<br>.*)',
    re.MULTILINE
)

# Initialize PubMed for querying bibliographic information.
pubmed = PubMed(tool="MyTool", email="my@email.address")

def replace_entry(match):
    """
    Replace the bibliographic entry by querying PubMed for an updated title.

    For valid citations, return the updated bibliographic entry (with added fireworks).
    For hallucinated ones (i.e. if no valid title is found), return a note stating:
    "The AI hallucinated so the source was removed, the content might however be of value,
    but be cautious > "QUOTE"", where "QUOTE" is the original summary.
    """
    global total_matches, credible_matches
    total_matches += 1

    year = match.group(1).strip()
    authors = match.group(2).strip()
    original_title = match.group(3).strip()
    journal = match.group(4).strip()
    orig_summary = match.group(5).strip()

    print(f"Querying PubMed for title: {original_title}")
    try:
        results = pubmed.query(original_title, max_results=5)
    except Exception as e:
        print("PubMed query error:", e)
        results = []

    new_title = None
    for article in results:
        if isinstance(article, pymed.article.PubMedArticle) and article.title:
            new_title = article.title.strip()
            break

    if new_title is None:
        print("Could not find title for article, citation marked as hallucinated.")
        # Instead of removing everything, return a message with the original summary.
        return (
            f'The AI hallucinated so the source was removed, the content might however be of value, '
            f'but be cautious > "{orig_summary}"'
        )
    else:
        credible_matches += 1
        new_title = new_title + " 🎆🎆🎆"
        new_bib = f"({year}, {authors}, \"{new_title}\", {journal})"
        # Return the updated citation without the hallucination note.
        return new_bib

def generate_new_summary(summary_text):
    """
    Remove Markdown fences from the given summary text and apply the PubMed-based replacement.
    """
    cleaned_summary = re.sub(r'```markdown\s*|\s*```', '', summary_text.strip())
    new_summary = pattern.sub(replace_entry, cleaned_summary)
    return new_summary

def process_markdown_file(input_filepath, output_filepath):
    """
    Reads a Markdown file, applies the new summary transformation, and writes the updated Markdown.
    """
    with open(input_filepath, 'r', encoding='utf8') as f:
        content = f.read()

    # Process the content.
    updated_content = generate_new_summary(content)

    with open(output_filepath, 'w', encoding='utf8') as f:
        f.write(updated_content)

    print(f"Updated markdown file saved to: {output_filepath}")

if __name__ == "__main__":
    # Example usage: update a given markdown file.
    input_md = "./output/text_files/automated_comparison/validation_o3-mini-GSEA-1.md"      # <-- Replace with your Markdown file path.
    output_md = "./output/text_files/automated_comparison/test_validation_o3-mini-GSEA-1.md"  # <-- Replace with desired output file path.
    process_markdown_file(input_md, output_md)


Updated markdown file saved to: ./output/text_files/automated_comparison/test_validation_o3-mini-GSEA-1.md
