In [None]:
import re
import json
from collections import defaultdict
from IPython.display import HTML

def load_journal_full_names(filename="journals.json"):
    """Load and normalize journal full names from a JSON file.
       Return a dictionary mapping normalized keys to the full names (plain text)."""
    try:
        with open(filename, "r", encoding="utf-8") as file:
            data = json.load(file)
            # Return full names as plain text.
            return {key.lower().replace(".", "").replace(",", ""): value for key, value in data.items()}
    except FileNotFoundError:
        print("Error: journals.json not found!")
        return {}

def normalize_journal_name(journal_name):
    """Normalize journal name (remove dots, lowercase) for comparison."""
    return journal_name.lower().replace(".", "").replace(",", "")

def classify_reference(reference):
    """Classify references into categories (Journal, Conference, arXiv, Dataset, Other)."""
    if "arXiv" in reference:
        return "arXiv"
    elif "Proceedings" in reference or "Conference" in reference or "in " in reference:
        return "Conference"
    elif re.search(r"vol\.\s*\d+", reference):
        return "Journal"
    elif "Dataset" in reference:
        return "Dataset"
    else:
        return "Other"

def extract_year(reference):
    """Extract the year from a reference."""
    match = re.search(r'\b(19|20)\d{2}\b', reference)
    return match.group(0) if match else "Unknown"

def extract_journal_name(reference, journal_full_names, missing_journals, ref_index):
    """Extract journal name and replace it with its full name (plain text) if found."""
    match = re.search(r'“.*?”\s*(.*?)\s*,\s*vol\.', reference)
    if match:
        abbr_name = match.group(1).strip()
        normalized_name = normalize_journal_name(abbr_name)
        if normalized_name in journal_full_names:
            return journal_full_names[normalized_name]
        else:
            missing_journals[abbr_name].append(f"[{ref_index}]")
            return abbr_name
    elif re.search(r'“.*?”\s*([A-Za-z\s]+),', reference):
        abbr_name = re.search(r'“.*?”\s*([A-Za-z\s]+),', reference).group(1).strip()
        normalized_name = normalize_journal_name(abbr_name)
        if normalized_name in journal_full_names:
            return journal_full_names[normalized_name]
        else:
            missing_journals[abbr_name].append(f"[{ref_index}]")
            return abbr_name
    else:
        return "Unknown Journal"

def check_format_issues(reference):
    """
    Check for missing components in the reference.
    In particular, verify:
      - The volume number exists.
      - The issue number exists.
      - The page numbers exist and contain a dash ("-" or "–").
    Returns a tuple: (list_of_issue_messages, details_string)
    """
    issues = []
    
    # Check volume
    vol_match = re.search(r"vol\.\s*(\d+)", reference)
    vol_value = vol_match.group(1) if vol_match else "missing"
    if not vol_match:
        issues.append("Missing Volume")
    
    # Check issue
    no_match = re.search(r"no\.\s*(\d+)", reference)
    no_value = no_match.group(1) if no_match else "missing"
    if not no_match:
        issues.append("Missing Issue Number")
    
    # Check pages (allow both "p." and "pp.")
    pages_match = re.search(r"p{1,2}\.\s*([\d–-]+)", reference)
    if pages_match:
        pages_value = pages_match.group(1)
        # Check if the page string contains a dash
        if ("-" not in pages_value) and ("–" not in pages_value):
            issues.append("Incorrect or Missing Page Numbers")
            pages_value = "Missing"
    else:
        issues.append("Incorrect or Missing Page Numbers")
        pages_value = "Missing"
    
    # Build details string only if there are issues
    details_parts = []
    if "Missing Volume" in issues:
        details_parts.append("vol: missing")
    if "Missing Issue Number" in issues:
        details_parts.append("no: missing")
    if "Incorrect or Missing Page Numbers" in issues:
        details_parts.append("pp: Missing")
    
    details_string = ""
    if details_parts:
        details_string = " (" + ", ".join(details_parts) + ")"
    
    return issues, details_string

def format_reference_html(index, reference, journal_full_names, missing_journals):
    """
    Generate an HTML-formatted reference.
    This function replicates the plain-text formatting (including tab spacing) 
    and wraps the journal name in <i> tags for italics.
    The output is enclosed in a <pre> tag to preserve alignment.
    """
    author_pattern = r"^\[\d+\]\s*(.*?),\s*“"
    title_pattern  = r"“(.*?)”"
    journal_pattern = r"([A-Za-z\s\.\-]+),\s*vol\.\s*(\S+),\s*no\.\s*(\d+),?"
    pages_pattern   = r"p{1,2}\.\s*([\d–-]+)(?:,\s*([^,]+))?,"
    doi_pattern     = r"doi:\s*(\S+)"
    
    authors = re.search(author_pattern, reference)
    title   = re.search(title_pattern, reference)
    journal = re.search(journal_pattern, reference)
    pages   = re.search(pages_pattern, reference)
    doi     = re.search(doi_pattern, reference)
    
    formatted_authors = authors.group(1) if authors else ""
    formatted_title   = f"“{title.group(1).title()}”" if title else ""
    
    if journal:
        journal_name = extract_journal_name(reference, journal_full_names, missing_journals, index)
        # Wrap journal name in <i> tags
        formatted_journal = f"<i>{journal_name}</i>, vol. {journal.group(2)}, no. {journal.group(3)},"
    else:
        formatted_journal = ""
    
    if pages:
        pages_text = pages.group(1)
        extra_text = pages.group(2) if pages.lastindex and pages.group(2) else ""
        if extra_text:
            formatted_pages = f"pp. {pages_text}, {extra_text}."
        else:
            formatted_pages = f"pp. {pages_text}."
    else:
        formatted_pages = ""
    
    # DOI printed as plain text with zero-width spaces removed.
    formatted_doi = f"\n\tDOI: https://doi.org/{doi.group(1)}" if doi else ""
    
    # Use a <pre> tag to preserve the alignment (including tab characters).
    html_ref = f"<pre>[{index}]\t{formatted_authors}, {formatted_title} {formatted_journal} {formatted_pages}{formatted_doi}</pre>"
    return html_ref

def process_references_html(input_text, start_index=1):
    """
    Process the input references and generate a single HTML string
    containing all formatted references.
    """
    journal_full_names = load_journal_full_names()
    missing_journals = defaultdict(list)
    references = re.split(r'\n(?=\[\d+\])', input_text.strip())
    html_references = []
    
    for i, ref in enumerate(references):
        html_ref = format_reference_html(start_index + i, ref, journal_full_names, missing_journals)
        html_references.append(html_ref)
    
    return "\n".join(html_references)

# --- Example Usage in Jupyter Notebook ---

input_references = """[1]	R. L. Siegel, K. D. Miller, N. S. Wagle, and A. Jemal, “Cancer statistics, 2023,” CA. Cancer J. Clin., vol. 73, no. 1, pp. 17–48, Jan. 2023, doi: 10.3322/caac.21763.
[2]	K. V. Sriram and R. H. Havaldar, “Analytical review and study on object detection techniques in the image,” Int. J. Model. Simul. Sci. Comput., vol. 12, no. 05, p. 2150031, Oct. 2021, doi: 10.1142/S1793962321500318.
[3]	L. Fan, H. Zhao, H. Zhao, H. Hu, and Z. Wang, “Survey of target detection based on deep convolutional neural networks,” Opt. Precis. Eng., vol. 28, no. 5, pp. 1152–1164, 2020, doi: 10.3788/ope.20202805.1152.
[4]	P. Viola and M. Jones, “Robust real-time face detection,” in Proceedings Eighth IEEE International Conference on Computer Vision. ICCV 2001, Vancouver, BC, Canada: IEEE Comput. Soc, 2001, pp. 747–747, doi: 10.1109/ICCV.2001.937709.
[5]	N. Dalal and B. Triggs, “Histograms of Oriented Gradients for Human Detection,” in 2005 IEEE Computer Society Conference on Computer Vision and Pattern Recognition (CVPR’05), San Diego, CA, USA: IEEE, 2005, pp. 886–893, doi: 10.1109/CVPR.2005.177.
[6]	M. Hussain, “YOLO-v1 to YOLO-v8, the Rise of YOLO and Its Complementary Nature toward Digital Manufacturing and Industrial Defect Detection,” Machines, vol. 11, no. 7, p. 677, Jun. 2023, doi: 10.3390/machines11070677.
[7]	G. Sharma, R. Dave, J. Sanadya, P. Sharma, and K. K. Sharma, “Various types and management of breast cancer: An overview,” J. Adv. Pharm. Technol. Res., vol. 1, no. 2, p. 109, 2010, doi: 10.4103/2231-4040.72251.
[8]	C. Li et al., “YOLOv6: A Single-Stage Object Detection Framework for Industrial Applications,” Sep. 07, 2022, arXiv: arXiv:2209.02976, doi: 10.48550/arXiv.2209.02976.
[9]	S. Zahia, D. Sierra-Sosa, B. Garcia-Zapirain, and A. Elmaghraby, “Tissue classification and segmentation of pressure injuries using convolutional neural networks,” Comput. Methods Programs Biomed., vol. 159, pp. 51–58, Jun. 2018, doi: 10.1016/j.cmpb.2018.02.018.
[10]	X. Sun, X. Wang, J. Liu, and H. Huang, “Classic YOLO Series Target Detection Algorithms and Their Application in Breast Cancer Detection,” J. Comput. Syst. Appl., vol. 32, no. 12, pp. 52–62, 2023, doi: 10.15888/j.cnki.csa.009351.
[11]	F. Prinzi, M. Insalaco, A. Orlando, S. Gaglio, and S. Vitabile, “A Yolo-Based Model for Breast Cancer Detection in Mammograms,” Cogn. Comput., vol. 16, no. 1, pp. 107–120, Jan. 2024, doi: 10.1007/s12559-023-10189-6.
[12]	P. K. Samanta, A. Basuli, N. K. Rout, and G. Panda, “Improved Breast Cancer Detection from Ultrasound Images Using YOLOv8 Model,” in 2023 IEEE 3rd International Conference on Applied Electromagnetics, Signal Processing, & Communication (AESPC), Bhubaneswar, India: IEEE, Nov. 2023, pp. 1–6, doi: 10.1109/AESPC59761.2023.10390341.
[13]	H. Gui et al., “FS-YOLOv9: A Frequency and Spatial Feature-Based YOLOv9 for Real-time Breast Cancer Detection,” Acad. Radiol., Oct. 2024, doi: 10.1016/j.acra.2024.09.048.
[14]	L. Zheng et al., “Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena,” Dec. 24, 2023, arXiv: arXiv:2306.05685, doi: 10.48550/arXiv.2306.05685.
[15]	A. Y. Yuan et al., “Hybrid deep learning network for vascular segmentation in photoacoustic imaging,” Biomed. Opt. Express, vol. 11, no. 11, p. 6445, Nov. 2020, doi: 10.1364/BOE.409246.
[16]	W. Al-Dhabyani, M. Gomaa, H. Khaled, and A. Fahmy, “Dataset of breast ultrasound images,” Data Brief, vol. 28, p. 104863, Feb. 2020, doi: 10.1016/j.dib.2019.104863.
"""

html_output = process_references_html(input_references)
# Render the HTML output in the notebook.
HTML(html_output)
