In [10]:
import re
import json
from collections import defaultdict

def load_journal_full_names(filename="journals.json"):
    """Load and normalize journal full names from a JSON file.
       Return a dictionary mapping normalized keys to the full names (plain text)."""
    try:
        with open(filename, "r", encoding="utf-8") as file:
            data = json.load(file)
            # Return full names as plain text.
            return {key.lower().replace(".", "").replace(",", ""): value for key, value in data.items()}
    except FileNotFoundError:
        print("Error: journals.json not found!")
        return {}

def normalize_journal_name(journal_name):
    """Normalize journal name (remove dots, lowercase) for comparison."""
    return journal_name.lower().replace(".", "").replace(",", "")

def classify_reference(reference):
    """Classify references into categories (Journal, Conference, arXiv, Dataset, Other)."""
    if "arXiv" in reference:
        return "arXiv"
    elif "Proceedings" in reference or "Conference" in reference or "in " in reference:
        return "Conference"
    elif re.search(r"vol\.\s*\d+", reference):
        return "Journal"
    elif "Dataset" in reference:
        return "Dataset"
    else:
        return "Other"

def extract_year(reference):
    """Extract the year from a reference."""
    match = re.search(r'\b(19|20)\d{2}\b', reference)
    return match.group(0) if match else "Unknown"

def extract_journal_name(reference, journal_full_names, missing_journals, ref_index):
    """Extract journal name and replace it with its full name (plain text) if found."""
    match = re.search(r'“.*?”\s*(.*?)\s*,\s*vol\.', reference)
    if match:
        abbr_name = match.group(1).strip()
        normalized_name = normalize_journal_name(abbr_name)
        if normalized_name in journal_full_names:
            return journal_full_names[normalized_name]
        else:
            missing_journals[abbr_name].append(f"[{ref_index}]")
            return abbr_name
    elif re.search(r'“.*?”\s*([A-Za-z\s]+),', reference):
        abbr_name = re.search(r'“.*?”\s*([A-Za-z\s]+),', reference).group(1).strip()
        normalized_name = normalize_journal_name(abbr_name)
        if normalized_name in journal_full_names:
            return journal_full_names[normalized_name]
        else:
            missing_journals[abbr_name].append(f"[{ref_index}]")
            return abbr_name
    else:
        return "Unknown Journal"

def check_format_issues(reference):
    """
    Check for missing components in the reference.
    Also, attempt to extract the page numbers portion.
    Returns a tuple: (list_of_issue_messages, extracted_pages)
    """
    issues = []
    pages_match = re.search(r"pp\.\s*([\d–]+)", reference)
    extracted_pages = pages_match.group(1) if pages_match else "Missing"
    
    if not pages_match:
        issues.append("Incorrect or Missing Page Numbers")
    if not re.search(r"doi:\s*10\.", reference):
        issues.append("Invalid DOI format")
    if not re.search(r"vol\.\s*\d+", reference):
        issues.append("Missing Volume")
    if not re.search(r"no\.\s*\d+", reference):
        issues.append("Missing Issue Number")
    
    return issues, extracted_pages

def format_reference(index, reference, journal_full_names, missing_journals):
    """
    Format a single reference.
    The formatted reference is produced as in your original code.
    Additionally, extract the list of format issues and the page numbers portion.
    """
    author_pattern = r"^\[\d+\]\s*(.*?),\s*“"
    title_pattern  = r"“(.*?)”"
    journal_pattern = r"([A-Za-z\s\.\-]+),\s*vol\.\s*(\S+),\s*no\.\s*(\d+),?"
    pages_pattern   = r"pp?\.\s*([\d–]+)(?:,\s*([^,]+))?,"
    doi_pattern     = r"doi:\s*(\S+)"
    
    authors = re.search(author_pattern, reference)
    title   = re.search(title_pattern, reference)
    journal = re.search(journal_pattern, reference)
    pages   = re.search(pages_pattern, reference)
    doi     = re.search(doi_pattern, reference)
    
    formatted_authors = authors.group(1) if authors else ""
    formatted_title   = f"“{title.group(1).title()}”" if title else ""
    
    if journal:
        journal_name = extract_journal_name(reference, journal_full_names, missing_journals, index)
        formatted_journal = f"{journal_name}, vol. {journal.group(2)}, no. {journal.group(3)},"
    else:
        formatted_journal = ""
    
    if pages:
        pages_text = pages.group(1)
        extra_text = pages.group(2) if pages.lastindex and pages.group(2) else ""
        if extra_text:
            formatted_pages = f"pp. {pages_text}, {extra_text}."
        else:
            formatted_pages = f"pp. {pages_text}."
    else:
        formatted_pages = ""
    
    # DOI printed as a link-like string with zero-width spaces to prevent auto-hyperlinking.
    formatted_doi = f"\n\tDOI: https:\u200B//\u200Bdoi.org/{doi.group(1)}" if doi else ""
    
    formatted_ref = f"[{index}]\t{formatted_authors}, {formatted_title} {formatted_journal} {formatted_pages}{formatted_doi}"
    
    ref_type = classify_reference(reference)
    year = extract_year(reference)
    issues, extracted_pages = check_format_issues(reference)
    
    return formatted_ref, ref_type, index, year, formatted_journal, issues, extracted_pages

def process_references(input_text, start_index=1):
    """
    Process and analyze references.
    Returns a tuple of three values:
      1. A string of all formatted references (one per line)
      2. A list of detailed format issues per reference.
      3. A summary dictionary with reference type counts, paper years, and missing journals.
    """
    journal_full_names = load_journal_full_names()
    missing_journals = defaultdict(list)
    references = re.split(r'\n(?=\[\d+\])', input_text.strip())
    formatted_references = []
    detailed_issues = []
    # Dictionary: reference type -> list of (ref_index, year)
    ref_type_counts = {"Journal": [], "Conference": [], "arXiv": [], "Other": []}
    # For paper years among Journal/Conference/arXiv
    jca_years = defaultdict(list)
    # For overall paper years (all references)
    overall_years = defaultdict(list)

    for i, ref in enumerate(references):
        formatted_ref, ref_type, ref_index, year, formatted_journal, issues, extracted_pages = format_reference(
            start_index + i, ref, journal_full_names, missing_journals)
        formatted_references.append(formatted_ref)
        if issues:
            issues_str = ", ".join(issues)
            if "Incorrect or Missing Page Numbers" in issues_str:
                issues_str += f" (pp: {extracted_pages})"
        else:
            issues_str = "None"
        detailed_issues.append(f"Reference [{ref_index}] {year} Issues: {issues_str}")

        ref_type_counts[ref_type].append((ref_index, year))
        overall_years[year].append(ref_index)
        if ref_type in ("Journal", "Conference", "arXiv"):
            jca_years[year].append(ref_index)
    
    summary_stats = {
        "ref_type_counts": ref_type_counts,
        "jca_years": dict(jca_years),
        "overall_years": dict(overall_years),
        "missing_journals": dict(missing_journals)
    }
    
    return "\n".join(formatted_references), detailed_issues, summary_stats

def report_reference_fields(reference, ref_index):
    """
    Extract and return a detailed report of the fields in a reference.
    The report includes:
      - Authors
      - Title
      - Journal name
      - Volume
      - Issue
      - Page numbers
      - Year
      - DOI
    """
    report = []
    # Authors:
    author_match = re.search(r"^\[\d+\]\s*(.*?),\s*“", reference)
    authors = author_match.group(1) if author_match else "Missing"
    report.append(f"Authors: {authors}")
    
    # Title:
    title_match = re.search(r"“(.*?)”", reference)
    title = title_match.group(1) if title_match else "Missing"
    report.append(f"Title: {title}")
    
    # Journal:
    journal_match = re.search(r'“.*?”\s*(.*?)\s*,\s*vol\.', reference)
    journal = journal_match.group(1) if journal_match else "Missing"
    report.append(f"Journal: {journal}")
    
    # Volume:
    vol_match = re.search(r"vol\.\s*(\d+)", reference)
    volume = vol_match.group(1) if vol_match else "Missing"
    report.append(f"Volume: {volume}")
    
    # Issue:
    issue_match = re.search(r"no\.\s*(\d+)", reference)
    issue = issue_match.group(1) if issue_match else "Missing"
    report.append(f"Issue: {issue}")
    
    # Page Numbers:
    pages_match = re.search(r"pp\.\s*([\d–]+)", reference)
    pages = pages_match.group(1) if pages_match else "Missing"
    report.append(f"Page numbers: {pages}")
    
    # Year:
    year = extract_year(reference)
    report.append(f"Year: {year}")
    
    # DOI:
    doi_match = re.search(r"doi:\s*(\S+)", reference)
    doi = doi_match.group(1) if doi_match else "Missing"
    report.append(f"DOI: {doi}")
    
    return "\n".join(report)

# --- Execution ---

input_references = """[1]	R. L. Siegel, K. D. Miller, N. S. Wagle, and A. Jemal, “Cancer statistics, 2023,” CA. Cancer J. Clin., vol. 73, no. 1, pp. 17–48, Jan. 2023, doi: 10.3322/caac.21763.
[2]	K. V. Sriram and R. H. Havaldar, “Analytical review and study on object detection techniques in the image,” Int. J. Model. Simul. Sci. Comput., vol. 12, no. 05, p. 2150031, Oct. 2021, doi: 10.1142/S1793962321500318.
[3]	L. Fan, H. Zhao, H. Zhao, H. Hu, and Z. Wang, “Survey of target detection based on deep convolutional neural networks,” Opt. Precis. Eng., vol. 28, no. 5, pp. 1152–1164, 2020, doi: 10.3788/ope.20202805.1152.
[4]	P. Viola and M. Jones, “Robust real-time face detection,” in Proceedings Eighth IEEE International Conference on Computer Vision. ICCV 2001, Vancouver, BC, Canada: IEEE Comput. Soc, 2001, pp. 747–747, doi: 10.1109/ICCV.2001.937709.
[5]	N. Dalal and B. Triggs, “Histograms of Oriented Gradients for Human Detection,” in 2005 IEEE Computer Society Conference on Computer Vision and Pattern Recognition (CVPR’05), San Diego, CA, USA: IEEE, 2005, pp. 886–893, doi: 10.1109/CVPR.2005.177.
[6]	M. Hussain, “YOLO-v1 to YOLO-v8, the Rise of YOLO and Its Complementary Nature toward Digital Manufacturing and Industrial Defect Detection,” Machines, vol. 11, no. 7, p. 677, Jun. 2023, doi: 10.3390/machines11070677.
[7]	G. Sharma, R. Dave, J. Sanadya, P. Sharma, and K. K. Sharma, “Various types and management of breast cancer: An overview,” J. Adv. Pharm. Technol. Res., vol. 1, no. 2, p. 109, 2010, doi: 10.4103/2231-4040.72251.
[8]	C. Li et al., “YOLOv6: A Single-Stage Object Detection Framework for Industrial Applications,” Sep. 07, 2022, arXiv: arXiv:2209.02976, doi: 10.48550/arXiv.2209.02976.
[9]	S. Zahia, D. Sierra-Sosa, B. Garcia-Zapirain, and A. Elmaghraby, “Tissue classification and segmentation of pressure injuries using convolutional neural networks,” Comput. Methods Programs Biomed., vol. 159, pp. 51–58, Jun. 2018, doi: 10.1016/j.cmpb.2018.02.018.
[10]	X. Sun, X. Wang, J. Liu, and H. Huang, “Classic YOLO Series Target Detection Algorithms and Their Application in Breast Cancer Detection,” J. Comput. Syst. Appl., vol. 32, no. 12, pp. 52–62, 2023, doi: 10.15888/j.cnki.csa.009351.
[11]	F. Prinzi, M. Insalaco, A. Orlando, S. Gaglio, and S. Vitabile, “A Yolo-Based Model for Breast Cancer Detection in Mammograms,” Cogn. Comput., vol. 16, no. 1, pp. 107–120, Jan. 2024, doi: 10.1007/s12559-023-10189-6.
[12]	P. K. Samanta, A. Basuli, N. K. Rout, and G. Panda, “Improved Breast Cancer Detection from Ultrasound Images Using YOLOv8 Model,” in 2023 IEEE 3rd International Conference on Applied Electromagnetics, Signal Processing, & Communication (AESPC), Bhubaneswar, India: IEEE, Nov. 2023, pp. 1–6, doi: 10.1109/AESPC59761.2023.10390341.
[13]	H. Gui et al., “FS-YOLOv9: A Frequency and Spatial Feature-Based YOLOv9 for Real-time Breast Cancer Detection,” Acad. Radiol., Oct. 2024, doi: 10.1016/j.acra.2024.09.048.
[14]	L. Zheng et al., “Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena,” Dec. 24, 2023, arXiv: arXiv:2306.05685, doi: 10.48550/arXiv.2306.05685.
[15]	A. Y. Yuan et al., “Hybrid deep learning network for vascular segmentation in photoacoustic imaging,” Biomed. Opt. Express, vol. 11, no. 11, p. 6445, Nov. 2020, doi: 10.1364/BOE.409246.
[16]	W. Al-Dhabyani, M. Gomaa, H. Khaled, and A. Fahmy, “Dataset of breast ultrasound images,” Data Brief, vol. 28, p. 104863, Feb. 2020, doi: 10.1016/j.dib.2019.104863.
"""

formatted_output, detailed_issues, summary_stats = process_references(input_references, start_index=1)

print("Formatted References:\n", formatted_output)

print("\nFormat Issues Detected:")
for issue in detailed_issues:
    print(issue)

print("\nDetailed Report per Reference:")
original_refs = re.split(r'\n(?=\[\d+\])', input_references.strip())
for i, ref in enumerate(original_refs, start=1):
    print(f"\nReference [{i}] Fields:")
    print(report_reference_fields(ref, i))

# --- New Summary Outputs ---

print("\nReference Type Counts:")
for ref_type, entries in summary_stats["ref_type_counts"].items():
    entries_str = " ".join([f"[{idx}] {yr}" for idx, yr in entries])
    print(f"{ref_type}: {len(entries)}")
    print(entries_str)

print("\nPaper Year (Journal/Conference/arXiv):")
for year in sorted(summary_stats["jca_years"].keys(), key=lambda x: int(x), reverse=True):
    refs = sorted(summary_stats["jca_years"][year])
    refs_str = ", ".join([f"[{idx}]" for idx in refs])
    print(f"{year}: {len(refs)} ({refs_str})")

print("\nPaper Year (Total):")
for year in sorted(summary_stats["overall_years"].keys(), key=lambda x: int(x), reverse=True):
    refs = sorted(summary_stats["overall_years"][year])
    refs_str = ", ".join([f"[{idx}]" for idx in refs])
    print(f"{year}: {len(refs)} ({refs_str})")

print("\nJournals Not Found in JSON (Check Manually):")
if summary_stats["missing_journals"]:
    for journal, ref_list in summary_stats["missing_journals"].items():
        ref_str = " ".join(ref_list)
        print(f"{journal}: {ref_str}")
else:
    print("None")


Formatted References:
 [1]	R. L. Siegel, K. D. Miller, N. S. Wagle, and A. Jemal, “Cancer Statistics, 2023,” CA: A Cancer Journal for Clinicians, vol. 73, no. 1, pp. 17–48, Jan. 2023.
	DOI: https:​//​doi.org/10.3322/caac.21763.
[2]	K. V. Sriram and R. H. Havaldar, “Analytical Review And Study On Object Detection Techniques In The Image,” International Journal of Modeling, Simulation, and Scientific Computing, vol. 12, no. 05, pp. 2150031, Oct. 2021.
	DOI: https:​//​doi.org/10.1142/S1793962321500318.
[3]	L. Fan, H. Zhao, H. Zhao, H. Hu, and Z. Wang, “Survey Of Target Detection Based On Deep Convolutional Neural Networks,” Optics and Precision Engineering, vol. 28, no. 5, pp. 1152–1164, 2020.
	DOI: https:​//​doi.org/10.3788/ope.20202805.1152.
[4]	P. Viola and M. Jones, “Robust Real-Time Face Detection,”  pp. 747–747.
	DOI: https:​//​doi.org/10.1109/ICCV.2001.937709.
[5]	N. Dalal and B. Triggs, “Histograms Of Oriented Gradients For Human Detection,”  pp. 886–893.
	DOI: https:​//​doi.org/1