In [3]:
import re
import json
from collections import defaultdict

# Load journal full names from external file
def load_journal_full_names(filename="journals.json"):
    """Load and normalize journal full names from a JSON file."""
    try:
        with open(filename, "r", encoding="utf-8") as file:
            data = json.load(file)
            return {key.lower().replace(".", "").replace(",", ""): value for key, value in data.items()}
    except FileNotFoundError:
        print("Error: journals.json not found!")
        return {}

# Normalize journal name before matching
def normalize_journal_name(journal_name):
    """Normalize journal name (remove dots, lowercase) for comparison."""
    return journal_name.lower().replace(".", "").replace(",", "")

def classify_reference(reference):
    """Classify references into categories (Journal, Conference, arXiv, Dataset, Other)."""
    if "arXiv" in reference:
        return "arXiv"
    elif "Proceedings" in reference or "Conference" in reference or "in " in reference:
        return "Conference"
    elif re.search(r"vol\.\s*\d+", reference):  # If there's a volume number, it's likely a Journal
        return "Journal"
    elif "Dataset" in reference:
        return "Dataset"
    else:
        return "Other"

def extract_year(reference):
    """Extract the year from a reference."""
    match = re.search(r'\b(19|20)\d{2}\b', reference)
    return match.group(0) if match else "Unknown"

def extract_journal_name(reference, journal_full_names, missing_journals, ref_index):
    """Extract journal name and replace it with full name if abbreviation is found."""
    match = re.search(r'“.*?”\s*(.*?)\s*,\s*vol\.', reference)
    if match:
        abbr_name = match.group(1).strip()
        normalized_name = normalize_journal_name(abbr_name)
        if normalized_name in journal_full_names:
            return journal_full_names[normalized_name]  # Replace with full name
        else:
            missing_journals[abbr_name].append(f"[{ref_index}]")  # Store reference number in brackets
            return abbr_name  # Keep abbreviation
    elif re.search(r'“.*?”\s*([A-Za-z\s]+),', reference):  # Backup detection
        abbr_name = re.search(r'“.*?”\s*([A-Za-z\s]+),', reference).group(1).strip()
        normalized_name = normalize_journal_name(abbr_name)
        if normalized_name in journal_full_names:
            return journal_full_names[normalized_name]
        else:
            missing_journals[abbr_name].append(f"[{ref_index}]")
            return abbr_name
    else:
        return "Unknown Journal"

def check_format_issues(reference):
    """Check for missing components in the reference."""
    issues = []
    if not re.search(r"doi:\s*\S+", reference):  # Check for missing DOI
        issues.append("Missing DOI")
    if not re.search(r"vol\.\s*\d+", reference):  # Check for missing volume
        issues.append("Missing Volume")
    if not re.search(r"no\.\s*\d+", reference):  # Check for missing issue number
        issues.append("Missing Issue Number")
    if not re.search(r"pp\.\s*\d+", reference):  # Check for missing pages
        issues.append("Missing Page Numbers")
    return issues

def format_reference(index, reference, journal_full_names, missing_journals):
    """Format references correctly and classify them."""
    author_pattern = r"^\[\d+\]\s*(.*?),\s*“"
    title_pattern = r"“(.*?)”"
    journal_pattern = r"([A-Za-z\s\.\-]+),\s*vol\.\s*(\S+),\s*no\.\s*(\d+),?"
    pages_pattern = r"pp?\.\s*(\d+–?\d*),?\s*(\d{4})"
    doi_pattern = r"doi:\s*(\S+)"

    authors = re.search(author_pattern, reference)
    title = re.search(title_pattern, reference)
    journal = re.search(journal_pattern, reference)
    pages = re.search(pages_pattern, reference)
    doi = re.search(doi_pattern, reference)

    formatted_authors = authors.group(1) if authors else ""
    formatted_title = f"“{title.group(1)}”" if title else ""
    formatted_journal = f"{extract_journal_name(reference, journal_full_names, missing_journals, index)}, vol. {journal.group(2)}, no. {journal.group(3)}," if journal else ""
    formatted_pages = f"pp. {pages.group(1)}, {pages.group(2)}." if pages else ""

    # Fixing DOI Formatting: Always on a new line, indented correctly
    formatted_doi = f"\n\tDOI: {doi.group(1)}" if doi else ""

    # Ensure proper alignment with tab spaces
    formatted_ref = f"[{index}]\t{formatted_authors}, {formatted_title} {formatted_journal} {formatted_pages}{formatted_doi}"
    
    ref_type = classify_reference(reference)
    year = extract_year(reference)
    issues = check_format_issues(reference)
    
    return formatted_ref, ref_type, index, year, formatted_journal, issues

def process_references(input_text, start_index=1):
    """Process and analyze references."""
    journal_full_names = load_journal_full_names()  # Load journal mapping
    references = re.split(r'\n(?=\[\d+\])', input_text.strip())  
    formatted_references = []
    reference_types = defaultdict(list)
    format_issues = []
    paper_years = defaultdict(list)  # Store journal/conference/arXiv paper counts by year
    missing_journals = defaultdict(list)  # Store missing journal abbreviations and their reference numbers

    for i, ref in enumerate(references):
        formatted_ref, ref_type, ref_index, year, journal_name, issues = format_reference(start_index + i, ref, journal_full_names, missing_journals)
        formatted_references.append(formatted_ref)
        reference_types[ref_type].append(f"[{ref_index}] {year}")
        
        if ref_type in ["Journal", "Conference", "arXiv"]:  # Include arXiv in paper year tracking
            paper_years[year].append(f"[{ref_index}]")

        if issues:
            format_issues.append(f"[{ref_index}] {year} Issues: {', '.join(issues)}")

    return "\n".join(formatted_references), reference_types, format_issues, missing_journals, paper_years

input_references = """[1]	R. L. Siegel, K. D. Miller, N. S. Wagle, and A. Jemal, “Cancer statistics, 2023,” CA. Cancer J. Clin., vol. 73, no. 1, pp. 17–48, Jan. 2023, doi: 10.3322/caac.21763.
[2]	K. V. Sriram and R. H. Havaldar, “Analytical review and study on object detection techniques in the image,” Int. J. Model. Simul. Sci. Comput., vol. 12, no. 05, p. 2150031, Oct. 2021, doi: 10.1142/S1793962321500318.
[3]	L. Fan, H. Zhao, H. Zhao, H. Hu, and Z. Wang, “Survey of target detection based on deep convolutional neural networks,” Opt. Precis. Eng., vol. 28, no. 5, pp. 1152–1164, 2020, doi: 10.3788/ope.20202805.1152.
[4]	P. Viola and M. Jones, “Robust real-time face detection,” in Proceedings Eighth IEEE International Conference on Computer Vision. ICCV 2001, Vancouver, BC, Canada: IEEE Comput. Soc, 2001, pp. 747–747. doi: 10.1109/ICCV.2001.937709.
[5]	N. Dalal and B. Triggs, “Histograms of Oriented Gradients for Human Detection,” in 2005 IEEE Computer Society Conference on Computer Vision and Pattern Recognition (CVPR’05), San Diego, CA, USA: IEEE, 2005, pp. 886–893. doi: 10.1109/CVPR.2005.177.
[6]	M. Hussain, “YOLO-v1 to YOLO-v8, the Rise of YOLO and Its Complementary Nature toward Digital Manufacturing and Industrial Defect Detection,” Machines, vol. 11, no. 7, p. 677, Jun. 2023, doi: 10.3390/machines11070677.
[7]	G. Sharma, R. Dave, J. Sanadya, P. Sharma, and K. K. Sharma, “Various types and management of breast cancer: An overview,” J. Adv. Pharm. Technol. Res., vol. 1, no. 2, p. 109, 2010, doi: 10.4103/2231-4040.72251.
[8]	C. Li et al., “YOLOv6: A Single-Stage Object Detection Framework for Industrial Applications,” Sep. 07, 2022, arXiv: arXiv:2209.02976. doi: 10.48550/arXiv.2209.02976.
[9]	S. Zahia, D. Sierra-Sosa, B. Garcia-Zapirain, and A. Elmaghraby, “Tissue classification and segmentation of pressure injuries using convolutional neural networks,” Comput. Methods Programs Biomed., vol. 159, pp. 51–58, Jun. 2018, doi: 10.1016/j.cmpb.2018.02.018.
[10]	X. Sun, X. Wang, J. Liu, and H. Huang, “Classic YOLO Series Target Detection Algorithms and Their Application in Breast Cancer Detection,” J. Comput. Syst. Appl., vol. 32, no. 12, pp. 52–62, 2023, doi: 10.15888/j.cnki.csa.009351.
[11]	F. Prinzi, M. Insalaco, A. Orlando, S. Gaglio, and S. Vitabile, “A Yolo-Based Model for Breast Cancer Detection in Mammograms,” Cogn. Comput., vol. 16, no. 1, pp. 107–120, Jan. 2024, doi: 10.1007/s12559-023-10189-6.
[12]	P. K. Samanta, A. Basuli, N. K. Rout, and G. Panda, “Improved Breast Cancer Detection from Ultrasound Images Using YOLOv8 Model,” in 2023 IEEE 3rd International Conference on Applied Electromagnetics, Signal Processing, & Communication (AESPC), Bhubaneswar, India: IEEE, Nov. 2023, pp. 1–6. doi: 10.1109/AESPC59761.2023.10390341.
[13]	H. Gui et al., “FS-YOLOv9: A Frequency and Spatial Feature-Based YOLOv9 for Real-time Breast Cancer Detection,” Acad. Radiol., Oct. 2024, doi: 10.1016/j.acra.2024.09.048.
[14]	L. Zheng et al., “Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena,” Dec. 24, 2023, arXiv: arXiv:2306.05685. doi: 10.48550/arXiv.2306.05685.
[15]	A. Y. Yuan et al., “Hybrid deep learning network for vascular segmentation in photoacoustic imaging,” Biomed. Opt. Express, vol. 11, no. 11, p. 6445, Nov. 2020, doi: 10.1364/BOE.409246.
[16]	W. Al-Dhabyani, M. Gomaa, H. Khaled, and A. Fahmy, “Dataset of breast ultrasound images,” Data Brief, vol. 28, p. 104863, Feb. 2020, doi: 10.1016/j.dib.2019.104863.


"""

# Run
formatted_output, reference_counts, format_issues, missing_journals, paper_years = process_references(input_references, start_index=1)

# Print formatted references
print("Formatted References:\n", formatted_output)

# Print reference type counts
print("\nReference Type Counts:")
for ref_type, ref_list in reference_counts.items():
    print(f"{ref_type}: {len(ref_list)}")
    print(" ".join(ref_list))

# Print format issues detected
if format_issues:
    print("\nFormat Issues Detected:")
    for issue in format_issues:
        print(issue)

# Print paper years including arXiv
print("\nPaper Year (Journal/Conference/arXiv):")
for year, refs in sorted(paper_years.items(), reverse=True):
    print(f"{year}: {len(refs)} ({', '.join(refs)})")

# Print missing journal abbreviations with reference numbers for manual checking
if missing_journals:
    print("\nJournals Not Found in JSON (Check Manually):")
    for journal, ref_numbers in missing_journals.items():
        print(f"{journal}: {', '.join(ref_numbers)}")


Formatted References:
 [1]	R. L. Siegel, K. D. Miller, N. S. Wagle, and A. Jemal, “Cancer statistics, 2023,” CA: A Cancer Journal for Clinicians, vol. 73, no. 1, 
	DOI: 10.3322/caac.21763.
[2]	K. V. Sriram and R. H. Havaldar, “Analytical review and study on object detection techniques in the image,” International Journal of Modeling, Simulation, and Scientific Computing, vol. 12, no. 05, pp. 215, 0031.
	DOI: 10.1142/S1793962321500318.
[3]	L. Fan, H. Zhao, H. Zhao, H. Hu, and Z. Wang, “Survey of target detection based on deep convolutional neural networks,” Optics and Precision Engineering, vol. 28, no. 5, pp. 1152–1164, 2020.
	DOI: 10.3788/ope.20202805.1152.
[4]	P. Viola and M. Jones, “Robust real-time face detection,”  
	DOI: 10.1109/ICCV.2001.937709.
[5]	N. Dalal and B. Triggs, “Histograms of Oriented Gradients for Human Detection,”  
	DOI: 10.1109/CVPR.2005.177.
[6]	M. Hussain, “YOLO-v1 to YOLO-v8, the Rise of YOLO and Its Complementary Nature toward Digital Manufacturing and Indust