In [1]:
import re
import requests
import xml.etree.ElementTree as ET
import feedparser
from IPython.display import HTML

# --- Sample Input References ---
input_references = """[1]	R. L. Siegel, K. D. Miller, N. S. Wagle, and A. Jemal, “Cancer statistics, 2023,” CA. Cancer J. Clin., vol. 73, no. 1, pp. 17–48, Jan. 2023, doi: 10.3322/caac.21763.
[2]	K. V. Sriram and R. H. Havaldar, “Analytical review and study on object detection techniques in the image,” Int. J. Model. Simul. Sci. Comput., vol. 12, no. 05, p. 2150031, Oct. 2021, doi: 10.1142/S1793962321500318.
[3]	L. Fan, H. Zhao, H. Zhao, H. Hu, and Z. Wang, “Survey of target detection based on deep convolutional neural networks,” Opt. Precis. Eng., vol. 28, no. 5, pp. 1152–1164, 2020, doi: 10.3788/ope.20202805.1152.
[4]	P. Viola and M. Jones, “Robust real-time face detection,” in Proceedings Eighth IEEE International Conference on Computer Vision. ICCV 2001, Vancouver, BC, Canada: IEEE Comput. Soc, 2001, pp. 747–747, doi: 10.1109/ICCV.2001.937709.
[5]	N. Dalal and B. Triggs, “Histograms of Oriented Gradients for Human Detection,” in 2005 IEEE Computer Society Conference on Computer Vision and Pattern Recognition (CVPR’05), San Diego, CA, USA: IEEE, 2005, pp. 886–893, doi: 10.1109/CVPR.2005.177.
[6]	M. Hussain, “YOLO-v1 to YOLO-v8, the Rise of YOLO and Its Complementary Nature toward Digital Manufacturing and Industrial Defect Detection,” Machines, vol. 11, no. 7, p. 677, Jun. 2023, doi: 10.3390/machines11070677.
[7]	G. Sharma, R. Dave, J. Sanadya, P. Sharma, and K. K. Sharma, “Various types and management of breast cancer: An overview,” J. Adv. Pharm. Technol. Res., vol. 1, no. 2, p. 109, 2010, doi: 10.4103/2231-4040.72251.
[8]	C. Li et al., “YOLOv6: A Single-Stage Object Detection Framework for Industrial Applications,” Sep. 07, 2022, arXiv: arXiv:2209.02976, doi: 10.48550/arXiv.2209.02976.
[9]	S. Zahia, D. Sierra-Sosa, B. Garcia-Zapirain, and A. Elmaghraby, “Tissue classification and segmentation of pressure injuries using convolutional neural networks,” Comput. Methods Programs Biomed., vol. 159, pp. 51–58, Jun. 2018, doi: 10.1016/j.cmpb.2018.02.018.
[10]	X. Sun, X. Wang, J. Liu, and H. Huang, “Classic YOLO Series Target Detection Algorithms and Their Application in Breast Cancer Detection,” J. Comput. Syst. Appl., vol. 32, no. 12, pp. 52–62, 2023, doi: 10.15888/j.cnki.csa.009351.
[11]	F. Prinzi, M. Insalaco, A. Orlando, S. Gaglio, and S. Vitabile, “A Yolo-Based Model for Breast Cancer Detection in Mammograms,” Cogn. Comput., vol. 16, no. 1, pp. 107–120, Jan. 2024, doi: 10.1007/s12559-023-10189-6.
[12]	P. K. Samanta, A. Basuli, N. K. Rout, and G. Panda, “Improved Breast Cancer Detection from Ultrasound Images Using YOLOv8 Model,” in 2023 IEEE 3rd International Conference on Applied Electromagnetics, Signal Processing, & Communication (AESPC), Bhubaneswar, India: IEEE, Nov. 2023, pp. 1–6, doi: 10.1109/AESPC59761.2023.10390341.
[13]	H. Gui et al., “FS-YOLOv9: A Frequency and Spatial Feature-Based YOLOv9 for Real-time Breast Cancer Detection,” Acad. Radiol., Oct. 2024, doi: 10.1016/j.acra.2024.09.048.
[14]	L. Zheng et al., “Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena,” Dec. 24, 2023, arXiv: arXiv:2306.05685, doi: 10.48550/arXiv.2306.05685.
[15]	A. Y. Yuan et al., “Hybrid deep learning network for vascular segmentation in photoacoustic imaging,” Biomed. Opt. Express, vol. 11, no. 11, p. 6445, Nov. 2020, doi: 10.1364/BOE.409246.
[16]	W. Al-Dhabyani, M. Gomaa, H. Khaled, and A. Fahmy, “Dataset of breast ultrasound images,” Data Brief, vol. 28, p. 104863, Feb. 2020, doi: 10.1016/j.dib.2019.104863.
"""

def remove_leading_numbering(reference):
    """Remove any leading numbering (e.g., '[8]') from the reference text."""
    return re.sub(r'^\s*\[\d+\]\s*', '', reference)

def extract_doi(reference):
    """Extract the DOI from a reference string and remove trailing punctuation."""
    doi_match = re.search(r"doi:\s*(\S+)", reference, re.IGNORECASE)
    if doi_match:
        return doi_match.group(1).rstrip(".,;")
    return None

def get_crossref_details(doi):
    """Query the CrossRef API for DOI metadata."""
    url = f"https://api.crossref.org/works/{doi}"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.json().get("message", {})
        else:
            return {}
    except Exception as e:
        print(f"Exception while querying DOI {doi}: {e}")
        return {}

def get_arxiv_details(doi):
    """Query the arXiv API for metadata if the DOI is from arXiv."""
    parts = doi.split("/")
    if len(parts) >= 2 and "arxiv" in parts[1].lower():
        arxiv_id = parts[1].replace("arXiv.", "")
        url = f"http://export.arxiv.org/api/query?id_list={arxiv_id}"
        feed = feedparser.parse(url)
        if feed.entries:
            entry = feed.entries[0]
            published_date = entry.get("published", entry.get("updated", "Unknown"))
            year = published_date[:4] if published_date != "Unknown" else "Unknown"
            details = {
                "title": [entry.title],
                "author": [{"given": "", "family": author.name} for author in entry.authors],
                "published-online": {"date-parts": [[year]]},
                "container-title": ["arXiv"]
            }
            return details
    return {}

def get_doi_details(doi):
    """Determine the type of DOI and query the appropriate API."""
    if "arxiv" in doi.lower():
        details = get_arxiv_details(doi)
        if details:
            return details
    return get_crossref_details(doi)

def format_authors(authors):
    """Format the list of authors; use 'et al.' if there are 4 or more."""
    if not authors:
        return "Not available"
    author_names = [f"{a.get('given', '').strip()} {a.get('family', '').strip()}" 
                    for a in authors if a.get('given') or a.get('family')]
    if len(author_names) >= 4:
        return f"{author_names[0]} et al."
    return ", ".join(author_names)

def get_year(details, original_ref=""):
    """Extract the publication year from the metadata.
    If no valid year is found, attempt to extract from the original reference text."""
    pub_info = details.get("published-print", details.get("published-online", {}))
    date_parts = pub_info.get("date-parts", [[]])
    if date_parts and date_parts[0] and date_parts[0][0]:
        year = str(date_parts[0][0])
    else:
        year = "Unknown"
    # Fallback: If year is "Unknown", try to parse a year from the original reference
    if year == "Unknown" and original_ref:
        match = re.search(r'\b(19|20)\d{2}\b', original_ref)
        if match:
            year = match.group(0)
    return year

def format_title(title):
    # Remove newline characters and extra spaces, then title-case the string.
    normalized = " ".join(title.split())
    return normalized.title()

def format_citation(index, doi, details, original_reference):
    """
    Return a formatted citation string in HTML format.
    The formatting varies based on the type (journal, conference, arXiv).
    If no DOI metadata is found, output the original reference in IJORAS style.
    """
    # If details is empty (i.e., no DOI metadata found) but a DOI was extracted,
    # split the original reference to remove the DOI part and append the DOI URL on a new line.
    if not details:
        ref_without_doi = re.split(r"\bdoi:\s*", original_reference, flags=re.IGNORECASE)[0].strip().rstrip(",")
        citation = f'<p style="white-space: pre;">[{index}]\t{ref_without_doi}.<br>DOI: https://doi.org/{doi}</p>'
        return citation

    authors = format_authors(details.get("author", []))
    raw_title = details.get("title", ["No title found"])[0]
    title = format_title(raw_title)
    year = get_year(details, original_reference)
    container = details.get("container-title", [])
    source = container[0] if container else ""
    volume = details.get("volume", "Not available")
    issue = details.get("issue", "Not available")
    page = details.get("page", "")
    
    doi_lower = doi.lower() if doi else ""
    source_lower = source.lower() if source else ""
    
    # For Data in Brief or similar - check for "article-number" field.
    if "data in brief" in source_lower or "article-number" in details:
        article_no = details.get("article-number", page)
        citation = f'<p style="white-space: pre;">[{index}]\t{authors}, “{title},” <i>{source}</i>'
        if volume != "Not available":
            citation += f", vol. {volume}"
        if article_no:
            citation += f", Article no. {article_no}"
        if year:
            citation += f", {year}"
        citation += f".<br>DOI: https://doi.org/{doi}</p>"
    # For conference papers - include event info.
    elif "conference" in source_lower or (volume == "Not available" and page):
        citation = f'<p style="white-space: pre;">[{index}]\t{authors}, “{title},” {source}'
        if page:
            citation += f", pp. {page}"
        if year:
            citation += f", {year}"
        citation += f".<br>DOI: https://doi.org/{doi}</p>"
    # For arXiv preprints.
    elif "arxiv" in doi_lower:
        citation = f'<p style="white-space: pre;">[{index}]\t{authors}, “{title}.”<br>DOI: https://doi.org/{doi}</p>'
    # Default: Journal article formatting.
    else:
        citation = f'<p style="white-space: pre;">[{index}]\t{authors}, “{title},” <i>{source}</i>'
        if volume != "Not available":
            citation += f", vol. {volume}"
        if issue != "Not available":
            citation += f", no. {issue}"
        if page:
            citation += f", pp. {page}"
        if year:
            citation += f", {year}"
        citation += f".<br>DOI: https://doi.org/{doi}</p>"
    return citation

# --- Main Processing ---
references_list = [ref.strip() for ref in input_references.strip().split("\n") if ref.strip()]
formatted_citations = []
issues = []

for i, ref in enumerate(references_list, start=1):
    clean_ref = remove_leading_numbering(ref)
    doi = extract_doi(clean_ref)
    if doi:
        details = get_doi_details(doi)
        if details:
            formatted = format_citation(i, doi, details, clean_ref)
            formatted_citations.append(formatted)
        else:
            formatted_citations.append(f'<p style="white-space: pre;">[{i}]\t{re.split(r"\bdoi:\s*", clean_ref, flags=re.IGNORECASE)[0].strip().rstrip(",")}.<br>DOI: https://doi.org/{doi}</p>')
            issues.append(f"[{i}] could not find the DOI metadata for DOI: {doi}")
    else:
        formatted_citations.append(f'<p style="white-space: pre;">[{i}]\t{clean_ref}</p>')

if issues:
    issues_section = "<h2>Issues</h2>" + "\n".join(f"<p>{issue}</p>" for issue in issues)
else:
    issues_section = ""

html_output = "<html><body>" + "\n".join(formatted_citations) + issues_section + "</body></html>"

# Render the HTML in a Jupyter Notebook.
HTML(html_output)

# Alternatively, print the HTML code for copying:
# print(html_output)
