In [1]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.path.dirname("__file__"), "..")))

In [2]:
from loguru import logger

# Configure loguru
logger.remove()  # Remove default handler
logger.add(sys.stderr, level="INFO")  # Add console handler
logger.add("example.log", level="INFO", rotation="2 MB")  # Add file handler

2

In [3]:
import json
from src.parsers.pubmed_parser import PubmedParser

In [4]:
pubmed = PubmedParser(get_citation_count_bool=True, get_journal_ranking_bool=True)

# Example of using the PubmedParser with PubmedID

In [5]:
pubmed.build_pubmed_json_from_pmid("36464825")



{'abstract': [{'text': '\n\nBACKGROUND/AIMS:\nEndoscopic submucosal dissection (ESD) is an effective method for resecting gastric adenomas and adenocarcinomas. A significant discrepancy was observed between endoscopic and pathological sizes in samples obtained from patients undergoing ESD. This study elucidates the factors affecting size discrepancy after formalin fixation.\n\n\nMETHODS:\nThe records of 64 patients with 69 lesions were analyzed, including 50 adenomas and 19 adenocarcinomas. Data on location, gross shape, histology, and size after fixation in formalin were collected.\n\n\nRESULTS:\nThe mean size of the resected specimen appeared to decrease after formalin fixation (37.5 mm prefixation vs. 35.8 mm postfixation, p<0.05). The mean long axis diameter of the lesions was 20.3±7.9 mm prefixation and 13.4±7.9 mm postfixation. Size differences in lesions smaller than 20 mm were significantly greater than those in lesions larger than 20 mm (7.6±5.6 mm vs. 2.5±5.8 mm, p<0.01). In 

# Fetch the xml file from pubmed API

In [6]:
pubmed_xml = pubmed.get_pubmed_article_xml("36464825")
print(pubmed_xml)

<?xml version="1.0" ?>
<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2025//EN" "https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_250101.dtd">
<PubmedArticleSet>
<PubmedArticle><MedlineCitation Status="PubMed-not-MEDLINE" Owner="NLM"><PMID Version="1">36464825</PMID><DateRevised><Year>2022</Year><Month>12</Month><Day>14</Day></DateRevised><Article PubModel="Print-Electronic"><Journal><ISSN IssnType="Print">2234-2400</ISSN><JournalIssue CitedMedium="Print"><Volume>55</Volume><Issue>6</Issue><PubDate><Year>2022</Year><Month>Nov</Month></PubDate></JournalIssue><Title>Clinical endoscopy</Title><ISOAbbreviation>Clin Endosc</ISOAbbreviation></Journal><ArticleTitle>Underestimation of endoscopic size in large gastric epithelial neoplasms.</ArticleTitle><Pagination><StartPage>760</StartPage><EndPage>766</EndPage><MedlinePgn>760-766</MedlinePgn></Pagination><ELocationID EIdType="doi" ValidYN="Y">10.5946/ce.2021.269</ELocationID><Abstract><AbstractText Label="BACKGROUND/A

# Process a single xml file

In [7]:
# get citation count and journal ranking from APIs
pubmed = PubmedParser(get_citation_count_bool=True, get_journal_ranking_bool=True)

In [8]:
# use a single article as xml file
formatted_json = pubmed.build_pubmed_json(
    "example xml/temp_3.xml",
)
formatted_json



{'abstract': [{'text': 'Switch-based adaptive dynamic programming (ADP) is an optimal control problem in which a cost must be minimized by switching among a family of dynamical modes. When the system dimension increases, the solution to switch-based ADP is made prohibitive by the exponentially increasing structure of the value function approximator and by the exponentially increasing modes. This technical correspondence proposes a distributed computational method for solving switch-based ADP. The method relies on partitioning the system into agents, each one dealing with a lower dimensional state and a few local modes. Each agent aims to minimize a local version of the global cost while avoiding that its local switching strategy has conflicts with the switching strategies of the neighboring agents. A heuristic algorithm based on the consensus dynamics and Nash equilibrium is proposed to avoid such conflicts. The effectiveness of the proposed method is verified via traffic and building 

# Process a large xml gz file that contains more than around 5000 articles

In [9]:
# these large xml files are from https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/
pubmed = PubmedParser(get_citation_count_bool=False, get_journal_ranking_bool=False)

dicts_out = pubmed.parse_pubmed_xml_iter("../data/pubmed23n1181.xml.gz")

save_path = "../data/pubmed"
if not os.path.exists(save_path):
    os.makedirs(save_path)

for article in dicts_out:
    if article is None:
        continue
    pmid = article["pmid"]
    with open(f"{save_path}/pubmed_{pmid}.json", "w") as f:
        json.dump(article, f)

[32m2025-05-23 18:18:14.955[0m | [1mINFO    [0m | [36msrc.parsers.pubmed_parser[0m:[36mparse_pubmed_xml_iter[0m:[36m339[0m - [1mStarting to parse XML file: ../data/pubmed23n1181.xml.gz[0m
[32m2025-05-23 18:18:17.621[0m | [1mINFO    [0m | [36msrc.parsers.pubmed_parser[0m:[36mbuild_pubmed_json[0m:[36m276[0m - [1mNo abstract found in pubmed article 32297852[0m
[32m2025-05-23 18:18:18.260[0m | [1mINFO    [0m | [36msrc.parsers.pubmed_parser[0m:[36mbuild_pubmed_json[0m:[36m276[0m - [1mNo abstract found in pubmed article 32372739[0m
[32m2025-05-23 18:18:18.385[0m | [1mINFO    [0m | [36msrc.parsers.pubmed_parser[0m:[36mbuild_pubmed_json[0m:[36m276[0m - [1mNo abstract found in pubmed article 32412552[0m
[32m2025-05-23 18:18:18.440[0m | [1mINFO    [0m | [36msrc.parsers.pubmed_parser[0m:[36mbuild_pubmed_json[0m:[36m276[0m - [1mNo abstract found in pubmed article 32437920[0m
[32m2025-05-23 18:18:18.575[0m | [1mINFO    [0m | [36msrc.p

In [10]:
# remove save path

import shutil

# Check if it exists before attempting to remove
if os.path.exists(save_path):
	try:
		# Use rmtree to remove directory and all its contents
		shutil.rmtree(save_path)
		print(f"Successfully removed {save_path}")
	except Exception as e:
		print(f"Error removing directory: {e}")
else:
	print(f"Path {save_path} does not exist")

Successfully removed ../data/pubmed
