In [None]:
import time
from tqdm import tqdm
from openpyxl import load_workbook
from openpyxl.styles import Font
from Bio import Entrez

# Required: Fill in a contactable email as per NCBI requirements; having an API Key is even better (to increase limits)
Entrez.email = "haiyecrab996@gmail.com"
# Optional：Entrez.api_key = "YOUR_NCBI_API_KEY"


def get_geo_id_and_title(gse: str) -> tuple[str, str]:
    """返回 (gds_id, title)"""
    with Entrez.esearch(db="gds", term=f"{gse}[Accession]", retmax=1) as h:
        rec = Entrez.read(h)
    ids = rec.get("IdList", [])
    if not ids:
        return None, None
    gds_id = ids[0]
    with Entrez.esummary(db="gds", id=gds_id) as h:
        summ = Entrez.read(h)
    title = None
    try:
        title = " ".join(str(summ[0]["title"]).split())
    except Exception:
        pass
    return gds_id, title


def get_pmids_via_elink_from_gds(gds_id: str) -> list[str]:
    """Use ELink to directly obtain the associated PMID list from GDS."""
    with Entrez.elink(dbfrom="gds", db="pubmed", id=gds_id) as h:
        links = Entrez.read(h)
    pmids = []
    for ldb in links[0].get("LinkSetDb", []):
        if ldb.get("DbTo") == "pubmed":
            pmids.extend([link["Id"] for link in ldb.get("Link", [])])
    # Deduplicate and maintain order
    seen = set()
    ordered = []
    for x in pmids:
        if x not in seen:
            seen.add(x)
            ordered.append(x)
    return ordered

In [None]:
def main(xlsx_path=R"C:\Users\10784\Downloads\文献统计.xlsx", sheet_name="Sheet1",
         gse_col=1, pmid_col=3, title_col=4, start_row=1,
         sleep_sec=0.4):
    """
    gse_col/pmid_col/title_col Use 1-based indexing (Excel style).
    start_row=2 Skip the header; adjust according to the actual situation of your file.
    """
    wb = load_workbook(xlsx_path)
    ws = wb[sheet_name]

    # A rough estimate of the number of rows to be processed (stop at the first empty GSE)
    rows = []
    r = start_row
    while True:
        val = ws.cell(row=r, column=gse_col).value
        if val is None or str(val).strip() == "":
            break
        rows.append(r)
        r += 1

    for r in tqdm(rows, desc="Processing"):
        raw_gse = ws.cell(row=r, column=gse_col).value
        gse = str(raw_gse).strip()
        gse = gse.split("_")[0]

        # Skip content that is not in GSE style.
        if not gse.upper().startswith("GSE"):
            continue

        gds_id, title = get_geo_id_and_title(gse)
        # Write Title (if there is one)
        cell = ws.cell(row=r, column=title_col)
        cell.value = title or ""
        cell.font = Font(name="Verdana", size=9)
        print(f"{gse} -> Title: {title}", end=" ")

        pmids = []
        # First use ELink to directly obtain PMID from GSE (most reliable)
        if gds_id:
            try:
                pmids = get_pmids_via_elink_from_gds(gds_id)
            except Exception as e:
                print(f"[WARN] ELink失败 @ row {r}, GDS {gds_id}: {e}")

        if pmids:
            cell = ws.cell(row=r, column=pmid_col)
            cell.value = f"PMID: {",".join(pmids)}"
            cell.hyperlink = f"https://pubmed.ncbi.nlm.nih.gov/{pmids[0]}/"
            cell.font = Font(name="Verdana", size=9)
        else:
            ws.cell(row=r, column=pmid_col).value = ""
        
        print(f"-> PMIDs: {pmids}")

        # Comply with NCBI frequency limits (recommended ≥0.34s without API key)
        time.sleep(sleep_sec)

    wb.save(xlsx_path)
    print("finished ✅ writing to", xlsx_path)


if __name__ == "__main__":
    main()

In [None]:
get_geo_id_and_title('GSE198372')

In [None]:
get_pmids_via_elink_from_gds('200198372')