In [5]:
#!/usr/bin/env python3
import csv
import sys
import time
from urllib.parse import urlparse

import requests
from bs4 import BeautifulSoup

UCSC_TRNA_TABLE = "hg19_gencode"
OUT_CSV = "test_first10_tRNA_variants.csv"

session = requests.Session()
session.headers.update({
    "User-Agent": "Mozilla/5.0 (compatible; gtrnadb-test/0.1)"
})


def clean_trna_url(raw):
    if not raw:
        return None
    s = raw.strip()
    if not s:
        return None
    # 有些会变成 "...Hsapi19/http://gtrnadb2..."，取第二个 http
    idx = s.find("http", 1)
    if idx != -1:
        s = s[idx:]
    return s


def to_gtrnadb_org(url):
    """
    UCSC 里是 http://gtrnadb2.ucsc.edu/genomes/eukaryota/Hsapi19/genes/XXX.html
    这里强制换成 https://gtrnadb.org/genomes/eukaryota/Hsapi19/genes/XXX.html
    """
    if not url:
        return None
    p = urlparse(url)
    path = p.path  # /genomes/eukaryota/Hsapi19/genes/XXX.html
    if not path:
        return None
    return "https://gtrnadb.org" + path


def load_gene_urls(tsv_path):
    urls = []
    with open(tsv_path, newline="") as f:
        reader = csv.DictReader(f, delimiter="\t")
        if "trnaUrl" not in reader.fieldnames:
            raise RuntimeError("UCSC 表头里没有 trnaUrl 这一列。")
        for row in reader:
            raw = row.get("trnaUrl", "")
            u0 = clean_trna_url(raw)
            u = to_gtrnadb_org(u0)
            if u:
                urls.append(u)
    return urls


def parse_gene_page(url):
    r = session.get(url, timeout=20)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "lxml")

    gene_id = url.rstrip("/").split("/")[-1].replace(".html", "")

    # 找到 “DNA Variants” 那张表
    target_table = None
    for table in soup.find_all("table"):
        headers = [th.get_text(strip=True) for th in table.find_all("th")]
        if not headers:
            continue
        # 第一列是 “tRNA Position”，并且有 “Genomic Position”
        if headers[0].startswith("tRNA Position") and any("Genomic Position" in h for h in headers):
            target_table = table
            break

    if target_table is None:
        return gene_id, []

    variants = []
    for tr in target_table.find_all("tr"):
        tds = tr.find_all("td")
        if not tds:
            continue
        cols = []
        for td in tds:
            a = td.find("a")
            if a and a.get_text(strip=True):
                cols.append(a.get_text(strip=True))
            else:
                cols.append(td.get_text(strip=True))
        if cols:
            variants.append(cols)
    return gene_id, variants


def main():
    urls = load_gene_urls(UCSC_TRNA_TABLE)

#    # 只测前 10 个
#    urls = urls[:10]
    print("First 10 gene URLs (already改成 gtrnadb.org):", file=sys.stderr)
    for u in urls:
        print("  ", u, file=sys.stderr)

    with open(OUT_CSV, "w", newline="") as fout:
        w = csv.writer(fout)
        w.writerow([
            "tRNA_Gene",
            "tRNA_Position",
            "Genomic_Position",
            "dbSNP_ID",
            "Ref_Alt_Allele",
            "Common_SNP",
            "OneK_Genome",
            "Effect",
        ])

        for i, url in enumerate(urls, 1):
            print(f"[{i}/10] Fetching {url}", file=sys.stderr)
            try:
                gene_id, rows = parse_gene_page(url)
            except Exception as e:
                print(f"  ERROR {url}: {e}", file=sys.stderr)
                time.sleep(0.3)
                continue

            if not rows:
                print(f"  No DNA variants for {gene_id}", file=sys.stderr)
                time.sleep(0.2)
                continue

            print(f"  {gene_id}: {len(rows)} variants", file=sys.stderr)
            for cols in rows:
                cols = (cols + [""] * 7)[:7]
                w.writerow([gene_id] + cols)

            time.sleep(0.3)

    print(f"Done. Written to {OUT_CSV}", file=sys.stderr)


if __name__ == "__main__":
    main()


First 10 gene URLs (already改成 gtrnadb.org):
   https://gtrnadb.org/genomes/eukaryota/Hsapi19/genes/nm-tRNA-Tyr-GTA-chr1-125.html
   https://gtrnadb.org/genomes/eukaryota/Hsapi19/genes/tRNA-Und-NNN-chr1-1.html
   https://gtrnadb.org/genomes/eukaryota/Hsapi19/genes/tRNA-Und-NNN-4-1.html
   https://gtrnadb.org/genomes/eukaryota/Hsapi19/genes/tRNA-Asn-GTT-5-1.html
   https://gtrnadb.org/genomes/eukaryota/Hsapi19/genes/tRNA-Asn-GTT-18-1.html
   https://gtrnadb.org/genomes/eukaryota/Hsapi19/genes/tRNA-Glu-TTC-4-1.html
   https://gtrnadb.org/genomes/eukaryota/Hsapi19/genes/tRNA-Gly-CCC-1-1.html
   https://gtrnadb.org/genomes/eukaryota/Hsapi19/genes/tRNA-Val-CAC-11-1.html
   https://gtrnadb.org/genomes/eukaryota/Hsapi19/genes/tRNA-Gly-CCC-4-1.html
   https://gtrnadb.org/genomes/eukaryota/Hsapi19/genes/tRNA-Val-CAC-10-1.html
   https://gtrnadb.org/genomes/eukaryota/Hsapi19/genes/tRNA-Gly-CCC-5-1.html
   https://gtrnadb.org/genomes/eukaryota/Hsapi19/genes/tRNA-Val-CAC-11-2.html
   https://gtrnad

In [6]:
import requests
from pprint import pprint

url = "https://ngdc.cncb.ac.cn/colocdb/api/coloc"

params = {
    "pageSize": 10,
    "pageIndex": 1,
    "gene_id": "TYW5",   # 你刚给的参数
}

r = requests.get(url, params=params)
r.raise_for_status()
j = r.json()

pprint(j)


{'data': [{'chr': 2.0,
           'coloc_snp': 'rs796364',
           'gene_id': 'TYW5',
           'gwas_p': 9.41e-17,
           'id': 490,
           'locus': 'rs796364_FTCDNL1',
           'molecule': 'eQTL',
           'nsnps': 1757.0,
           'oldtissue': 'Adipose_Subcutaneous',
           'p1_threshold': 5e-08,
           'p2_threshold': 5e-08,
           'pp4_pp3': 16.0189615452235,
           'pp_h0_abf': 2.21897992267875e-15,
           'pp_h1_abf': 9.2692350630342e-05,
           'pp_h2_abf': 1.42901890882073e-12,
           'pp_h3_abf': 0.058752545212053,
           'pp_h4_abf': 0.941154762435883,
           'probe': 'ENSG00000162971.10',
           'qtl': 'Adipose_Subcutaneous_eQTL',
           'qtl_p': 7.84942e-08,
           'qtl_population': 'EUR',
           'snp_pp_h4': 0.115035427460555,
           'source': 'GTEx v8',
           'tissue': 'Adipose Subcutaneous',
           'top_snp': 'rs796364',
           'top_snp_gene': 'FTCDNL1',
           'trait': 'P00981',


In [17]:
import requests
import pandas as pd
import time

BASE_URL = "https://ngdc.cncb.ac.cn/colocdb/api/coloc"


def fetch_coloc_for_gene(gene_id, page_size=100):
    all_rows = []
    page_index = 1

    while True:
        params = {
            "pageSize": page_size,
            "pageIndex": page_index,
            "gene_id": gene_id,
        }

        r = requests.get(BASE_URL, params=params, timeout=30)
        r.raise_for_status()
        j = r.json()

        meta = j["meta"]
        total = meta["total"]
        rows = j["data"] 

        print(f"page {page_index}: got {len(rows)} / total {total}")

        if not rows:
            break

        all_rows.extend(rows)

        if page_index * page_size >= total:
            break

        page_index += 1
        time.sleep(0.3) 

    return all_rows


In [None]:
if __name__ == "__main__":
    gene = "TYW5"
    rows = fetch_coloc_for_gene(gene)

    print(f"Total rows collected: {len(rows)}")

    if rows:
        df = pd.DataFrame(rows)
        df.to_csv(f"COLOCdb_{gene}.csv", index=False)
        print(f"Saved to COLOCdb_{gene}.csv")

In [8]:
if __name__ == "__main__":
    gene = "NSUN6"
    rows = fetch_coloc_for_gene(gene)

    print(f"Total rows collected: {len(rows)}")

    if rows:
        df = pd.DataFrame(rows)
        df.to_csv(f"COLOCdb_{gene}.csv", index=False)
        print(f"Saved to COLOCdb_{gene}.csv")


page 1: got 4 / total 4
Total rows collected: 4
Saved to COLOCdb_NSUN6.csv


In [9]:
if __name__ == "__main__":
    gene = "TRMT61A"
    rows = fetch_coloc_for_gene(gene)

    print(f"Total rows collected: {len(rows)}")

    if rows:
        df = pd.DataFrame(rows)
        df.to_csv(f"COLOCdb_{gene}.csv", index=False)
        print(f"Saved to COLOCdb_{gene}.csv")

page 1: got 100 / total 685
page 2: got 100 / total 685
page 3: got 100 / total 685
page 4: got 100 / total 685
page 5: got 100 / total 685
page 6: got 100 / total 685
page 7: got 85 / total 685
Total rows collected: 685
Saved to COLOCdb_TRMT61A.csv


In [18]:
if __name__ == "__main__":
    gene = "QTRT1"
    rows = fetch_coloc_for_gene(gene)

    print(f"Total rows collected: {len(rows)}")

    if rows:
        df = pd.DataFrame(rows)
        df.to_csv(f"COLOCdb_{gene}.csv", index=False)
        print(f"Saved to COLOCdb_{gene}.csv")

page 1: got 74 / total 74
Total rows collected: 74
Saved to COLOCdb_QTRT1.csv


In [12]:
import requests
import pandas as pd
import time

BASE_URL = "https://ngdc.cncb.ac.cn/colocdb/api/smr"

def fetch_smr_for_gene(gene, page_size=100):
    all_records = []
    page_index = 1

    while True:
        params = {
            "pageSize": page_size,
            "pageIndex": page_index,
            "gene": gene,
        }

        resp = requests.get(BASE_URL, params=params, timeout=30)
        resp.raise_for_status()
        j = resp.json()

        # 和 coloc 一样
        meta = j.get("meta", {})
        total = meta.get("total", 0)
        records = j.get("data", [])

        print(f"[SMR] page {page_index}, got {len(records)}, total={total}")

        if not records:
            break

        all_records.extend(records)

        # 如果已经取完
        if page_index * page_size >= total:
            break

        page_index += 1
        time.sleep(0.3)

    return all_records


In [None]:

if __name__ == "__main__":
    gene = "NSUN2"
    all_smr = fetch_smr_for_gene(gene)
    print(f"Total SMR rows for {gene}: {len(all_smr)}")

    if all_smr:
        df = pd.DataFrame(all_smr)
        df.to_csv(f"COLOCdb_SMR_{gene}.csv", index=False)
        print(f"Saved to COLOCdb_SMR_{gene}.csv")

In [13]:
if __name__ == "__main__":
    gene = "TRMT61A"
    all_smr = fetch_smr_for_gene(gene)
    print(f"Total SMR rows for {gene}: {len(all_smr)}")

    if all_smr:
        df = pd.DataFrame(all_smr)
        df.to_csv(f"COLOCdb_SMR_{gene}.csv", index=False)
        print(f"Saved to COLOCdb_SMR_{gene}.csv")

[SMR] page 1, got 100, total=935
[SMR] page 2, got 100, total=935
[SMR] page 3, got 100, total=935
[SMR] page 4, got 100, total=935
[SMR] page 5, got 100, total=935
[SMR] page 6, got 100, total=935
[SMR] page 7, got 100, total=935
[SMR] page 8, got 100, total=935
[SMR] page 9, got 100, total=935
[SMR] page 10, got 35, total=935
Total SMR rows for TRMT61A: 935
Saved to COLOCdb_SMR_TRMT61A.csv


In [14]:
if __name__ == "__main__":
    gene = "QTRT1"
    all_smr = fetch_smr_for_gene(gene)
    print(f"Total SMR rows for {gene}: {len(all_smr)}")

    if all_smr:
        df = pd.DataFrame(all_smr)
        df.to_csv(f"COLOCdb_SMR_{gene}.csv", index=False)
        print(f"Saved to COLOCdb_SMR_{gene}.csv")

[SMR] page 1, got 100, total=587
[SMR] page 2, got 100, total=587
[SMR] page 3, got 100, total=587
[SMR] page 4, got 100, total=587
[SMR] page 5, got 100, total=587
[SMR] page 6, got 87, total=587
Total SMR rows for QTRT1: 587
Saved to COLOCdb_SMR_QTRT1.csv


In [15]:
if __name__ == "__main__":
    gene = "TYW5"
    all_smr = fetch_smr_for_gene(gene)
    print(f"Total SMR rows for {gene}: {len(all_smr)}")

    if all_smr:
        df = pd.DataFrame(all_smr)
        df.to_csv(f"COLOCdb_SMR_{gene}.csv", index=False)
        print(f"Saved to COLOCdb_SMR_{gene}.csv")

[SMR] page 1, got 100, total=302
[SMR] page 2, got 100, total=302
[SMR] page 3, got 100, total=302
[SMR] page 4, got 2, total=302
Total SMR rows for TYW5: 302
Saved to COLOCdb_SMR_TYW5.csv
