In [2]:
from pathlib import Path
import pandas as pd
from tqdm.auto import tqdm

def build_bg_table(bg_files, out_prefix: Path, desc: str = ""):
    """
    从一组 bed/vcf(/vcf.gz) 文件构建背景位点表，只保留 chr,pos 两列：
    - BED: 使用 start+1 → 1-based pos
    - VCF: 使用 POS 列
    - 去掉 'chr' 前缀，统一成和 mut 表一致的染色体格式
    - 去重 + 排序
    - 保存为 gzip 压缩 tsv: {out_prefix}.tsv.gz
      例如 out_prefix = Path("bg_sites_293T.hg38")
      -> 文件名: bg_sites_293T.hg38.tsv.gz
    """
    bg_dfs = []

    print(f"[INFO] {desc} 将合并以下文件：")
    for f in bg_files:
        print("  ", f)

    for f in tqdm(bg_files, desc=desc):
        if not f.exists():
            print(f"[WARN] 不存在，跳过: {f}")
            continue

        suf = f.suffix.lower()
        name = str(f)

        # BED: 0-based start, 左闭右开；用 start+1 -> 1-based pos
        if suf == ".bed":
            bed = pd.read_csv(f, sep="\t", header=None, comment="#")
            if bed.shape[1] < 3:
                print(f"[WARN] BED 列数不足 3，跳过: {f}")
                continue
            bed.columns = ["chr", "start", "end"] + list(range(3, bed.shape[1]))
            bed["pos"] = bed["start"].astype("int64") + 1
            bg_dfs.append(bed[["chr", "pos"]])
            print(f"[INFO] BED 读取完成: {f} (n={len(bed):,})")

        # VCF: 标准 1-based POS，直接用第 2 列
        elif suf == ".vcf":
            vcf = pd.read_csv(
                f,
                sep="\t",
                comment="#",
                header=None,
                usecols=[0, 1],
                names=["chr", "pos"],
            )
            bg_dfs.append(vcf[["chr", "pos"]])
            print(f"[INFO] VCF 读取完成: {f} (n={len(vcf):,})")

        # .vcf.gz（防止以后你换成压缩版）
        elif suf == ".gz" and name.endswith(".vcf.gz"):
            vcf = pd.read_csv(
                f,
                sep="\t",
                comment="#",
                header=None,
                usecols=[0, 1],
                names=["chr", "pos"],
                compression="infer",
            )
            bg_dfs.append(vcf[["chr", "pos"]])
            print(f"[INFO] VCF.GZ 读取完成: {f} (n={len(vcf):,})")

        else:
            print(f"[WARN] 未识别后缀（不是 .bed / .vcf / .vcf.gz？），跳过: {f}")

    if not bg_dfs:
        raise RuntimeError(f"{desc}: 一个合法的背景文件都没读到，请检查路径和后缀。")

    print(f"[INFO] {desc} 合并所有 DataFrame ...")
    bg_all = pd.concat(bg_dfs, ignore_index=True)

    # 去掉 'chr' 前缀，统一格式；只保留 chr,pos
    bg_all["chr"] = bg_all["chr"].astype(str).str.replace(r"^chr", "", regex=True)
    bg_all["pos"] = bg_all["pos"].astype("int64")

    # 去重 + 排序
    bg_all = (
        bg_all.drop_duplicates()
        .sort_values(["chr", "pos"])
        .reset_index(drop=True)
    )

    print(f"[DONE] {desc} 合并后背景位点总数: {len(bg_all):,}")
    print(bg_all.head())

    # ===== 关键改动：保留 hg38，拼接 .tsv.gz 后缀 =====
    tsv_gz_path = out_prefix.parent / (out_prefix.name + ".tsv.gz")
    bg_all.to_csv(tsv_gz_path, sep="\t", index=False, compression="gzip")
    print(f"[SAVE] {desc} 压缩 TSV: {tsv_gz_path}  （chr,pos 两列，已 gzip 压缩）")

    return bg_all, tsv_gz_path


# ========= 1) 293T DIY 背景 =========

BG_293T_DIR = Path(
    "~/1.database/db_genomes/cell_line_mutations/293T/293T_BE_INPUT_VCF"
).expanduser()

bg_files_293t = [
    BG_293T_DIR / "293T-EMX1-Mock-Input.site_index.rmdup.bed",
    BG_293T_DIR / "293T_DddA11_FalsePositive_off-target-list.bed",
    BG_293T_DIR
    / "293T-Mock-Input-covaris_bwa_hg38_sort_rmdup.recall.merge.Genotype.filter.rmdup_signal.vcf",
]

bg_293t, path_293t = build_bg_table(
    bg_files_293t,
    out_prefix=Path("./bg_sites_293T.hg38"),
    desc="293T DIY background",
)

# ========= 2) GATK Resource Bundle 背景 =========

GATK_DIR = Path(
    "/lustre1/chengqiyi_pkuhpc/zhaohn/1.database/db_genomes/GATK_resource_bundle_hg38"
)

bg_files_gatk = [
    GATK_DIR / "resources_broad_hg38_v0_1000G.phase3.integrated.sites_only.no_MATCHED_REV.hg38.vcf",
    GATK_DIR / "resources_broad_hg38_v0_1000G_omni2.5.hg38.vcf",
    GATK_DIR / "resources_broad_hg38_v0_1000G_phase1.snps.high_confidence.hg38.vcf",
    GATK_DIR / "resources_broad_hg38_v0_Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf",
    GATK_DIR / "resources_broad_hg38_v0_hapmap_3.3.hg38.vcf",
    GATK_DIR / "resources_broad_hg38_v0_Homo_sapiens_assembly38.dbsnp138.vcf",
    GATK_DIR / "resources_broad_hg38_v0_Homo_sapiens_assembly38.known_indels.vcf",
    GATK_DIR / "resources_broad_hg38_v0_Mills_and_1000G_gold_standard.indels.hg38.vcf",
]

bg_gatk, path_gatk = build_bg_table(
    bg_files_gatk,
    out_prefix=Path("./bg_sites_GATKbundle.hg38"),
    desc="GATK bundle background",
)

print("\n[SUMMARY]")
print("293T DIY  背景文件:", path_293t)
print("GATK bundle 背景文件:", path_gatk)

[INFO] 293T DIY background 将合并以下文件：
   /lustre3/chengqiyi_pkuhpc/folder_for_learning/zhaohn/1.database/db_genomes/cell_line_mutations/293T/293T_BE_INPUT_VCF/293T-EMX1-Mock-Input.site_index.rmdup.bed
   /lustre3/chengqiyi_pkuhpc/folder_for_learning/zhaohn/1.database/db_genomes/cell_line_mutations/293T/293T_BE_INPUT_VCF/293T_DddA11_FalsePositive_off-target-list.bed
   /lustre3/chengqiyi_pkuhpc/folder_for_learning/zhaohn/1.database/db_genomes/cell_line_mutations/293T/293T_BE_INPUT_VCF/293T-Mock-Input-covaris_bwa_hg38_sort_rmdup.recall.merge.Genotype.filter.rmdup_signal.vcf


293T DIY background:  33%|███████▎              | 1/3 [00:00<00:01,  1.28it/s]

[INFO] BED 读取完成: /lustre3/chengqiyi_pkuhpc/folder_for_learning/zhaohn/1.database/db_genomes/cell_line_mutations/293T/293T_BE_INPUT_VCF/293T-EMX1-Mock-Input.site_index.rmdup.bed (n=719,215)
[INFO] BED 读取完成: /lustre3/chengqiyi_pkuhpc/folder_for_learning/zhaohn/1.database/db_genomes/cell_line_mutations/293T/293T_BE_INPUT_VCF/293T_DddA11_FalsePositive_off-target-list.bed (n=99)


293T DIY background: 100%|██████████████████████| 3/3 [00:07<00:00,  2.45s/it]

[INFO] VCF 读取完成: /lustre3/chengqiyi_pkuhpc/folder_for_learning/zhaohn/1.database/db_genomes/cell_line_mutations/293T/293T_BE_INPUT_VCF/293T-Mock-Input-covaris_bwa_hg38_sort_rmdup.recall.merge.Genotype.filter.rmdup_signal.vcf (n=4,402,636)
[INFO] 293T DIY background 合并所有 DataFrame ...





[DONE] 293T DIY background 合并后背景位点总数: 5,111,285
  chr    pos
0   1  13418
1   1  14464
2   1  14653
3   1  14654
4   1  14677
[SAVE] 293T DIY background 压缩 TSV: bg_sites_293T.tsv.gz  （chr,pos 两列，已 gzip 压缩）
[INFO] GATK bundle background 将合并以下文件：
   /lustre1/chengqiyi_pkuhpc/zhaohn/1.database/db_genomes/GATK_resource_bundle_hg38/resources_broad_hg38_v0_1000G.phase3.integrated.sites_only.no_MATCHED_REV.hg38.vcf
   /lustre1/chengqiyi_pkuhpc/zhaohn/1.database/db_genomes/GATK_resource_bundle_hg38/resources_broad_hg38_v0_1000G_omni2.5.hg38.vcf
   /lustre1/chengqiyi_pkuhpc/zhaohn/1.database/db_genomes/GATK_resource_bundle_hg38/resources_broad_hg38_v0_1000G_phase1.snps.high_confidence.hg38.vcf
   /lustre1/chengqiyi_pkuhpc/zhaohn/1.database/db_genomes/GATK_resource_bundle_hg38/resources_broad_hg38_v0_Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf
   /lustre1/chengqiyi_pkuhpc/zhaohn/1.database/db_genomes/GATK_resource_bundle_hg38/resources_broad_hg38_v0_hapmap_3.3.hg38.vcf
   /lustre1/c

GATK bundle background:  12%|██▎               | 1/8 [02:36<18:12, 156.06s/it]

[INFO] VCF 读取完成: /lustre1/chengqiyi_pkuhpc/zhaohn/1.database/db_genomes/GATK_resource_bundle_hg38/resources_broad_hg38_v0_1000G.phase3.integrated.sites_only.no_MATCHED_REV.hg38.vcf (n=67,213,480)


GATK bundle background:  25%|████▊              | 2/8 [02:41<06:43, 67.18s/it]

[INFO] VCF 读取完成: /lustre1/chengqiyi_pkuhpc/zhaohn/1.database/db_genomes/GATK_resource_bundle_hg38/resources_broad_hg38_v0_1000G_omni2.5.hg38.vcf (n=2,442,902)


GATK bundle background:  38%|███████▏           | 3/8 [03:30<04:55, 59.04s/it]

[INFO] VCF 读取完成: /lustre1/chengqiyi_pkuhpc/zhaohn/1.database/db_genomes/GATK_resource_bundle_hg38/resources_broad_hg38_v0_1000G_phase1.snps.high_confidence.hg38.vcf (n=28,565,336)


GATK bundle background:  50%|█████████▌         | 4/8 [03:30<02:23, 35.89s/it]

[INFO] VCF 读取完成: /lustre1/chengqiyi_pkuhpc/zhaohn/1.database/db_genomes/GATK_resource_bundle_hg38/resources_broad_hg38_v0_Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf (n=198,735)


GATK bundle background:  62%|███████████▉       | 5/8 [03:33<01:12, 24.08s/it]

[INFO] VCF 读取完成: /lustre1/chengqiyi_pkuhpc/zhaohn/1.database/db_genomes/GATK_resource_bundle_hg38/resources_broad_hg38_v0_hapmap_3.3.hg38.vcf (n=4,150,474)


GATK bundle background:  75%|██████████████▎    | 6/8 [04:34<01:12, 36.45s/it]

[INFO] VCF 读取完成: /lustre1/chengqiyi_pkuhpc/zhaohn/1.database/db_genomes/GATK_resource_bundle_hg38/resources_broad_hg38_v0_Homo_sapiens_assembly38.dbsnp138.vcf (n=60,691,395)


GATK bundle background:  88%|████████████████▋  | 7/8 [04:36<00:25, 25.16s/it]

[INFO] VCF 读取完成: /lustre1/chengqiyi_pkuhpc/zhaohn/1.database/db_genomes/GATK_resource_bundle_hg38/resources_broad_hg38_v0_Homo_sapiens_assembly38.known_indels.vcf (n=1,917,671)


GATK bundle background: 100%|███████████████████| 8/8 [04:36<00:00, 34.62s/it]

[INFO] VCF 读取完成: /lustre1/chengqiyi_pkuhpc/zhaohn/1.database/db_genomes/GATK_resource_bundle_hg38/resources_broad_hg38_v0_Mills_and_1000G_gold_standard.indels.hg38.vcf (n=1,271,815)
[INFO] GATK bundle background 合并所有 DataFrame ...





[DONE] GATK bundle background 合并后背景位点总数: 92,946,466
  chr    pos
0   1  10019
1   1  10109
2   1  10139
3   1  10144
4   1  10146
[SAVE] GATK bundle background 压缩 TSV: bg_sites_GATKbundle.tsv.gz  （chr,pos 两列，已 gzip 压缩）

[SUMMARY]
293T DIY  背景文件: bg_sites_293T.tsv.gz
GATK bundle 背景文件: bg_sites_GATKbundle.tsv.gz
