In [1]:
import pandas as pd
import yaml
import os

# ===================== 读取 YAML =====================
with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)

common_params = config.get("common", {})
overlap_params = config.get("overlap", {})

THRESHOLD = overlap_params.get("THRESHOLD", 0.6)
AF3_FILE = overlap_params["AF3_FILE"]
GD_FILE = overlap_params["GD_FILE"]
MAP_FILE = overlap_params["MAP_FILE"]
OUTPUT_FILE_SIMPLE = overlap_params["OUTPUT_FILE_SIMPLE"]
OUTPUT_FILE_MAPPED = overlap_params["OUTPUT_FILE_MAPPED"]

In [6]:
# ===================== 2️⃣ 映射 overlap =====================
af3_df = pd.read_csv(AF3_FILE)
gd_df = pd.read_csv(GD_FILE)
map_df = pd.read_csv(MAP_FILE)

# 构建 mapping: PDB residue → AF3 residue
mapping = {int(r["pdb_residue_id"]): int(r["af3_residue_id"])
           for _, r in map_df.iterrows() if pd.notna(r["af3_residue_id"])}

gd_residues = set()
for r in gd_df["antigen_binding_sites_gd"]:
    gd_residues.update(int(x.strip()) for x in str(r).split(",") if x.strip())

gd_mapped_residues = {mapping[r] for r in gd_residues if r in mapping}
total_gd_count = len(gd_mapped_residues)

results = []
for _, row in af3_df.iterrows():
    model_name = row["model_name"]
    af3_residues = set(int(x.strip()) for x in str(row["antigen_binding_sites_AF3"]).split(",") if x.strip())
    overlap_count = len(af3_residues & gd_mapped_residues)
    overlap_ratio = overlap_count / total_gd_count if total_gd_count > 0 else 0.0
    results.append({
        "model_name": model_name,
        "overlap_count": overlap_count,
        "total_gd_count": total_gd_count,
        "overlap_ratio": round(overlap_ratio, 4)
    })

results_df = pd.DataFrame(results).sort_values("overlap_ratio", ascending=False)
results_df.to_csv(OUTPUT_FILE_MAPPED, index=False)
print(f"✅ 输出文件: {OUTPUT_FILE_MAPPED}")

high_overlap_df = results_df[results_df["overlap_ratio"] >= THRESHOLD]
print("\n✅ overlap_ratio >= {0} 的模型：".format(THRESHOLD))
print(high_overlap_df)

✅ 输出文件: /home/yuyang/lb_yaml/data/IL23/af3_gd_overlap_mapped.csv

✅ overlap_ratio >= 0.6 的模型：
   model_name  overlap_count  total_gd_count  overlap_ratio
77   MJ00D_98             12              15         0.8000
25  MJ00D_188             12              15         0.8000
31  MJ00D_226             12              15         0.8000
18  MJ00D_164             11              15         0.7333
54  MJ00D_350             11              15         0.7333
5   MJ00D_114             11              15         0.7333
33  MJ00D_240             10              15         0.6667
23  MJ00D_180             10              15         0.6667
64  MJ00D_390             10              15         0.6667
19  MJ00D_167             10              15         0.6667
72   MJ00D_57             10              15         0.6667
43  MJ00D_286             10              15         0.6667
71   MJ00D_55             10              15         0.6667
73   MJ00D_64             10              15         0.6667
74   M