In [4]:
import pandas as pd

# 文件路径
gd_csv = "/home/yuyang/lb/data/IL23/ppi_summary_gd.csv"
mapping_csv = "/home/yuyang/lb/data/IL23/ppi_summary_mapping.csv"
output_csv = "/home/yuyang/lb/data/IL23/af3_gd_overlap.csv"

# 读取 CSV
gd_df = pd.read_csv(gd_csv)
mapping_df = pd.read_csv(mapping_csv)

# GD 的结合位点字典
gd_dict = {}
for _, row in gd_df.iterrows():
    residues = str(row["antigen_binding_sites_gd"]).split(",")
    residues = set(int(r.strip()) for r in residues if r.strip())
    gd_dict[row["model_name"]] = residues

# 计算每个 AF3 模型的 overlap
results = []
for _, row in mapping_df.iterrows():
    model_name = row["model_name"]
    
    # AF3 位点映射到 GD 的位点
    af3_residues = str(row["antigen_binding_sites_groundtruth"]).split(",")
    af3_residues = set(int(r.strip()) for r in af3_residues if r.strip())
    
    # 这里假设 GD 的 model_name 是 '5njd'
    gd_residues = gd_dict.get("5njd", set())
    
    # 计算重叠
    overlap_count = len(af3_residues & gd_residues)
    total_gd_count = len(gd_residues)
    overlap_ratio = overlap_count / total_gd_count if total_gd_count > 0 else 0.0
    
    results.append({
        "model_name": model_name,
        "overlap_count": overlap_count,
        "total_gd_count": total_gd_count,
        "overlap_ratio": overlap_ratio
    })

# 保存结果并按 overlap 从高到低排序
results_df = pd.DataFrame(results)
results_df = results_df.sort_values("overlap_ratio", ascending=False)
results_df.to_csv(output_csv, index=False)

print(f"✅ 输出文件: {output_csv}")
print(results_df)


✅ 输出文件: /home/yuyang/lb/data/IL23/af3_gd_overlap.csv
   model_name  overlap_count  total_gd_count  overlap_ratio
25  MJ00D_210              2              15       0.133333
27  MJ00D_240              2              15       0.133333
31  MJ00D_268              2              15       0.133333
23   MJ00D_20              2              15       0.133333
40  MJ00D_297              2              15       0.133333
..        ...            ...             ...            ...
54  MJ00D_391              0              15       0.000000
56   MJ00D_43              0              15       0.000000
57   MJ00D_47              0              15       0.000000
62   MJ00D_64              0              15       0.000000
63   MJ00D_78              0              15       0.000000

[64 rows x 4 columns]
