In [31]:
from pathlib import Path
import traceback
import pandas as pd
from tqdm import tqdm

In [32]:
from Bio.SVDSuperimposer import SVDSuperimposer
from Bio.PDB import PDBParser
import numpy as np

atoms_list = {
    "backbone": ["N", "CA", "C", "O"],
}



def get_paratope_rmsds(pdb_code: str, model_name: str, benchmark_folder: Path):
    parser = PDBParser()
    native = benchmark_folder/f'{pdb_code}/{pdb_code}_true_complex.pdb'
    if 'ensemble' in model_name:
        model_type = model_name.split('_')[0]
        rank = model_name.split("_")[-1]
        if 'ABB2' in model_name:
            model = benchmark_folder/f'{pdb_code}/{model_type}_ensemble_models_{pdb_code}/{rank}_refined.pdb'
        else:
            model = benchmark_folder/f'{pdb_code}/{model_type}_ensemble_models_{pdb_code}/{pdb_code}_rank_{rank}.pdb'
    else:
        model = benchmark_folder/f'{pdb_code}/{model_name}_{pdb_code}_antibody_model_imgt.pdb'

    model_chains = {chain.id: chain for chain in parser.get_structure('model', model).get_chains()}
    native_chains = {chain.id: chain for chain in parser.get_structure('native', native).get_chains()
                     if chain.id in model_chains.keys()}

    if set(native_chains.keys()) != set(model_chains.keys()):
        raise ValueError("Model chain ids not equal to native chain ids.")

    paratope_def_nums = {chain_id: set() for chain_id in model_chains.keys()}

    with open(benchmark_folder/f'{pdb_code}/{pdb_code}_constraint_pairs.txt') as file:
        file.readline()
        for line in file:
            antibody_line = line.split(':')[0]
            words = antibody_line.strip().split(',')
            if words[1][-1].isalpha():
                insert_code = words[1][-1]
                number = int(words[1][:-1])
            else:
                insert_code = ' '
                number = int(words[1])
            paratope_def_nums[words[0]].add((' ', number, insert_code))
            model_resname =  model_chains[words[0]][(' ', number, insert_code)].resname
            if model_resname != words[2].upper() :
                raise ValueError(f"For {pdb_code=}, chain id {words[0]} residue {number}{insert_code}, "
                                 f"got mismatching residue to constraint, {model_resname=}, constraint={words[2]}")

    try:
        native_paratope_atom_coords = [list(atom.coord)  for chain_id, residue_keys
                                       in paratope_def_nums.items() for residue_key in residue_keys
                                       for atom in native_chains[chain_id][residue_key]
                                       if atom.get_id() in atoms_list['backbone']]

        model_paratope_atom_coords = [list(atom.coord)  for chain_id, residue_keys
                                       in paratope_def_nums.items() for residue_key in residue_keys
                                       for atom in model_chains[chain_id][residue_key]
                                       if atom.get_id() in atoms_list['backbone']]
    except Exception as e:
        print(traceback.format_exc())
        raise ValueError(f"For {pdb_code=}, {model_name=}, got error {e}")


    svd = SVDSuperimposer()
    svd.set(np.array(native_paratope_atom_coords), np.array(model_paratope_atom_coords))
    svd.run()
    rmsd_para = svd.get_rms()
    row = {'pdb': pdb_code, 'model': model_name, 'rmsd_paratope': rmsd_para}

    return row

In [33]:
records = []
model_names = (["ABodyBuilder2", "ABlooper", "AF2", "IgFold"]+ [f"ABB2_ensemble_rank{i}" for i in range(4)]
               + [f"IgFold_ensemble_rank_{i}" for i in range(4)] + [f"AF2_ensemble_rank_{i}" for i in range(5)]
               )
benchmark_folder = Path('../../benchmark_haddock_27_July_2024')
for path in tqdm(list(benchmark_folder.iterdir())):
    if path.is_dir():
        pdb_code = path.name
    else:
        continue
    print(pdb_code)
    for model_name in model_names:
        try:
            records.append(get_paratope_rmsds(pdb_code,model_name,benchmark_folder))
        except Exception as e:
            print(f"Got error {e} for {pdb_code=}, {model_name=}.")

  0%|          | 0/84 [00:00<?, ?it/s]

7rfb_A0-B0


  1%|          | 1/84 [00:02<03:32,  2.56s/it]

7ps1_A0-B0


  2%|▏         | 2/84 [00:05<03:33,  2.60s/it]

7kql_H0-L0


  4%|▎         | 3/84 [00:07<03:27,  2.56s/it]

7si0_I0-J0


  5%|▍         | 4/84 [00:09<03:04,  2.31s/it]

7q0i_H0-L0


  6%|▌         | 5/84 [00:11<02:45,  2.09s/it]

7mzi_H0-L0


  7%|▋         | 6/84 [00:13<02:32,  1.95s/it]

7k9j_H0-L0


  8%|▊         | 7/84 [00:15<02:50,  2.22s/it]

7rah_B0-A0


 10%|▉         | 8/84 [00:17<02:34,  2.03s/it]

7lr3_H0-L0


 12%|█▏        | 10/84 [00:18<01:44,  1.42s/it]

7qu2_A0-B0


 13%|█▎        | 11/84 [00:20<01:42,  1.40s/it]

7mzm_H0-L0
Traceback (most recent call last):
  File "/var/folders/z8/jlwrv8zn6h9fzlz4kqcsnq2m0000gq/T/ipykernel_49478/313181647.py", line 51, in get_paratope_rmsds
    native_paratope_atom_coords = [list(atom.coord)  for chain_id, residue_keys
  File "/var/folders/z8/jlwrv8zn6h9fzlz4kqcsnq2m0000gq/T/ipykernel_49478/313181647.py", line 53, in <listcomp>
    for atom in native_chains[chain_id][residue_key]
  File "/Users/dcutting/Projects/ai-antibodies/.venv/lib/python3.10/site-packages/Bio/PDB/Chain.py", line 108, in __getitem__
    return Entity.__getitem__(self, id)
  File "/Users/dcutting/Projects/ai-antibodies/.venv/lib/python3.10/site-packages/Bio/PDB/Entity.py", line 45, in __getitem__
    return self.child_dict[id]
KeyError: (' ', 111, 'E')

Got error For pdb_code='7mzm_H0-L0', model_name='ABodyBuilder2', got error (' ', 111, 'E') for pdb_code='7mzm_H0-L0', model_name='ABodyBuilder2'.
Traceback (most recent call last):
  File "/var/folders/z8/jlwrv8zn6h9fzlz4kqcsnq2m0000gq/T/ipy

 14%|█▍        | 12/84 [00:21<01:40,  1.39s/it]

Traceback (most recent call last):
  File "/var/folders/z8/jlwrv8zn6h9fzlz4kqcsnq2m0000gq/T/ipykernel_49478/313181647.py", line 51, in get_paratope_rmsds
    native_paratope_atom_coords = [list(atom.coord)  for chain_id, residue_keys
  File "/var/folders/z8/jlwrv8zn6h9fzlz4kqcsnq2m0000gq/T/ipykernel_49478/313181647.py", line 53, in <listcomp>
    for atom in native_chains[chain_id][residue_key]
  File "/Users/dcutting/Projects/ai-antibodies/.venv/lib/python3.10/site-packages/Bio/PDB/Chain.py", line 108, in __getitem__
    return Entity.__getitem__(self, id)
  File "/Users/dcutting/Projects/ai-antibodies/.venv/lib/python3.10/site-packages/Bio/PDB/Entity.py", line 45, in __getitem__
    return self.child_dict[id]
KeyError: (' ', 111, 'E')

Got error For pdb_code='7mzm_H0-L0', model_name='AF2_ensemble_rank_4', got error (' ', 111, 'E') for pdb_code='7mzm_H0-L0', model_name='AF2_ensemble_rank_4'.
7kyo_H0-L0


 15%|█▌        | 13/84 [00:23<01:39,  1.40s/it]

7b0b_H0-L0


 17%|█▋        | 14/84 [00:24<01:37,  1.40s/it]

7qny_A0-B0


 18%|█▊        | 15/84 [00:25<01:36,  1.40s/it]

7n4i_H0-L0


 19%|█▉        | 16/84 [00:29<02:10,  1.92s/it]

7k7h_H0-L0


 20%|██        | 17/84 [00:30<02:04,  1.86s/it]

7l7e_C0-D0


 21%|██▏       | 18/84 [00:32<02:00,  1.83s/it]

7msq_D0-E0


 23%|██▎       | 19/84 [00:33<01:49,  1.69s/it]

7ps2_A0-B0


 24%|██▍       | 20/84 [00:35<01:42,  1.60s/it]

7s0b_A0-B0


 25%|██▌       | 21/84 [00:36<01:37,  1.54s/it]

7bbg_H0-L0


 26%|██▌       | 22/84 [00:38<01:34,  1.52s/it]

7r8l_H0-L0


 27%|██▋       | 23/84 [00:39<01:27,  1.43s/it]

7mzh_H0-L0


 29%|██▊       | 24/84 [00:40<01:23,  1.39s/it]

7nx3_B0-C0


 30%|██▉       | 25/84 [00:42<01:31,  1.54s/it]

7l7r_B0-A0


 31%|███       | 26/84 [00:44<01:30,  1.55s/it]

7q0g_A0-B0


 32%|███▏      | 27/84 [00:45<01:21,  1.43s/it]

7phu_B0-C0


 33%|███▎      | 28/84 [00:46<01:21,  1.46s/it]

7seg_H0-L0


 35%|███▍      | 29/84 [00:47<01:14,  1.36s/it]

7s4s_H0-L0


 36%|███▌      | 30/84 [00:49<01:12,  1.34s/it]

7phw_B0-C0


 37%|███▋      | 31/84 [00:50<01:14,  1.41s/it]

7lr4_H0-L0


 38%|███▊      | 32/84 [00:52<01:11,  1.37s/it]

7l7d_H0-L0


 39%|███▉      | 33/84 [00:53<01:13,  1.44s/it]

7l0l_H0-L0


 40%|████      | 34/84 [00:55<01:16,  1.53s/it]

7mzj_H0-L0


 42%|████▏     | 35/84 [00:56<01:11,  1.47s/it]

7qu1_A0-B0


 43%|████▎     | 36/84 [00:57<01:07,  1.40s/it]

7e72_A0-B0


 44%|████▍     | 37/84 [00:59<01:03,  1.35s/it]

7daa_H0-L0


 45%|████▌     | 38/84 [01:00<00:57,  1.24s/it]

7np1_H0-L0


 46%|████▋     | 39/84 [01:01<00:56,  1.26s/it]

7n4j_H0-L0


 48%|████▊     | 40/84 [01:02<00:55,  1.27s/it]

7dk2_A0-B0


 49%|████▉     | 41/84 [01:04<00:55,  1.29s/it]

7pr0_H0-L0


 50%|█████     | 42/84 [01:05<00:56,  1.35s/it]

7s13_H0-L0


 51%|█████     | 43/84 [01:07<00:57,  1.41s/it]

7l7r_D0-C0


 52%|█████▏    | 44/84 [01:08<00:57,  1.45s/it]

7f7e_C0-L0


 54%|█████▎    | 45/84 [01:10<00:57,  1.46s/it]

7rk1_C0-C1


 55%|█████▍    | 46/84 [01:11<00:53,  1.41s/it]

7s11_H0-L0


 56%|█████▌    | 47/84 [01:12<00:49,  1.34s/it]

7bnv_H0-L0


 57%|█████▋    | 48/84 [01:13<00:47,  1.32s/it]

7mzf_H0-L0


 58%|█████▊    | 49/84 [01:15<00:44,  1.26s/it]

7kn4_H0-L0


 60%|█████▉    | 50/84 [01:16<00:43,  1.28s/it]

7r89_C0-D0


 61%|██████    | 51/84 [01:18<00:46,  1.41s/it]

7kf1_H0-L0


 62%|██████▏   | 52/84 [01:19<00:43,  1.35s/it]

7e3o_H0-L0


 63%|██████▎   | 53/84 [01:20<00:41,  1.33s/it]

7lf7_A0-B0


 64%|██████▍   | 54/84 [01:22<00:41,  1.37s/it]

7mdj_A0-B0


 65%|██████▌   | 55/84 [01:23<00:37,  1.30s/it]

7shu_E0-F0


 67%|██████▋   | 56/84 [01:24<00:36,  1.29s/it]

7kez_H0-L0


 68%|██████▊   | 57/84 [01:25<00:33,  1.26s/it]

7lfa_B0-D0


 69%|██████▉   | 58/84 [01:27<00:37,  1.45s/it]

7ps6_H0-L0


 70%|███████   | 59/84 [01:28<00:33,  1.36s/it]

7mrz_X0-Y0


 71%|███████▏  | 60/84 [01:30<00:35,  1.48s/it]

7lfb_H0-L0


 73%|███████▎  | 61/84 [01:31<00:31,  1.38s/it]

7sem_B0-C0


 74%|███████▍  | 62/84 [01:33<00:30,  1.41s/it]

7rk2_C0-C1


 75%|███████▌  | 63/84 [01:34<00:31,  1.51s/it]

7qnw_A0-B0


 76%|███████▌  | 64/84 [01:36<00:28,  1.45s/it]

7mzk_N0-M0


 77%|███████▋  | 65/84 [01:37<00:26,  1.41s/it]

7kn3_H0-L0


 79%|███████▊  | 66/84 [01:38<00:24,  1.39s/it]

7rah_D0-C0


 80%|███████▉  | 67/84 [01:40<00:23,  1.40s/it]

7soe_B0-L0


 81%|████████  | 68/84 [01:42<00:27,  1.71s/it]

7ps4_H0-L0


 82%|████████▏ | 69/84 [01:43<00:23,  1.57s/it]

7shz_K0-L0


 83%|████████▎ | 70/84 [01:45<00:20,  1.50s/it]

7ps0_H0-L0


 85%|████████▍ | 71/84 [01:46<00:18,  1.43s/it]

7bbj_H0-L0


 86%|████████▌ | 72/84 [01:48<00:18,  1.51s/it]

7vux_H0-L0


 87%|████████▋ | 73/84 [01:49<00:15,  1.42s/it]

7vmu_A0-A1


 88%|████████▊ | 74/84 [01:50<00:13,  1.35s/it]

7coe_H0-L0


 89%|████████▉ | 75/84 [01:51<00:12,  1.33s/it]

7ps6_C0-D0


 90%|█████████ | 76/84 [01:53<00:10,  1.32s/it]

7q0g_H0-L0


 92%|█████████▏| 77/84 [01:54<00:08,  1.29s/it]

7mzg_H0-L0


 93%|█████████▎| 78/84 [01:55<00:07,  1.25s/it]

7pi7_B0-C0


 94%|█████████▍| 79/84 [01:57<00:06,  1.33s/it]

7e5o_H0-L0


 95%|█████████▌| 80/84 [01:58<00:05,  1.32s/it]

7n3i_H0-L0


 96%|█████████▋| 81/84 [01:59<00:03,  1.27s/it]

7ps2_H0-L0


 98%|█████████▊| 82/84 [02:00<00:02,  1.28s/it]

7rks_H0-L0


 99%|█████████▉| 83/84 [02:02<00:01,  1.36s/it]

7kf0_H0-L0


100%|██████████| 84/84 [02:03<00:00,  1.47s/it]


In [34]:
paratope_rmsd_df = pd.DataFrame.from_records(records)

In [35]:
paratope_rmsd_df.to_csv('../data/paratope_rmsds.csv')

In [36]:
print(paratope_rmsd_df)

             pdb                model  rmsd_paratope
0     7rfb_A0-B0        ABodyBuilder2       3.427635
1     7rfb_A0-B0             ABlooper       3.984238
2     7rfb_A0-B0                  AF2       3.929652
3     7rfb_A0-B0               IgFold       2.799389
4     7rfb_A0-B0  ABB2_ensemble_rank0       3.427635
...          ...                  ...            ...
1389  7kf0_H0-L0  AF2_ensemble_rank_0       2.011022
1390  7kf0_H0-L0  AF2_ensemble_rank_1       1.873005
1391  7kf0_H0-L0  AF2_ensemble_rank_2       2.009313
1392  7kf0_H0-L0  AF2_ensemble_rank_3       2.049127
1393  7kf0_H0-L0  AF2_ensemble_rank_4       1.850511

[1394 rows x 3 columns]
