In [1]:
from pathlib import Path
import traceback
import pandas as pd
from tqdm import tqdm

In [6]:
from Bio.SVDSuperimposer import SVDSuperimposer
from Bio.PDB import PDBParser
import numpy as np

atoms_list = {
    "backbone": ["N", "CA", "C", "O"],
}



def get_paratope_rmsds(pdb_code: str, model_name: str, benchmark_folder: Path):
    parser = PDBParser()
    native = benchmark_folder/f'{pdb_code}/{pdb_code}_true_complex.pdb'
    if model_name == 'ABodyBuilder2':
        model = benchmark_folder/f'{pdb_code}/{model_name}_{pdb_code}_antibody_model.pdb'
    elif 'ensemble' in model_name:
        model = benchmark_folder/f'{pdb_code}/ABB2_ensemble_models_{pdb_code}/{model_name.split("_")[-1]}_refined.pdb'
    else:
        model = benchmark_folder/f'{pdb_code}/{model_name}_{pdb_code}_antibody_model_imgt.pdb'

    model_chains = {chain.id: chain for chain in parser.get_structure('model', model).get_chains()}
    native_chains = {chain.id: chain for chain in parser.get_structure('native', native).get_chains()
                     if chain.id in model_chains.keys()}

    if set(native_chains.keys()) != set(model_chains.keys()):
        raise ValueError("Model chain ids not equal to native chain ids.")

    paratope_def_nums = {chain_id: set() for chain_id in model_chains.keys()}

    with open(benchmark_folder/f'{pdb_code}/{pdb_code}_constraint_pairs.txt') as file:
        file.readline()
        for line in file:
            antibody_line = line.split(':')[0]
            words = antibody_line.strip().split(',')
            if words[1][-1].isalpha():
                insert_code = words[1][-1]
                number = int(words[1][:-1])
            else:
                insert_code = ' '
                number = int(words[1])
            paratope_def_nums[words[0]].add((' ', number, insert_code))
            model_resname =  model_chains[words[0]][(' ', number, insert_code)].resname
            if model_resname != words[2].upper() :
                raise ValueError(f"For {pdb_code=}, chain id {words[0]} residue {number}{insert_code}, "
                                 f"got mismatching residue to constraint, {model_resname=}, constraint={words[2]}")

    try:
        native_paratope_atom_coords = [list(atom.coord)  for chain_id, residue_keys
                                       in paratope_def_nums.items() for residue_key in residue_keys
                                       for atom in native_chains[chain_id][residue_key]
                                       if atom.get_id() in atoms_list['backbone']]

        model_paratope_atom_coords = [list(atom.coord)  for chain_id, residue_keys
                                       in paratope_def_nums.items() for residue_key in residue_keys
                                       for atom in model_chains[chain_id][residue_key]
                                       if atom.get_id() in atoms_list['backbone']]
    except Exception as e:
        print(traceback.format_exc())
        raise ValueError(f"For {pdb_code=}, {model_name=}, got error {e}")


    svd = SVDSuperimposer()
    svd.set(np.array(native_paratope_atom_coords), np.array(model_paratope_atom_coords))
    svd.run()
    rmsd_para = svd.get_rms()
    row = {'pdb': pdb_code, 'model': model_name, 'rmsd_paratope': rmsd_para}

    return row

In [7]:
records = []
model_names = ["ABodyBuilder2", "ABlooper", "AF2", "IgFold"]+ [f"ABB2_ensemble_rank{i}" for i in range(4)]
benchmark_folder = Path('../../benchmark_haddock_23_May_2023')
for path in tqdm(list(benchmark_folder.iterdir())):
    if path.is_dir():
        pdb_code = path.name
    else:
        continue
    for model_name in model_names:
        try:
            records.append(get_paratope_rmsds(pdb_code,model_name,benchmark_folder))
        except Exception as e:
            print(f"Got error {e} for {pdb_code=}, {model_name=}.")

100%|██████████| 83/83 [00:56<00:00,  1.47it/s]


In [8]:
paratope_rmsd_df = pd.DataFrame.from_records(records)

In [9]:
paratope_rmsd_df.to_csv('../data/paratope_rmsds.csv')

In [10]:
print(paratope_rmsd_df)

      pdb                model  rmsd_paratope
0    7q0g        ABodyBuilder2       3.199848
1    7q0g             ABlooper       2.675287
2    7q0g                  AF2       3.306487
3    7q0g               IgFold       2.866815
4    7q0g  ABB2_ensemble_rank0       3.370159
..    ...                  ...            ...
643  7s13               IgFold       1.441026
644  7s13  ABB2_ensemble_rank0       1.308578
645  7s13  ABB2_ensemble_rank1       1.444428
646  7s13  ABB2_ensemble_rank2       1.178569
647  7s13  ABB2_ensemble_rank3       1.269890

[648 rows x 3 columns]
