In [34]:
import traceback
import warnings
import pandas as pd
from pathlib import Path

from tqdm import tqdm

from Bio.SVDSuperimposer import SVDSuperimposer
from Bio.PDB import PDBParser
import numpy as np

atoms_list = {
    "backbone": ["N", "CA", "C", "O"],
    "ca": ["CA"]
}

In [35]:
def get_paratope_confs(pdb_code: str, benchmark_folder: Path, cdr_epi_vague: bool = False):
    parser = PDBParser()
    native = benchmark_folder/f'{pdb_code}/{pdb_code}_true_complex.pdb'
    model = benchmark_folder/f'{pdb_code}/ABodyBuilder2_{pdb_code}_antibody_model_imgt.pdb'

    model_chains = {chain.id: chain for chain in parser.get_structure('model', model).get_chains()}
    native_chains = {chain.id: chain for chain in parser.get_structure('native', native).get_chains()
                     if chain.id in model_chains.keys()}

    if set(native_chains.keys()) != set(model_chains.keys()):
        raise ValueError("Model chain ids not equal to native chain ids.")

    paratope_def_nums = {chain_id: set() for chain_id in model_chains.keys()}
    if cdr_epi_vague:
        with open(benchmark_folder/f'{pdb_code}/{pdb_code}_residue_constraints_antibody.csv') as file:
            file.readline()
            for line in file:
                words = line.strip().split(',')
                if words[1][-1].isalpha():
                    insert_code = words[1][-1]
                    number = int(words[1][:-1])
                else:
                    insert_code = ' '
                    number = int(words[1])
                model_resname =  model_chains[words[0]][(' ', number, insert_code)].resname
                if model_resname != words[2].upper() :
                    raise ValueError(f"For {pdb_code=}, chain id {words[0]} residue {number}{insert_code}, "
                                     f"got mismatching residue to constraint, {model_resname=}, constraint={words[2]}")
                residue_key = (' ', number, insert_code)
                if residue_key not in native_chains[words[0]].child_dict.keys():
                    warnings.warn(f'For {pdb_code} and {cdr_epi_vague=} chain {words[0]} {residue_key} not found in native model')
                    continue
                paratope_def_nums[words[0]].add(residue_key)
    else:
        with open(benchmark_folder/f'{pdb_code}/{pdb_code}_constraint_pairs.txt') as file:
            file.readline()
            for line in file:
                antibody_line = line.split(':')[0]
                words = antibody_line.strip().split(',')
                if words[1][-1].isalpha():
                    insert_code = words[1][-1]
                    number = int(words[1][:-1])
                else:
                    insert_code = ' '
                    number = int(words[1])
                model_resname =  model_chains[words[0]][(' ', number, insert_code)].resname
                if model_resname != words[2].upper() :
                    raise ValueError(f"For {pdb_code=}, chain id {words[0]} residue {number}{insert_code}, "
                                     f"got mismatching residue to constraint, {model_resname=}, constraint={words[2]}")
                residue_key = (' ', number, insert_code)
                if residue_key not in native_chains[words[0]].child_dict.keys():
                    warnings.warn(f'For {pdb_code} and {cdr_epi_vague=} chain {words[0]} {residue_key} not found in native model')
                    continue
                paratope_def_nums[words[0]].add(residue_key)

    try:
        native_paratope_atom_coords = [list(atom.coord)  for chain_id, residue_keys
                                       in paratope_def_nums.items() for residue_key in residue_keys
                                       for atom in native_chains[chain_id][residue_key]
                                       if atom.get_id() in atoms_list['backbone']]

        model_paratope_atom_coords = [list(atom.coord)  for chain_id, residue_keys
                                       in paratope_def_nums.items() for residue_key in residue_keys
                                       for atom in model_chains[chain_id][residue_key]
                                       if atom.get_id() in atoms_list['backbone']]
    except Exception as e:
        print(traceback.format_exc())
        raise ValueError(f"For {pdb_code=} got error {e}")

    svd = SVDSuperimposer()
    svd.set(np.array(native_paratope_atom_coords), np.array(model_paratope_atom_coords))
    svd.run()
    rmsd_para = svd.get_rms()

    model_paratope_atom_conf = [atom.bfactor  for chain_id, residue_keys
                                           in paratope_def_nums.items() for residue_key in residue_keys
                                           for atom in model_chains[chain_id][residue_key]
                                           if atom.get_id() in atoms_list['backbone']]

    para_ave_conf = np.mean(model_paratope_atom_conf)
    row = {'pdb': pdb_code, 'rmsd_paratope': rmsd_para, 'para_ave_conf': para_ave_conf}

    return row

In [36]:
records = []
benchmark_folder = Path('../../benchmark_haddock_27_July_2024')
for path in tqdm(list(benchmark_folder.iterdir())):
    if path.is_dir():
        pdb_code = path.name
    else:
        continue
    try:
        record = get_paratope_confs(pdb_code,benchmark_folder)
        vague_para_record = get_paratope_confs(pdb_code,benchmark_folder,cdr_epi_vague=True)
        record.update({'rmsd_vague_paratope': vague_para_record['rmsd_paratope'],
                       'para_vague_ave_conf': vague_para_record['para_ave_conf']})
        records.append(record)
    except Exception as e:
        print(f"Got error {e} for {pdb_code=}.")

100%|██████████| 84/84 [00:17<00:00,  4.87it/s]


In [38]:
print(records)

[{'pdb': '7rfb_A0-B0', 'rmsd_paratope': 3.4276353690090215, 'para_ave_conf': 1.3023076923076924, 'rmsd_vague_paratope': 2.533765600163948, 'para_vague_ave_conf': 0.9035365853658537}, {'pdb': '7ps1_A0-B0', 'rmsd_paratope': 0.8221345756908988, 'para_ave_conf': 0.35965517241379297, 'rmsd_vague_paratope': 0.6657394513053291, 'para_vague_ave_conf': 0.2836764705882353}, {'pdb': '7kql_H0-L0', 'rmsd_paratope': 1.7402885918875601, 'para_ave_conf': 0.6943478260869566, 'rmsd_vague_paratope': 1.467130836094075, 'para_vague_ave_conf': 0.5026666666666667}, {'pdb': '7si0_I0-J0', 'rmsd_paratope': 1.6979439865258368, 'para_ave_conf': 0.31470588235294106, 'rmsd_vague_paratope': 1.1042709970541276, 'para_vague_ave_conf': 0.21789473684210525}, {'pdb': '7q0i_H0-L0', 'rmsd_paratope': 2.108229823664613, 'para_ave_conf': 1.1522222222222223, 'rmsd_vague_paratope': 1.6564223530307292, 'para_vague_ave_conf': 0.8725316455696204}, {'pdb': '7mzi_H0-L0', 'rmsd_paratope': 1.0951701539549537, 'para_ave_conf': 0.405714

In [39]:
abb2_antibody_rmsd_conf_df = pd.DataFrame.from_records(records)

In [40]:
abb2_antibody_rmsd_conf_df

Unnamed: 0,pdb,rmsd_paratope,para_ave_conf,rmsd_vague_paratope,para_vague_ave_conf
0,7rfb_A0-B0,3.427635,1.302308,2.533766,0.903537
1,7ps1_A0-B0,0.822135,0.359655,0.665739,0.283676
2,7kql_H0-L0,1.740289,0.694348,1.467131,0.502667
3,7si0_I0-J0,1.697944,0.314706,1.104271,0.217895
4,7q0i_H0-L0,2.108230,1.152222,1.656422,0.872532
...,...,...,...,...,...
78,7e5o_H0-L0,1.473584,0.633571,1.188293,0.480845
79,7n3i_H0-L0,1.150733,0.366250,0.985651,0.300571
80,7ps2_H0-L0,3.188704,0.881600,2.437336,0.503151
81,7rks_H0-L0,4.215847,1.155909,3.271277,0.828193


In [41]:
abb2_antibody_rmsd_conf_df.to_csv('../data/ABB2_antibody_rmsd_conf.csv')