In [22]:
import traceback

import pandas as pd
from pathlib import Path

from tqdm import tqdm

from Bio.SVDSuperimposer import SVDSuperimposer
from Bio.PDB import PDBParser
import numpy as np

atoms_list = {
    "backbone": ["N", "CA", "C", "O"],
    "ca": ["CA"]
}

In [28]:
def get_paratope_confs(pdb_code: str, benchmark_folder: Path, cdr_epi_vague: bool = False):
    parser = PDBParser()
    native = benchmark_folder/f'{pdb_code}/{pdb_code}_true_complex.pdb'
    model = benchmark_folder/f'{pdb_code}/ABodyBuilder2_{pdb_code}_antibody_model.pdb'

    model_chains = {chain.id: chain for chain in parser.get_structure('model', model).get_chains()}
    native_chains = {chain.id: chain for chain in parser.get_structure('native', native).get_chains()
                     if chain.id in model_chains.keys()}

    if set(native_chains.keys()) != set(model_chains.keys()):
        raise ValueError("Model chain ids not equal to native chain ids.")

    paratope_def_nums = {chain_id: set() for chain_id in model_chains.keys()}
    if cdr_epi_vague:
        with open(benchmark_folder/f'{pdb_code}/{pdb_code}_residue_constraints_antibody.csv') as file:
            file.readline()
            for line in file:
                words = line.strip().split(',')
                if words[1][-1].isalpha():
                    insert_code = words[1][-1]
                    number = int(words[1][:-1])
                else:
                    insert_code = ' '
                    number = int(words[1])
                paratope_def_nums[words[0]].add((' ', number, insert_code))
                model_resname =  model_chains[words[0]][(' ', number, insert_code)].resname
                if model_resname != words[2].upper() :
                    raise ValueError(f"For {pdb_code=}, chain id {words[0]} residue {number}{insert_code}, "
                                     f"got mismatching residue to constraint, {model_resname=}, constraint={words[2]}")
    else:
        with open(benchmark_folder/f'{pdb_code}/{pdb_code}_constraint_pairs.txt') as file:
            file.readline()
            for line in file:
                antibody_line = line.split(':')[0]
                words = antibody_line.strip().split(',')
                if words[1][-1].isalpha():
                    insert_code = words[1][-1]
                    number = int(words[1][:-1])
                else:
                    insert_code = ' '
                    number = int(words[1])
                paratope_def_nums[words[0]].add((' ', number, insert_code))
                model_resname =  model_chains[words[0]][(' ', number, insert_code)].resname
                if model_resname != words[2].upper() :
                    raise ValueError(f"For {pdb_code=}, chain id {words[0]} residue {number}{insert_code}, "
                                     f"got mismatching residue to constraint, {model_resname=}, constraint={words[2]}")

    try:
        native_paratope_atom_coords = [list(atom.coord)  for chain_id, residue_keys
                                       in paratope_def_nums.items() for residue_key in residue_keys
                                       for atom in native_chains[chain_id][residue_key]
                                       if atom.get_id() in atoms_list['backbone']]

        model_paratope_atom_coords = [list(atom.coord)  for chain_id, residue_keys
                                       in paratope_def_nums.items() for residue_key in residue_keys
                                       for atom in model_chains[chain_id][residue_key]
                                       if atom.get_id() in atoms_list['backbone']]
    except Exception as e:
        print(traceback.format_exc())
        raise ValueError(f"For {pdb_code=} got error {e}")

    svd = SVDSuperimposer()
    svd.set(np.array(native_paratope_atom_coords), np.array(model_paratope_atom_coords))
    svd.run()
    rmsd_para = svd.get_rms()

    model_paratope_atom_conf = [atom.bfactor  for chain_id, residue_keys
                                           in paratope_def_nums.items() for residue_key in residue_keys
                                           for atom in model_chains[chain_id][residue_key]
                                           if atom.get_id() in atoms_list['backbone']]

    para_ave_conf = np.mean(model_paratope_atom_conf)
    row = {'pdb': pdb_code, 'rmsd_paratope': rmsd_para, 'para_ave_conf': para_ave_conf}

    return row

In [29]:
records = []
benchmark_folder = Path('/Users/dcutting/Library/CloudStorage/Box-Box/Exscientia - Bonvin Lab share/benchmark_haddock_23_May_2023')
for path in tqdm(list(benchmark_folder.iterdir())):
    if path.is_dir():
        pdb_code = path.name
    else:
        continue
    try:
        record = get_paratope_confs(pdb_code,benchmark_folder)
        vague_para_record = get_paratope_confs(pdb_code,benchmark_folder,cdr_epi_vague=True)
        record.update({'rmsd_vague_paratope': vague_para_record['rmsd_paratope'],
                       'para_vague_ave_conf': vague_para_record['para_ave_conf']})
        records.append(record)
    except Exception as e:
        print(f"Got error {e} for {pdb_code=}.")

 54%|█████▍    | 45/83 [00:07<00:05,  6.66it/s]

Got error (' ', 31, ' ') for pdb_code='7kn4'.


100%|██████████| 83/83 [00:13<00:00,  6.23it/s]


In [30]:
print(records)

[{'pdb': '7q0g', 'rmsd_paratope': 3.199847589461224, 'para_ave_conf': 0.6395454545454545, 'rmsd_vague_paratope': 2.4799229976237367, 'para_vague_ave_conf': 0.44591549295774646}, {'pdb': '7q0i', 'rmsd_paratope': 2.1469648848815446, 'para_ave_conf': 1.1522222222222223, 'rmsd_vague_paratope': 1.6317770988611637, 'para_vague_ave_conf': 0.8725316455696204}, {'pdb': '7pi7', 'rmsd_paratope': 0.89587879213425, 'para_ave_conf': 0.46366666666666667, 'rmsd_vague_paratope': 0.7819444458210867, 'para_vague_ave_conf': 0.37742857142857145}, {'pdb': '7f7e', 'rmsd_paratope': 2.9947311470700604, 'para_ave_conf': 1.4594117647058826, 'rmsd_vague_paratope': 2.93353236272712, 'para_vague_ave_conf': 0.8200000000000001}, {'pdb': '7n4i', 'rmsd_paratope': 2.188082120803512, 'para_ave_conf': 0.8457692307692307, 'rmsd_vague_paratope': 1.50823018334282, 'para_vague_ave_conf': 0.4516666666666667}, {'pdb': '7mdj', 'rmsd_paratope': 1.3516068686214953, 'para_ave_conf': 0.6404761904761906, 'rmsd_vague_paratope': 1.2458

In [31]:
abb2_antibody_rmsd_conf_df = pd.DataFrame.from_records(records)

In [32]:
abb2_antibody_rmsd_conf_df.to_csv('../data/ABB2_antibody_rmsd_conf.csv')