In [65]:
from Bio.SVDSuperimposer import SVDSuperimposer
from Bio.PDB import PDBParser
import numpy as np
from tqdm import tqdm
from pathlib import Path
import pandas as pd
from Bio.Align import PairwiseAligner
from Bio.SeqUtils import seq1

atoms_list = {
    "backbone": ["N", "CA", "C", "O"],
}

def get_antigen_rmsd(pdb_code, benchmark_folder: Path):
    parser = PDBParser()
    native = benchmark_folder/f'{pdb_code}/{pdb_code}_antigen.pdb'
    model = benchmark_folder/f'{pdb_code}/{pdb_code}_AF2_antigen_model.pdb'

    # native_struc_res = parser.get_structure('native', native).get_residues()
    # model_struc_res = parser.get_structure('model', model).get_residues()

    native_chains = {chain.id: chain for chain in parser.get_structure('native', native).get_chains()}
    model_chains = {chain.id: chain for chain in parser.get_structure('model', model).get_chains()}

    chain_id_mappings = {native_chain_id : model_chain_id for native_chain_id, model_chain_id in zip(native_chains.keys(),
                                                                                                     model_chains.keys())}

    atom_coords_native = []
    atom_coords_model = []

    atom_coords_model_epitope = []
    atom_coords_native_epitope = []

    epitope_native_def_nums = {chain: set() for chain in native_chains}
    epitope_model_def_nums = {chain: set() for chain in model_chains}
    with (benchmark_folder/f'{pdb_code}/{pdb_code}_constraint_pairs.txt').open() as inf:
        for line in inf.readlines()[1:]:
            antigen_line = line.split(':')[1]
            chain_id, resnum, restype = antigen_line.strip().split(',')
            resnum = int(resnum)
            native_restype =  native_chains[chain_id][(' ',resnum,' ')].resname
            if native_restype != restype:
                raise ValueError(f"For {pdb_code=}, chain id {chain_id} residue {resnum}, "
                                     f"got mismatching residue to constraint, {native_restype=}, constraint={restype}")
            epitope_native_def_nums[chain_id].add(resnum)
    with (benchmark_folder/f'{pdb_code}/{pdb_code}_AF2_constraint_pairs.txt').open() as inf:
        for line in inf.readlines()[1:]:
            antigen_line = line.split(':')[1]
            chain_id, resnum, restype = antigen_line.strip().split(',')
            resnum = int(resnum)
            model_restype =  model_chains[chain_id][(' ',resnum,' ')].resname
            if model_restype != restype:
                raise ValueError(f"For {pdb_code=}, chain id {chain_id} residue {resnum}, "
                                     f"got mismatching residue to constraint, {model_restype=}, constraint={restype}")
            epitope_model_def_nums[chain_id].add(resnum)


    for native_chain_id, model_chain_id in chain_id_mappings.items():
        nat_ress = list(native_chains[native_chain_id].get_residues())
        model_ress = list(model_chains[model_chain_id].get_residues())

        seq_native = "".join([seq1(res.resname) for res in nat_ress])
        seq_model = "".join([seq1(res.resname) for res in model_ress])

        aligner = PairwiseAligner()

        alignment = aligner.align(seq_native,seq_model)

        nat_ress_aligned = sum([nat_ress[start:end] for start, end in alignment[0].aligned[0]], start=[])
        model_ress_aligned = sum([model_ress[start:end] for start, end in alignment[0].aligned[1]], start=[])

        for nat_res, model_res in zip(nat_ress_aligned, model_ress_aligned):
            nat_atoms = [list(atom.coord) for atom in list(nat_res.get_atoms()) if atom.get_id() in atoms_list['backbone']]
            mod_atoms = [list(atom.coord) for atom in list(model_res.get_atoms()) if atom.get_id() in atoms_list['backbone']]

            atom_coords_native += nat_atoms
            atom_coords_model += mod_atoms

            if nat_res.get_full_id()[3][1] in epitope_native_def_nums[native_chain_id]:
                assert model_res.get_full_id()[3][1] in epitope_model_def_nums[model_chain_id]
                atom_coords_native_epitope += nat_atoms
                atom_coords_model_epitope += mod_atoms

    svd = SVDSuperimposer()
    svd.set(np.array(atom_coords_native), np.array(atom_coords_model))
    svd.run()
    rmsd = svd.get_rms()

    svd = SVDSuperimposer()
    svd.set(np.array(atom_coords_native_epitope), np.array(atom_coords_model_epitope))
    svd.run()
    rmsd_epi = svd.get_rms()
    row = {'pdb': pdb_code, 'rmsd': rmsd, 'rmsd_epitope': rmsd_epi}

    return row

In [66]:
records = []
benchmark_folder = Path('../../benchmark_haddock_27_July_2024')
for path in tqdm(list(benchmark_folder.iterdir())):
    if path.is_dir():
        pdb_code = path.name
    else:
        continue
    #try:
    records.append(get_antigen_rmsd(pdb_code, benchmark_folder))
    #except Exception as e:
        #print(f"Got error {e} for {pdb_code=}.")


100%|██████████| 84/84 [00:09<00:00,  8.73it/s]


In [67]:
epitope_rmsd_df = pd.DataFrame.from_records(records)


In [68]:
epitope_rmsd_df

Unnamed: 0,pdb,rmsd,rmsd_epitope
0,7rfb_A0-B0,0.970089,1.407753
1,7ps1_A0-B0,0.539861,0.409709
2,7kql_H0-L0,2.579659,2.495614
3,7si0_I0-J0,1.020640,0.524872
4,7q0i_H0-L0,2.453988,4.086787
...,...,...,...
78,7e5o_H0-L0,1.813552,0.509658
79,7n3i_H0-L0,0.746927,0.549763
80,7ps2_H0-L0,3.629126,0.338868
81,7rks_H0-L0,0.903424,1.095239


In [69]:
epitope_rmsd_df.to_csv('../data/AF2_antigen_rmsd.csv')