In [1]:
import traceback

import pandas as pd
from pathlib import Path

from tqdm import tqdm

from Bio.SVDSuperimposer import SVDSuperimposer
from Bio.PDB import PDBParser
import numpy as np

atoms_list = {
    "backbone": ["N", "CA", "C", "O"],
    "ca" : ["CA"]
}

def get_plddt_cat(plddt: float) -> str:
    if plddt < 70:
        return 'bad'
    if plddt < 90:
        return 'good'
    return 'high'

def get_rmsd_cat(rmsd: float) -> str:
    if rmsd > 5.0:
        return 'bad'
    if rmsd >= 2.0:
        return 'good'
    return 'high'

In [20]:
def get_plddts(pdb_code: str, benchmark_folder: Path, region_type: str, model_type: str):
    allowed_model_types = {"antibody", "antigen"}
    if model_type not in allowed_model_types:
        raise ValueError(f"Unrecognised {model_type=}, must be one of {allowed_model_types}")

    allowed_region_types = {'CDR-EpiVague', 'Para-Epi', 'full'}
    if region_type not in allowed_region_types:
        raise ValueError(f"Unrecognised {region_type=}, must be one of {allowed_region_types}")
    parser = PDBParser()
    native = benchmark_folder/f'{pdb_code}/{pdb_code}_true_complex.pdb'
    if model_type == 'antibody':
        model = benchmark_folder/f'{pdb_code}/AF2_{pdb_code}_{model_type}_model_imgt.pdb'
    else:
        model = benchmark_folder/f'{pdb_code}/{pdb_code}_AF2_{model_type}_model.pdb'

    model_chains = {chain.id: chain for chain in parser.get_structure('model', model).get_chains()}
    if model_type == 'antibody':
        native_chains = {chain.id: chain for chain in parser.get_structure('native', native).get_chains()
                         if chain.id in model_chains.keys()}
        if set(native_chains.keys()) != set(model_chains.keys()):
            raise ValueError("Model chain ids not equal to native chain ids.")
    else:
        native_chains = {chain.id: chain for chain in parser.get_structure('native', native).get_chains()}

    region_def_nums = {chain_id: set() for chain_id in native_chains.keys()}

    if region_type == 'CDR-EpiVague':
        with open(benchmark_folder/f'{pdb_code}/{pdb_code}_residue_constraints_{model_type}.csv') as file:
            file.readline()
            for line in file:
                words = line.strip().split(',')
                if words[1][-1].isalpha():
                    insert_code = words[1][-1]
                    number = int(words[1][:-1])
                else:
                    insert_code = ' '
                    number = int(words[1])
                region_def_nums[words[0]].add((' ', number, insert_code))
                native_resname =  native_chains[words[0]][(' ', number, insert_code)].resname
                if native_resname != words[2].upper() :
                    raise ValueError(f"For {pdb_code=}, chain id {words[0]} residue {number}{insert_code}, "
                                     f"got mismatching residue to constraint, {native_resname=}, constraint={words[2]}")
    elif region_type == 'Para-Epi':
        with open(benchmark_folder/f'{pdb_code}/{pdb_code}_constraint_pairs.txt') as file:
            file.readline()
            for line in file:
                antibody_line,antigen_line = line.split(':')
                if model_type == 'antibody':
                    words = antibody_line.strip().split(',')
                else:
                    words = antigen_line.strip().split(',')
                if words[1][-1].isalpha():
                    insert_code = words[1][-1]
                    number = int(words[1][:-1])
                else:
                    insert_code = ' '
                    number = int(words[1])
                region_def_nums[words[0]].add((' ', number, insert_code))
                native_resname =  native_chains[words[0]][(' ', number, insert_code)].resname
                if native_resname != words[2].upper() :
                    raise ValueError(f"For {pdb_code=}, chain id {words[0]} residue {number}{insert_code}, "
                                     f"got mismatching residue to constraint, {native_resname=}, constraint={words[2]}")
    else:
        for chain_id, chain in native_chains.items():
            for res in chain:
                region_def_nums[chain_id].add(res.full_id[-1])

    try:
        if model_type == 'antibody':
            native_region_res = [native_chains[chain_id][residue_key] for chain_id, residue_keys
                             in region_def_nums.items() for residue_key in residue_keys]
            model_region_res = [model_chains[chain_id][residue_key] for chain_id, residue_keys
                             in region_def_nums.items() for residue_key in residue_keys]
        else:
            all_model_res = sum([list(chain.get_residues()) for chain in model_chains.values()],[])
            all_native_res = sum([list(chain.get_residues()) for chain in native_chains.values()],[])
            model_region_res = []
            native_region_res = []
            for model_res, native_res in zip(all_model_res, all_native_res):
                if native_res.full_id[-1] in region_def_nums[native_res.full_id[-2]]:
                    model_region_res.append(model_res)
                    native_region_res.append(native_res)
        native_region_atom_coords = [list(atom.coord)  for res in native_region_res for atom in res
                                       if atom.get_id() in atoms_list['backbone']]

        model_region_atom_coords = [list(atom.coord) for res in model_region_res for atom in res
                                       if atom.get_id() in atoms_list['backbone']]
    except Exception as e:
        print(traceback.format_exc())
        raise ValueError(f"For {pdb_code=} got error {e}")

    svd = SVDSuperimposer()
    svd.set(np.array(native_region_atom_coords), np.array(model_region_atom_coords))
    svd.run()
    rmsd_region = svd.get_rms()

    model_region_atom_plddt = [atom.bfactor  for res in model_region_res for atom in res
                                           if atom.get_id() in atoms_list['ca']]

    region_ave_plddt = np.mean(model_region_atom_plddt)

    row = {'pdb': pdb_code, 'rmsd_region': rmsd_region, 'plddt_ave_region': region_ave_plddt}
    return row

In [20]:
records = []
benchmark_folder = Path('../../benchmark_haddock_23_May_2023')
for path in tqdm(list(benchmark_folder.iterdir())):
    if path.is_dir():
        pdb_code = path.name
    else:
        continue
    try:
        para_epi_record = get_plddts(pdb_code,benchmark_folder,region_type='Para-Epi',model_type='antibody')
        vague_record = get_plddts(pdb_code,benchmark_folder,region_type='CDR-EpiVague',model_type='antibody')
        full_record = get_plddts(pdb_code,benchmark_folder,region_type='full',model_type='antibody')
        record = {'pdb': full_record['pdb'],
              'rmsd_full': full_record['rmsd_region'], 'plddt_ave_full': full_record['plddt_ave_region'],
              'rmsd_full_cat': get_rmsd_cat(full_record['rmsd_region']),
              'plddt_full_cat': get_plddt_cat(full_record['plddt_ave_region']),
              'rmsd_vague': vague_record['rmsd_region'], 'plddt_ave_vague': vague_record['plddt_ave_region'],
              'rmsd_vague_cat': get_rmsd_cat(vague_record['rmsd_region']),
              'plddt_vague_cat': get_plddt_cat(vague_record['plddt_ave_region']),
              'rmsd_para_epi': para_epi_record['rmsd_region'], 'plddt_ave_para_epi': para_epi_record['plddt_ave_region'],
              'rmsd_para_epi_cat': get_rmsd_cat(para_epi_record['rmsd_region']),
              'plddt_para_epi_cat': get_plddt_cat(para_epi_record['plddt_ave_region']),
              }

        records.append(record)
    except Exception as e:
        print(f"Got error {e} for {pdb_code=}.")
df_antibody = pd.DataFrame().from_records(records)

  0%|          | 0/83 [00:00<?, ?it/s]

In [21]:
df_antibody.to_csv('../data/AF2_antibody_rmsd_plddt_multi_regions.csv')

In [22]:
records = []
benchmark_folder = Path('/Users/dcutting/Library/CloudStorage/Box-Box/Exscientia - Bonvin Lab share/benchmark_haddock_23_May_2023')

for path in tqdm(list(benchmark_folder.iterdir())):
    if path.is_dir():
        pdb_code = path.name
    else:
        continue
    #try:
    para_epi_record = get_plddts(pdb_code,benchmark_folder,region_type='Para-Epi',model_type='antigen')
    vague_record = get_plddts(pdb_code,benchmark_folder,region_type='CDR-EpiVague',model_type='antigen')
    full_record = get_plddts(pdb_code,benchmark_folder,region_type='full',model_type='antigen')
    record = {'pdb': full_record['pdb'],
              'rmsd_full': full_record['rmsd_region'], 'plddt_ave_full': full_record['plddt_ave_region'],
              'rmsd_full_cat': get_rmsd_cat(full_record['rmsd_region']),
              'plddt_full_cat': get_plddt_cat(full_record['plddt_ave_region']),
              'rmsd_vague': vague_record['rmsd_region'], 'plddt_ave_vague': vague_record['plddt_ave_region'],
              'rmsd_vague_cat': get_rmsd_cat(vague_record['rmsd_region']),
              'plddt_vague_cat': get_plddt_cat(vague_record['plddt_ave_region']),
              'rmsd_para_epi': para_epi_record['rmsd_region'], 'plddt_ave_para_epi': para_epi_record['plddt_ave_region'],
              'rmsd_para_epi_cat': get_rmsd_cat(para_epi_record['rmsd_region']),
              'plddt_para_epi_cat': get_plddt_cat(para_epi_record['plddt_ave_region']),
              }

    records.append(record)
    #except Exception as e:
    #    print(f"Got error {e} for {pdb_code=}.")
df_antigen = pd.DataFrame().from_records(records)

100%|██████████| 83/83 [00:27<00:00,  3.07it/s]


In [23]:
df_antigen.to_csv('../data/AF2_antigen_rmsd_plddt_multi_regions.csv')