In [1]:
from pathlib import Path
import pandas as pd
import tarfile
from io import BytesIO
from joblib import Parallel, delayed

In [2]:
# Download the zenodo data to this filepath:
haddock_results = Path('../../results_24_july_2024')

In [3]:
def get_top_dockq(df: pd.DataFrame, n_ranks: int = 1):
    top_dockq = df[df['caprieval_rank'].isin(range(1,n_ranks+1))]['dockq'].max()
    return top_dockq

model_name_run_dict = {'ABodyBuilder2':'ab','AlphaFold2':'af2','ABlooper':'abl','IgFold':'ig'}
protocols_dict = {'CDR-VagueEpi-AA':'CDR-EpiVag-AA-mpi-50-50', 'Para-Epi': 'Para-Epi-mpi-50-50'}
antigen_dict = {'alphafold2':'af2','experimental':''}
stage_to_eval_dict = {'Rigid-body' : '2_caprieval', 'Refinement' : '5_caprieval', 'Clustering': '7_caprieval'}

In [6]:
def get_pdb_records(pdb_tar_file_path: Path):
    records = []
    pdb_name = pdb_tar_file_path.name.removesuffix('_capri.tgz')
    with tarfile.open(pdb_tar_file_path,'r') as tfile:
        for antigen, antigen_str in antigen_dict.items():
            for protocol, protocol_str in protocols_dict.items():
                for stage, eval_folder in stage_to_eval_dict.items():
                    for model_name, model_run_str in model_name_run_dict.items():
                        for topn in [1,10]:
                            dockq_fpath = f'{pdb_name}/run-{antigen_str}{model_run_str}-{protocol_str}/{eval_folder}/capri_ss.tsv'
                            file_object=tfile.extractfile(tfile.getmember(dockq_fpath))
                            df = pd.read_csv(BytesIO(file_object.read()), sep='\t')
                            record = {
                                'pdb' : pdb_name,
                                'antigen' : antigen,
                                'model_name' : model_name,
                                'protocol' : protocol,
                                'stage' : stage,
                                'topn' : topn,
                                'topn_dockq': get_top_dockq(df,topn)
                            }
                            records.append(record)
    return records

In [7]:
with Parallel(n_jobs=-2, backend='loky',verbose=10) as parallel:
    nested_records = parallel(delayed(get_pdb_records)(tarfile_path) for tarfile_path in haddock_results.glob('*.tgz'))

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 9 concurrent workers.
[Parallel(n_jobs=-2)]: Done   7 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-2)]: Done  14 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-2)]: Done  23 tasks      | elapsed:   13.9s
[Parallel(n_jobs=-2)]: Done  32 tasks      | elapsed:   16.4s
[Parallel(n_jobs=-2)]: Done  43 tasks      | elapsed:   18.8s
[Parallel(n_jobs=-2)]: Done  54 tasks      | elapsed:   21.8s
[Parallel(n_jobs=-2)]: Done  75 out of  83 | elapsed:   28.2s remaining:    3.0s
[Parallel(n_jobs=-2)]: Done  83 out of  83 | elapsed:   29.8s finished


In [8]:
df = pd.DataFrame([record for records in nested_records for record in records])

In [9]:
df.to_csv('../data/topn_dockq.csv')