# About
This notebook generates all DODO-generated domains for the paper from an input AlphaFold2 derived proteome

In [None]:
import pandas as pd
import os
import protfasta
from dodo import build
from tqdm import tqdm

In [None]:
# define where the alphafold proteome used for input is
# the rootdir must be a directory where the ONLY files in that directory are PDB files from AlphaFold2. Importantly the filename structure
# should be `AF-<UniProtID>-F1-model_v4.pdb` because this then gets parsed by the code below to map unioprot ID to filename
rootdir = '../data/UP000002311_559292_YEAST_v4'
outroot = '../data/domains_dodo'

# build mapping of uniprot IDs to filenames
uid2fn = {}
for entry in os.listdir(rootdir):        
    uid = entry.split('-')[1]
    uid2fn[uid] = entry


In [None]:
# the code here cycles through each PDB file in the proteome and decomposes it into 
newly_generated = 0
found_existing = 0
for idx, uid in tqdm(enumerate(list(uid2fn.keys()))):

    
    outdir = f'{outroot}/{uid}'

    infile = f'{rootdir}/{uid2fn[uid]}'
    
    if not os.path.exists(outdir):
        os.mkdir(outdir)
        
        try:
            build.pdb_from_pdb(infile, out_path=f'{outdir}/{uid2fn[uid]}', just_fds=True, verbose=False)
            newly_generated = newly_generated + 1
        except Exception as e:
            print(f"Failed on {e}... carrying on though!")
    else:
        found_existing = found_existing + 1

print('')
print(f"Generated domains for {newly_generated} new proteins")
print(f"Found domains for {found_existing} existing proteins")
      
