# About
This notebook generates all chainsaw-generated domains for the paper from an input AlphaFold2 derived proteome. Note that this actually generates two versions of the proteome:


1. `domains_chainsaw` are the domains where we decompose even internal regions into isolated subdomains. This is the most "aggressive" domain decomposition approach.
2. `domains_chainsaw_extended` are the domains where we use the broader chainsaw definitions, this is a much less aggressive version, the analysis of which is not in the current version of the manuscript but we included for completeness

In [None]:
import pandas as pd
import os
import protfasta
from finches.utils import folded_domain_utils
from tqdm import tqdm

In [None]:
# this file was generated by the chainsaw program as the output
with open('../data/chainsaw_domain_definitions.tsv','r') as fh:
    content = fh.readlines()

In [None]:
# define where the alphafold proteome used for input is
# the rootdir must be a directory where the ONLY files in that directory are PDB files from AlphaFold2. Importantly the filename structure
# should be `AF-<UniProtID>-F1-model_v4.pdb` because this then gets parsed by the code below to map unioprot ID to filename

rootdir = '../data/UP000002311_559292_YEAST_v4'
outroot = '../data/domains_chainsaw'

# build mapping of uniprot IDs to filenames
uid2fn = {}
for entry in os.listdir(rootdir):    
    uid = entry.split('-')[1]
    uid2fn[uid] = entry


In [None]:
errors = 0
newly_generated = 0
found_existing = 0

silent_errors = True

for idx, line in tqdm(enumerate(content[1:])):
    sline = line.strip().split('\t')

    uid = sline[0].split('-')[1]

    infile = infile = f'{rootdir}/{uid2fn[uid]}'
    outdir = f'{outroot}/{uid}'

    if not os.path.exists(outdir):
        os.mkdir(outdir)


    boundaries = []
    domains = sline[4].split("_")
    for d in domains:
        sub_domains =  d.split(',')
        for sd in sub_domains:
            try:
                (start,end) = sd.split('-')
            except ValueError:
                errors = errors + 1
                if silent_errors is False:
                    print(f'Error unpacking on {line}.\nSkipping..')
                continue
            start = int(start)
            end = int(end)
            if end-start < 10:
                pass
            else:
                boundaries.append([start,end])


    for d in boundaries:   

        # NB: This +1 offset is actually a tiny error and I'm not sure why it was introduced, but
        # the chainsaw number uses a 1-indexing not a zero indexing. For consistency, this is kept
        # here but in the future I'd probably reocmmend not offsetting by +1
        start = d[0]+1
        end = d[1]
    
        outfile = f'{outdir}/{uid}_{start}_{end}.pdb'
        
        if not os.path.exists(outfile):
            folded_domain_utils.extract_and_write_domains(infile, outfile, start, end)
            newly_generated = newly_generated + 1
        else:
            found_existing = found_existing + 1

print('')
print(f"Generated domains for {newly_generated} new proteins")
print(f"Found domains for {found_existing} existing domains")
print(f"Had {errors} error lines (skipped)")

            

In [None]:
outroot = '../data/domains_chainsaw_extended'

# build mapping of uniprot IDs to filenames
uid2fn = {}
for entry in os.listdir(rootdir):    
    uid = entry.split('-')[1]
    uid2fn[uid] = entry

In [None]:
errors = 0
newly_generated = 0
found_existing = 0

silent_errors = False

for idx, line in tqdm(enumerate(content[1:])):
    sline = line.strip().split('\t')

    uid = sline[0].split('-')[1]

    infile = infile = f'{rootdir}/{uid2fn[uid]}'
    outdir = f'{outroot}/{uid}'

    if not os.path.exists(outdir):
        os.mkdir(outdir)


    boundaries = []
    domains = sline[4].split("_")

    
    for d in domains:
        local_domains = d.split(',')
        try:
            start = int(local_domains[0].split('-')[0])
            end = int(local_domains[-1].split('-')[1])
        except ValueError:
            errors = errors + 1
            if silent_errors is False:
                print(f'Error unpacking on {line}.\nSkipping..')
            continue
        if end-start < 10:
            pass
        else:
            boundaries.append([start,end])

    for d in boundaries:  

            # NB: This +1 offset is actually a tiny error and I'm not sure why it was introduced, but
            # the chainsaw number uses a 1-indexing not a zero indexing. For consistency, this is kept
            # here but in the future I'd probably reocmmend not offsetting by +1
            start = d[0]+1
            end = d[1]
        
            outfile = f'{outdir}/{uid}_{start}_{end}.pdb'
            
            if not os.path.exists(outfile):
                folded_domain_utils.extract_and_write_domains(infile, outfile, start, end)
                newly_generated = newly_generated + 1
            else:
                found_existing = found_existing + 1

print('')
print(f"Generated domains for {newly_generated} new domains")
print(f"Found domains for {found_existing} existing domains")
print(f"Had {errors} error lines (skipped)")
    