# ABOUT
This file calculates fractional and absolute counts of surface-accessible residues. Note we use the SASA_ONLY flag which is a hacky flag that can be passed into a FoldedDomain object that avoids some expensive operations during initiatilization, but note this makes any other functionality unusable (but shifts this analysis from ~6 hours to ~15 minutes).

## Output
This script generates the files

`../data/shprd_files/shprd_{mode_name}_per_res_SASA.tsv'`

where {mode_name} will be either `chainsaw` or `dodo`.

In [1]:
import numpy as np
import os
import sys

from tqdm import tqdm
from finches.utils import folded_domain_utils

# lets you import from the code directory here
local_code_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'code'))

if local_code_path not in sys.path:
    sys.path.append(local_code_path)


from domain_def import Domain


In [None]:
# settings
mode_name='chainsaw'

if mode_name not in ['chainsaw', 'dodo']:
    raise Exception('Mode must be either chainsaw or dodo!')


hard_root = f'../data/domains_{mode_name}'
 

The cell below takes about 12 minutes to run on my macbook pro (M3), which is a lil while but down from the ~6 hours it took previously (!) when we didn't have the `ONLY_SASA` keyword for the FoldedDomain object.

In [None]:
# get the list of all the files in the directory
# and subdirectories
uid2domains = {}

# for debugging
breakflag = False 

# get the list of all the files in the directory
for root, dirs, files in os.walk(hard_root):


    if breakflag:
        break

    # skip the first directory
    if len(dirs) == 0:
        continue
    
    # for each directory (dirs) we're going to cycle through
    # the contents...  
    for i, tl_dir in  tqdm(enumerate(dirs)):
        if breakflag:
            break

        # get files in teach tol level directroy
        for root, dirs, files in os.walk(f'{hard_root}/{tl_dir}/'):
            if breakflag:
                break

            # cycle through each file in each directory
            for file in files:      
                
                full_name = f"{hard_root}/{tl_dir}/{file}"

                # do some mode specific parsing
                if mode_name == 'chainsaw':
                    uid = file.split('_')[0]
                    try:
                        start = file.split('_')[1]
                        end = file.split('_')[2].split('.')[0]
                    except IndexError:
                        print(f'Skipping {file}')
                        continue
                elif mode_name == 'dodo':
                    uid = file.split('-')[1]
                    try:
                        start = file.split('_')[-2]
                        end = file.split('_')[-1].split('.')[0]
                        
                        if int(end) < int(start):
                            print(full_name)
                            raise IndexError('This is an error...')
                        
                    except (IndexError, ValueError):
                        print(f'Skipping {file}')
                        continue
                
                # if the uid is not in the dictionary, add it
                if uid not in uid2domains:
                    uid2domains[uid] = []                    
                
                # build a finches folded domain
                x = folded_domain_utils.FoldeDomain(full_name, SASA_ONLY=True)          
                try:
                    pass
                    
                except Exception as e:
                    print(f"Godamn, something went wrong with {full_name} so we're skipping it")
                    continue

                # amino acid sequence of PDB file
                seq = x.sequence

                # per-residue SASA value
                sasa = x.sasa 
                
                # assign each domain to each UID                
                if uid not in uid2domains:
                    uid2domains[uid] = []
                uid2domains[uid].append([start,end,seq,sasa])                    
                
                

0it [00:00, ?it/s]

6039it [12:40,  7.94it/s]


Having generated the mapping, we then write out a SHEPHARD track file.

In [4]:
fh = open(f'../data/shprd_files/shprd_{mode_name}_per_res_SASA.tsv','w')

for uid in uid2domains:
    for entry in uid2domains[uid]:
        x = [str(int(round(i))) for i in entry[3]]    
        seq = entry[2]
        start = entry[0]
        end = entry[1]

        outstring=''
        for i,sasa in enumerate(x):
            fh.write(f"{uid}\t{i+int(start)+1}\tsasa_site\t{seq[i]}\t{sasa}\n")
            
fh.close()
