# ABOUT
This file calculates fractional and absolute counts of surface-accessible residues

In [1]:
import numpy as np
import matplotlib
import pandas as pd
import os
import sys

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
	
# Set such that PDF fonts export in a manner that they
# are editable in illustrator/affinity
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

# set to define axes linewidths
matplotlib.rcParams['axes.linewidth'] = 0.5

# this defines some prefactors so inline figures look nice
# on a retina macbook. These can be commented out without any
# issue and are solely asthetic.
%matplotlib inline
%config InlineBackend.figure_format='retina'

# UPDATE 2020-12-31 (my preferred font is Avenir...)
font = {'family' : 'avenir',
    	'weight' : 'normal'}

matplotlib.rc('font', **font)

from tqdm import tqdm
import pickle
from sparrow import Protein
import protfasta

from scipy.cluster.hierarchy import dendrogram, linkage, leaves_list,fcluster
import matplotlib.patches as mpatches

import itertools


In [None]:
# lets you import from the code directory here
local_code_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'code'))

if local_code_path not in sys.path:
    sys.path.append(local_code_path)
from finches.utils import folded_domain_utils
from finches import CALVADOS_frontend
from domain_def import Domain


cf = CALVADOS_frontend()
from sparrow.data.amino_acids import VALID_AMINO_ACIDS
import protfasta

In [3]:
def get_sasa_info(x):
    """
    Function that will calculate the surface residue identity and SASA 
    (Solvent Accessible Surface Area) for a given folded domain object.

    Parameters:

        x : folded_domain_utils.FoldeDomain
            The folded domain object to analyze.

    Returns:
        tuple: A tuple containing two dictionaries:
            - surface_res: A dictionary with residue types as keys and their counts as values.
            - surface_res_sasa: A dictionary with residue types as keys and their total SASA as values.
        
    """

    # Initialize dictionaries to store surface residue counts and SASA values
    surface_res = {}
    surface_res_sasa = {}

    
    for k in x.surface_positions:
        res = x.sequence[k]
    
        if res not in surface_res:
            surface_res[res] = 0
            surface_res_sasa[res] = 0 

        surface_res[res] = surface_res[res] + 1
        surface_res_sasa[res] = surface_res_sasa[res] + x.sasa[k]
    return (surface_res, surface_res_sasa)



    

In [4]:
hard_root = '../data/domains_chainsaw'

# settings
SURFACE_THRESH = 0.10
SURFACE_THRESH_NAME = int(SURFACE_THRESH*100)
 

In [None]:
uid2domains = {}
uid2scores = {}
breakflag = False
for root, dirs, files in os.walk(hard_root):
    if breakflag:
        break

    # NO IDEA why this is needed by walk seems to be fucked..?
    if len(dirs) ==0:
        continue

    
    # cycle through each directory
    for i, tl_dir in enumerate(dirs):
        if breakflag:
            break

        # get files in teach tol level directroy
        for root, dirs, files in tqdm(os.walk(f'{hard_root}/{tl_dir}/')):
            if breakflag:
                break
            

            # cycle through each file in each directory
            for file in files:                        
                
                full_name = f"{hard_root}/{tl_dir}/{file}"
                uid = file.split('_')[0]
                start = file.split('_')[1]
                end = file.split('_')[2].split('.')[0]
                if uid not in uid2domains:
                    uid2domains[uid] = []                    
                

                # build a finches folded domain
                x = folded_domain_utils.FoldeDomain(full_name, surface_thresh=SURFACE_THRESH)            

                
                # calculate suface residue idenity and SASA
                try:

                    # write a SASA vis file just so we have it for spot checking...
                    #x.write_SASA_vis_file(filename=f"{hard_root}/{tl_dir}/" + uid +"_"+start+"_"+end + "10_.vis")
                    
                    surface_res, surface_res_sasa = get_sasa_info(x)

                    # build magic strings
                    surface_res_sasa = dict(sorted(surface_res_sasa.items()))
                    SASA_string = "\t".join([f"{k}_SASA : {round(surface_res_sasa[k],2)}"for k in surface_res_sasa])

                    surface_res = dict(sorted(surface_res.items()))
                    res_count_string = "\t".join([f"{k}_SASA_count : {surface_res[k]}"for k in surface_res])

                    uid2domains[uid].append([start, end, f'SASA_v1_{SURFACE_THRESH_NAME}', SASA_string, res_count_string, x.surface_fraction])
                    
                except Exception as e:
                    print(e)
                    print(f"Oh darn something went wrong with {hard_root}/{tl_dir}/ + uid... Skipping...")


1it [00:04,  4.36s/it]
0it [00:00, ?it/s]

In [8]:
with open(f'../data/shprd_files/shprd_chainsaw_domain_sasa_v1_{SURFACE_THRESH_NAME}.tsv','w') as fh:
    for uid in uid2domains:
        for d in uid2domains[uid]:
            domain_id = f"{uid}_{d[0]}_{d[1]}"

            interaction_string = f"fraction_sasa:{round(d[5],2)}" + "\t" + d[3] + "\t" +d[4]
            
            fh.write(f"{uid}\t{d[0]}\t{d[1]}\tglobular_chainsaw_sasa_v1_{SURFACE_THRESH_NAME}\t{interaction_string}\n")
