# ABOUT
This file builds a SHEPHARD file with attractive/repulsive interaction attributes for each CHAINSAW globular domain.

NOTE - this analysis has been run and does not need to be re-run again. Takes a long time.

In [None]:
import numpy as np
import matplotlib
import sys
import os


import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
	
# Set such that PDF fonts export in a manner that they
# are editable in illustrator/affinity
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

# set to define axes linewidths
matplotlib.rcParams['axes.linewidth'] = 0.5

# this defines some prefactors so inline figures look nice
# on a retina macbook. These can be commented out without any
# issue and are solely asthetic.
%matplotlib inline
%config InlineBackend.figure_format='retina'

# UPDATE 2020-12-31 (my preferred font is Avenir...)
font = {'family' : 'avenir',
    	'weight' : 'normal'}

matplotlib.rc('font', **font)

from tqdm import tqdm
import pickle
from sparrow import Protein
import protfasta

from scipy.cluster.hierarchy import dendrogram, linkage, leaves_list,fcluster
import matplotlib.patches as mpatches

import itertools


In [None]:
# lets you import from the code directory here
local_code_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'code'))

if local_code_path not in sys.path:
    sys.path.append(local_code_path)

In [None]:
import pandas as pd
import os
from finches.utils import folded_domain_utils
from finches import CALVADOS_frontend
from domain_def import Domain


cf = CALVADOS_frontend()
from sparrow.data.amino_acids import VALID_AMINO_ACIDS
import protfasta

In [None]:
fingerprints = protfasta.read_fasta('../data/fingerprint_calvados.fasta')
tmp = {}
for k in fingerprints:
    seq = fingerprints[k]
    n = seq[0:2]
    tmp[n]  =seq
fingerprints = tmp

In [None]:
for root, dirs, files in os.walk('../data/domains_chainsaw/'):
    if len(dirs) == 0:
        continue

In [None]:
uid2domains = {}
uid2scores = {}
surface_threshold = 0.4
input_root = '../data/domains_chainsaw/'
for root, dirs, files in os.walk(input_root):
    print(root)

    # NO IDEA why this is needed by walk, seems to be broken..?
    if len(dirs) ==0:
        continue

    # cycle through each directory
    for i, tl_dir in tqdm(enumerate(dirs)):

        # get files in teach tol level directroy
        for root, dirs, files in os.walk(f'{input_root}/{tl_dir}/'):

            # cycle through each file in each directory
            for file in files:
                print(file)
                full_name = f"{input_root}/{tl_dir}/{file}"
                uid = file.split('_')[0]
                start = file.split('_')[1]
                end = file.split('_')[2].split('.')[0]
                if uid not in uid2domains:
                    uid2domains[uid] = []
                    uid2scores[uid] = {}
                uid2domains[uid].append([start,end])

                # build a finches folded domain
                x = folded_domain_utils.FoldeDomain(full_name, surface_thresh=surface_threshold)

                domain_name = f"{uid}_{start}_{end}"
                uid2scores[uid][domain_name] = []

                for f_k in fingerprints:
                    f_seq = fingerprints[f_k]
                    attractive = np.sum(x.calculate_attractive_surface_epsilon(f_seq, cf.IMC_object))
                    repulsive = np.sum(x.calculate_repulsive_surface_epsilon(f_seq, cf.IMC_object))
                    uid2scores[uid][domain_name].append([round(attractive,3), round(repulsive,3)])
                    
                    

        
with open(f'../data/shprd_files/shprd_chainsaw_domains_{str(surface_threshold)}.tsv','w') as fh:
    for uid in uid2domains:
        for d in uid2domains[uid]:
            domain_id = f"{uid}_{d[0]}_{d[1]}"

            interaction_string = ""
            for idx, f_k in enumerate(fingerprints):
                tmp = uid2scores[uid][domain_id][idx]
                interaction_string = interaction_string + f"{f_k}_attractive:{tmp[0]}\t{f_k}_repulsive:{tmp[1]}\t"
            interaction_string = interaction_string.strip()
            
            fh.write(f"{uid}\t{d[0]}\t{d[1]}\tglobular_chainsaw\t{interaction_string}\n")
