In [1]:
import local_seqtools.general_utils as tools
import pandas as pd
import numpy as np
import local_conservation_analysis_pipeline.group_conservation_objects as group_tools
import json
import os
import re
import sys
from pathlib import Path
from Bio import AlignIO, Seq, SeqIO, Align

import matplotlib.pyplot as plt
plt.style.use('custom_standard')
# plt.style.use('custom_small')
import seaborn as sns
# pd.options.plotting.backend = "plotly"

# %load_ext autoreload
# %autoreload 2

In [2]:
import local_env_variables.env_variables as env
ORTHODB_DATABASE = env.orthoDBDatabase()

In [3]:
def uni2odb_ids(uniprot_ids):
    odb2uni = {}
    for i in uniprot_ids:
        try:
            odb_id = uniprotid_search.uniprotid_2_odb_gene_id(i)
            # uni2odb[i] = odb_id
            odb2uni[odb_id] = i
        except ValueError as e:
            print(e)
            print(f"COULD NOT FIND: {i}, {ligands[i]}")
            continue
    return odb2uni


def get_verif_table_custom(
    uniprot_ids: list,
    regex: str,
    idr_map: dict|None,
    seq_organism: str = "Homo sapiens",
):
    odb2uni = uni2odb_ids(uniprot_ids)
    seq_list = [ORTHODB_DATABASE.data_all_seqrecords_dict[i] for i in odb2uni.keys()]
    df_verified = regex_search(regex, seq_list, idr_map=idr_map)
    df_verified["UniprotID"] = df_verified["odb_id"].map(odb2uni)
    df_verified["verified interaction"] = True
    df_verified["Organism"] = seq_organism
    return df_verified

In [4]:
enah_table = "../data/manually_curated_interactions/ena_vasp_binders_binding_seq_verification_annotation.xlsx"
enadf = pd.read_excel(enah_table)
enadf['UniprotID'] = enadf['UniprotID'].str.strip()
verif_ids = enadf['UniprotID'].unique()
verif_id_dict = enadf[['UniprotID', 'name']].drop_duplicates().set_index('UniprotID').to_dict()['name']
assert set(verif_ids) == set(list(verif_id_dict.keys()))

In [5]:
enadf_set = set(enadf['UniprotID'].unique())

In [7]:
len(enadf_set)

18

In [8]:
df = pd.read_csv('../benchmark/benchmark_v3/p1_table/benchmark_table_renamed.csv')
df = df[(df['ELM_motif_class']=='enah_LPPPP_FPPPP')&(df['verified interaction'])].copy()

In [9]:
def get_flanked_hit(s: pd.Series, flank: int = 5):
    seq = str(ORTHODB_DATABASE.data_all_seqrecords_dict[s['gene_id']].seq)
    st = s['hit start position']
    end = s['hit end position']
    r = tools.pad_hit(seq, st, end, l_flank=flank, r_flank=flank)
    return r[2]

In [10]:
df['flanked_hit'] = df.apply(get_flanked_hit, axis=1)

In [11]:
df = df[
    [
        "Organism",
        "name",
        "flanked_hit",
        "UniprotID",
        "hit start position",
        "hit end position",
        "ELM_motif_class",
        "gene_id",
    ]
]

In [12]:
df = df.sort_values('name')

In [13]:
temp = enadf.dropna(subset=['source for verified interaction']).copy()
citation_map = temp[['source for verified interaction','UniprotID']].set_index('UniprotID').to_dict()['source for verified interaction']

In [14]:
df['reference'] = df['UniprotID'].map(citation_map)

In [16]:
len(df.UniprotID.unique())

17

In [29]:
supp_folder = Path('./supplement/')
output_folder = supp_folder / 'manual_TPs'
output_folder.mkdir(parents=True, exist_ok=True)
df.to_csv(output_folder / 'ena_vasp_benchmark.csv', index=False)

In [17]:
x=[
	'10.1016/j.devcel.2004.07.021',
	'10.1016/j.devcel.2014.08.001',
	'ELM - 10.1093/nar/gkad1058',
	'10.1038/sj.emboj.7600380 ',
	'10.1074/jbc.M512107200',
	'10.1083/jcb.149.1.181',
	'10.1091/mbc.11.1.117 ',
	'10.7554/eLife.70680',
	'10.1002/cm.10173',
	'10.7554/eLife.70680',
	'10.1073/pnas.1903125117',
	'10.1038/ncomms11491',
	'10.1016/j.devcel.2004.07.024',
	'ELM - 10.1093/nar/gkad1058',
	'10.1016/s0092-8674(00)80883-1',
	'10.1083/jcb.201501003 ',
	'10.7554/eLife.70680',
	'10.1242/dev.045369',
	'ELM - 10.1093/nar/gkad1058',
	'ELM - 10.1093/nar/gkad1058',
	'10.1016/j.yexcr.2006.03.015 ',
	'ELM - 10.1093/nar/gkad1058',
	'10.1074/jbc.M001698200 ',
]
print(len(set(x)))

17


In [18]:
set(x)

{'10.1002/cm.10173',
 '10.1016/j.devcel.2004.07.021',
 '10.1016/j.devcel.2004.07.024',
 '10.1016/j.devcel.2014.08.001',
 '10.1016/j.yexcr.2006.03.015 ',
 '10.1016/s0092-8674(00)80883-1',
 '10.1038/ncomms11491',
 '10.1038/sj.emboj.7600380 ',
 '10.1073/pnas.1903125117',
 '10.1074/jbc.M001698200 ',
 '10.1074/jbc.M512107200',
 '10.1083/jcb.149.1.181',
 '10.1083/jcb.201501003 ',
 '10.1091/mbc.11.1.117 ',
 '10.1242/dev.045369',
 '10.7554/eLife.70680',
 'ELM - 10.1093/nar/gkad1058'}

In [21]:
tt = pd.read_excel('./supplement/manual_TPs/TRAF6_ligands.xlsx')    

In [33]:
tt

Unnamed: 0,Name,Uniprot ID,SLiM sequence,reference DOI,ref
0,CD40,P25942,KQEPQEINF,"10.1073/pnas.96.4.1234, 10.1074/jbc.274.20.14246","1999-Tsukamoto, 1999-Pullen"
1,TIFA,Q96CG3,SSSPTEMDE,10.1002/cbic.201800436,2019-Huang
2,MAVS,Q7Z434,CHGPEENEY,10.1074/jbc.M115.666578,2015-Shi
3,TICAM1,Q8IUC6,CQEPEEMSW,"10.1073/pnas.0308496101, 10.4049/jimmunol.171....","2004-Jiang, 2003-Sato"
4,IRAK2,O43187,SNTPEETDD,10.1038/nature00888,2002-Ye
5,IRAK1,P51617,PPSPQENSY,10.1038/nature00888,2002-Ye
6,IRAK1,P51617,PNQPVESDE,10.1038/nature00888,2002-Ye
7,IRAK1,P51617,RQGPEESDE,10.1038/nature00888,2002-Ye
8,IRAK3 (IRAK-M),Q9Y616,PSIPVEDDE,10.1038/nature00888,2002-Ye
9,mouse TNFRSF11A (RANK),O35305,RKIPTEDEY,10.1038/nature00888,2002-Ye


In [32]:
refmap = tt[['Uniprot ID', 'reference DOI']].set_index('Uniprot ID').to_dict()['reference DOI']

In [36]:
df = pd.read_csv('../benchmark/benchmark_v3/p1_table/benchmark_table_renamed.csv')
df = df[(df['ELM_motif_class']=='TRAF6')&(df['verified interaction'])].copy()
df = df[
    [
        'name',
        'UniprotID',
        'hit_sequence',
        'gene_id',
    ]
].copy()
df = df.rename(columns = {
    'hit_sequence':'SLiM sequence',
    'gene_id':'OrthoDB id',
    'name': 'Name',
    'UniprotID': 'Uniprot ID',
})

In [38]:
df['reference DOI'] = df['Uniprot ID'].map(refmap)
df

Unnamed: 0,Name,Uniprot ID,SLiM sequence,OrthoDB id,reference DOI
2581,CD40,P25942,KQEPQEINF,9606_0:004882,"10.1073/pnas.96.4.1234, 10.1074/jbc.274.20.14246"
2582,TIFA,Q96CG3,SSSPTEMDE,9606_0:001440,10.1002/cbic.201800436
2583,MAVS,Q7Z434,CHGPEENEY,9606_0:00486f,10.1074/jbc.M115.666578
2584,TICAM1,Q8IUC6,CQEPEEMSW,9606_0:004368,"10.1073/pnas.0308496101, 10.4049/jimmunol.171...."
2585,IRAK2,O43187,SNTPEETDD,9606_0:000e31,10.1038/nature00888
2586,IRAK1,P51617,PPSPQENSY,9606_0:004fa3,10.1038/nature00888
2587,IRAK1,P51617,PNQPVESDE,9606_0:004fa3,10.1038/nature00888
2588,IRAK1,P51617,RQGPEESDE,9606_0:004fa3,10.1038/nature00888
2589,IRAK3 (IRAK-M),Q9Y616,PSIPVEDDE,9606_0:0031e9,10.1038/nature00888
2590,mouse TNFRSF11A (RANK),O35305,RKIPTEDEY,10090_0:000361,10.1038/nature00888


In [39]:
df.to_csv('./supplement/manual_TPs/traf6_table.csv', index=False)

In [25]:
final_df = pd.read_csv('./z_scores_wideform.csv')
x = final_df[(final_df['ELM_motif_class']=='TRAF6')&(final_df['verified interaction'])].copy()
x[['UniprotID', 'gene_id']].set_index('UniprotID').to_dict()['gene_id']

{'P25942': '9606_0:004882',
 'Q96CG3': '9606_0:001440',
 'Q7Z434': '9606_0:00486f',
 'Q8IUC6': '9606_0:004368',
 'O35305': '10090_0:000361'}

In [16]:
enadf = enadf.sort_values('name')
enadf.to_csv(output_folder / 'ena_vasp_curated_interaction_table.csv')

In [32]:
final_df = pd.read_csv('./z_scores_wideform.csv')
final_df[(final_df['ELM_motif_class']=='Ena/VASP ([FL]PPPP)')&(final_df['verified interaction'])]

Unnamed: 0,reference_index,ELM_motif_class,verified interaction,regex,UniprotID,name,gene_id,hit end position,hit start position,hit_sequence,...,49.0,52.0,53.0,56.0,57.0,60.0,61.0,hit_length,mask_array,kibby_hit_zscores
1647,2220,Ena/VASP ([FL]PPPP),True,[FL]PPPP,O15117,FYB1_HUMAN,9606_0:0015fb,405,401,LPPPP,...,"[0.4601748323039581, 0.7975366391592859, 0.791...","[-0.23933683165670938, 1.3873420858147183, 1.5...","[0.19855748449218152, 1.4958950037216467, 1.32...","[-0.29617105470864036, 1.0707400342002609, 1.3...","[-0.4649984294507578, 1.642470265234283, 1.244...","[-0.3267682411825402, 1.007340625683417, 1.379...","[-0.4402351860813063, 1.6663202817582232, 1.31...",5,[1 1 0 1 1],"[-0.6801897478231042, 1.4715495363361728, 1.06..."
1648,2221,Ena/VASP ([FL]PPPP),True,[FL]PPPP,O15117,FYB1_HUMAN,9606_0:0015fb,629,625,FPPPP,...,"[0.2529320727129855, 1.011611100223858, 0.8186...","[-1.1231165834135544, 0.17158162367711285, 0.9...","[0.8736613092232504, 0.9665492050356074, 0.220...","[-0.8892890481310805, 0.08513803074348787, 0.3...","[0.25500278520948383, 1.554463509671094, 1.047...","[-0.9679944692933184, 0.05413178374820016, 0.2...","[0.26822620628625743, 1.6279508524727566, 0.94...",5,[1 1 0 1 1],"[1.3663687652944667, 1.177442714574201, 0.9375..."
1649,2222,Ena/VASP ([FL]PPPP),True,[FL]PPPP,Q9Y6N7,ROBO1_HUMAN,9606_0:000e6f,1189,1185,LPPPP,...,"[2.200174804446495, 1.3981771617935435, 1.7851...","[1.8245078189890809, 1.824507259503923, 1.8245...","[1.461920181660564, 1.4619196183928522, 1.4619...","[1.6427336742362912, 1.9137226956023294, 1.642...","[1.539593901392101, 1.5395937188688669, 1.5395...","[2.705273823179586, 2.9498285916893034, 2.8763...","[1.1698834304538923, 2.200515046981443, 2.1645...",5,[1 1 0 1 1],"[2.391649075247819, 4.402481824753609, 2.98371..."
1650,2223,Ena/VASP ([FL]PPPP),True,[FL]PPPP,Q9Y6N7,ROBO1_HUMAN,9606_0:000e6f,1483,1479,LPPPP,...,"[2.200174804446495, 1.3981771617935435, 1.7851...","[1.8245078189890809, 1.824507259503923, 1.3892...","[1.461920181660564, 1.4619196183928522, 1.4619...","[1.178048223024716, 1.7575973242674046, 1.4874...","[0.9266425527954572, 1.3749387644444686, 1.253...","[0.4734618574372247, 1.8788266901031536, 1.943...","[-0.18551598272799094, 1.5258036536328947, 1.5...",5,[1 1 0 1 1],"[0.9600534714519917, 2.997465140618524, 1.5246..."
1651,2225,Ena/VASP ([FL]PPPP),True,[FL]PPPP,Q702N8,XIRP1_HUMAN,9606_0:000f47,27,23,LPPPP,...,"[0.8586300837813101, 1.063280015874193, 0.7236...","[-0.23071870077644513, 0.9256373976128015, 0.2...","[0.19145357791965245, 0.9890616627244961, 0.48...","[0.025380354445568222, 1.1447143708665275, 0.6...","[0.06725929537345476, 1.163020239920466, 0.796...","[0.025380354445568222, 1.1447143708665275, 0.6...","[0.06725929537345476, 1.163020239920466, 0.796...",5,[1 1 0 1 1],"[0.1359395564649597, 1.8676028787069843, 1.796..."
1652,2226,Ena/VASP ([FL]PPPP),True,[FL]PPPP,Q15942,ZYX_HUMAN,9606_0:001d2c,76,72,LPPPP,...,"[1.0050441466662652, 1.7512482857336302, 1.521...","[0.18622755982677208, 1.70085058382364, 1.5692...","[-0.1123751700648484, 1.5057680069293158, 1.36...","[0.2614849259550335, 2.012998717829049, 1.0859...","[-0.16294025888150615, 1.75212425279716, 0.649...","[0.2614849259550335, 2.012998717829049, 1.0859...","[-0.16294025888150615, 1.75212425279716, 0.649...",5,[1 1 0 1 1],"[-0.25413451803695647, 3.391226024105314, 3.01..."
1653,2227,Ena/VASP ([FL]PPPP),True,[FL]PPPP,Q15942,ZYX_HUMAN,9606_0:001d2c,96,92,FPPPP,...,"[1.6572748057946836, 1.3150334259596224, 1.657...","[1.5692364551783897, 1.700850656957874, 1.5692...","[1.3648100165298858, 1.364809913171284, 1.3648...","[1.1751488614909238, 2.238311134658832, 1.6625...","[1.4641593380228834, 1.6684934300008274, 1.687...","[1.1751488614909238, 2.238311134658832, 1.6625...","[1.4641593380228834, 1.6684934300008274, 1.687...",5,[1 1 0 1 1],"[1.7949820163958512, 1.6179141244066362, 1.492..."
1654,2228,Ena/VASP ([FL]PPPP),True,[FL]PPPP,O15357,SHIP2_HUMAN,9606_0:002a4b,1056,1052,FPPPP,...,"[0.15533935196884974, 0.6718310826142352, 0.35...","[-0.3467328027155194, 1.0438941108369724, 0.83...","[-0.6188058216107617, 0.9893644531899906, 0.08...","[0.21837405608166252, 1.570355461213161, 1.079...","[-0.3058883462579657, 1.2377111816970907, 0.56...","[-0.16101614968892392, 1.437170306984934, 1.31...","[-1.104397665737907, 0.7219715369433581, 0.818...",5,[1 1 0 1 1],"[1.020810705069605, 1.7699466095714549, 3.1203..."
1655,2229,Ena/VASP ([FL]PPPP),True,[FL]PPPP,Q14517,FAT1_HUMAN,9606_0:00122c,4311,4307,LPPPP,...,"[1.0811224379580868, 0.8480356804745801, 0.269...","[0.7748233655548281, 1.0349391629864464, 0.181...","[0.8861264640929396, 0.8861261522695548, 0.886...","[0.9356420895142108, 1.043502387972335, 0.6605...","[0.9185451022001526, 0.9185449796753803, 0.918...","[0.6472677436748886, 0.9857672640518274, 0.580...","[0.39680716110457526, 1.540681043445785, 1.169...",5,[1 1 0 1 1],"[-0.07331504678779194, 1.9159865847817417, 1.9..."
1656,2230,Ena/VASP ([FL]PPPP),True,[FL]PPPP,Q14517,FAT1_HUMAN,9606_0:00122c,4438,4434,FPPPP,...,"[0.4692447624397573, 0.6265760107807807, -0.07...","[0.7748232647227614, 1.0349391629864464, 1.034...","[0.8861263623339373, 0.8861261522695548, 0.886...","[0.827971750584042, 0.9356420629421068, 0.7196...","[0.5721150776850563, 0.9185449796753803, 0.803...","[-0.20364086463523087, 0.5670915259056567, 0.8...","[-0.8035131075472743, 1.7121808297796437, 1.09...",5,[1 1 0 1 1],"[3.0552602194788334, 4.573307903708205, 2.9155..."


Unnamed: 0,reference_index,ELM_motif_class,verified interaction,regex,UniprotID,name,gene_id,hit end position,hit start position,hit_sequence,...,49.0,52.0,53.0,56.0,57.0,60.0,61.0,hit_length,mask_array,kibby_hit_zscores
1922,2581,TRAF6,True,...P.E..[FYWDE],P25942,CD40,9606_0:004882,241,233,KQEPQEINF,...,"[-0.8080762781931751, -0.7126324724789362, -0....","[-1.0149073713213814, -1.0141129372102002, -0....","[-1.2577008833093166, -0.7335579114800691, -0....","[-0.8685314528771192, -0.836500393977869, 0.66...","[-0.8263577080909258, -0.032214076414633815, 0...","[-0.8332266908037984, -0.7757051245247605, 0.7...","[-0.8346588253009211, -9.279319314298994e-05, ...",9,[0 0 0 1 0 1 0 0 1],"[-1.3394181471448607, -0.708460309034063, -0.9..."
1923,2582,TRAF6,True,...P.E..[FYWDE],Q96CG3,TIFA,9606_0:001440,180,172,SSSPTEMDE,...,"[-0.48589945165445486, -0.8825300796698908, -0...","[-0.5130024785016191, -1.0622952578299139, 0.0...","[-0.562548271463643, -1.1348368647611562, -0.0...","[-0.8662019081366231, -0.6837887896309388, -0....","[-0.43446241688742115, -0.7890015070642357, -0...","[-0.8662019081366231, -0.6837887896309388, -0....","[-0.43446241688742115, -0.7890015070642357, -0...",9,[0 0 0 1 0 1 0 0 1],"[-1.1427157976162268, -1.4076785871799582, -1...."
1924,2583,TRAF6,True,...P.E..[FYWDE],Q7Z434,MAVS,9606_0:00486f,459,451,CHGPEENEY,...,"[-1.598394676954996, 0.6760616015916827, -0.36...","[-1.7847527331036095, -0.12612350336235728, -0...","[-1.618105343465473, 0.2849775408054772, -0.37...","[-1.6777735977581538, 0.020028813040949382, -0...","[-1.1906867009567785, 0.34344327761439064, 0.0...","[-1.7198310891739925, 0.017588710602354413, -0...","[-1.237536197156806, 0.27967210620298155, -0.0...",9,[0 0 0 1 0 1 0 0 1],"[1.2254762923472207, 0.6860232205917756, 0.094..."
1925,2584,TRAF6,True,...P.E..[FYWDE],Q8IUC6,TICAM1,9606_0:004368,254,246,CQEPEEMSW,...,"[-0.9332995107464855, -0.7626319298211485, -0....","[-1.1020462284303523, 0.7734405365381956, 0.28...","[-1.1305325321304667, 0.5831404664158045, 0.30...","[-1.4040121785662647, 0.14777939811325466, 0.1...","[-1.3537609791899798, -0.16152773334710355, 0....","[-1.4040121785662647, 0.14777939811325466, 0.1...","[-1.3537609791899798, -0.16152773334710355, 0....",9,[0 0 0 1 0 1 0 0 1],"[1.8437968643509062, 0.3471971121389175, -0.24..."
1926,2590,TRAF6,True,...P.E..[FYWDE],O35305,mouse TNFRSF11A (RANK),10090_0:000361,344,336,RKIPTEDEY,...,"[-0.6819805958101022, -0.8747705651134324, 0.6...","[0.35346832509261583, -0.1692365075825763, 1.4...","[0.1971370373927511, -0.2917302055045039, 1.34...","[-0.24299579005857852, -0.6698400275362825, 0....","[-0.5902006259217838, -0.4508434780026391, 0.5...","[-0.21804079574755844, -0.669093269029158, 0.4...","[-0.6293780261096196, -0.4895487489305791, 0.5...",9,[0 0 0 1 0 1 0 0 1],"[0.25273507806639733, -0.5557000071545389, -1...."
1927,2591,TRAF6,True,...P.E..[FYWDE],O35305,mouse TNFRSF11A (RANK),10090_0:000361,377,369,FQEPLEVGE,...,"[0.7203792878697312, -0.8592408050958475, 1.57...","[1.333173454939843, 0.044446724276932026, 1.37...","[1.3383387408136123, -0.07833450505274114, 1.2...","[0.7675137463986275, -0.2931343015804783, 1.79...","[0.541395140326414, -0.3922453824334708, 1.634...","[0.7518923823689242, -0.3480162593005668, 1.86...","[0.445220316986872, -0.5030329648199984, 1.731...",9,[0 0 0 1 0 1 0 0 1],"[-0.8616547217271037, 0.08747486727537739, -0...."
1928,2592,TRAF6,True,...P.E..[FYWDE],O35305,mouse TNFRSF11A (RANK),10090_0:000361,451,443,GNTPGEDHE,...,"[-0.3074567061297867, -0.9180791179457269, -1....","[-0.5028563818865133, -1.1962258028265071, -1....","[-1.0306024261683857, -1.1643661542193418, -1....","[-0.4745354856500186, -0.9598687856884902, -0....","[-0.8045673346748532, -1.295877748287464, -0.9...","[-0.42955571280826604, -0.9666848298751205, -0...","[-0.8351394966125343, -1.311515178190053, -0.9...",9,[0 0 0 1 0 1 0 0 1],"[0.1622886113496904, 0.1667551035332316, -0.25..."
