This should be run only after `../../ELM_instances/build_instance_map.ipynb` and `../../all_human_odb_sequences_removed_duplicates/get_all_human_proteins_in_odb.ipynb` have been run

add path to the IUPRED2A library in the file `./iupred_tools.py` as the variable `IUPRED_PATH`

In [5]:
from pathlib import Path
from pyprojroot import here

import orthodb_tools.sql_queries as sql_queries
import orthodb_tools.tools.general_utils as tools
import iupred_tools as iuptools
import numpy as np
import pandas as pd
import yaml
from Bio import Align, AlignIO, Seq, SeqIO
# from local_env_variables import project_filepaths as fp

In [6]:
BENCHMARK_DIR = here() / "benchmark" / "benchmark_v4" / "p1_table"
BENCHMARK_DIR.mkdir(parents=True, exist_ok=True)

# Verified interactions/regexes from the ELM database

## functions

In [None]:
def get_regex_matches(regex_pattern: str, seq_str: str):
    """searches for all matches of a regex pattern in a sequence string
    returns a generator object that yields the match sequence, start index, and end index

    Parameters
    ----------
    regex_pattern : str
        regular expression pattern
    seq_str : str
        string to search for matches

    Yields
    ------
    tuple
        (match sequence, start index, end index)
    """
    p = re.compile(regex_pattern)
    for m in p.finditer(seq_str):
        if m.start() == m.end():
            # even if there are groups in the lookahead, the first group should be the full match b/c that group surrounds the entire regex
            # so this will work whether or not there are groups in the lookahead
            match_seq = m.groups()[0]
        else:
            match_seq = seq_str[m.start() : m.end()]
        yield match_seq, m.start(), m.start() + len(match_seq) - 1


In [7]:
def in_any_idrs(idr_list: list, start: int, end: int):
    for idr in idr_list:
        if idr[0] <= start and end <= idr[1]:
            return True
    return False


def regex_search(regex, seqrecord_list, idr_map: dict = None):
    odb_id_ex = []
    hit_start_pos_ex = []
    hit_end_pos_ex = []
    motif_match_ex = []

    for record in seqrecord_list:
        sequence = str(record.seq)
        if idr_map is not None:
            idrs = idr_map[record.id]
        for m in get_regex_matches(regex, sequence):
            start_pos = m[1]
            end_pos = m[2]
            if idr_map is not None:
                if len(idrs) == 0:
                    continue
                if not in_any_idrs(idrs, start_pos, end_pos):
                    continue
            odb_id_ex.append(record.id)
            hit_start_pos_ex.append(start_pos)
            hit_end_pos_ex.append(end_pos)
            motif_match_ex.append(m[0])

    df_fp = pd.DataFrame(
        {
            "odb_id": odb_id_ex,
            "odb_mot_st": hit_start_pos_ex,
            "odb_mot_end": hit_end_pos_ex,
            "motif_match": motif_match_ex,
        }
    )
    df_fp["regex"] = regex
    return df_fp


def load_verified_from_instances(
    motif_class_name,
    elm_instances_df,
    allowed_organisms=["Homo sapiens", "Mus musculus", "Rattus norvegicus"],
):
    """Load the verified instances for 1 slim class from the table of verified instances. Filters out instances not in `allowed_organisms`

    Parameters
    ----------
    motif_class_name : str
        name of the motif class (in the table)
    elm_instances_df : pd.Dataframe
        Instances from the ELM (should be preprocessed)
    allowed_organisms : list, optional
        list of organisms to allow, by default ["Homo sapiens", "Mus musculus", "Rattus norvegicus"]

    Returns
    -------
    tuple
        returns the pd.Dataframe of instances and the regular expression of the motif
    """
    print(f"Getting motif benchmark table for {motif_class_name}")

    # import known binders
    df = elm_instances_df.copy()
    df = df[df["ELMIdentifier"] == motif_class_name].copy()
    df["verified interaction"] = True
    df = df.rename(
        columns={
            "uniprot2map": "UniprotID",
            "Regex": "regex",
            "ProteinName": "name",
            "hit_sequence": "flanked_motif_match",
        }
    )
    df = df[df["InstanceLogic"] == "true positive"]
    df2 = df[
        [
            # "Accession_inst",
            # "Accession_class",
            "Organism",
            # "Probability",
            # "ELMIdentifier",
            # "name",
            "Primary_Acc",
            "Accessions",
            # "PDB",
            # "FunctionalSiteName",
            # "Description",
            # 'uniprot2map',
            "UniprotID",
            # "fasta_id",
            "regex",
            "motif_match",
            # "flanked_motif_match",
            # "mot_start_pos_in_hit",
            "odb_id",
            # "odb_hit_start",
            # "odb_hit_end",
            "odb_mot_st",
            "odb_mot_end",
            "verified interaction",
        ]
    ].copy()

    print("restrict known binders to a subset of organisms (only ones close to human)")

    print(f"allowed organisms for known binding interactions {allowed_organisms}")
    df2 = df2[df2["Organism"].isin(allowed_organisms)]
    reglist = df2["regex"].unique()
    assert len(reglist) == 1, "More than one regex in dataframe of known binders"
    regex = reglist[0]
    return df2, regex


def add_clashes_with_known_instances(df_bg_in, instances_df):
    df_bg = df_bg_in.copy()
    clash_map = {}
    for i, row in df_bg.iterrows():
        elm_clash_flag = False
        if row["odb_id"] in instances_df["odb_id"].values:
            potential_overlaps = instances_df[instances_df["odb_id"] == row["odb_id"]]
            for j, instance_row in potential_overlaps.iterrows():
                if (
                    row["odb_mot_st"] <= instance_row["odb_mot_end"]
                    and row["odb_mot_end"] >= instance_row["odb_mot_st"]
                ):
                    elm_clash_flag = True
            if elm_clash_flag:
                clash_map[i] = True
            else:
                clash_map[i] = False
        else:
            clash_map[i] = False
    df_bg["clash"] = df_bg.index.map(clash_map)
    return df_bg


def get_bg_seqs(
    bg_seq_fasta_file,
    regex,
    instance_df,
    idr_map: dict = None,
    bg_organism="Homo sapiens",
):
    faimporter = tools.FastaImporter(bg_seq_fasta_file)
    bg_search_seqs = faimporter.import_as_list()
    df_bg = regex_search(regex, bg_search_seqs, idr_map=idr_map)
    df_bg["UniprotID"] = df_bg["odb_id"].apply(sql_queries.odb_gene_id_2_uniprotid)
    df_bg["verified interaction"] = False
    df_bg["Organism"] = bg_organism
    df_bg = add_clashes_with_known_instances(df_bg, instance_df)
    df_bg = df_bg[~df_bg["clash"]]
    df_bg = df_bg.drop(columns="clash")
    return df_bg


def merge_verified_and_fp_motifs(df_verified, df_bg, cluster_dict=None):
    # remove overlap with verified binders
    verified_binder_odids = list(df_verified["odb_id"].unique())
    if cluster_dict is not None:
        # for each verified binder, add all cluster members so that they're removed from fp as well
        for cluster in cluster_dict:
            if any([i in verified_binder_odids for i in cluster_dict[cluster]['all_members']]):
                print('added cluster members')
                verified_binder_odids.extend(cluster_dict[cluster]['all_members'])
        verified_binder_odids = list(set(verified_binder_odids))
    df_bg = df_bg[~df_bg["odb_id"].isin(verified_binder_odids)].copy()
    assert (
        df_bg["UniprotID"].isin(df_verified["UniprotID"].unique()).sum() == 0
    ), "False positives overlap with verified binders by uniprot id"
    df_verified["regex"] = df_verified["regex"].astype(str)
    df_verified["verified interaction"] = df_verified["verified interaction"].astype(
        bool
    )

    # # merge
    dfm = pd.concat([df_verified, df_bg]).reset_index(drop=True)
    return dfm
    # dfm.to_csv(f"./{motif_class_name}_motifs.csv", index=False)
    # for col in dfm.columns: print(col)


def main(
    elm_class,
    elm_instances_file,
    bg_seq_fasta_file,
    idr_map,
    bg_organism,
    subsample: int|None=None,
    cluster_dict=None,
):
    elm_instances_df = pd.read_csv(elm_instances_file)
    dftp, regex = load_verified_from_instances(elm_class, elm_instances_df)
    bg_df = get_bg_seqs(
        bg_seq_fasta_file,
        regex,
        elm_instances_df,
        idr_map=idr_map,
        bg_organism=bg_organism,
    )
    if subsample is not None and len(bg_df) > subsample:
        bg_df = bg_df.sample(subsample, random_state=42)
    dfm = merge_verified_and_fp_motifs(dftp, bg_df, cluster_dict=cluster_dict)
    dfm["ELM_motif_class"] = elm_class
    return dfm

## load idr map for all human proteins

In [8]:
human_sequences = "../../../../data/orthodb_clustered_species_proteins/9606_6/all_human_proteins_in_odb.fasta"

idr_map = {}
faimporter = tools.FastaImporter(human_sequences)
all_human_seq_dict = faimporter.import_as_dict()
for k, v in all_human_seq_dict.items():
    idr_map[k] = iuptools.main_find_idr_regions(
        str(v.seq),
        iupred_cutoff=0.4,
        gap_merge_threshold=10,
        idr_min_length=8,
    )

## generating tables for a variety of ELM motifs (classes)

In [9]:
ELM_CLASSES_4_BENCHMARK = [
    "LIG_AP2alpha_2",
    "LIG_EH_1",
    "LIG_SH2_GRB2like",
    "LIG_SH3_CIN85_PxpxPR_1",
    # "LIG_PDZ_Class_1",
    "DOC_WW_Pin1_4", # this will have to be subsampled because the regex is way too general
    # "LIG_LIR_Gen_1",
    "LIG_14-3-3_CanoR_1"
]
# This is to set the number of background sequences to sample for each motif class in the cases where there are too many background matches. This happens when the motif regex being too general
CLASS_BG_SAMPLING = {
    "LIG_AP2alpha_2": 350,
    "LIG_EH_1": 350,
    "LIG_SH2_GRB2like": 350,
    "LIG_SH3_CIN85_PxpxPR_1": 350,
    # "LIG_PDZ_Class_1": 300,
    "DOC_WW_Pin1_4": 350, # this will have to be subsampled because the regex is way too general
    # "LIG_LIR_Gen_1": 300,
    "LIG_14-3-3_CanoR_1": 350
}

In [10]:
BG_SEQ_FASTA_FILE = "../../../../data/orthodb_clustered_species_proteins/9606_6/all_human_proteins_in_odb_clustered_c0_95.fasta"
BG_CLUSTER_JSON_FILE = "../../../../data/orthodb_clustered_species_proteins/9606_6/all_human_proteins_in_odb_clustered_c0_95.json"
BG_ORGANISM = "Homo sapiens"
ELM_INSTANCES_FILE = "../../../../data/ELM/2024-02-09-ELM_instances/elm_instances_with_hit_sequence.csv"
import json
with open(BG_CLUSTER_JSON_FILE) as f:
    BG_CLUSTER_DICT = json.load(f)

In [11]:
# elm_instances_df = pd.read_csv(ELM_INSTANCES_FILE)
# dftp, regex = load_verified_from_instances(ELM_CLASSES_4_BENCHMARK[0], elm_instances_df)

In [12]:
# df = get_bg_seqs(BG_SEQ_FASTA_FILE, regex, elm_instances_df, idr_map=idr_map, bg_organism=BG_ORGANISM)
# df[df['clash']]

In [11]:
df_list = []
for motif in ELM_CLASSES_4_BENCHMARK:
    if motif in CLASS_BG_SAMPLING:
        df = main(
            elm_class=motif,
            elm_instances_file=ELM_INSTANCES_FILE,
            bg_seq_fasta_file=BG_SEQ_FASTA_FILE,
            idr_map=idr_map,
            bg_organism=BG_ORGANISM,
            subsample=CLASS_BG_SAMPLING[motif],
            cluster_dict=BG_CLUSTER_DICT
        )
    else:
        df = main(
            elm_class=motif,
            elm_instances_file=ELM_INSTANCES_FILE,
            bg_seq_fasta_file=BG_SEQ_FASTA_FILE,
            idr_map=idr_map,
            bg_organism=BG_ORGANISM,
            cluster_dict=BG_CLUSTER_DICT
        )
    df_list.append(df)

Getting motif benchmark table for LIG_AP2alpha_2
restrict known binders to a subset of organisms (only ones close to human)
allowed organisms for known binding interactions ['Homo sapiens', 'Mus musculus', 'Rattus norvegicus']
added cluster members
added cluster members
added cluster members
added cluster members
Getting motif benchmark table for LIG_EH_1
restrict known binders to a subset of organisms (only ones close to human)
allowed organisms for known binding interactions ['Homo sapiens', 'Mus musculus', 'Rattus norvegicus']
added cluster members
added cluster members
added cluster members
added cluster members
added cluster members
added cluster members
added cluster members
added cluster members
added cluster members
added cluster members
added cluster members
added cluster members
Getting motif benchmark table for LIG_SH2_GRB2like
restrict known binders to a subset of organisms (only ones close to human)
allowed organisms for known binding interactions ['Homo sapiens', 'Mus mus

In [12]:
elms_df = pd.concat(df_list).reset_index(drop=True)
elms_df

Unnamed: 0,Organism,Primary_Acc,Accessions,UniprotID,regex,motif_match,odb_id,odb_mot_st,odb_mot_end,verified interaction,ELM_motif_class
0,Rattus norvegicus,O08838,O08838,O08838,DP[FW],DPF,10116_0:004cdf,356,358,True,LIG_AP2alpha_2
1,Rattus norvegicus,Q05140,Q05140,Q05140,DP[FW],DPF,10116_0:002e5e,399,401,True,LIG_AP2alpha_2
2,Rattus norvegicus,Q05140,Q05140,Q05140,DP[FW],DPF,10116_0:002e5e,473,475,True,LIG_AP2alpha_2
3,Homo sapiens,P98082,P98082 A6NES5 Q13598 Q9BTY0 Q9UK04,P98082,DP[FW],DPF,9606_0:0016b2,292,294,True,LIG_AP2alpha_2
4,Homo sapiens,P98082,P98082 A6NES5 Q13598 Q9BTY0 Q9UK04,P98082,DP[FW],DPF,9606_0:0016b2,297,299,True,LIG_AP2alpha_2
...,...,...,...,...,...,...,...,...,...,...,...
2376,Homo sapiens,,,Q8TD94,"R[^DE]{0,2}[^DEPG]([ST])(([FWYLMV].)|([^PRIKGN...",RRRSVT,9606_0:001dcd,183,188,False,LIG_14-3-3_CanoR_1
2377,Homo sapiens,,,A0A090N8Q6,"R[^DE]{0,2}[^DEPG]([ST])(([FWYLMV].)|([^PRIKGN...",RSSSKESSP,9606_0:001e50,127,135,False,LIG_14-3-3_CanoR_1
2378,Homo sapiens,,,Q7KZ85,"R[^DE]{0,2}[^DEPG]([ST])(([FWYLMV].)|([^PRIKGN...",RATDLPERF,9606_0:00403d,289,297,False,LIG_14-3-3_CanoR_1
2379,Homo sapiens,,,Q15560,"R[^DE]{0,2}[^DEPG]([ST])(([FWYLMV].)|([^PRIKGN...",RVGMSVN,9606_0:0048e3,47,53,False,LIG_14-3-3_CanoR_1


In [13]:
elms_df.groupby('verified interaction')['ELM_motif_class'].value_counts().unstack()

ELM_motif_class,DOC_WW_Pin1_4,LIG_14-3-3_CanoR_1,LIG_AP2alpha_2,LIG_EH_1,LIG_SH2_GRB2like,LIG_SH3_CIN85_PxpxPR_1
verified interaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
False,350,349,349,340,348,348
True,79,52,50,54,30,32


---

# manually curated verified interactions

## ENAH with [FL]PPPP regex

In [14]:
from orthodb_tools.orthogroup_processing import uniprotid_search

Get the regex matches for the verified binders first

import the annotated binders

In [17]:
df

Unnamed: 0,name,Uniprot ID,reference,OrthoDB id
0,AB1IP_HUMAN,Q7Z5R6,10.1016/j.devcel.2004.07.021,9606_0:00294e
1,ABI3_HUMAN,Q9P2A4,10.1016/j.devcel.2014.08.001,9606_0:003dae
2,ANK3_HUMAN,Q12955,ELM - 10.1093/nar/gkad1058,9606_0:0027f1
3,FAT1_HUMAN,Q14517,10.1038/sj.emboj.7600380,9606_0:00122c
4,FBLI1_HUMAN,Q8WUP2,10.1074/jbc.M512107200,9606_0:000661
5,FYB1_HUMAN,O15117,10.1083/jcb.149.1.181,9606_0:0015fb
6,LPP_HUMAN,Q93052,10.1091/mbc.11.1.117,9606_0:000d90
7,NHSL1_HUMAN,Q5SYE7,10.7554/eLife.70680,9606_0:001b40
8,PALLD_HUMAN,Q8WX93,10.1002/cm.10173,9606_0:00141f
9,PCARE_HUMAN,A6NGG8,10.7554/eLife.70680; 10.1073/pnas.1903125117;...,9606_0:00094d


In [18]:
enah_table = "../../../../data/manually_curated_interactions/ena_vasp_benchmark.xlsx"
df = pd.read_excel(enah_table)
df['Uniprot ID'] = df['Uniprot ID'].str.strip()
verif_ids = df['Uniprot ID'].unique()
verif_id_dict = df[['Uniprot ID', 'name']].drop_duplicates().set_index('Uniprot ID').to_dict()['name']
assert set(verif_ids) == set(list(verif_id_dict.keys()))

In [20]:
REGEX = '[FL]PPPP'
CLASS_NAME = 'enah_LPPPP_FPPPP'

In [21]:
import orthodb_tools.env_variables.env_variables as env
ORTHODB_DATABASE = env.orthoDBDatabase()

In [22]:
def uni2odb_ids(uniprot_ids):
    odb2uni = {}
    for i in uniprot_ids:
        try:
            odb_id = uniprotid_search.uniprotid_2_odb_gene_id(i)
            # uni2odb[i] = odb_id
            odb2uni[odb_id] = i
        except ValueError as e:
            print(e)
            print(f"COULD NOT FIND: {i}, {ligands[i]}")
            continue
    return odb2uni


def get_verif_table_custom(
    uniprot_ids: list,
    regex: str,
    idr_map: dict|None,
    seq_organism: str = "Homo sapiens",
):
    odb2uni = uni2odb_ids(uniprot_ids)
    seq_list = [ORTHODB_DATABASE.data_all_seqrecords_dict[i] for i in odb2uni.keys()]
    df_verified = regex_search(regex, seq_list, idr_map=idr_map)
    df_verified["UniprotID"] = df_verified["odb_id"].map(odb2uni)
    df_verified["verified interaction"] = True
    df_verified["Organism"] = seq_organism
    return df_verified

In [25]:
df_verified = get_verif_table_custom(
    verif_ids,
    regex=REGEX,
    idr_map=idr_map,
    seq_organism="Homo sapiens"
)

Q8WX93 not found in gene key table, searching in xref table
Q70E73 not found in gene key table, searching in xref table
Q9Y6N7 not found in gene key table, searching in xref table
Q8TF72 not found in gene key table, searching in xref table
P18206 not found in gene key table, searching in xref table


In [26]:
df_verified['name'] = df_verified['UniprotID'].map(verif_id_dict)
df_verified

Unnamed: 0,odb_id,odb_mot_st,odb_mot_end,motif_match,regex,UniprotID,verified interaction,Organism,name
0,9606_0:00294e,127,131,LPPPP,[FL]PPPP,Q7Z5R6,True,Homo sapiens,AB1IP_HUMAN
1,9606_0:00294e,140,144,LPPPP,[FL]PPPP,Q7Z5R6,True,Homo sapiens,AB1IP_HUMAN
2,9606_0:00294e,515,519,LPPPP,[FL]PPPP,Q7Z5R6,True,Homo sapiens,AB1IP_HUMAN
3,9606_0:00294e,553,557,LPPPP,[FL]PPPP,Q7Z5R6,True,Homo sapiens,AB1IP_HUMAN
4,9606_0:00294e,567,571,LPPPP,[FL]PPPP,Q7Z5R6,True,Homo sapiens,AB1IP_HUMAN
5,9606_0:00294e,597,601,LPPPP,[FL]PPPP,Q7Z5R6,True,Homo sapiens,AB1IP_HUMAN
6,9606_0:003dae,276,280,LPPPP,[FL]PPPP,Q9P2A4,True,Homo sapiens,ABI3_HUMAN
7,9606_0:003dae,289,293,LPPPP,[FL]PPPP,Q9P2A4,True,Homo sapiens,ABI3_HUMAN
8,9606_0:0027f1,3253,3257,FPPPP,[FL]PPPP,Q12955,True,Homo sapiens,ANK3_HUMAN
9,9606_0:00122c,4307,4311,LPPPP,[FL]PPPP,Q14517,True,Homo sapiens,FAT1_HUMAN


In [27]:
elm_instances_df = pd.read_csv(ELM_INSTANCES_FILE)
df_bg = get_bg_seqs(BG_SEQ_FASTA_FILE, regex=REGEX, instance_df=elm_instances_df, idr_map=idr_map, bg_organism=BG_ORGANISM)
df_bg = df_bg.sample(350, random_state=42)
dfm = merge_verified_and_fp_motifs(df_verified, df_bg, cluster_dict=BG_CLUSTER_DICT)
dfm['ELM_motif_class'] = CLASS_NAME

added cluster members
added cluster members
added cluster members
added cluster members
added cluster members
added cluster members
added cluster members
added cluster members
added cluster members
added cluster members
added cluster members
added cluster members
added cluster members
added cluster members
added cluster members
added cluster members
added cluster members


In [31]:
print(len(dfm))
print(len(elms_df))

365
2381


In [32]:
benchmark_df = pd.concat([elms_df, dfm]).reset_index(drop=True)

## TRAF6

In [42]:
traf_table = "../../../../data/manually_curated_interactions/traf6_table.csv"
df = pd.read_csv(traf_table)
df['Uniprot ID'] = df['Uniprot ID'].str.strip()
df = df.rename(columns = {'Uniprot ID':'UniprotID','SLiM sequence':'motif_match'})

You could make the regex more specific to reduce the number of false positives<br>
It might actually be better to use the information we know about unfavorable residues to make the false positives more likely to be real false positives. That said it's probably safer for now to just use the same regex for both TPs and FPs.
- normal regex: `...P.E..[FYWHDE]`
- numbering: `(-3)(-2)(-1)(0)(+1)(+2)(+3)(+4)(+5)`
- Removing unfavorable residues from the regex:
  - proline at any position between (+1) and (+5)
    - it binds as a beta sheet
  - a positively charged residue (RK) at positions (+3), (+4), or (+5)


**To reduce the number of false positives, I am just going to subsample the false positives**

In [38]:
# REGEX='...P[^P]E[^PRK][^PRK][FYWDE]'
REGEX='...P.E..[FYWDE]'
CLASS_NAME='TRAF6'

In [39]:
def odb_id_2_species_name(odb_gene_id: str):
    spec_id=sql_queries.odb_gene_id_2_species_id(odb_gene_id)
    return ORTHODB_DATABASE.data_species_dict[spec_id]


def get_verif_table_custom2(
    uniprot_ids: list,
    regex: str,
):
    odb2uni = uni2odb_ids(uniprot_ids)
    seq_list = [ORTHODB_DATABASE.data_all_seqrecords_dict[i] for i in odb2uni.keys()]
    df_verified = regex_search(regex, seq_list, idr_map=None)
    df_verified["UniprotID"] = df_verified["odb_id"].map(odb2uni)
    df_verified["verified interaction"] = True
    df_verified["Organism"] = df_verified["odb_id"].apply(odb_id_2_species_name)
    return df_verified

In [40]:
df_verified = get_verif_table_custom2(
    list(df['UniprotID'].unique()),
    regex=REGEX,
)

P25942 not found in gene key table, searching in xref table
P51617 not found in gene key table, searching in xref table


In [44]:
df

Unnamed: 0,Name,UniprotID,motif_match,OrthoDB id,reference DOI
0,CD40,P25942,KQEPQEINF,9606_0:004882,"10.1073/pnas.96.4.1234, 10.1074/jbc.274.20.14246"
1,TIFA,Q96CG3,SSSPTEMDE,9606_0:001440,10.1002/cbic.201800436
2,MAVS,Q7Z434,CHGPEENEY,9606_0:00486f,10.1074/jbc.M115.666578
3,TICAM1,Q8IUC6,CQEPEEMSW,9606_0:004368,"10.1073/pnas.0308496101, 10.4049/jimmunol.171...."
4,IRAK2,O43187,SNTPEETDD,9606_0:000e31,10.1038/nature00888
5,IRAK1,P51617,PPSPQENSY,9606_0:004fa3,10.1038/nature00888
6,IRAK1,P51617,PNQPVESDE,9606_0:004fa3,10.1038/nature00888
7,IRAK1,P51617,RQGPEESDE,9606_0:004fa3,10.1038/nature00888
8,IRAK3 (IRAK-M),Q9Y616,PSIPVEDDE,9606_0:0031e9,10.1038/nature00888
9,mouse TNFRSF11A (RANK),O35305,RKIPTEDEY,10090_0:000361,10.1038/nature00888


In [45]:
df_verified

Unnamed: 0,odb_id,odb_mot_st,odb_mot_end,motif_match,regex,UniprotID,verified interaction,Organism
0,9606_0:004882,233,241,KQEPQEINF,...P.E..[FYWDE],P25942,True,Homo sapiens
1,9606_0:001440,172,180,SSSPTEMDE,...P.E..[FYWDE],Q96CG3,True,Homo sapiens
2,9606_0:00486f,149,157,PESPGENSE,...P.E..[FYWDE],Q7Z434,True,Homo sapiens
3,9606_0:00486f,451,459,CHGPEENEY,...P.E..[FYWDE],Q7Z434,True,Homo sapiens
4,9606_0:004368,82,90,TEDPEEPPD,...P.E..[FYWDE],Q8IUC6,True,Homo sapiens
5,9606_0:004368,246,254,CQEPEEMSW,...P.E..[FYWDE],Q8IUC6,True,Homo sapiens
6,9606_0:004368,297,305,TNYPVECTE,...P.E..[FYWDE],Q8IUC6,True,Homo sapiens
7,9606_0:000e31,522,530,SNTPEETDD,...P.E..[FYWDE],O43187,True,Homo sapiens
8,9606_0:000e31,553,561,PLLPTENGE,...P.E..[FYWDE],O43187,True,Homo sapiens
9,9606_0:004fa3,538,546,PPSPQENSY,...P.E..[FYWDE],P51617,True,Homo sapiens


merge the regex matches with the annotated binders to restrict the "verified" interactions to only those specific sequences in the original excel file. For this protein, I don't just want to take the regex matches from proteins that are known to bind, I want the specific sequences that are known to bind.<br>

You could say that I should have done the same thing for enah, but I think the fact that I used the [LF]PPPP regex will make the matches very very likely to be real.

In [46]:
df_verified=pd.merge(left=df, right=df_verified, how='left', on=['UniprotID', 'motif_match'])

In [47]:
df_verified

Unnamed: 0,Name,UniprotID,motif_match,OrthoDB id,reference DOI,odb_id,odb_mot_st,odb_mot_end,regex,verified interaction,Organism
0,CD40,P25942,KQEPQEINF,9606_0:004882,"10.1073/pnas.96.4.1234, 10.1074/jbc.274.20.14246",9606_0:004882,233,241,...P.E..[FYWDE],True,Homo sapiens
1,TIFA,Q96CG3,SSSPTEMDE,9606_0:001440,10.1002/cbic.201800436,9606_0:001440,172,180,...P.E..[FYWDE],True,Homo sapiens
2,MAVS,Q7Z434,CHGPEENEY,9606_0:00486f,10.1074/jbc.M115.666578,9606_0:00486f,451,459,...P.E..[FYWDE],True,Homo sapiens
3,TICAM1,Q8IUC6,CQEPEEMSW,9606_0:004368,"10.1073/pnas.0308496101, 10.4049/jimmunol.171....",9606_0:004368,246,254,...P.E..[FYWDE],True,Homo sapiens
4,IRAK2,O43187,SNTPEETDD,9606_0:000e31,10.1038/nature00888,9606_0:000e31,522,530,...P.E..[FYWDE],True,Homo sapiens
5,IRAK1,P51617,PPSPQENSY,9606_0:004fa3,10.1038/nature00888,9606_0:004fa3,538,546,...P.E..[FYWDE],True,Homo sapiens
6,IRAK1,P51617,PNQPVESDE,9606_0:004fa3,10.1038/nature00888,9606_0:004fa3,581,589,...P.E..[FYWDE],True,Homo sapiens
7,IRAK1,P51617,RQGPEESDE,9606_0:004fa3,10.1038/nature00888,9606_0:004fa3,700,708,...P.E..[FYWDE],True,Homo sapiens
8,IRAK3 (IRAK-M),Q9Y616,PSIPVEDDE,9606_0:0031e9,10.1038/nature00888,9606_0:0031e9,474,482,...P.E..[FYWDE],True,Homo sapiens
9,mouse TNFRSF11A (RANK),O35305,RKIPTEDEY,10090_0:000361,10.1038/nature00888,10090_0:000361,336,344,...P.E..[FYWDE],True,Mus musculus


In [48]:
elm_instances_df = pd.read_csv(ELM_INSTANCES_FILE)
df_bg = get_bg_seqs(
    BG_SEQ_FASTA_FILE,
    regex=REGEX,
    instance_df=elm_instances_df,
    idr_map=idr_map,
    bg_organism=BG_ORGANISM,
)
df_bg = df_bg.sample(300, random_state=42)
dfm = merge_verified_and_fp_motifs(df_verified, df_bg, cluster_dict=BG_CLUSTER_DICT)
dfm["ELM_motif_class"] = CLASS_NAME

added cluster members
added cluster members
added cluster members
added cluster members
added cluster members
added cluster members
added cluster members


In [49]:
print(len(dfm))

312


In [50]:
benchmark_df = pd.concat([benchmark_df, dfm]).reset_index(drop=True)

In [51]:
benchmark_df

Unnamed: 0,Organism,Primary_Acc,Accessions,UniprotID,regex,motif_match,odb_id,odb_mot_st,odb_mot_end,verified interaction,ELM_motif_class,name,Name,OrthoDB id,reference DOI
0,Rattus norvegicus,O08838,O08838,O08838,DP[FW],DPF,10116_0:004cdf,356,358,True,LIG_AP2alpha_2,,,,
1,Rattus norvegicus,Q05140,Q05140,Q05140,DP[FW],DPF,10116_0:002e5e,399,401,True,LIG_AP2alpha_2,,,,
2,Rattus norvegicus,Q05140,Q05140,Q05140,DP[FW],DPF,10116_0:002e5e,473,475,True,LIG_AP2alpha_2,,,,
3,Homo sapiens,P98082,P98082 A6NES5 Q13598 Q9BTY0 Q9UK04,P98082,DP[FW],DPF,9606_0:0016b2,292,294,True,LIG_AP2alpha_2,,,,
4,Homo sapiens,P98082,P98082 A6NES5 Q13598 Q9BTY0 Q9UK04,P98082,DP[FW],DPF,9606_0:0016b2,297,299,True,LIG_AP2alpha_2,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3053,Homo sapiens,,,A0A5F9ZHS8,...P.E..[FYWDE],IRDPSEFEY,9606_0:00386f,1011,1019,False,TRAF6,,,,
3054,Homo sapiens,,,P22087,...P.E..[FYWDE],NLVPGESVY,9606_0:004480,109,117,False,TRAF6,,,,
3055,Homo sapiens,,,Q92791,...P.E..[FYWDE],TEPPLEPED,9606_0:003e0d,387,395,False,TRAF6,,,,
3056,Homo sapiens,,,,...P.E..[FYWDE],NSSPEEVQF,9606_0:00123d,2623,2631,False,TRAF6,,,,


# filter out really long sequences

In [52]:
MAX_SEQ_LENGTH = 5000
# change to 4000 in the next iteration
# MAX_SEQ_LENGTH = 4000

In [53]:
def get_seq_length(odb_gene_id: str) -> int:
    return len(ORTHODB_DATABASE.data_all_seqrecords_dict[odb_gene_id].seq)

In [54]:
benchmark_df['seq_length'] = benchmark_df['odb_id'].apply(get_seq_length)
benchmark_df = benchmark_df[benchmark_df['seq_length'] < MAX_SEQ_LENGTH]

In [55]:
print(len(benchmark_df))

3002


In [56]:
benchmark_df.groupby('ELM_motif_class')['verified interaction'].value_counts().unstack().fillna(0)

verified interaction,False,True
ELM_motif_class,Unnamed: 1_level_1,Unnamed: 2_level_1
DOC_WW_Pin1_4,332,79
LIG_14-3-3_CanoR_1,342,52
LIG_AP2alpha_2,337,50
LIG_EH_1,336,54
LIG_SH2_GRB2like,342,30
LIG_SH3_CIN85_PxpxPR_1,346,32
TRAF6,294,12
enah_LPPPP_FPPPP,328,36


In [57]:
len(benchmark_df['odb_id'].unique())

2231

In [58]:
rn_dict = {
    'motif_match': 'hit_sequence',
    'odb_id': 'gene_id',
    'odb_mot_st': 'hit start position',
    'odb_mot_end': 'hit end position',
}

benchmark_df = benchmark_df.rename(columns=rn_dict)

In [59]:
BENCHMARK_DIR / "benchmark_table.csv"

PosixPath('/home/jch/Documents/08-benchmark/benchmark/benchmark_v4/p1_table/benchmark_table.csv')

In [60]:
benchmark_df.to_csv(BENCHMARK_DIR / "benchmark_table.csv", index=False)