# run with [pipeline 1](https://github.com/jacksonh1/orthogroup_generation) environment
`mamba activate odb_groups_x86`

need access to the orthoDB tools

In [2]:
import pandas as pd
import json
import os
import re
import subprocess
import sys
from pathlib import Path
from Bio import AlignIO, Seq, SeqIO, Align
import numpy as np
import pandas as pd
from pyprojroot import here

import local_seqtools.general_utils as tools
# from local_env_variables import project_filepaths as fp

%load_ext autoreload
%autoreload 2

plan
1. import instance table
2. import fasta file
3. parse fasta file to get uniprot ids -> make a dictionary of uniprot_id:sequence
    - make sure that all of the uniprot_ids are unique (no duplicates)
4. map fasta id from instance table to the sequence id in the fasta
    - make sure that all of the table entries have a corresponding fasta entry
5. import elm classes table
6. map regex from elm classes table to the instance table - merge the tables
7. use the start and end positions to get the sequence from the fasta entries
8. Check that the regex matches the sequence pulled out from the positions in the instance table
9. Filter or fix any problems with the regex matching the sequence (~29 sequences total)
10. get a "hit sequence" from the fasta entry where it's the motif surrounded by some flanking sequence 
    - This should be the hit that you search for in the orthodb sequence. Ideally, there should be only one hit sequence found in the orthodb sequence
11. get the offset where the motif starts within the hit sequence
12. map the uniprot id to the orthodb database to retrieve the orthodb sequence
    - check if there are any that didn't map (~4 sequences total)
13. find the hit sequence in the orthodb sequence

# download the data

In [None]:
!mkdir "../../../data/ELM/2024-02-09-ELM_instances"
!curl "http://elm.eu.org/instances.tsv?q=*&instance_logic=true+positive" -o "../../../data/ELM/2024-02-09-ELM_instances/TPinstances.tsv"
!curl "http://elm.eu.org/instances.fasta?q=*&instance_logic=true+positive" -o "../../../data/ELM/2024-02-09-ELM_instances/TPinstances.fasta"
!curl "http://elm.eu.org/elms/elms_index.tsv" -o "../../../data/ELM/2024-02-09-ELM_instances/elms_index.tsv"

In [3]:
elm_instance_dir = here() / "data" / "ELM" / "2024-02-09-ELM_instances"

elm_instance_tsv = elm_instance_dir / "TPinstances.tsv"
elm_instance_fasta = elm_instance_dir / "TPinstances.fasta"
elm_classes_tsv = elm_instance_dir / "elms_index.tsv"

elm_instance_hit_seq_table = elm_instance_dir / "elm_instances_with_hit_sequence.csv"

## import instances

In [4]:
inst_df = pd.read_csv(elm_instance_tsv, sep='\t', skiprows=5)
for i in inst_df.columns: print(i)
inst_df.head()

Accession
ELMType
ELMIdentifier
ProteinName
Primary_Acc
Accessions
Start
End
References
Methods
InstanceLogic
PDB
Organism


Unnamed: 0,Accession,ELMType,ELMIdentifier,ProteinName,Primary_Acc,Accessions,Start,End,References,Methods,InstanceLogic,PDB,Organism
0,ELMI003774,CLV,CLV_C14_Caspase3-7,A0A0H3NIK3_SALTS,A0A0H3NIK3,A0A0H3NIK3,483,487,20947770,enzymatic reaction; mutation analysis; proteas...,true positive,,Salmonella enterica subsp. enterica serovar Ty...
1,ELMI002256,CLV,CLV_C14_Caspase3-7,ATN1_HUMAN,P54259,P54259 Q99495 Q99621 Q9UEK7,103,107,10085113 9535906,cleavage reaction; mutation analysis; western ...,true positive,,Homo sapiens
2,ELMI001933,CLV,CLV_C14_Caspase3-7,ATN1_HUMAN,P54259,P54259 Q99495 Q99621 Q9UEK7,106,110,10085113 9535906,cleavage reaction; mutation analysis; western ...,true positive,,Homo sapiens
3,ELMI001914,CLV,CLV_C14_Caspase3-7,BCAR1_RAT,Q63767,Q63767 Q63766,413,417,10712510,classical fluorescence spectroscopy; cleavage ...,true positive,,Rattus norvegicus
4,ELMI001915,CLV,CLV_C14_Caspase3-7,BCAR1_RAT,Q63767,Q63767 Q63766,745,749,10712510,classical fluorescence spectroscopy; cleavage ...,true positive,,Rattus norvegicus


In [5]:
inst_df['Organism'].value_counts()

Organism
Homo sapiens                                 2247
Mus musculus                                  381
Saccharomyces cerevisiae S288c                209
Rattus norvegicus                             169
Drosophila melanogaster                       108
                                             ... 
Orf virus                                       1
Hepatitis B virus adw2/Rutter/1979              1
Human adenovirus 9                              1
African swine fever virus Malawi LIL 20/1       1
Leishmania mexicana                             1
Name: count, Length: 259, dtype: int64

## import fasta, parse uniprot ids, map to table

In [6]:
faimporter = tools.FastaImporter(elm_instance_fasta)
seqlist = faimporter.import_as_list()
# parse the fasta id to get the uniprot id
ids = set([tools.split_uniprot(i.id)[1] for i in seqlist])
assert len(ids) == len(seqlist), "not as many parsed uniprot ids as fasta entries"
# name_map = {tools.split_uniprot(i.id)[1]:tools.split_uniprot(i.id)[0] for i in seqlist}
# make a dictionary of uniprot_id:sequence (I could change the fasta ids to just uniprot id but I won't for now)
id_map = {tools.split_uniprot(i.id)[1]:i.id for i in seqlist}
# map ids onto the instance table
inst_df['fasta_id'] = inst_df['Primary_Acc'].map(id_map)
# check that all table rows have a fasta entry
assert inst_df['fasta_id'].isna().sum() == 0, "not all table entries have a fasta entry"

## import elm classes

In [7]:
# import classes table
classes_df = pd.read_csv(elm_classes_tsv, sep='\t', skiprows=5)
for i in classes_df.columns: print(i)
classes_df.head()
c_cols = set(classes_df.columns)
i_cols = set(inst_df.columns)
print(c_cols.intersection(i_cols))

Accession
ELMIdentifier
FunctionalSiteName
Description
Regex
Probability
#Instances
#Instances_in_PDB
{'ELMIdentifier', 'Accession'}


`Accession` columns are different between the classes and instances. Map on the `ELMIdentifier` column

## merge classes and instance tables 

In [8]:
# merge the tables
dfm = pd.merge(inst_df, classes_df, on='ELMIdentifier', how='left', suffixes=('_inst', '_class'))
assert len(dfm) == len(inst_df), "not all instances were mapped to a class"
dfm.head()

Unnamed: 0,Accession_inst,ELMType,ELMIdentifier,ProteinName,Primary_Acc,Accessions,Start,End,References,Methods,...,PDB,Organism,fasta_id,Accession_class,FunctionalSiteName,Description,Regex,Probability,#Instances,#Instances_in_PDB
0,ELMI003774,CLV,CLV_C14_Caspase3-7,A0A0H3NIK3_SALTS,A0A0H3NIK3,A0A0H3NIK3,483,487,20947770,enzymatic reaction; mutation analysis; proteas...,...,,Salmonella enterica subsp. enterica serovar Ty...,sp|A0A0H3NIK3|A0A0H3NIK3_SALTS,ELME000321,Caspase cleavage motif,Caspase-3 and Caspase-7 cleavage site.,[DSTE][^P][^DEWHFYC]D[GSAN],0.003094,41,0
1,ELMI002256,CLV,CLV_C14_Caspase3-7,ATN1_HUMAN,P54259,P54259 Q99495 Q99621 Q9UEK7,103,107,10085113 9535906,cleavage reaction; mutation analysis; western ...,...,,Homo sapiens,sp|P54259|ATN1_HUMAN,ELME000321,Caspase cleavage motif,Caspase-3 and Caspase-7 cleavage site.,[DSTE][^P][^DEWHFYC]D[GSAN],0.003094,41,0
2,ELMI001933,CLV,CLV_C14_Caspase3-7,ATN1_HUMAN,P54259,P54259 Q99495 Q99621 Q9UEK7,106,110,10085113 9535906,cleavage reaction; mutation analysis; western ...,...,,Homo sapiens,sp|P54259|ATN1_HUMAN,ELME000321,Caspase cleavage motif,Caspase-3 and Caspase-7 cleavage site.,[DSTE][^P][^DEWHFYC]D[GSAN],0.003094,41,0
3,ELMI001914,CLV,CLV_C14_Caspase3-7,BCAR1_RAT,Q63767,Q63767 Q63766,413,417,10712510,classical fluorescence spectroscopy; cleavage ...,...,,Rattus norvegicus,sp|Q63767|BCAR1_RAT,ELME000321,Caspase cleavage motif,Caspase-3 and Caspase-7 cleavage site.,[DSTE][^P][^DEWHFYC]D[GSAN],0.003094,41,0
4,ELMI001915,CLV,CLV_C14_Caspase3-7,BCAR1_RAT,Q63767,Q63767 Q63766,745,749,10712510,classical fluorescence spectroscopy; cleavage ...,...,,Rattus norvegicus,sp|Q63767|BCAR1_RAT,ELME000321,Caspase cleavage motif,Caspase-3 and Caspase-7 cleavage site.,[DSTE][^P][^DEWHFYC]D[GSAN],0.003094,41,0


## get hit sequence from start/end positions

In [9]:
# function to pad the hit sequence
def pad_hit(seq: str, st_pos: int, end_pos: int, l_flank: int=0, r_flank: int=0):
    st = max(0, st_pos - l_flank)
    end = min(len(seq)-1, end_pos + r_flank)
    return st, end, seq[st:end+1]

# test this function
seq = "ABCDEFGHIJK"
st_pos = 3
end_pos = 7
l_flank = 2
r_flank = 2
pad_hit(seq, st_pos, end_pos, l_flank, r_flank)
print(seq[st_pos:end_pos+1])
print(seq[st_pos:len(seq)-1+1])

DEFGH
DEFGHIJK


In [10]:
# apply the function to the dataframe - get just the exact hit (no flanking)
seqdict = {i.id:str(i.seq) for i in seqlist}
dfm2 = dfm.copy()
dfm2['fl_sequence'] = dfm2['fasta_id'].map(seqdict)
# check that all of the sequences were mapped
assert dfm2['fl_sequence'].isna().sum() == 0, "not all fasta ids were mapped to a sequence"
dfm2['motif_match'] = dfm2.apply(lambda x: pad_hit(x['fl_sequence'], x['Start']-1, x['End']-1), axis=1)
dfm2[['Start', 'End', 'motif_match', 'Regex']].head()

Unnamed: 0,Start,End,motif_match,Regex
0,483,487,"(482, 486, DCTDG)",[DSTE][^P][^DEWHFYC]D[GSAN]
1,103,107,"(102, 106, SDLDS)",[DSTE][^P][^DEWHFYC]D[GSAN]
2,106,110,"(105, 109, DSLDG)",[DSTE][^P][^DEWHFYC]D[GSAN]
3,413,417,"(412, 416, DVPDG)",[DSTE][^P][^DEWHFYC]D[GSAN]
4,745,749,"(744, 748, DSPDG)",[DSTE][^P][^DEWHFYC]D[GSAN]


given positions are 1-indexed, so subtract 1 to get the correct position in the sequence

### see if the sequence matches the regex - fullmatch

In [11]:
dfm2 = dfm.copy()
dfm2['fl_sequence'] = dfm2['fasta_id'].map(seqdict)
dfm2[['mot_start_0','mot_end_0','motif_match']] = dfm2.apply(lambda x: pad_hit(x['fl_sequence'], x['Start']-1, x['End']-1), axis=1, result_type='expand')
# check that the extracted sequences matches the regex
dfm2['hit_matches_regex'] = dfm2.apply(lambda x: re.fullmatch(x['Regex'], x['motif_match']) is not None, axis=1)
# check that all of the regexes match the extracted sequences
assert dfm2['hit_matches_regex'].sum() == len(dfm2), "not all regexes match the extracted sequences"

AssertionError: not all regexes match the extracted sequences

In [12]:
dfm2['hit_matches_regex'].value_counts()

hit_matches_regex
True     3955
False      55
Name: count, dtype: int64

In [17]:
# dfm2[~dfm2['hit_matches_regex']][['Start', 'End', 'motif_match', 'Regex', 'ELMIdentifier']]

not all the regexes match the extracted sequences. 55 do not match

try with re.search instead of re.fullmatch

### see if the sequence matches the regex - search

In [14]:
# keep creating a copy of df and overwriting the old version to avoid some confusion and modifying the original df
dfm2 = dfm.copy()
dfm2['fl_sequence'] = dfm2['fasta_id'].map(seqdict)
dfm2[['mot_start_0','mot_end_0','motif_match']] = dfm2.apply(lambda x: pad_hit(x['fl_sequence'], x['Start']-1, x['End']-1), axis=1, result_type='expand')
# check that the extracted sequences matches the regex
dfm2['hit_matches_regex'] = dfm2.apply(lambda x: re.search(x['Regex'], x['motif_match']) is not None, axis=1)
dfm2['hit_matches_regex'].value_counts()

hit_matches_regex
True     3995
False      15
Name: count, dtype: int64

In [15]:
dfm2[~dfm2['hit_matches_regex']][['Start', 'End', 'motif_match', 'Regex']].head()

Unnamed: 0,Start,End,motif_match,Regex
144,184,188,SSGGU,...G[GA]$
687,1336,1340,SRFSI,RF[^P][IV].
688,1346,1350,SDAQS,RF[^P][IV].
717,668,669,TP,...([ST])P.
1396,459,463,ELVKH,.NPF.


it seems that most of the time it's from incorrect Start/End positions. check out dfm2.loc[687], 20-38 are given for a regex that is at most 17 positions<br><br>
for dfm2.loc[438], the positions just are not inclusive enough (TP vs. ...([ST])P.)<br><br>
There is one that has a regex that just doesn't match at all - dfm2.loc[800] (ELVKH vs. .NPF.)



new plan:
1. Use the positions to slice the sequence with some flanking sequence - add new positions
2. use the regex to exact the match and positions of the match
    - raise an error if there is more than one match
3. correct for the true positions of the match in the full length sequence

OR

1. just ignore the entries that are messed up

**I will just ignore the entries that are messed up**

### I am going to remove instances that don't have a fullmatch

### inspect entries where the regex does not match the sequence

In [23]:
dfm2 = dfm.copy()
dfm2['fl_sequence'] = dfm2['fasta_id'].map(seqdict)
dfm2[['mot_start_0','mot_end_0','motif_match']] = dfm2.apply(lambda x: pad_hit(x['fl_sequence'], x['Start']-1, x['End']-1), axis=1, result_type='expand')
dfm2['motif_matches_regex'] = dfm2.apply(lambda x: re.fullmatch(x['Regex'], x['motif_match']) is not None, axis=1)
dfm2['motif_matches_regex'].value_counts()

motif_matches_regex
True     3955
False      55
Name: count, dtype: int64

In [24]:
# dfm2[~dfm2['hit_matches_regex']]['ELMIdentifier'].value_counts()
dfm2[~dfm2['motif_matches_regex']][['ELMIdentifier', 'ProteinName', 'Start', 'End', 'motif_match', 'Regex']]

Unnamed: 0,ELMIdentifier,ProteinName,Start,End,motif_match,Regex
144,DEG_Cend_KLHDC2_1,SELS_HUMAN,184,188,SSGGU,...G[GA]$
687,DOC_SPAK_OSR1_1,A6BLY8_MOUSE,1336,1340,SRFSI,RF[^P][IV].
688,DOC_SPAK_OSR1_1,A6BLY8_MOUSE,1346,1350,SDAQS,RF[^P][IV].
717,DOC_WW_Pin1_4,A4_HUMAN,668,669,TP,...([ST])P.
1110,LIG_CaM_IQ_9,ADCY8_RAT,1191,1209,AVVLGLVQSLNRQRQKQLL,[ACLIVTM][^P][^P][ILVMFCT]Q[^P][^P][^P][RK][^P...
1112,LIG_CaM_IQ_9,CAC1C_HUMAN,1666,1685,FYATFLIQEYFRKFKKRKEQ,[ACLIVTM][^P][^P][ILVMFCT]Q[^P][^P][^P][RK][^P...
1113,LIG_CaM_IQ_9,CAC1C_RABIT,1648,1667,FYATFLIQEYFRKFKKRKEQ,[ACLIVTM][^P][^P][ILVMFCT]Q[^P][^P][^P][RK][^P...
1114,LIG_CaM_IQ_9,CAC1D_RAT,1650,1669,FYATFLIQDYFRKFKKRKEQ,[ACLIVTM][^P][^P][ILVMFCT]Q[^P][^P][^P][RK][^P...
1115,LIG_CaM_IQ_9,CAC1S_HUMAN,1523,1542,FYATFLIQEHFRKFMKRQEE,[ACLIVTM][^P][^P][ILVMFCT]Q[^P][^P][^P][RK][^P...
1116,LIG_CaM_IQ_9,CAVPT_BRALA,34,52,ISAATRIQASFRMHKNRMA,[ACLIVTM][^P][^P][ILVMFCT]Q[^P][^P][^P][RK][^P...


## filter out instances that don't fully match the regex

In [25]:
print(len(dfm2))
dfm2 = dfm2[dfm2['motif_matches_regex']]
print(len(dfm2))

4010
3955


## Get the hit sequence with some flank

In [26]:
FLANK = 15

In [27]:
dfm2[['hit_start_0','hit_end_0','hit_sequence']] = dfm2.apply(lambda x: pad_hit(x['fl_sequence'], x['Start']-1, x['End']-1, l_flank=15, r_flank=15), axis=1, result_type='expand')
dfm2['mot_start_pos_in_hit'] = dfm2['mot_start_0'] - dfm2['hit_start_0']
dfm2['mot_start_pos_in_hit'].value_counts()

mot_start_pos_in_hit
15    3676
0       87
6       19
1       18
2       17
3       17
11      17
8       16
4       16
9       14
7       13
5       12
12      11
14       9
13       9
10       4
Name: count, dtype: int64

In [28]:
dfm2 = dfm2.drop(columns=['fl_sequence'])

# map to the orthoDB

In [29]:
from local_orthoDB_group_pipeline import uniprotid_search

In [30]:
def uni2odb(uni_id):
    try:
        odb_id = uniprotid_search.uniprotid_2_odb_gene_id(uni_id)
        return odb_id
    except ValueError as e:
        print(e)
        print(f'COULD NOT FIND: {uni_id}')
        return False

## have to convert uniprot ids from isoforms like this: 'Q9Y2R2-2' to this: 'Q9Y2R2'

In [31]:
re.sub(r'(.+)-\d+', r'\1', 'Q8TEV9-2')

'Q8TEV9'

In [32]:
# dfm2['uniprot2map'] = dfm2['Primary_Acc'].map(lambda x: re.sub('(.+)-\d+', r'\1', x))
dfm2['uniprot2map'] = dfm2['Primary_Acc'].apply(lambda x: re.sub(r'(.+)-\d+', r'\1', x))
# dfm2['uniprot2map'] = dfm2['Primary_Acc'].str.replace('(.+)-\d+', r'\1')
# dfm2['Primary_Acc'].str.replace('(.+)-\d+', r'\1')

In [33]:
dfm2[dfm2['uniprot2map'] == 'P03070']

Unnamed: 0,Accession_inst,ELMType,ELMIdentifier,ProteinName,Primary_Acc,Accessions,Start,End,References,Methods,...,#Instances_in_PDB,mot_start_0,mot_end_0,motif_match,motif_matches_regex,hit_start_0,hit_end_0,hit_sequence,mot_start_pos_in_hit,uniprot2map
229,ELMI001402,DEG,DEG_SCF_FBW7_2,LT_SV40,P03070,P03070,699,705,15611062,classical fluorescence spectroscopy; coimmunop...,...,0,698,704,PPTPPPE,True,683,707,NQPYHICRGFTCFKKPPTPPPEPET,15,P03070
2193,ELMI003110,LIG,LIG_RB_LxCxE_1,LT_SV40,P03070,P03070,101,119,11226179 2839300 16118215 24371076 8676470,coimmunoprecipitation; glutathione s tranferas...,...,9,100,118,ENLFCSEEMPSSDDEATAD,True,85,133,YGTDEWEQWWNAFNEENLFCSEEMPSSDDEATADSQHSTPPKKKRKVED,15,P03070
3921,ELMI001354,TRG,TRG_NLS_MonoCore_2,LT_SV40,P03070,P03070,127,132,6096007 12852786,colocalization; mutation analysis; x-ray cryst...,...,1,126,131,KKKRKV,True,111,146,SDDEATADSQHSTPPKKKRKVEDPKDFPSELLSFLS,15,P03070
3941,ELMI001358,TRG,TRG_NLS_MonoExtC_3,LT_SV40,P03070,P03070,127,133,6096007 12852786,colocalization; mutation analysis; x-ray cryst...,...,2,126,132,KKKRKVE,True,111,147,SDDEATADSQHSTPPKKKRKVEDPKDFPSELLSFLSH,15,P03070
3962,ELMI001362,TRG,TRG_NLS_MonoExtN_4,LT_SV40,P03070,P03070,126,132,6096007 12852786,colocalization; mutation analysis; x-ray cryst...,...,2,125,131,PKKKRKV,True,110,146,SSDDEATADSQHSTPPKKKRKVEDPKDFPSELLSFLS,15,P03070
3963,ELMI002397,TRG,TRG_NLS_MonoExtN_4,LT_SV40,P03070,P03070,127,132,6096007,colocalization; mutation analysis,...,2,126,131,KKKRKV,True,111,146,SDDEATADSQHSTPPKKKRKVEDPKDFPSELLSFLS,15,P03070


In [34]:
dfm2['odb_id'] = dfm2['uniprot2map'].apply(uni2odb)

A0A0H3NIK3 not found in gene key table, searching in xref table
not found in xref key table
Uniprot id `A0A0H3NIK3` not found in human gene key or xref tables
COULD NOT FIND: A0A0H3NIK3
P12830 not found in gene key table, searching in xref table
P55211 not found in gene key table, searching in xref table
Q99741 not found in gene key table, searching in xref table
P52566 not found in gene key table, searching in xref table
O35254 not found in gene key table, searching in xref table
not found in xref key table
Uniprot id `O35254` not found in human gene key or xref tables
COULD NOT FIND: O35254
O35254 not found in gene key table, searching in xref table
not found in xref key table
Uniprot id `O35254` not found in human gene key or xref tables
COULD NOT FIND: O35254
O35254 not found in gene key table, searching in xref table
not found in xref key table
Uniprot id `O35254` not found in human gene key or xref tables
COULD NOT FIND: O35254
Q9UQF2 not found in gene key table, searching in xre

How many uniprot ids failed to map to odb_id

In [35]:
(dfm2['odb_id']==False).sum()

584

# Get the hit sequence positions within the orthodb version of the sequence

In [36]:
import local_env_variables.env_variables as env
data_all_seqrecords_dict = env.load_data_all_odb_seqs()

entries with a odb_id but not a odb_seq are probably sequences that are not in the fasta file b/c the fasta file is only sequences that made it into ortholog groups.

In [37]:
## map the odb ids to the sequences
dfm2['odb_seq'] = dfm2['odb_id'].apply(lambda x: str(data_all_seqrecords_dict[x].seq) if x in data_all_seqrecords_dict.keys() else False)

In [38]:
(dfm2['odb_seq']==False).sum()

600

In [39]:
# find all occurences of a string in a string
def find_all(string: str, substring):
    start = 0
    while True:
        start = string.find(substring, start)
        if start == -1: return
        yield start
        start += len(substring)

list(find_all('ABDABCABC', 'ABC'))
# list(find_all('ABDDDDDDDC', 'ABC'))

[3, 6]

In [40]:
## search for the hit sequence in the odb sequence and return the start and end positions
def find_hit_in_odb_seq(hit_seq, odb_seq):
    match_positions = list(find_all(odb_seq, hit_seq))
    if len(match_positions) == 0:
        print(f'could not find {hit_seq} in {odb_seq}')
        return False
    elif len(match_positions) > 1:
        raise ValueError(f'found more than one match for {hit_seq} in {odb_seq}')
    elif len(match_positions) == 1:
        return match_positions[0], match_positions[0]+len(hit_seq)-1
    

def apply_find_hit_in_odb_seq(row: pd.Series):
    try:
        return find_hit_in_odb_seq(row['hit_sequence'], row['odb_seq'])
    except ValueError as e:
        print(e)
        return False

In [41]:
dfm3 = dfm2[dfm2['odb_seq'] != False].copy()
dfm3[['odb_hit_start', 'odb_hit_end']] = dfm3.apply(apply_find_hit_in_odb_seq, axis=1, result_type='expand')

could not find SIKNYIVDKTNEALAPRRTLKVIQQSASGCLVGRTKEPA in MRNGLGLKRRTKKGKKIATWCPRRSTLDPDIRLKMNSNMKQRSDVENPSMSIKNYIVDKTNEALAPRRTLKVIQQSASGCLVGRAKEPAKNSTKRKLWNDQLTSKKAKVEVAVDPENKDCPSEAYDLMVKETPTCLYWKDVAEERRKALYEALQENEKLHQEIELKDEEIARLKQENDELMELAGHVQYMANMIERLTGNAPQSLEDLKNLDLEEARFEDEAESRIEDETDMTQPSSSDQNMDKQTV
could not find SELNSSQSESAKAADDPENGERESHTPVSIQEEIVG in MRGRRGRPPKQPAAPAAERCAPAPPPPPPPPTSGPIGGLRSRHRGSSRGRWAAAQAEVAPKTRLSSPRGGSSSRRKPPPPPPAPPSTSAPGRGGRGGGGGRTGGGGGGGHLARTTAARRAVNKVVYDDHESEEEEEEEDMVSEEEEEEDGDAEETQDSEDDEEDEMEEDDDDSDYPEEMEDDDDDASYCTESSFRSHSTYSSTPGRRKPRVHRPRSPILEEKDIPPLEFPKSSEDLMVPNEHIMNVIAIYEVLRNFGTVLRLSPFRFEDFCAALVSQEQCTLMAEMHVVLLKAVLREEDTSNTTFGPADLKDSVNSTLYFIDGMTWPEVLRVYCESDKEYHHVLPYQEAEDYPYGPVENKIKVLQFLVDQFLTTNIAREELMSEGVIQYDDHCRVCHKLGDLLCCETCSAVYHLECVKPPLEEVPEDEWQCEVCVAHKVPGVTDCVAEIQKNKPYIRHEPIGYDRSRRKYWFLNRRLIIEEDTENENEKKIWYYSTKVQLAELIDCLDKDYWEAELCKILEEMREEIHRHMDITEDLTNKARGSNKSFLAAANEEILESIRAKKGDIDNVKSPEETEKDKNETENDSKDAEKNREEFEDQSLEKDSDDKTPDDDPEQGKSEVGDFKSEKSNGELSESPG

In [42]:
(dfm3['odb_hit_start']==False).sum()

332

In [43]:
dfm3.loc[2244]

Accession_inst                                                 ELMI004218
ELMType                                                               LIG
ELMIdentifier                                         LIG_RuBisCO_WRxxL_1
ProteinName                                              A0A2K3DA85_CHLRE
Primary_Acc                                                    A0A2K3DA85
Accessions                                                     A0A2K3DA85
Start                                                                 243
End                                                                   251
References                                                       33177094
Methods                 Identification by mass spectrometry; coimmunop...
InstanceLogic                                               true positive
PDB                                                                   NaN
Organism                                        Chlamydomonas reinhardtii
fasta_id                              

In [44]:
dfm3 = dfm3[dfm3['odb_hit_start'] != False]

In [45]:
dfm3['odb_hit_start'].apply(lambda x: type(x)).value_counts()

odb_hit_start
<class 'int'>    3023
Name: count, dtype: int64

In [46]:
dfm3['Organism'] = dfm3['Organism'].str.strip()

In [47]:
def get_odb_mot_pos(row: pd.Series):
    mot_st = row['odb_hit_start'] + row['mot_start_pos_in_hit']
    mot_end = mot_st + len(row['motif_match']) - 1
    return mot_st, mot_end

dfm3[['odb_mot_st', 'odb_mot_end']] = dfm3.apply(get_odb_mot_pos, axis=1, result_type='expand')

In [48]:
dfm3.to_csv(elm_instance_hit_seq_table, index=False)

In [49]:
# dfm3.groupby('ELMIdentifier').count().value_counts()
temp = dfm3['ELMIdentifier'].value_counts()
temp = temp.reset_index()
temp = pd.merge(temp, classes_df[['ELMIdentifier', 'Probability','Regex']], on='ELMIdentifier', how='left')
temp.to_csv('./new_instance_count_table.csv', index=False)

---

# random stuff

In [65]:
vcounts = dfm3['Organism'].value_counts()
for i in vcounts.index:
    print(i, vcounts[i])

Homo sapiens 1997
Mus musculus 305
Saccharomyces cerevisiae S288c 195
Rattus norvegicus 123
Saccharomyces cerevisiae 97
Drosophila melanogaster 75
Arabidopsis thaliana 44
Caenorhabditis elegans 23
Chlamydomonas reinhardtii 20
Xenopus laevis 18
Plasmodium falciparum 3D7 16
Legionella pneumophila subsp. pneumophila str. Philadelphia 1 9
Danio rerio 7
Simian virus 40 6
Bos taurus 6
Nipah virus 5
Haemophilus ducreyi 35000HP 4
Sus scrofa 4
Listeria monocytogenes 4
Tursiops truncatus 3
Solanum lycopersicum 3
Oryctolagus cuniculus 3
Chlamydophila caviae GPIC 3
Phytophthora infestans T30-4 2
Human papillomavirus type 16 2
Chlamydia trachomatis D/UW-3/CX 2
Candida albicans SC5314 2
Agrobacterium tumefaciens 2
Helicobacter pylori 26695 2
Toxoplasma gondii ME49 2
Gallus gallus 2
Phytophthora sojae 2
Oreochromis niloticus 2
Human T-lymphotropic virus 2 2
Aspergillus nidulans 2
Harvey murine sarcoma virus 1
Bovine papillomavirus type 1 1
Strongylocentrotus purpuratus 1
Listeria monocytogenes EGD-e 

In [52]:
dfm3[dfm3['Organism']=='Plasmodium falciparum 3D7']

Unnamed: 0,Accession_inst,ELMType,ELMIdentifier,ProteinName,Primary_Acc,Accessions,Start,End,References,Methods,...,motif_matches_regex,hit_start_0,hit_end_0,hit_sequence,mot_start_pos_in_hit,uniprot2map,odb_id,odb_seq,odb_hit_start,odb_hit_end
3983,ELMI003624,TRG,TRG_Pf-PMV_PEXEL_1,GBP_PLAF7,Q8I6U8,Q8I6U8,84,88,25850860 15591203,Identification by mass spectrometry; mutation ...,...,True,68,102,GDKYEKAVDYGFRESRILAEGEDTCARKEKTTLRK,15,Q8I6U8,36329_0:000ad3,MRLSKVSDIKSTGVSNYKNFNSKNSSKYSLMEVSKKNEKKNSLGAF...,68,102
3987,ELMI003635,TRG,TRG_Pf-PMV_PEXEL_1,O97336_PLAF7,O97336,O97336,86,90,19055692 25850860,Identification by mass spectrometry; mutation ...,...,True,70,104,GRRNKGKKILGIRINKSLAEMDHTKYHPEYYDEVQ,15,O97336,36329_0:00019e,MAKDSQKNLNVSNNNNVQCTMGRSSQNINKSDSKGKIKRCTYAYKI...,70,104
3988,ELMI003643,TRG,TRG_Pf-PMV_PEXEL_1,Q8I0U6_PLAF7,Q8I0U6,Q8I0U6,86,91,23387285,Identification by mass spectrometry; affinity ...,...,True,70,105,NGSSSSGVQFTDRCSRNLYGETLPVNPYADSENPIV,15,Q8I0U6,36329_0:000006,MRPFHAYSWIFSQQYMDTKNVKEKNPTIYSFDDEEKRNENKSFLKV...,70,105
3989,ELMI003630,TRG,TRG_Pf-PMV_PEXEL_1,Q8I202_PLAF7,Q8I202,Q8I202,59,63,19055692,Identification by mass spectrometry; western blot,...,True,43,77,NGKLDFKRSQRLKEYRILVEFSNSYYYDEPKVRII,15,Q8I202,36329_0:0002fe,MIYIRKDKLRFCFSFYFYVQLFIIYLFIWTENKYNEYGGDKNLNGK...,43,77
3990,ELMI003629,TRG,TRG_Pf-PMV_PEXEL_1,Q8I298_PLAF7,Q8I298,Q8I298,61,65,19055692,Identification by mass spectrometry; western blot,...,True,45,79,INKIGDNVYKNKIKSRILKENKEESLETAAVNENT,15,Q8I298,36329_0:00004b,MSLKRRKFILLSFVFSVVELIFGYDNIINNYGGLSSVVVYNKNVGI...,45,79
3991,ELMI003623,TRG,TRG_Pf-PMV_PEXEL_1,Q8I2C7_PLAF7,Q8I2C7,Q8I2C7,36,40,23387285 15591202,Identification by mass spectrometry; affinity ...,...,True,20,54,SKNKPSITPHHTQTNRSLCECDTQSTNYNNDEDIK,15,Q8I2C7,36329_0:000065,MKLHYTKILLFFFPLYILVYSKNKPSITPHHTQTNRSLCECDTQST...,20,54
3992,ELMI003632,TRG,TRG_Pf-PMV_PEXEL_1,Q8I2F2_PLAF7,Q8I2F2,Q8I2F2,57,61,19055692 25850860,Identification by mass spectrometry; mutation ...,...,True,41,75,NNNNNYGFHCNKRHFKSLAEASPEEHNNLRSHSTS,15,Q8I2F2,36329_0:00093e,MAVSTYNNTRRNGLRYVLKRRTILSVFAVICMLSLNLSIFENNNNN...,41,75
3993,ELMI003642,TRG,TRG_Pf-PMV_PEXEL_1,Q8I2F7_PLAF7,Q8I2F7,Q8I2F7,46,50,25850860,Identification by mass spectrometry; mutation ...,...,True,30,64,TFNYKYTTSYEGSSFRQLSEPVVEEQDLKKTNAES,15,Q8I2F7,36329_0:0008bc,MQTRKYNKMLSKVETKQFIYILFFLCLYLNTFNYKYTTSYEGSSFR...,30,64
3994,ELMI003626,TRG,TRG_Pf-PMV_PEXEL_1,Q8I489_PLAF7,Q8I489,Q8I489,60,64,15591203,tag visualisation by fluorescence,...,True,44,78,NEIFKNTKVFDFTSLRSLAEFNSGSSRESSKTDET,15,Q8I489,36329_0:0003e0,MSILNKYEGKKNKIFLFIINIILFYTLEYVLIGSNYDKHNQSFGNE...,44,78
3995,ELMI003631,TRG,TRG_Pf-PMV_PEXEL_1,Q8I490_PLAF7,Q8I490,Q8I490,89,93,19055692,Identification by mass spectrometry; western blot,...,True,73,107,YNKNKFHNTFNRRDTRVLAEQEDQYIRNPNNSNYP,15,Q8I490,36329_0:00041a,MMNKKSMQTKNFLSERNYGSIDQNVRTKNKRRLMKFQSKSKAKSFL...,73,107


In [81]:
from local_orthoDB_group_pipeline import sql_queries
from local_env_variables import env_variables as env
database = env.orthoDB_database()
og_ids = sql_queries.odb_gene_id_2_ogid_list('36329_0:000ad3')
for i in og_ids:
    info = sql_queries.get_ogid_info(i)
    print(info[0], database.data_levels_taxid_name_dict[int(info[1])], len(sql_queries.ogid_2_odb_gene_id_list(info[0])))

248648at2759 Eukaryota 8
4804at418107 Plasmodium (Laverania) 8
72006at33630 Alveolata 8
22705at422676 Aconoidasida 8
10733at5820 Plasmodium 8
52675at5794 Apicomplexa 8


In [87]:
dfm2['InstanceLogic'].value_counts()
dfm2['ELMType'].value_counts()

ELMType
LIG    1069
MOD     517
DOC     338
TRG     150
DEG     115
CLV      45
Name: count, dtype: int64