In [33]:
# Autoreload 
%load_ext autoreload
%autoreload 2

from pathlib import Path 
import pandas as pd 
import numpy as np
import os 
import re 
import gzip 
import shutil
import Bio.PDB.MMCIF2Dict
from typing import Union, List, Tuple, Dict, Optional
from pathlib import Path

pd.options.mode.chained_assignment = None  # default='warn'


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
from phosphosite.utils import aa1to3, aa3to1
from phosphosite.utils.graphs import get_seq_distance

radius = 6.0

In [10]:
motif_df = pd.read_csv(
    "data/motif_dataframe/all_R6A.csv",
    header=0,
    delimiter=",",
)
motif_df["site_res"] = motif_df.apply(
    lambda row: row["site"].split(":")[1], 
    axis=1,
)
nearest_node = motif_df[~motif_df["nearest_node"].isna()]
percentage = len(nearest_node) / len(motif_df) * 100
print(f"{percentage:.1f}% of motifs have a nearest node. ({len(nearest_node)} / {len(motif_df)})")

nearest_node["sequence_distance"] = nearest_node.apply(
    lambda row: get_seq_distance(row["site"], row["nearest_node"], absolute=False),
    axis=1,
)
# min and max sequence distance
print("Sequence distance range of 3rd nearest: ", nearest_node["sequence_distance"].min(), "--", nearest_node["sequence_distance"].max())
print()
nearest_node[0:2]

57.9% of motifs have a nearest node. (126760 / 218940)
Sequence distance range of 3rd nearest:  -2409 -- 2381



Unnamed: 0,uniprot_id,prev,site,next,nearest_node,site_res,sequence_distance
0,A0A0B4J2A2,A:LYS:118,A:THR:119,A:GLU:120,A:HIS:92,THR,27
1,A0A0B4J2A2,A:GLY:109,A:SER:110,A:GLN:111,A:LYS:76,SER,34


In [34]:
def filter_by_combo(
    df: pd.DataFrame, 
    combo: Tuple[str, str]
) -> pd.DataFrame:
    """Filter a dataframe by a combination of two columns.
    
    Assumes dataframe has columns 
    - `prev`
    - `site`
    - `next`
    - `nearest_node`

    Parameters
    ----------
    df : pd.DataFrame
        The dataframe to filter.
    combo : Tuple[str, str]
        The combination of residues to filter by.

    Returns
    -------
    pd.DataFrame
        The filtered dataframe.
    """
    triplet, nearest_node = combo
    prev, site, next = [aa1to3[aa] for aa in triplet] 
    nearest_node = aa1to3[nearest_node]
    return df[
        (df["prev"].str.contains(prev)) &
        (df["site"].str.contains(site)) &
        (df["next"].str.contains(next)) &
        (df["nearest_node"].str.contains(nearest_node))
    ]


In [None]:
combo = ("PYK", "F")

In [37]:
combo = ("HTG", "R")

In [38]:


# Get all motifs with this combo. 
df = filter_by_combo(nearest_node, combo)
# Sort by uniprot_id
df = df.sort_values(by="uniprot_id")
# drop site_res 
df = df.drop(columns=["site_res"])
df

Unnamed: 0,uniprot_id,prev,site,next,nearest_node,sequence_distance
200715,A1YPR0,A:HIS:386,A:THR:387,A:GLY:388,A:ARG:384,3
156778,A2RRD8,A:HIS:267,A:THR:268,A:GLY:269,A:ARG:265,3
156787,A2RRD8,A:HIS:323,A:THR:324,A:GLY:325,A:ARG:321,3
156784,A2RRD8,A:HIS:239,A:THR:240,A:GLY:241,A:ARG:237,3
156781,A2RRD8,A:HIS:379,A:THR:380,A:GLY:381,A:ARG:377,3
...,...,...,...,...,...,...
103894,Q9Y6Q3,A:HIS:539,A:THR:540,A:GLY:541,A:ARG:537,3
103896,Q9Y6Q3,A:HIS:343,A:THR:344,A:GLY:345,A:ARG:341,3
103899,Q9Y6Q3,A:HIS:567,A:THR:568,A:GLY:569,A:ARG:565,3
103907,Q9Y6Q3,A:HIS:315,A:THR:316,A:GLY:317,A:ARG:313,3


In [39]:
# Print in csv format
print(df.to_csv(index=False, sep="\t"))

# Save to tsv 
df.to_csv(
    f"SANITY_CHECK_{combo[0]}_{combo[1]}_.tsv",
    sep="\t",
    index=False,
)

uniprot_id	prev	site	next	nearest_node	sequence_distance
A1YPR0	A:HIS:386	A:THR:387	A:GLY:388	A:ARG:384	3
A2RRD8	A:HIS:267	A:THR:268	A:GLY:269	A:ARG:265	3
A2RRD8	A:HIS:323	A:THR:324	A:GLY:325	A:ARG:321	3
A2RRD8	A:HIS:239	A:THR:240	A:GLY:241	A:ARG:237	3
A2RRD8	A:HIS:379	A:THR:380	A:GLY:381	A:ARG:377	3
A2RRD8	A:HIS:155	A:THR:156	A:GLY:157	A:ARG:153	3
A2RRD8	A:HIS:351	A:THR:352	A:GLY:353	A:ARG:349	3
A6NDX5	A:HIS:327	A:THR:328	A:GLY:329	A:ARG:325	3
A6NFI3	A:HIS:741	A:THR:742	A:GLY:743	A:ARG:739	3
A6NFI3	A:HIS:769	A:THR:770	A:GLY:771	A:ARG:767	3
A6NFI3	A:HIS:909	A:THR:910	A:GLY:911	A:ARG:907	3
A6NFI3	A:HIS:881	A:THR:882	A:GLY:883	A:ARG:879	3
A6NFI3	A:HIS:853	A:THR:854	A:GLY:855	A:ARG:851	3
A6NFI3	A:HIS:797	A:THR:798	A:GLY:799	A:ARG:795	3
A6NFI3	A:HIS:395	A:THR:396	A:GLY:397	A:ARG:393	3
A6NHJ4	A:HIS:336	A:THR:337	A:GLY:338	A:ARG:334	3
A6NHJ4	A:HIS:448	A:THR:449	A:GLY:450	A:ARG:446	3
A6NHJ4	A:HIS:532	A:THR:533	A:GLY:534	A:ARG:530	3
A6NHJ4	A:HIS:308	A:THR:309	A:GLY:310	A:ARG:306	3
A6NHJ4	A:HIS

In [40]:
df["sequence_distance"].describe()

count    1914.0
mean        3.0
std         0.0
min         3.0
25%         3.0
50%         3.0
75%         3.0
max         3.0
Name: sequence_distance, dtype: float64

### Sanity check some weird combos.