- create conda env from file. 
- download PSP data into separate repository. 
- try loading structures in with cif.gz
- proteome class for iterating over list of sites and a given structure directory (i.e. filter out sites with no structure)
- motif class for dealing with phosphosite

In [1]:
# Try and construct graphs per site from structure dir (cif.gz files)

# Try and use structuremap to do the same. 

In [2]:
# Autoreload 
%load_ext autoreload
%autoreload 2

from pathlib import Path 
import pandas as pd 
import numpy as np

# Import structuremap functions
import structuremap.utils
structuremap.utils.set_logger()
from structuremap.processing import download_alphafold_cif, download_alphafold_pae, format_alphafold_data, annotate_accessibility, get_smooth_score, annotate_proteins_with_idr_pattern, get_extended_flexible_pattern, get_proximity_pvals, perform_enrichment_analysis, perform_enrichment_analysis_per_protein, evaluate_ptm_colocalization, extract_motifs_in_proteome
from structuremap.plotting import plot_enrichment, plot_ptm_colocalization

import os 
import re 
import gzip 
import shutil
import Bio.PDB.MMCIF2Dict
from typing import Union, List, Tuple, Dict, Optional
from pathlib import Path

### Load phosphosite dataset

In [163]:
from phosphosite.dataset import phosphorylation # Filtered out isoforms
df = phosphorylation[phosphorylation["ORGANISM"] == "human"]
print(len(df[df["ACC_ID"].str.contains("-")]) / len(df) * 100 )
len(df)

0.0


232587

In [164]:
# Sort by ACC_ID
df = df.sort_values("ACC_ID")

# Filter by residue type, first character in MOD_RSD
allowed_residues = "STY"
df = df[df["MOD_RSD"].str[0].isin(list(allowed_residues))]
df

Unnamed: 0,GENE,PROTEIN,ACC_ID,HU_CHR_LOC,MOD_RSD,SITE_GRP_ID,ORGANISM,MW_kD,DOMAIN,SITE_+/-7_AA,LT_LIT,MS_LIT,MS_CST,CST_CAT#,Ambiguous_Site
42799,C19orf48,C19orf48,A0A024R4G9,19q13.33,S20-p,23088226,human,13.09,DUF5535,ItGsRLLsMVPGPAR,1.0,,1.0,,0
42798,C19orf48,C19orf48,A0A024R4G9,19q13.33,S16-p,23088224,human,13.09,DUF5535,EIQAItGsRLLsMVP,,,1.0,,0
42797,C19orf48,C19orf48,A0A024R4G9,19q13.33,T14-p,23088228,human,13.09,DUF5535,VLEIQAItGsRLLsM,,,1.0,,0
249210,PPIAL4E,PPIAL4E,A0A075B759,1q21.1,Y79-p,50687016,human,18.20,Pro_isomerase,GTGDKSIyGEKFDDE,,,2.0,,1
249212,PPIAL4E,PPIAL4E,A0A075B759,1q21.1,T119-p,50729117,human,18.20,Pro_isomerase,FFICAAKtEWLDGkH,,,1.0,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69495,COLEC10,COLEC10,Q9Y6Z7,8q24.12,T155-p,15418691,human,30.71,,VIAGIREtEEKFYYI,,,1.0,,0
69494,COLEC10,COLEC10,Q9Y6Z7,8q24.12,S141-p,25287884,human,30.71,,SIARLKTsMKFVKNV,,,1.0,,0
102808,ERVK-19,ERVK-19,Q9YNA8,19q11,S465-p,31094301,human,74.18,Gag_p24_C,LQDVAQKsIAIEKAR,,1.0,,,0
236227,PHF10,PHF10 iso5,S5FZ81,6q27,S376-p,25170312,human,42.61,,GYKVEEKsP______,,5.0,,,0


### Load structure data

#### Define paths to structural data.

In [5]:
uniprot_id = "P51786"

af_version = 3
filename_template = "AF-{uniprot_id}-F1-model_v{af_version}.cif.gz"
filename = filename_template.format(uniprot_id=uniprot_id, af_version=af_version)

structure_dir = Path.home() / "STRUCTURAL_MOTIFS/DATA/"
af_cif_dir = structure_dir / "AF_HUMAN_CIF" 
af_pdb_dir = structure_dir / "AF_HUMAN_PDB"

# Assert that the cif and pdb directories exist.
assert af_cif_dir.exists()
assert af_pdb_dir.exists()

#### Initialise structure loader object.

In [6]:
from phosphosite.structure import StructureLoader
cif_loader = StructureLoader(af_cif_dir, extension="cif.gz")
filepath = cif_loader.get_structure("Q8WUY3")
filepath

PosixPath('/home/cim/STRUCTURAL_MOTIFS/DATA/AF_HUMAN_CIF/AF-Q8WUY3-F1-model_v3.cif.gz')

In [7]:
pdb_loader = StructureLoader(af_pdb_dir, extension="pdb")
filepath = pdb_loader.get_structure("Q8WUY3")
filepath

PosixPath('/home/cim/STRUCTURAL_MOTIFS/DATA/AF_HUMAN_PDB/AF-Q8WUY3-F1-model_v3.pdb')

In [8]:
protein_ids = list(df["ACC_ID"].unique())
len(protein_ids)

17581

In [9]:
existing_ids = cif_loader.get_existing_ids(protein_ids)
len(existing_ids)

17338

In [10]:
for protein_id in existing_ids:
    # Check that equivalent PDB file exists.
    if not pdb_loader.protein_id_exists(protein_id):
        print(f"Missing PDB file for {protein_id}")

In [11]:
# only use existing 
df = df[df["ACC_ID"].isin(existing_ids)]

## Annotate AlphaFold structures 

In [12]:
# Trim down uniprot ids for now. 
uniprot_ids = existing_ids[0:100]

In [13]:
from phosphosite.structure.processing import process_af_data
structure_df = process_af_data(
    af_cif_dir, 
    out_format="AF-{uniprot_id}-F1-model_v3.cif.gz",
    #protein_ids=existing_ids, 
    protein_ids=uniprot_ids,
)
structure_df[0:2]

100%|██████████| 100/100 [00:24<00:00,  4.06it/s]


Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,z_coord_ca,z_coord_cb,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured
0,A0A075B759,1,M,1,66.72,-19.205,-19.235,-17.818,-20.083,6.188,...,6.506,7.091,7.169,unstructured,unstructured,0,0,0,0,1
1,A0A075B759,1,V,2,83.11,-17.532,-18.934,-19.228,-19.068,6.568,...,2.733,2.005,4.191,unstructured,unstructured,0,0,0,0,1


In [14]:
structure_df

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,z_coord_ca,z_coord_cb,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured
0,A0A075B759,1,M,1,66.72,-19.205,-19.235,-17.818,-20.083,6.188,...,6.506,7.091,7.169,unstructured,unstructured,0,0,0,0,1
1,A0A075B759,1,V,2,83.11,-17.532,-18.934,-19.228,-19.068,6.568,...,2.733,2.005,4.191,unstructured,unstructured,0,0,0,0,1
2,A0A075B759,1,N,3,88.43,-15.452,-16.160,-16.404,-17.439,6.076,...,1.051,0.356,1.501,unstructured,unstructured,0,0,0,0,1
3,A0A075B759,1,S,4,93.22,-12.711,-13.566,-12.681,-14.405,6.888,...,-0.110,0.812,0.661,unstructured,unstructured,0,0,0,0,1
4,A0A075B759,1,V,5,97.80,-10.210,-11.417,-11.992,-12.403,8.060,...,-3.217,-4.645,-2.232,STRN,STRN,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65999,A4D126,100,Q,447,98.10,-25.830,-25.390,-23.946,-25.528,-8.684,...,15.235,15.547,13.819,unstructured,unstructured,0,0,0,0,1
66000,A4D126,100,L,448,96.78,-25.579,-26.684,-28.069,-26.439,-8.257,...,18.368,18.994,17.268,STRN,STRN,0,0,1,0,0
66001,A4D126,100,L,449,97.54,-24.192,-23.644,-22.400,-24.681,-6.427,...,20.492,19.880,19.472,STRN,STRN,0,0,1,0,0
66002,A4D126,100,I,450,96.53,-22.755,-23.977,-24.619,-23.561,-5.226,...,24.107,25.092,22.863,STRN,STRN,0,0,1,0,0


#### Graphein subgraph method.

In [15]:
pd.options.mode.chained_assignment = None  # default='warn'

In [22]:
from graphein.protein.config import ProteinGraphConfig
config = ProteinGraphConfig()

from graphein.protein.graphs import construct_graph
from graphein.protein.visualisation import plotly_protein_structure_graph



In [27]:
uniprot_id = "Q8WUY3"

In [29]:
pdb_path = pdb_loader.get_structure(uniprot_id)
g = construct_graph(config=config, path=pdb_path)

Output()

In [31]:
# Get sites for this uniprot_id 

site_df = df[df["ACC_ID"] == uniprot_id]
len(site_df)


49

In [32]:
sites = list(site_df["MOD_RSD"].unique())
len(sites)

49

In [25]:
p = plotly_protein_structure_graph(
    g,
    colour_edges_by="kind",
    colour_nodes_by="degree",
    label_node_ids=False,
    plot_title="Peptide backbone graph. Nodes coloured by degree.",
    node_size_multiplier=1
    )
p.show()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [66]:
from phosphosite.graphs import get_motif_subgraph
from phosphosite.utils import aa1to3

chain_id = "A"
g.graph[f"sequence_{chain_id}"]

'MEEFLQRAKSKLNRSKRLEKVHVVIGPKSCDLDSLISTFTYAYFLDKVSPPGVLCLPVLNIPRTEFNYFTETRFILEELNISESFHIFRDEINLHQLNDEGKLSITLVGSSVLASEDKTLESAVVKVINPVEQSDANVEFRESSSSLVLKEILQEAPELITEQLAHRLRGSILFKWMTMESEKISEKQEEILSILEEKFPNLPPREDIINVLQETQFSAQGLSIEQTMLKDLKELSDGEIKVAISTVSMNLENCLFHSNITSDLKAFTDKFGFDVLILFSSYLSEEQQPRRQIAVYSENMELCSQICCELEECQNPCLELEPFDCGCDEILVYQQEDPSVTCDQVVLVVKEVINRRCPEMVSNSRTSSTEAVAGSAPLSQGSSGIMELYGSDIEPQPSSVNFIENPPDLNDSNQAQVDANVDLVSPDSGLATIRSSRSSKESSVFLSDDSPVGEGAGPHHTLLPGLDSYSPIPEGAVAEEHAWSGEHGEHFDLFNFDPAPMASGQSQQSSHSADYSPADDFFPNSDLSEGQLPAGPEGLDGMGTNMSNYSSSSLLSGAGKDSLVEHDEEFVQRQDSPRDNSERNLSLTDFVGDESPSPERLKNTGKRIPPTPMNSLVESSPSTEEPASLYTEDMTQKATDTGHMGPPQTHARCSSWWGGLEIDSKNIADAWSSSEQESVFQSPESWKEHKPSSIDRRASDSVFQPKSLEFTKSGPWESEFGQPELGSNDIQDKNEESLPFQNLPMEKSPLPNTSPQGTNHLIEDFASLWHSGRSPTAMPEPWGNPTDDGEPAAVAPFPAWSAFGKEDHDEALKNTWNLHPTSSKTPSVRDPNEWAMAKSGFAFSSSELLDNSPSEINNEAAPEIWGKKNNDSRDHIFAPGNPSSDLDHTWTNSKPPKEDQNGLVDPKTRGKVYEKVDSWNLFEENMKKGGSDVLVPWEDSFLSYKCSDYSASNLGEDSVPSPLDTNYSTSDSYTSPTFAGDEKETEHKPFAKEEGFESK

In [123]:
s_g.nodes

NodeView(('A:GLY:80', 'A:TYR:79', 'A:ILE:78', 'A:SER:77'))

In [115]:
node_id

'A:TYR:79'

In [116]:
s_g[node_id]

AtlasView({'A:GLY:80': {'kind': {'peptide_bond'}, 'distance': 3.8600312174903455}, 'A:ILE:78': {'kind': {'peptide_bond'}, 'distance': 3.8591601417925117}})

In [109]:
type(s_g[node_id])

networkx.classes.coreviews.AtlasView

In [107]:
list(s_g[node_id])

['A:GLY:80', 'A:ILE:78']

In [128]:
site_coords = s_g.nodes[node_id]["coords"]
type(site_coords)



numpy.ndarray

In [129]:
from phosphosite.structure.processing import get_3d_dist

In [147]:
candidate_nodes = [
            s_g.nodes[i]
            for i in s_g.nodes
            if i not in [prev_node_id]
        ]

type(candidate_nodes[0])

dict

In [133]:
for i in s_g.nodes(data=True):
    s_g.nodes

In [134]:
site_coords = s_g.nodes[node_id]["coords"]

array([-3.653,  8.871, 15.889])

In [144]:
def get_euc_dist(
    arr1: np.ndarray, arr2: np.ndarray
): 
    """Get euclidean distance between two arrays."""
    return np.sqrt(np.sum((arr1 - arr2) ** 2))
    

In [148]:
[
    get_euc_dist(site_coords, node["coords"])
    for node in candidate_nodes
]


[3.8600312174903455, 0.0, 5.231354509111383]

In [152]:
node_dict = candidate_nodes[0]
node_dict

{'chain_id': 'A',
 'residue_name': 'GLY',
 'residue_number': 80,
 'atom_type': 'CA',
 'element_symbol': 'C',
 'coords': array([-1.3000e-02,  8.7510e+00,  1.7168e+01]),
 'b_factor': 95.24,
 'meiler': dim_1    0.00
 dim_2    0.00
 dim_3    0.00
 dim_4    0.00
 dim_5    6.07
 dim_6    0.13
 dim_7    0.15
 Name: GLY, dtype: float64}

['A', 'GLY', 80]

'A:GLY:80'

In [199]:
def get_node_id(
    site: str, 
    chain_id: str = "A",
) -> str: 
    mod_rsd, modification = site.split("-")
    aa = aa1to3[mod_rsd[0]]
    position = mod_rsd[1:]
    node_id = f"{chain_id}:{aa}:{position}"
    return node_id

def generate_node_id(
    node_dict: Dict[str, Union[str, int]],
    delimiter: str = ":",
) -> str: 
    return delimiter.join([str(node_dict[s]) for s in ["chain_id", "residue_name", "residue_number"]])

In [200]:
failed = []
sequence_dict = {}

radius = 6.0
chain_id = "A"
force = False

out_dir = Path(f"data/processed/motifs")
out_file_format = "{uniprot_id}-R{radius}Å.csv", 



skip_empty_motifs = False # if False, still write NaN to file.

for uniprot_id in uniprot_ids:

    # Check that .csv file exists for this uniprot_id.
    if not force:
        out_file = out_dir / out_file_format.format(
            uniprot_id=uniprot_id, 
            radius=radius,
        )
        if out_file.exists():
            print(f"Skipping {uniprot_id}. File exists.")
            continue

    # Construct graph. 
    pdb_path = pdb_loader.get_structure(uniprot_id)
    g = construct_graph(config=config, path=pdb_path)

    sequence_dict[uniprot_id] = g.graph[f"sequence_{chain_id}"]

    # Get sites for this uniprot_id
    site_df = df[df["ACC_ID"] == uniprot_id]
    sites = list(site_df["MOD_RSD"].unique())

    dict_list = []
    for site in sites:
        node_id = get_node_id(site)
        if node_id in g.nodes:
            s_g = get_motif_subgraph(
                g, 
                node_id, 
                radius = radius, # 5.0
            )
        else: 
            print(f"Node {node_id} not in graph ({uniprot_id}).")
            failed.append((uniprot_id, node_id))
            continue

        # Get nearest node to site.
        
        # Extract out adjacent residues (prev and next in sequence) 
        pos = int(node_id.split(":")[2])
        next_pos = pos + 1
        prev_pos = pos - 1
        seq = g.graph[f'sequence_{chain_id}']
        # sequence is 0 indexed 
        
        # Check if out of range. 
        if next_pos > len(seq):
            next_node_id = np.nan
        else:   
            next_node_id = f"{chain_id}:{aa1to3[seq[next_pos - 1]]}:{next_pos}"
        
        if prev_pos < 1:
            prev_node_id = np.nan
        else:
            prev_node_id = f"{chain_id}:{aa1to3[seq[prev_pos - 1]]}:{prev_pos}"

        site_coords = s_g.nodes[node_id]["coords"]

        candidate_nodes = [
            s_g.nodes[i]
            for i in s_g.nodes
            if i not in [prev_node_id, node_id, next_node_id] # Not the modified residue, or sequence adjacent residues.
        ]
        if len(candidate_nodes) == 0:
            print(f"No candidate nodes for {uniprot_id} {node_id}.")
            nearest_node = np.nan

            if skip_empty_motifs: 
                continue # don't create data for this motif.
        else:
            # Pick node with smallest euclidean distance to site.
            nearest_node = min(
                candidate_nodes,
                key=lambda node: get_euc_dist(site_coords, node["coords"]),
            )
            nearest_node = generate_node_id(nearest_node)
    
        dict_list.append(dict(
            uniprot_id=uniprot_id,
            prev=prev_node_id,
            site=node_id,
            next=next_node_id,
            nearest_node=nearest_node, # nearest that's prev or next. NaN if none. 
        ))
    
    
    motif_df = pd.DataFrame.from_dict(dict_list)
    motif_df.to_csv(
        out_dir / out_file_format.format(uniprot_id, radius),
        index=False,
        # Tab delimited for easy reading.
        sep="\t",
        # Include column names.
        header=True,
    )

Output()

Output()

Node A:SER:2804 not in graph (A0A087WUL8).
Node A:SER:3048 not in graph (A0A087WUL8).
Node A:SER:2316 not in graph (A0A087WUL8).
Node A:SER:2560 not in graph (A0A087WUL8).
Node A:SER:3536 not in graph (A0A087WUL8).
Node A:SER:2072 not in graph (A0A087WUL8).
Node A:SER:1828 not in graph (A0A087WUL8).
Node A:SER:1584 not in graph (A0A087WUL8).


No candidate nodes for A0A087WUL8 A:SER:1340.


No candidate nodes for A0A087WUL8 A:SER:1096.


No candidate nodes for A0A087WUL8 A:SER:852.


Output()

No candidate nodes for A0A087WUL8 A:SER:364.
Node A:SER:3292 not in graph (A0A087WUL8).


No candidate nodes for A0A096LP49 A:SER:566.


No candidate nodes for A0A096LP49 A:SER:519.


Output()

No candidate nodes for A0A096LP49 A:SER:478.


Output()

Output()

Output()

Output()

Output()

No candidate nodes for A0AUZ9 A:SER:105.


No candidate nodes for A0AUZ9 A:SER:205.


No candidate nodes for A0AUZ9 A:SER:526.


No candidate nodes for A0AUZ9 A:SER:714.


No candidate nodes for A0AUZ9 A:THR:797.


Output()

No candidate nodes for A0AV02 A:SER:665.


No candidate nodes for A0AV02 A:THR:485.


Output()

No candidate nodes for A0AV02 A:SER:488.


No candidate nodes for A0AV96 A:THR:519.


No candidate nodes for A0AV96 A:SER:508.


No candidate nodes for A0AV96 A:THR:554.


No candidate nodes for A0AV96 A:SER:396.


No candidate nodes for A0AV96 A:TYR:541.


No candidate nodes for A0AV96 A:SER:540.


No candidate nodes for A0AV96 A:TYR:522.


Output()

No candidate nodes for A0AVF1 A:SER:553.


Output()

Output()

Output()

No candidate nodes for A0AVK6 A:SER:417.


No candidate nodes for A0AVK6 A:SER:664.


No candidate nodes for A0AVK6 A:SER:767.


No candidate nodes for A0AVK6 A:THR:43.


No candidate nodes for A0AVK6 A:THR:46.


No candidate nodes for A0AVK6 A:SER:52.


No candidate nodes for A0AVK6 A:SER:416.


No candidate nodes for A0AVK6 A:THR:863.


No candidate nodes for A0AVK6 A:SER:862.


No candidate nodes for A0AVK6 A:THR:777.


No candidate nodes for A0AVK6 A:THR:812.


No candidate nodes for A0AVK6 A:SER:102.


Output()

Output()

No candidate nodes for A0FGR8 A:THR:701.


No candidate nodes for A0FGR8 A:SER:699.


No candidate nodes for A0FGR8 A:THR:705.


No candidate nodes for A0FGR8 A:SER:711.


No candidate nodes for A0FGR8 A:SER:736.


No candidate nodes for A0FGR8 A:SER:743.


No candidate nodes for A0FGR8 A:SER:739.


No candidate nodes for A0FGR8 A:SER:704.


No candidate nodes for A0FGR8 A:SER:737.


No candidate nodes for A0FGR8 A:SER:691.


No candidate nodes for A0FGR8 A:SER:685.


No candidate nodes for A0FGR8 A:SER:688.


No candidate nodes for A0FGR8 A:THR:921.


No candidate nodes for A0FGR8 A:SER:693.


No candidate nodes for A0FGR8 A:SER:738.


No candidate nodes for A0FGR8 A:THR:684.


Output()

No candidate nodes for A0FGR9 A:THR:647.


No candidate nodes for A0FGR9 A:SER:697.


Output()

No candidate nodes for A0FGR9 A:SER:850.


No candidate nodes for A0JLT2 A:SER:226.


No candidate nodes for A0JLT2 A:SER:234.


No candidate nodes for A0JLT2 A:SER:235.


No candidate nodes for A0JLT2 A:SER:196.


No candidate nodes for A0JLT2 A:SER:194.


Output()

No candidate nodes for A0JLT2 A:THR:192.


No candidate nodes for A0JNW5 A:THR:427.


No candidate nodes for A0JNW5 A:SER:1083.


No candidate nodes for A0JNW5 A:SER:1108.


No candidate nodes for A0JNW5 A:SER:423.


No candidate nodes for A0JNW5 A:THR:420.


No candidate nodes for A0JNW5 A:SER:891.


No candidate nodes for A0JNW5 A:THR:441.


No candidate nodes for A0JNW5 A:SER:754.


No candidate nodes for A0JNW5 A:SER:1081.


No candidate nodes for A0JNW5 A:SER:1078.


No candidate nodes for A0JNW5 A:THR:1070.


No candidate nodes for A0JNW5 A:SER:1066.


No candidate nodes for A0JNW5 A:SER:989.


No candidate nodes for A0JNW5 A:SER:987.


No candidate nodes for A0JNW5 A:TYR:985.


No candidate nodes for A0JNW5 A:SER:976.


No candidate nodes for A0JNW5 A:SER:968.


No candidate nodes for A0JNW5 A:SER:962.


No candidate nodes for A0JNW5 A:SER:414.


No candidate nodes for A0JNW5 A:THR:955.


No candidate nodes for A0JNW5 A:SER:953.


No candidate nodes for A0JNW5 A:TYR:966.


No candidate nodes for A0JNW5 A:THR:677.


No candidate nodes for A0JNW5 A:SER:1402.


Output()

No candidate nodes for A0JNW5 A:THR:1367.


No candidate nodes for A0JP26 A:SER:383.


Output()

No candidate nodes for A0MZ66 A:THR:537.


No candidate nodes for A0MZ66 A:SER:534.


No candidate nodes for A0MZ66 A:SER:532.


No candidate nodes for A0MZ66 A:SER:619.


No candidate nodes for A0MZ66 A:SER:384.


No candidate nodes for A0MZ66 A:SER:494.


No candidate nodes for A0MZ66 A:SER:493.


No candidate nodes for A0MZ66 A:SER:512.


No candidate nodes for A0MZ66 A:SER:515.


No candidate nodes for A0MZ66 A:THR:496.


No candidate nodes for A0MZ66 A:SER:473.


No candidate nodes for A0MZ66 A:SER:467.


No candidate nodes for A0MZ66 A:SER:386.


Output()

No candidate nodes for A0MZ66 A:SER:506.


No candidate nodes for A0PJK1 A:SER:234.


Output()

Output()

No candidate nodes for A0PJW8 A:THR:35.


No candidate nodes for A0PJW8 A:THR:86.


Output()

Output()

Output()

No candidate nodes for A0PJX4 A:SER:137.


No candidate nodes for A0PJX4 A:THR:153.


No candidate nodes for A0PJX4 A:SER:154.


No candidate nodes for A0PJX4 A:SER:159.


No candidate nodes for A0PJX4 A:SER:162.


No candidate nodes for A0PJX4 A:SER:169.


Output()

No candidate nodes for A0PJX4 A:TYR:216.


Output()

Output()

Output()

Output()

No candidate nodes for A0PK05 A:SER:209.


No candidate nodes for A1A4G5 A:TYR:88.


No candidate nodes for A1A4G5 A:SER:89.


No candidate nodes for A1A4G5 A:SER:100.


Output()

No candidate nodes for A1A4S6 A:SER:668.


No candidate nodes for A1A4S6 A:SER:643.


No candidate nodes for A1A4S6 A:SER:600.


No candidate nodes for A1A4S6 A:SER:591.


No candidate nodes for A1A4S6 A:SER:589.


Output()

No candidate nodes for A1A4V9 A:SER:225.


No candidate nodes for A1A4V9 A:THR:227.


Output()

No candidate nodes for A1A4V9 A:SER:38.


Output()

No candidate nodes for A1A519 A:THR:171.


No candidate nodes for A1A519 A:SER:200.


No candidate nodes for A1A519 A:SER:174.


No candidate nodes for A1A519 A:SER:183.


Output()

No candidate nodes for A1A519 A:THR:198.


Output()

No candidate nodes for A1A5C7 A:THR:673.


Output()

No candidate nodes for A1A5D9 A:THR:348.


No candidate nodes for A1A5D9 A:SER:330.


No candidate nodes for A1A5D9 A:SER:36.


Output()

No candidate nodes for A1A5D9 A:SER:349.


No candidate nodes for A1E959 A:SER:256.


Output()

No candidate nodes for A1IGU5 A:SER:600.


No candidate nodes for A1IGU5 A:SER:672.


Output()

No candidate nodes for A1IGU5 A:THR:590.


No candidate nodes for A1KXE4 A:THR:57.


Output()

Output()

No candidate nodes for A1L020 A:THR:501.


No candidate nodes for A1L020 A:SER:453.


No candidate nodes for A1L020 A:SER:432.


No candidate nodes for A1L020 A:SER:308.


No candidate nodes for A1L020 A:THR:431.


No candidate nodes for A1L020 A:SER:428.


No candidate nodes for A1L020 A:SER:338.


Output()

Output()

Output()

No candidate nodes for A1L168 A:SER:14.


No candidate nodes for A1L170 A:SER:249.


No candidate nodes for A1L170 A:SER:47.


No candidate nodes for A1L170 A:SER:35.


No candidate nodes for A1L170 A:SER:30.


No candidate nodes for A1L170 A:SER:23.


No candidate nodes for A1L170 A:SER:18.


No candidate nodes for A1L170 A:SER:222.


No candidate nodes for A1L170 A:SER:207.


No candidate nodes for A1L170 A:SER:225.


No candidate nodes for A1L170 A:SER:240.


No candidate nodes for A1L170 A:SER:223.


Output()

No candidate nodes for A1L170 A:SER:244.


Output()

Output()

No candidate nodes for A1L390 A:SER:61.


No candidate nodes for A1L390 A:SER:42.


No candidate nodes for A1L390 A:THR:782.


No candidate nodes for A1L390 A:SER:576.


No candidate nodes for A1L390 A:SER:827.


No candidate nodes for A1L390 A:SER:865.


No candidate nodes for A1L390 A:SER:656.


No candidate nodes for A1L390 A:SER:962.


No candidate nodes for A1L390 A:THR:964.


No candidate nodes for A1L390 A:SER:1000.


No candidate nodes for A1L390 A:SER:1011.


No candidate nodes for A1L390 A:SER:1020.


No candidate nodes for A1L390 A:SER:1023.


No candidate nodes for A1L390 A:SER:1028.


No candidate nodes for A1L390 A:SER:1034.


No candidate nodes for A1L390 A:SER:1037.


No candidate nodes for A1L390 A:SER:870.


No candidate nodes for A1L390 A:SER:65.


No candidate nodes for A1L390 A:SER:1040.


No candidate nodes for A1L390 A:SER:537.


No candidate nodes for A1L390 A:SER:533.


No candidate nodes for A1L390 A:SER:577.


No candidate nodes for A1L390 A:SER:488.


No candidate nodes for A1L390 A:SER:485.


No candidate nodes for A1L390 A:SER:475.


No candidate nodes for A1L390 A:SER:618.


No candidate nodes for A1L390 A:SER:631.


No candidate nodes for A1L390 A:SER:76.


No candidate nodes for A1L390 A:SER:636.


No candidate nodes for A1L390 A:SER:639.


No candidate nodes for A1L390 A:SER:643.


No candidate nodes for A1L390 A:SER:647.


No candidate nodes for A1L390 A:SER:448.


No candidate nodes for A1L390 A:SER:433.


No candidate nodes for A1L390 A:THR:1044.


No candidate nodes for A1L390 A:SER:783.


No candidate nodes for A1L390 A:SER:1102.


No candidate nodes for A1L390 A:SER:1100.


No candidate nodes for A1L390 A:SER:1092.


No candidate nodes for A1L390 A:SER:1081.


No candidate nodes for A1L390 A:SER:1169.


No candidate nodes for A1L390 A:SER:1115.


No candidate nodes for A1L390 A:SER:1046.


Output()

No candidate nodes for A1L390 A:SER:1079.


Output()

No candidate nodes for A1L443 A:THR:567.


Output()

No candidate nodes for A1L443 A:SER:524.


Output()

No candidate nodes for A1L4H1 A:SER:930.


No candidate nodes for A1L4H1 A:THR:928.


No candidate nodes for A1L4H1 A:SER:919.


No candidate nodes for A1L4H1 A:SER:726.


No candidate nodes for A1L4H1 A:THR:718.


No candidate nodes for A1L4H1 A:SER:917.


Output()

No candidate nodes for A1L4H1 A:SER:174.


No candidate nodes for A1L4K1 A:SER:5.


Output()

No candidate nodes for A1X283 A:SER:142.


No candidate nodes for A1X283 A:SER:679.


No candidate nodes for A1X283 A:SER:293.


No candidate nodes for A1X283 A:SER:319.


No candidate nodes for A1X283 A:SER:512.


No candidate nodes for A1X283 A:TYR:508.


No candidate nodes for A1X283 A:SER:499.


No candidate nodes for A1X283 A:SER:491.


No candidate nodes for A1X283 A:SER:339.


No candidate nodes for A1X283 A:THR:353.


No candidate nodes for A1X283 A:SER:291.


No candidate nodes for A1X283 A:THR:147.


No candidate nodes for A1X283 A:THR:829.


No candidate nodes for A1X283 A:SER:772.


No candidate nodes for A1X283 A:THR:630.


No candidate nodes for A1X283 A:SER:811.


No candidate nodes for A1X283 A:SER:649.


No candidate nodes for A1X283 A:THR:735.


No candidate nodes for A1X283 A:SER:720.


No candidate nodes for A1X283 A:SER:528.


No candidate nodes for A1X283 A:SER:699.


No candidate nodes for A1X283 A:SER:823.


Output()

No candidate nodes for A1X283 A:THR:818.


Output()

No candidate nodes for A1YPR0 A:SER:201.


No candidate nodes for A1YPR0 A:SER:215.


Output()

Output()

No candidate nodes for A1Z1Q3 A:SER:258.
Node A:SER:426 not in graph (A1Z1Q3).


No candidate nodes for A2A288 A:TYR:468.


No candidate nodes for A2A288 A:SER:345.


No candidate nodes for A2A288 A:THR:462.


Output()

No candidate nodes for A2A288 A:SER:428.


No candidate nodes for A2A2Y4 A:THR:14.


No candidate nodes for A2A2Y4 A:THR:480.


Output()

No candidate nodes for A2A2Y4 A:SER:416.


No candidate nodes for A2A2Z9 A:SER:655.


Output()

No candidate nodes for A2A2Z9 A:THR:654.


Output()

No candidate nodes for A2A3K4 A:SER:537.


No candidate nodes for A2A3K4 A:SER:534.


No candidate nodes for A2A3K4 A:SER:505.


No candidate nodes for A2A3K4 A:SER:472.


No candidate nodes for A2A3K4 A:THR:544.


No candidate nodes for A2A3K4 A:SER:457.


No candidate nodes for A2A3K4 A:SER:553.


No candidate nodes for A2A3K4 A:THR:34.


No candidate nodes for A2A3K4 A:THR:410.


No candidate nodes for A2A3K4 A:THR:413.


No candidate nodes for A2A3K4 A:SER:438.


No candidate nodes for A2A3K4 A:SER:547.


Output()

No candidate nodes for A2A3L6 A:THR:436.


No candidate nodes for A2A3L6 A:SER:3.


Output()

No candidate nodes for A2A3L6 A:SER:2.


No candidate nodes for A2A3N6 A:SER:3.


Output()

No candidate nodes for A2AJT9 A:SER:78.


No candidate nodes for A2AJT9 A:SER:312.


No candidate nodes for A2AJT9 A:SER:80.


No candidate nodes for A2AJT9 A:SER:17.


No candidate nodes for A2AJT9 A:SER:100.


No candidate nodes for A2AJT9 A:SER:139.


No candidate nodes for A2AJT9 A:SER:162.


No candidate nodes for A2AJT9 A:SER:192.


No candidate nodes for A2AJT9 A:TYR:74.


Output()

No candidate nodes for A2AJT9 A:SER:31.


No candidate nodes for A2CJ06 A:TYR:414.


No candidate nodes for A2CJ06 A:SER:500.


Output()

Output()

No candidate nodes for A2PYH4 A:THR:1124.


No candidate nodes for A2PYH4 A:THR:1132.


No candidate nodes for A2PYH4 A:THR:1133.


No candidate nodes for A2PYH4 A:SER:1135.


No candidate nodes for A2PYH4 A:SER:1123.


No candidate nodes for A2PYH4 A:THR:1112.


No candidate nodes for A2PYH4 A:SER:1307.


Output()

Output()

Output()

No candidate nodes for A2RRH5 A:TYR:757.


No candidate nodes for A2RRP1 A:SER:2052.


No candidate nodes for A2RRP1 A:SER:11.


No candidate nodes for A2RRP1 A:SER:1388.


No candidate nodes for A2RRP1 A:SER:1259.


Output()

No candidate nodes for A2RTX5 A:SER:453.


Output()

Output()

No candidate nodes for A2RU30 A:SER:356.


No candidate nodes for A2RU30 A:SER:138.


No candidate nodes for A2RU30 A:TYR:358.


No candidate nodes for A2RU30 A:THR:294.


No candidate nodes for A2RU30 A:THR:355.


No candidate nodes for A2RU30 A:SER:454.


No candidate nodes for A2RU30 A:SER:478.


Output()

Output()

Output()

No candidate nodes for A2RU54 A:SER:48.


No candidate nodes for A2RU54 A:SER:51.


Output()

No candidate nodes for A2RU54 A:SER:128.


No candidate nodes for A2RU67 A:THR:26.


No candidate nodes for A2RU67 A:SER:62.


No candidate nodes for A2RU67 A:SER:33.


No candidate nodes for A2RU67 A:SER:30.


No candidate nodes for A2RU67 A:THR:79.


No candidate nodes for A2RU67 A:TYR:22.


No candidate nodes for A2RU67 A:SER:16.


No candidate nodes for A2RU67 A:TYR:82.


No candidate nodes for A2RU67 A:SER:52.


Output()

No candidate nodes for A2RU67 A:SER:75.


No candidate nodes for A2RUB1 A:THR:385.


No candidate nodes for A2RUB1 A:SER:508.


No candidate nodes for A2RUB1 A:THR:393.


No candidate nodes for A2RUB1 A:SER:170.


No candidate nodes for A2RUB1 A:THR:169.


Output()

No candidate nodes for A2RUB1 A:SER:502.


No candidate nodes for A2RUB6 A:THR:121.


No candidate nodes for A2RUB6 A:THR:115.


No candidate nodes for A2RUB6 A:SER:64.


No candidate nodes for A2RUB6 A:SER:594.


No candidate nodes for A2RUB6 A:THR:60.


No candidate nodes for A2RUB6 A:THR:57.


No candidate nodes for A2RUB6 A:THR:50.


No candidate nodes for A2RUB6 A:THR:48.


No candidate nodes for A2RUB6 A:THR:599.


No candidate nodes for A2RUB6 A:SER:669.


No candidate nodes for A2RUB6 A:SER:603.


No candidate nodes for A2RUB6 A:SER:796.


No candidate nodes for A2RUB6 A:SER:794.


No candidate nodes for A2RUB6 A:THR:691.


No candidate nodes for A2RUB6 A:SER:764.


No candidate nodes for A2RUB6 A:SER:797.


No candidate nodes for A2RUB6 A:THR:812.


Output()

No candidate nodes for A2RUB6 A:SER:763.


Output()

No candidate nodes for A2RUG3 A:TYR:113.


Output()

No candidate nodes for A2RUQ5 A:SER:5.


Output()

No candidate nodes for A2RUQ5 A:THR:8.


No candidate nodes for A2RUR9 A:SER:176.


No candidate nodes for A2RUR9 A:TYR:466.


Output()

No candidate nodes for A2RUR9 A:THR:105.


No candidate nodes for A2RUS2 A:SER:490.


No candidate nodes for A2RUS2 A:THR:494.


No candidate nodes for A2RUS2 A:SER:512.


No candidate nodes for A2RUS2 A:SER:472.


Output()

Output()

No candidate nodes for A2VCK2 A:SER:102.


No candidate nodes for A2VCK2 A:THR:247.


No candidate nodes for A2VCK2 A:SER:260.


No candidate nodes for A2VCK2 A:TYR:263.


Output()

Output()

No candidate nodes for A2VDJ0 A:TYR:983.


No candidate nodes for A2VDJ0 A:SER:1036.


No candidate nodes for A2VDJ0 A:THR:1059.


No candidate nodes for A2VDJ0 A:SER:981.


No candidate nodes for A2VDJ0 A:SER:1007.


No candidate nodes for A2VDJ0 A:THR:989.


No candidate nodes for A2VDJ0 A:SER:990.


No candidate nodes for A2VDJ0 A:THR:1096.


No candidate nodes for A2VDJ0 A:TYR:12.


No candidate nodes for A2VDJ0 A:THR:368.


No candidate nodes for A2VDJ0 A:THR:369.


No candidate nodes for A2VDJ0 A:THR:991.


No candidate nodes for A2VDJ0 A:SER:1122.


No candidate nodes for A2VDJ0 A:TYR:1289.


No candidate nodes for A2VDJ0 A:SER:1212.


No candidate nodes for A2VDJ0 A:SER:1600.


No candidate nodes for A2VDJ0 A:THR:1328.


No candidate nodes for A2VDJ0 A:SER:1327.


No candidate nodes for A2VDJ0 A:TYR:1125.


No candidate nodes for A2VDJ0 A:SER:1321.


No candidate nodes for A2VDJ0 A:SER:1310.


No candidate nodes for A2VDJ0 A:SER:1306.


No candidate nodes for A2VDJ0 A:SER:1302.


No candidate nodes for A2VDJ0 A:THR:955.


No candidate nodes for A2VDJ0 A:TYR:1288.


Output()

No candidate nodes for A2VDJ0 A:SER:1242.


Node A:THR:3842 not in graph (A2VEC9).
Node A:SER:3782 not in graph (A2VEC9).
Node A:THR:3766 not in graph (A2VEC9).
Node A:THR:3076 not in graph (A2VEC9).
Node A:SER:3068 not in graph (A2VEC9).
Node A:SER:2435 not in graph (A2VEC9).
Node A:THR:3785 not in graph (A2VEC9).
Node A:SER:2326 not in graph (A2VEC9).
Node A:SER:2420 not in graph (A2VEC9).
Node A:TYR:533 not in graph (A2VEC9).


Output()

No candidate nodes for A2VEC9 A:SER:68.
Node A:THR:1075 not in graph (A2VEC9).


No candidate nodes for A3KMH1 A:TYR:1402.


No candidate nodes for A3KMH1 A:SER:3.


Output()

No candidate nodes for A3KN83 A:THR:819.


No candidate nodes for A3KN83 A:SER:693.


No candidate nodes for A3KN83 A:SER:689.


No candidate nodes for A3KN83 A:SER:692.


No candidate nodes for A3KN83 A:SER:1326.


No candidate nodes for A3KN83 A:THR:1328.


No candidate nodes for A3KN83 A:SER:817.


No candidate nodes for A3KN83 A:SER:212.


No candidate nodes for A3KN83 A:SER:148.


No candidate nodes for A3KN83 A:SER:794.


No candidate nodes for A3KN83 A:SER:815.


No candidate nodes for A3KN83 A:SER:811.


No candidate nodes for A3KN83 A:SER:697.


No candidate nodes for A3KN83 A:SER:214.


No candidate nodes for A3KN83 A:SER:823.


No candidate nodes for A3KN83 A:SER:828.


No candidate nodes for A3KN83 A:THR:829.


No candidate nodes for A3KN83 A:THR:713.


No candidate nodes for A3KN83 A:SER:1062.


Output()

No candidate nodes for A3KN83 A:SER:818.


Output()

Output()

Output()

No candidate nodes for A4D0V7 A:SER:318.


Output()

No candidate nodes for A4D0V7 A:THR:42.


In [198]:
len(seq)

921

In [173]:
failed

[('A0A087WUL8', 'A:SER:2804'),
 ('A0A087WUL8', 'A:SER:3048'),
 ('A0A087WUL8', 'A:SER:2316'),
 ('A0A087WUL8', 'A:SER:2560'),
 ('A0A087WUL8', 'A:SER:3536'),
 ('A0A087WUL8', 'A:SER:2072'),
 ('A0A087WUL8', 'A:SER:1828'),
 ('A0A087WUL8', 'A:SER:1584'),
 ('A0A087WUL8', 'A:SER:3292')]

In [176]:
for f in Path("data/processed/motifs").glob("*.csv"):
    f

In [190]:
radius

5.0

In [193]:
# Load dataframes 

# For each file that ends with .csv, load it into a dataframe.
motif_dfs = [
    pd.read_csv(
        f, 
        header=0,
        delimiter="\t",
    )
    for f in Path("data/processed/motifs").glob(f"*-R{radius}Å.csv")
]
motif_df = pd.concat(motif_dfs)
motif_df

Unnamed: 0,uniprot_id,prev,site,next,nearest_node
0,A0A096LP55,A:LEU:10,A:THR:11,A:GLU:12,
0,A0A096LP49,A:PRO:565,A:SER:566,A:PRO:567,
1,A0A096LP49,A:GLY:518,A:SER:519,A:PRO:520,
2,A0A096LP49,A:GLU:477,A:SER:478,A:PRO:479,
0,A0A0B4J2A2,A:LYS:118,A:THR:119,A:GLU:120,
1,A0A0B4J2A2,A:GLY:109,A:SER:110,A:GLN:111,
2,A0A0B4J2A2,A:LEU:98,A:SER:99,A:MET:100,A:ALA:128
3,A0A0B4J2A2,A:GLY:94,A:SER:95,A:GLY:96,A:THR:116
4,A0A0B4J2A2,A:ILE:78,A:TYR:79,A:GLY:80,
5,A0A0B4J2A2,A:LEU:39,A:SER:40,A:THR:41,A:GLY:162


In [188]:
nearest_node = motif_df[~motif_df["nearest_node"].isna()]
nearest_node

Unnamed: 0,uniprot_id,prev,site,next,nearest_node
2,A0A0B4J2A2,A:LEU:98,A:SER:99,A:MET:100,A:ALA:128
3,A0A0B4J2A2,A:GLY:94,A:SER:95,A:GLY:96,A:THR:116
5,A0A0B4J2A2,A:LEU:39,A:SER:40,A:THR:41,A:GLY:162
3,A0A075B759,A:LEU:39,A:SER:40,A:THR:41,A:GLY:162


In [187]:
nearest_node["uniprot_id"].unique()

array(['A0A0B4J2A2', 'A0A075B759'], dtype=object)