In [26]:
# Autoreload 
%load_ext autoreload
%autoreload 2

from pathlib import Path 
import pandas as pd 
import numpy as np
import os 
import re 
import gzip 
import shutil
import Bio.PDB.MMCIF2Dict
from typing import Union, List, Tuple, Dict, Optional
from pathlib import Path

pd.options.mode.chained_assignment = None  # default='warn'

from phosphosite.utils import aa1to3, aa3to1

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
"""Initialise phosphosite dataset."""
from phosphosite.dataset import phosphorylation # Filtered out isoforms
df = phosphorylation[phosphorylation["ORGANISM"] == "human"]

# Sort by ACC_ID
df = df.sort_values("ACC_ID")

# Filter by residue type, first character in MOD_RSD
allowed_residues = "STY"
df = df[df["MOD_RSD"].str[0].isin(list(allowed_residues))]
phosphosite_df = df

In [5]:

from phosphosite import DATA_DIR
annotation_dir = DATA_DIR / "structure_annotations"

if False:
    N = 10000
    filepath = annotation_dir / f"structure_df_{N}.csv"
    # load df 
    structure_df = pd.read_csv(filepath, sep="\t")
    structure_df.head()
    # save df as HDF
    structure_df.to_hdf(annotation_dir / f"structure_df.h5", key="structure_df")

In [8]:
# Load from h5 
structure_df = pd.read_hdf(annotation_dir / f"structure_df.h5", key="structure_df")

In [156]:
# Filter to first 100 unique protein ids
all_protein_ids = list(structure_df["protein_id"].unique())
protein_ids = all_protein_ids[0:100]
subset_df = structure_df[structure_df["protein_id"].isin(protein_ids)]

In [157]:
len(all_protein_ids)

17336

In [25]:
def get_euc_dist(
    arr1: np.ndarray, arr2: np.ndarray
): 
    """Get euclidean distance between two arrays."""
    return np.sqrt(np.sum((arr1 - arr2) ** 2))
    

def get_node_id(
    site: str, 
    chain_id: str = "A",
) -> str: 
    mod_rsd, modification = site.split("-")
    aa = aa1to3[mod_rsd[0]]
    position = mod_rsd[1:]
    node_id = f"{chain_id}:{aa}:{position}"
    return node_id

def generate_node_id(
    node_dict: Dict[str, Union[str, int]],
    delimiter: str = ":",
) -> str: 
    return delimiter.join([str(node_dict[s]) for s in ["chain_id", "residue_name", "residue_number"]])

In [185]:
from tqdm import tqdm

"""Process motifs for a given set of protein ids."""
def process_sites(
    phosphosite_df: pd.DataFrame, 
    structure_df: pd.DataFrame,
    protein_ids: List[str],
    ref_atom: str = "ca", # ["c", "ca", "cb", "n"]
    radius: float = 6.0,
    adjacent_range: int = 1,
    next_nearest: int = 2,
    to_process: str = "all", # "p" for just phosphorylated.
    residue_type: str = "S", # just serine for now.
    verbose: bool = False,
    filepath: Optional[Union[str, Path]] = None,
) -> pd.DataFrame:
    """Process sites for a given protein id.
    
    Parameters
    ----------
    phosphosite_df : pd.DataFrame
        Phosphosite df.
    structure_df : pd.DataFrame
        Structure df.
    protein_ids : List[str]
        List of protein ids to process.
    ref_atom : str, optional
        Reference atom to use for calculating distances, by default "ca"
    radius : float, optional
        Radius to filter by, by default 6.0
    adjacent_range : int, optional
        Range of adjacent residues to exclude from "next-nearest spatial
        neighbour" calculation, by default 1
        For example, if adjacent_range = 1, then residues -1 and +1 will be
        excluded from the candidates (relative to the site of interest's position).
    next_nearest : int, optional
        Number of next-nearest spatial neighbours to calculate, by default 2. 
        For example, if next_nearest = 2, then the 2 closest residues to the site
        of interest will be calculated.  If there is no residue within the radius, 
        then the value will be NaN.

    Returns
    -------
    pd.DataFrame
        Processed sites df.

    """
    if ref_atom not in ["c", "ca", "cb", "n"]:
        raise ValueError(f"Invalid reference atom: {ref_atom}")

    suffix = f"_coord_{ref_atom}"
    def get_coords(row): 
        return np.array([row[x + suffix] for x in ["x", "y", "z"]])

    if isinstance(protein_ids, str):
        protein_ids = [protein_ids]

    dict_list = []
    pbar = tqdm(enumerate(protein_ids))
    for counter, protein_id in pbar:
        pbar.set_description(protein_id)

        df = structure_df[structure_df["protein_id"] == protein_id]         # filter structure_df for this protein_id
        site_df = phosphosite_df[phosphosite_df["ACC_ID"] == protein_id]    # filter phosphosite_df for this protein_id

        # Get MOD_RSD column as list 
        mod_rsd = list(site_df["MOD_RSD"])
        mod_rsd = [x.split("-")[0] for x in mod_rsd]
        mod_rsd = [(x[0], int(x[1:])) for x in mod_rsd] # all known phosphosites for this protein
        #mod_rsd = [f"{x[0]}:{x[1:]}" for x in mod_rsd]


        if to_process == "p": 
            # Filter to just phosphorylated residues.
            to_consider = [x for x in mod_rsd if x[0] in residue_type]

        elif to_process == "all":
            # Consider all allowed residues. 
            # i.e. every residue in df that is an allowed residue. 
            to_consider = []
            for res in residue_type:
                for pos in df[df["AA"] == res]["position"].unique():
                    to_consider.append((res, pos))

        else: 
            raise ValueError(f"Invalid value for to_process: {to_process}")

        for res, pos in to_consider: 
            # Get the first row that matches the residue and position, unless there are none
            try:
                row = df[(df["AA"] == res) & (df["position"] == pos)].iloc[0]   
            except IndexError:
                if verbose: tqdm.write(f"[{protein_id}] Could not find centre residue {res} at position {pos}")
                continue
        
            site_qual = row["quality"] # store pLDDT for centre residue.
            
            site_coords = get_coords(row)
            is_phosphosite = (res, pos) in mod_rsd

            # Get the previous and next residues
            prev_dict = {}
            for i in list(range(-adjacent_range, 0)): 
                try:
                    next_row = df[(df["position"] == pos + i)].iloc[0]
                except IndexError:
                    if verbose: tqdm.write(f"[{protein_id}] Could not find residue at position {pos + i}")
                    prev_dict[f"{i}"] = np.nan
                    continue
                #coords = get_coords(next_row)
                prev_dict[f"{i}"] = next_row["AA"]+str(next_row["position"]) # (next_row["AA"], get_euc_dist(site_coords, coords))
            
            next_dict = {}
            for i in list(range(0+1, adjacent_range+1)):
                try:
                    next_row = df[(df["position"] == pos + i)].iloc[0]
                except IndexError:
                    if verbose: tqdm.write(f"[{protein_id}] Could not find residue at position {pos + i}")
                    prev_dict[f"+{i}"] = np.nan
                    continue
                
                #coords = get_coords(next_row)
                next_dict[f"+{i}"] = next_row["AA"]+str(next_row["position"]) # (next_row["AA"], get_euc_dist(site_coords, coords))
            
            # Filter df to rows within radius of site_coords. 
            candidate_df = df
            candidate_df["euc_dist"] = candidate_df.apply(lambda row: get_euc_dist(site_coords, get_coords(row)), axis=1)
            candidate_df = candidate_df[candidate_df["euc_dist"] <= radius]
            # Exclude rows with position in [pos-adjacent_range, pos+adjacent_range]
            candidate_df = candidate_df[~candidate_df["position"].isin(list(range(pos-adjacent_range, pos+adjacent_range+1)))]
            candidate_df["seq_dist"] = candidate_df["position"] - pos
            
            # sort in ascending order by euc_dist
            candidate_df = candidate_df.sort_values("euc_dist")
            nearest_dict = {}
            for i in range(1, next_nearest+1):
                # If results in IndexError, then there are less than i residues within radius
                # fill with NaN
                try:
                    next_row = candidate_df.iloc[i-1]
                except IndexError:
                    nearest_dict[f"{i}_res"] = np.nan
                    nearest_dict[f"{i}_euc_dist"] = np.nan
                    nearest_dict[f"{i}_seq_dist"] = np.nan
                    continue
                # Pick rank i of the nearest residues (euc_dist)
                nearest_dict[f"{i}_res"] = next_row["AA"] + str(next_row["position"])
                nearest_dict[f"{i}_euc_dist"] = next_row["euc_dist"]
                nearest_dict[f"{i}_seq_dist"] = next_row["seq_dist"]
            dict_list.append({
                "phosphosite": is_phosphosite,
                "site_qual": site_qual,
                "protein_id": protein_id,
                **prev_dict,
                "site": f"{res}{pos}",
                **next_dict,
                **nearest_dict,
            })
            
        # Save dataframe every 100 rows
        if counter % 100 == 0:
            print(f"Saving dataframe to {filepath}")
            pd.DataFrame(dict_list).to_csv(filepath, sep="\t", index=False)
    
    final_df = pd.DataFrame(dict_list)
    final_df.to_csv(filepath, sep="\t", index=False)
    return final_df
    # save df 


In [183]:
to_process = "all"
residue_type = "STY"
radius = 6.0
residue_adjacent = 2
next_nearest = 3
ref_atom = "ca"

out_filename = f"{to_process}-{residue_type}-{int(radius)}A{residue_adjacent}R{next_nearest}N-{ref_atom}.csv"

outfile = DATA_DIR / "motif" / out_filename
print(f"Running {outfile} ...")


Running /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv ...


In [191]:
# load df 
prev_processed_df = pd.read_csv(outfile, sep="\t")
processed_ids = list(prev_processed_df["protein_id"].unique())
len(processed_ids)

100

In [194]:
unprocessed_ids = [x for x in all_protein_ids if x not in processed_ids]
len(unprocessed_ids), len(all_protein_ids)

(17236, 17336)

In [195]:
processed_df = process_sites(
    phosphosite_df, 
    structure_df, 
    protein_ids=unprocessed_ids, 
    adjacent_range=residue_adjacent,
    next_nearest=next_nearest,
    radius=radius,
    to_process=to_process, # p
    residue_type=residue_type,
    ref_atom=ref_atom,
    verbose=False,
    filepath=outfile,
)

A4D1B5: : 1it [00:01,  1.08s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


A6NFE2: : 101it [05:34,  2.13s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


A6NKN8: : 200it [09:02,  2.62s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


A8MTI9: : 300it [15:01,  1.96s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


B7ZAQ6: : 400it [19:26,  2.97s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


O00193: : 500it [24:05,  2.76s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


O00548: : 600it [28:47,  4.05s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


O14672: : 700it [34:55,  1.97s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


O15013: : 800it [39:29,  3.34s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


O15287: : 900it [47:24,  1.68s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


O43172: : 1000it [52:35,  5.77s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


O43521: : 1100it [57:45,  3.71s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


O43865: : 1200it [1:00:38,  3.39s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


O60479: : 1300it [1:08:41,  5.12s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


O60880: : 1400it [1:13:56,  2.85s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


O75251: : 1500it [1:22:12,  2.51s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


O75526: : 1600it [1:27:36,  1.45s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


O75881: : 1700it [1:31:42,  1.36s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


O94818: : 1800it [1:36:24,  3.98s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


O95164: : 1900it [1:43:23,  3.42s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


O95470: : 2000it [1:48:04,  3.04s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


O95858: : 2100it [1:51:58,  1.11s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


P00748: : 2200it [1:55:24,  2.14s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


P02655: : 2300it [1:58:31,  2.15s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


P04818: : 2400it [2:02:20,  1.55s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


P06748: : 2500it [2:05:17,  1.60s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


P08263: : 2600it [2:08:38,  1.72s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


P09871: : 2700it [2:12:08,  5.80s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


P0CG40: : 2800it [2:15:20,  2.78s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


P10243: : 2900it [2:18:20,  1.51s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


P11717: : 3000it [2:22:42,  1.66s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


P13807: : 3100it [2:27:59,  1.22s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


P15812: : 3200it [2:31:02,  1.60s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


P17181: : 3300it [2:35:45,  1.64s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


P19419: : 3400it [2:39:13,  1.22s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


P21291: : 3500it [2:42:41,  1.42s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


P23142: : 3600it [2:47:00,  2.38s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


P25116: : 3700it [2:51:53,  1.77s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


P28161: : 3800it [2:55:14,  1.13s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


P30154: : 3900it [2:58:59,  1.22s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


P31997: : 4000it [3:02:24,  1.09s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


P35251: : 4100it [3:05:38,  1.14s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


P36915: : 4200it [3:10:49,  2.10s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


P41182: : 4300it [3:14:25,  1.91s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


P43251: : 4400it [3:19:10,  2.44s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


P47755: : 4500it [3:23:35,  2.27s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


P49006: : 4600it [3:27:01,  2.31s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


P49910: : 4700it [3:30:52,  1.52s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


P51587: : 4800it [3:34:07,  1.47s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


P52741: : 4900it [3:38:15,  2.35s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


P54727: : 5000it [3:42:44,  1.96s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


P56182: : 5100it [3:46:28,  1.17s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


P58743: : 5200it [3:49:35,  1.35s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


P61313: : 5300it [3:52:07,  1.26it/s]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


P62906: : 5400it [3:53:28,  1.43it/s]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


P78412: : 5500it [3:56:29,  1.62s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


P98095: : 5600it [4:00:02,  4.30s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q01780: : 5700it [4:04:32,  2.79s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q03591: : 5800it [4:09:32,  3.46s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q07002: : 5900it [4:14:21,  1.77s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q0JRZ9: : 6000it [4:19:35,  3.10s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q12899: : 6100it [4:25:49,  3.61s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q13185: : 6200it [4:32:08,  1.88s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q13454: : 6300it [4:37:44,  2.33s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q13769: : 6400it [4:43:16,  3.95s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q14191: : 6500it [4:49:17,  3.35s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q14653: : 6600it [4:54:47,  3.34s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q14CZ8: : 6700it [5:03:19,  2.48s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q15326: : 6800it [5:07:58,  2.04s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q15700: : 6900it [5:12:49,  2.03s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q16531: : 7000it [5:17:49,  1.49s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q16842: : 7100it [5:21:25,  2.70s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q2MV58: : 7200it [5:27:41,  1.67s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q3KQZ1: : 7300it [5:33:12,  2.14s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q49AN0: : 7400it [5:37:52,  3.74s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q53FP2: : 7500it [5:43:57,  1.56s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q5DJT8: : 7600it [5:49:21, 14.38s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q5JTB6: : 7700it [5:56:02,  5.18s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q5SSG8: : 7800it [6:01:53,  2.97s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q5T601: : 7900it [6:08:54,  9.34s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q5TH74: : 8000it [6:14:35,  8.71s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q5VW38: : 8100it [6:22:37,  5.73s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q63ZY6: : 8200it [6:28:49,  4.64s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q6BCY4: : 8300it [6:36:30,  2.36s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q6IQ22: : 8400it [6:40:40,  1.69s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q6NXP6: : 8500it [6:45:13,  2.07s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q6P6C2: : 8600it [6:49:55,  1.44s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q6Q788: : 8700it [6:54:55,  8.24s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q6UX07: : 8800it [7:00:12,  1.89s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q6ZMB5: : 8900it [7:06:50,  3.17s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q6ZRT6: : 9000it [7:12:37,  6.49s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q6ZWB6: : 9100it [7:19:16,  2.87s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q7L2E3: : 9200it [7:26:13,  1.57s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q7Z2W4: : 9300it [7:30:28,  2.52s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q7Z591: : 9400it [7:39:06,  5.89s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q7Z7M8: : 9500it [7:44:53,  4.13s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q86UP8: : 9600it [7:50:28,  5.35s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q86W50: : 9700it [7:56:36,  1.74s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q86Y22: : 9800it [8:01:22,  3.65s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q8IUX8: : 9900it [8:07:07,  2.82s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q8IWE5: : 10000it [8:13:06,  2.42s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q8IY22: : 10100it [8:20:11,  6.84s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q8IZ81: : 10200it [8:25:00,  2.22s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q8N138: : 10300it [8:31:25,  2.36s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q8N398: : 10400it [8:36:58,  2.71s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q8N5A5: : 10500it [8:41:45,  2.41s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q8N7A1: : 10600it [8:45:12,  2.41s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q8N9I0: : 10700it [8:49:03,  2.18s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q8NBN7: : 10800it [8:53:22,  1.65s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q8NDM7: : 10900it [8:58:00,  6.34s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q8NFF5: : 11000it [9:04:58,  7.79s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q8NGK5: : 11100it [9:09:15,  1.21s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q8NHH9: : 11200it [9:11:41,  1.76s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q8TB03: : 11300it [9:15:34,  1.27s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q8TCT9: : 11400it [9:19:10,  2.00s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q8TE77: : 11500it [9:25:20,  5.15s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q8WUA4: : 11600it [9:31:48,  1.59s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q8WW36: : 11700it [9:35:08,  1.06s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q8WY54: : 11800it [9:41:35,  3.16s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q92599: : 11900it [9:48:16,  3.75s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q92859: : 12000it [9:56:16,  2.15s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q969K7: : 12100it [10:01:18,  1.41s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q96AV8: : 12200it [10:04:07,  1.08s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q96CM8: : 12300it [10:08:09,  1.43s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q96EI5: : 12400it [10:11:26,  1.41s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q96GA3: : 12500it [10:14:38,  1.48s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q96IC2: : 12600it [10:18:01,  1.40s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q96K76: : 12700it [10:24:09,  1.91s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q96LZ2: : 12800it [10:29:01,  1.60s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q96NA2: : 12900it [10:33:39,  3.11s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q96PP4: : 13000it [10:38:23,  3.59s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q96RI9: : 13100it [10:44:51,  2.29s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q96T68: : 13200it [10:50:51,  2.34s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q99666: : 13300it [10:56:03,  3.02s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9BPW4: : 13400it [11:00:45,  2.55s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9BRP8: : 13500it [11:04:02,  1.62s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9BTP7: : 13600it [11:07:04,  1.39s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9BVK6: : 13700it [11:10:06,  2.01s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9BXJ3: : 13800it [11:14:38,  1.18s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9BYJ0: : 13900it [11:19:53,  3.17s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9BZX2: : 14000it [11:26:11,  3.00s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9GZT3: : 14100it [11:34:05,  1.67s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9H0R8: : 14200it [11:37:56,  1.58s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9H2A3: : 14300it [11:42:19,  2.16s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9H3S5: : 14400it [11:47:13,  2.20s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9H628: : 14500it [11:52:03,  1.50s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9H814: : 14600it [11:57:02,  1.92s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9HA82: : 14700it [12:00:36,  1.85s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9HBX8: : 14800it [12:05:00,  2.56s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9HD45: : 14900it [12:13:24,  3.33s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9NQ25: : 15000it [12:17:05,  3.09s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9NR55: : 15100it [12:21:49,  3.57s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9NS56: : 15200it [12:26:39,  3.51s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9NUP9: : 15300it [12:31:46,  1.39s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9NWF9: : 15400it [12:35:20,  1.35s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9NXT0: : 15500it [12:38:40,  1.93s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9NZ52: : 15600it [12:43:57,  1.42s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9P0L9: : 15700it [12:48:57,  2.89s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9P2F6: : 15800it [12:58:23,  6.11s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9UBM8: : 15900it [13:04:55,  1.83s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9UGF6: : 16000it [13:09:15,  1.63s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9UI09: : 16100it [13:15:02,  2.13s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9UJW2: : 16200it [13:20:00,  1.88s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9UKV3: : 16300it [13:25:43,  3.16s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9ULS5: : 16400it [13:34:44,  2.15s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9UNN4: : 16500it [13:40:41,  1.31s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9UQE7: : 16600it [13:49:17,  7.31s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9Y2E5: : 16700it [13:53:37,  5.12s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9Y315: : 16800it [14:00:16,  1.56s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9Y487: : 16900it [14:04:19,  5.40s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9Y5F2: : 17000it [14:12:53,  3.74s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9Y5Z7: : 17100it [14:18:07,  2.98s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9Y6Q1: : 17200it [14:23:33,  3.82s/it]

Saving dataframe to /home/cim/STRUCTURAL_MOTIFS/phosphosite/data/motif/all-STY-6A2R3N-ca.csv


Q9Y6Z7: : 17236it [14:26:44,  3.02s/it]


In [187]:
len(processed_df)

10405

In [177]:
# What proportion have phosphosite True
processed_df["phosphosite"].value_counts(normalize=True)

False    0.905815
True     0.094185
Name: phosphosite, dtype: float64

In [178]:
# What proportion have residues (i.e. first letter of 'site')
processed_df["site"].apply(lambda x: x[0]).value_counts(normalize=True)

S    0.508313
T    0.329938
Y    0.161749
Name: site, dtype: float64

In [141]:
df = processed_df

In [142]:
psite = df[df["phosphosite"] == True]
psite["site"].apply(lambda x: x[0]).value_counts(normalize=True)

S    0.598980
T    0.242857
Y    0.158163
Name: site, dtype: float64

In [143]:
notpsite = df[df["phosphosite"] == False]
notpsite["site"].apply(lambda x: x[0]).value_counts(normalize=True)

S    0.498886
T    0.338992
Y    0.162122
Name: site, dtype: float64

In [161]:
# psite rows with 1_res not NaN
psite[psite["1_res"].notna()]["site"].apply(lambda x: x[0]).value_counts(normalize=True)

S    0.479452
Y    0.264840
T    0.255708
Name: site, dtype: float64

In [162]:
for n in [1, 2, 3]:
    length = len(psite[psite[f"{str(n)}_res"].notna()])
    print(f"Number of phosphosites with >= {n} nearest residues: {length}")

Number of phosphosites with >= 1 nearest residues: 438
Number of phosphosites with >= 2 nearest residues: 307
Number of phosphosites with >= 3 nearest residues: 164


In [174]:
n = 3
cols = [f"{int(i)}_{res}" for i in range(1, n+1) for res in ["euc_dist", "seq_dist"]]
print(psite[psite[f"{int(n)}_res"].notna()][cols])

       1_euc_dist  1_seq_dist  2_euc_dist  2_seq_dist  3_euc_dist  3_seq_dist
2        4.818600       122.0    5.002838        -3.0    5.410340       121.0
426      4.805697       122.0    4.999018        -3.0    5.429360       121.0
429      4.190240        21.0    4.771475        22.0    5.035984        20.0
430      4.079906        29.0    5.360338        28.0    5.411511        30.0
431      5.217619       -34.0    5.288498       -45.0    5.296298       -35.0
...           ...         ...         ...         ...         ...         ...
10142    5.411226      -142.0    5.498299        -3.0    5.719013      -143.0
10143    4.967875      -144.0    5.666538      -146.0    5.919139      -143.0
10145    4.317854      -153.0    5.605041      -165.0    5.647070      -123.0
10170    4.263599         8.0    4.519881         9.0    5.526067        -4.0
10278    5.020814         3.0    5.176321        -3.0    5.895667        -4.0

[164 rows x 6 columns]


In [155]:
N = len(df.protein_id.unique())
fn = out_filename + f"-{N}P.csv"
outfile = DATA_DIR / "motif" / fn
# save df 
df.to_csv(outfile, index=False)