In [1]:
# Autoreload 
%load_ext autoreload
%autoreload 2

In [2]:
# Load PTM data, annotate alphafold structures 
# in order to get distribution of pLDDT (or IDR annotations)

# Then, randomly select same number of STY locations with same distribution from 
# the same set of proteins.  (negative examples)

# Dataloader class to provide data to model (graphs with label) 
# Model class to train model on data



In [3]:
import yaml
from pathlib import Path

import pandas as pd 
import numpy as np

#### Load phosphorylation sites from PhosphoSitePlus

In [4]:
with open("config.yml", 'r') as ymlfile:
    cfg = yaml.load(ymlfile, Loader=yaml.FullLoader)

# Get the path to the DATASET_DIR
DATASET_DIR = Path(cfg['DATASET_DIR']).expanduser()

# PhosphoSitePlus datasets 
PSP_DIR = DATASET_DIR / 'PSP' 
assert PSP_DIR.is_dir()

P_PATH          = PSP_DIR / "Phosphorylation_site_dataset"  # Phosphorylation sites dataset
REG_PATH        = PSP_DIR / "Regulatory_sites"              # Regulatory sites dataset
PTM_SEQ_PATH    = PSP_DIR / "Phosphosite_PTM_seq.fasta"     # FASTA formatted sequences of all substrate proteins.  Lowercase letters indicate the phosphorylation sites.

# Assert that the paths are existing files
assert P_PATH.is_file()
assert REG_PATH.is_file()
assert PTM_SEQ_PATH.is_file()

In [19]:
# Data dir 
from phosphosite import CIF_DIR, PAE_DIR, PDB_DIR
from phosphosite.dataset import PhosphoSequenceList

ptm_seq = PhosphoSequenceList(
    PTM_SEQ_PATH, 
    handle_isoforms = "remove",
    organism = "human",
    
)
print(type(ptm_seq))
ptm_seq[0:2]

<class 'phosphosite.dataset.psp.PhosphoSequenceList'>


[SeqRecord(seq=Seq('MEEYHRHCDEVGFNAEEAHNIVKECVDGVLGGEDYNHNNINQWTASIVEQSLTH...IVL'), id='P51808', name='DYNLT3', description='<unknown description>', dbxrefs=[]),
 SeqRecord(seq=Seq('MsEPGGGGGEDGsAGLEVSAVQNVADVSVLQKHLRkLVPLLLEDGGEAPAALEA...CTE'), id='Q14204', name='DNCH1', description='<unknown description>', dbxrefs=[])]

In [23]:
# Get some proteins for testing 
uniprot_ids = [p.id for p in ptm_seq[:10]]
print(uniprot_ids)

['P51808', 'Q14204', 'P68431', 'Q52LA3', 'Q9Y237', 'P52298', 'Q6P5R6', 'Q9NRG0', 'Q14CS0', 'P60866']


In [7]:
# Use structuremap to download AF models for the sites we have;
# convert sites to `p` and `p_reg` dataframe

In [8]:
# Function that generates matching distribution of STY sites 

"""
Generate negative examples
"""
def get_negative_examples(
    df: pd.DataFrame, 
    possible_residues: str = "STY",
):
    if isinstance(possible_residues, str):
        possible_residues = list(possible_residues)
    
    

### Structural annotations

In [9]:
# Import structuremap functions
import structuremap.utils
structuremap.utils.set_logger()
from structuremap.processing import download_alphafold_cif, download_alphafold_pae, format_alphafold_data, annotate_accessibility, get_smooth_score, annotate_proteins_with_idr_pattern, get_extended_flexible_pattern, get_proximity_pvals, perform_enrichment_analysis, perform_enrichment_analysis_per_protein, evaluate_ptm_colocalization, extract_motifs_in_proteome
from structuremap.plotting import plot_enrichment, plot_ptm_colocalization


In [11]:
valid_proteins_cif, invalid_proteins_cif, existing_proteins_cif = download_alphafold_cif(
    proteins=uniprot_ids,
    out_folder=CIF_DIR)

100%|██████████| 10/10 [00:07<00:00,  1.32it/s]

2023-05-05 15:18:09> Valid proteins: 9
2023-05-05 15:18:09> Invalid proteins: 1
2023-05-05 15:18:09> Existing proteins: 0





In [12]:
valid_proteins_pae, invalid_proteins_pae, existing_proteins_pae = download_alphafold_pae(
    proteins=uniprot_ids,
    out_folder=PAE_DIR, 
    )

100%|██████████| 10/10 [00:11<00:00,  1.19s/it]

2023-05-05 15:18:32> Valid proteins: 9
2023-05-05 15:18:32> Invalid proteins: 1
2023-05-05 15:18:32> Existing proteins: 0





### Phosphosite database (dbPTM)

In [15]:
from baked import dbPTM, Field
import baked.modifications as mod

dbptm = dbPTM(
    "general", 
    filter_dict={
        Field.ptm_type: mod.p,  # TODO; ONLY INCLUDE PHOSPHORYLATION FOR NOW
    },
)
# Convert to format for structuremap
ptm_df = dbptm.structuremap_df
ptm_df[0:2]

Loading dataset from /Users/cameronmcmenamie/RewiredBio/PTMBakery/Datasets/dbPTM/general_data/dbPTM_general_sites.csv... skippping rows 1
Done loading dataset.


Unnamed: 0,protein_id,AA,position,p
0,A0A023T787,S,56,1
1,A0A024QYT6,S,150,1


In [17]:
len(ptm_df.protein_id.unique())

32343

In [None]:
# Generate negative examples; should be same number per protein 
# i.e. set of all protein ids that have structure and sequence available. pick same number of STY sites
# that there are phosphorylation sites for.  (negative examples), per protein

# first pass: don't pay attention to IDR or structural qualities; just pick randomly. 

In [None]:
# Use PhosphoSequenceList to get all STY sites (will be lowercase).  

# TODO : first, do inner join of dbPTM sites (i.e. make STY lowercase if they aren't already if they show up in dbPTM)
# Count the number of dbPTM sites that aren't lowercase. 


In [None]:
# TODO: add dbPTM sites to seqrecords 

In [None]:
for u in uniprot_ids:
    seq = ptm_seq.to_str(u)
    print(seq)

In [29]:
u = uniprot_ids[0]
ptm_seq.get_sequence(u)



'MEEYHRHCDEVGFNAEEAHNIVKECVDGVLGGEDYNHNNINQWTASIVEQSLTHLVkLGKAYKYIVTCAVVQKSAYGFHTASSCFWDTTSDGtCtVRWENRTMNCIVNVFAIAIVL'

In [53]:
dbptm_df = dbptm.dataset
print(len(dbptm_df))
dbptm_df[0:2]

506457


Unnamed: 0,entry_name,acc_id,pos,ptm_type,pmids,seq_window,res,species_code
44,IMA3_HUMAN,O00629,24,Phosphorylation,['18212344'],NFKNKGRDLETMRRQRNEVVV,T,human
45,IMA3_HUMAN,O00629,60,Phosphorylation,"['29255136', '22167270', '23927012', '18707149...",NVPHEDICEDSDIDGDYRVQN,S,human


In [68]:
# Per acc_id, count the number of sites in df that are not lowercase in ptm_seq
# (matching with same protein_id, position, residue)

seq_ids = ptm_seq.get_uniprot_ids()
print(len(seq_ids))

# Assert no seq_id has a "-" char (since we removed isoforms)
assert not any([s.find("-") != -1 for s in seq_ids])

# How many ids in seq_ids are in df.acc_id?
print(len(set(seq_ids).intersection(set(dbptm_df.acc_id)))) 

# Filter dbptm_df to only include those in seq_ids
dbptm_df = dbptm_df[dbptm_df.acc_id.isin(seq_ids)]
print(len(dbptm_df))

20583
18776
438523


In [76]:
len(dbptm_df)
all_sites = dbptm_df

filtered = all_sites[dbptm_df.res.isin(["S", "T", "Y"])]
dbptm_df = filtered[0:10_000] # USE FIRST 10,000 SITES FOR NOW
len(dbptm_df.acc_id.unique())


409

In [87]:


def get_annotation(
    acc_id: str, 
    pos: int,
    res: str,
    verbose: bool = False,
) -> bool:
    seq = ptm_seq.get_sequence(acc_id)
    idx = pos - 1
    # Check that idx is in range
    if not (idx >= 0 and idx < len(seq)):
        if verbose: print(f"POS {pos} is out of range for {acc_id} ({len(seq)})")
        return None
    # Check that residue matches
    res_match = True
    if seq[idx].upper() != res.upper():
        if verbose: print(f"RES {res} does not match {seq[idx]} at POS {pos} for {acc_id}")
        res_match = False
    return dict(
        res_match = res_match,
        lowercase = seq[idx].islower(),
    )
dbptm_df[["res_match", "lowercase"]] = dbptm_df.apply(
    lambda x: get_annotation(x.acc_id, int(x.pos), x.res),
    result_type="expand",
    axis=1,
)

In [84]:
for i in [True, False]: #, None]:
    for j in [True, False, None]:
        print(f"same_letter={i}, lowercase={j}: "
              + str(len(dbptm_df[(dbptm_df.same_annotation == i) & (dbptm_df.lowercase == j)])))

same_letter=True, lowercase=True: 8535
same_letter=True, lowercase=False: 1262
same_letter=True, lowercase=None: 0
same_letter=False, lowercase=True: 14
same_letter=False, lowercase=False: 189
same_letter=False, lowercase=None: 0


In [86]:
dbptm_df

Unnamed: 0,entry_name,acc_id,pos,ptm_type,pmids,seq_window,res,species_code,same_annotation,lowercase
44,IMA3_HUMAN,O00629,24,Phosphorylation,['18212344'],NFKNKGRDLETMRRQRNEVVV,T,human,True,True
45,IMA3_HUMAN,O00629,60,Phosphorylation,"['29255136', '22167270', '23927012', '18707149...",NVPHEDICEDSDIDGDYRVQN,S,human,True,True
46,IMA3_HUMAN,O00629,66,Phosphorylation,"['23927012', '28674151', '25852190', '28796482...",ICEDSDIDGDYRVQNTSLEAI,Y,human,True,True
47,IMA3_HUMAN,O00629,71,Phosphorylation,"['30278072', '20068231', '27050516', '28176443...",DIDGDYRVQNTSLEAIVQNAS,T,human,True,False
48,IMA3_HUMAN,O00629,72,Phosphorylation,"['28464451', '30278072', '20068231', '25159151...",IDGDYRVQNTSLEAIVQNASS,S,human,True,False
...,...,...,...,...,...,...,...,...,...,...
10039,RGS14_HUMAN,O43566,338,Phosphorylation,"['22985185', '24247654', '27251275', '28348404']",LAGICEKRGLSLPDIKVYLVG,S,human,True,True
10040,RGS14_HUMAN,O43566,472,Phosphorylation,"['26552605', '26074081', '23312004', '23186163...",GCPPRTQDKATHPPPASPSSL,T,human,True,True
10041,RGS14_HUMAN,O43566,478,Phosphorylation,"['28787133', '28348404', '28122231', '30108239...",QDKATHPPPASPSSLVKVPSS,S,human,True,True
10042,RGS14_HUMAN,O43566,480,Phosphorylation,"['28787133', '28122231', '30108239', '26552605...",KATHPPPASPSSLVKVPSSAT,S,human,True,True


In [58]:
# find number of rows in df that match to the same residue (at position) in ptm_seq
matches = {}
for i, acc_id in enumerate(seq_ids):
    df = dbptm_df[dbptm_df.acc_id == acc_id]
    window_match = 0
    for j, row in df.iterrows():
        acc_id = row.acc_id
        pos = row.pos
        res = row.res
        seq = ptm_seq.get_sequence(acc_id)
        if acc_id not in matches:
            matches[acc_id] = {}
        window = row.seq_window 
        l = len(window)
        # trim down to +/- N residues from midpoint
        n = 4
        #print(window)
        row_window = window[l//2-n:l//2+n+1]
        print(row_window)

        # print window of sequence (sequence is 1-indexed)
        seq_window = seq[pos-n-1:pos+n]
        print(seq_window)
        print()
    d = dict(
        window_match=row_window == seq_window.upper(),
        res_match=
        lower
        total=len(df),
        
    )
    matches[acc_id] = d 

---MSEPGG


GEDGSAGLE
GEDGsAGLE

RKFLSDPQV
RkFLsDPQV

VEDPTFLNQ
VEDPtFLNQ

EKRESPEVL
EKRESPEVL

EVLLTLDIL
EVLLTLDIL

GLDVSKEGT
GLDVskEGT

QDGDSFRMK
QDGDsFRMk

TIESTRVRG
TIEStRVRG

IITLSKEVR
IITLSkEVR

LEVRSLETC
LEVRSLETC

RSLETCMYD
RSLETCMYD

ETCMYDHKT
ETCMYDHkT

YDHKTFSEI
YDHkTFSEI

NQVIYLNPP
NQVIyLNPP

MVVLSLPRI
MVVLsLPRI

VGVHYELTE
VGVHyELTE

QMLGSNMTE
QMLGsNMTE

TASTSDAVT
TASTsDAVt

SDAVTFITY
sDAVtFITy

TFITYVQSL
tFITyVQsL

TYVQSLKRK
TyVQsLKRK

RRKDSAIQQ
RRkDsAIQQ

ALTIYEGKF
ALTIyEGkF

ALELTDTGL
ALELtDTGL

TGLLSGSEE
TGLLsGSEE

ALGEYLERE
ALGEyLERE

ERERSSFPR
ERERSSFPR

DVTRSLIKS
DVtRsLIKS

DNAKSFEWL
DNAksFEWL

FEWLSQMRF
FEWLsQMRF

DPKQTDVLQ
DPKQtDVLQ

RLGGSPFGP
RLGGsPFGP

SNPNYDKTS
SNPNYDkTS

YDKTSAPIT
YDkTSAPIT

SLAMTKPDR
SLAMtKPDR

QVMLYSQGF
QVMLYSQGF

RALKSVLVS
RALksVLVs

SVLVSAGNV
sVLVsAGNV

KDHLYGTLD
kDHLyGtLD

HLYGTLDPN
HLyGtLDPN

LDPNTREWT
LDPNtrEWt

TREWTDGLF
trEWtDGLF

DGLFTHVLR
DGLFtHVLR

LKYATLATV
LkYATLATV

EEAASPMLQ
EEAAsPMLQ

PYFTSNGLV
PYFTsNGLV

LWSLSGDSR
LWSLsGDSR

E

KeyboardInterrupt: 

In [None]:
pd.DataFrame.from_dict(matches, orient='index').reset_index()