- create conda env from file. 
- download PSP data into separate repository. 
- try loading structures in with cif.gz
- proteome class for iterating over list of sites and a given structure directory (i.e. filter out sites with no structure)
- motif class for dealing with phosphosite

In [None]:
# Try and construct graphs per site from structure dir (cif.gz files)

# Try and use structuremap to do the same. 

### Load phosphosite dataset

In [None]:
# TODO

### Load structure data

In [2]:
# Autoreload 
%load_ext autoreload
%autoreload 2

from pathlib import Path 
import pandas as pd 
import numpy as np

# Import structuremap functions
import structuremap.utils
structuremap.utils.set_logger()
from structuremap.processing import download_alphafold_cif, download_alphafold_pae, format_alphafold_data, annotate_accessibility, get_smooth_score, annotate_proteins_with_idr_pattern, get_extended_flexible_pattern, get_proximity_pvals, perform_enrichment_analysis, perform_enrichment_analysis_per_protein, evaluate_ptm_colocalization, extract_motifs_in_proteome
from structuremap.plotting import plot_enrichment, plot_ptm_colocalization

import os 
import re 
import gzip 
import shutil
import Bio.PDB.MMCIF2Dict
from typing import Union, List, Tuple, Dict, Optional

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Define paths to structural data.

In [3]:
uniprot_id = "P51786"

af_version = 3
filename_template = "AF-{uniprot_id}-F1-model_v{af_version}.cif.gz"
filename = filename_template.format(uniprot_id=uniprot_id, af_version=af_version)

structure_dir = Path.home() / "STRUCTURAL_MOTIFS/DATA/"
af_cif_dir = structure_dir / "AF_HUMAN_CIF" 
af_pdb_dir = structure_dir / "AF_HUMAN_PDB"

# Assert that the cif and pdb directories exist.
assert af_cif_dir.exists()
assert af_pdb_dir.exists()

#### Initialise structure loader object.

In [6]:
from phosphosite.structure import StructureLoader
loader = StructureLoader(af_cif_dir)
filepath = loader.get_structure("Q8WUY3")
filepath

PosixPath('/home/cim/STRUCTURAL_MOTIFS/DATA/AF_HUMAN_CIF/AF-Q8WUY3-F1-model_v3.cif.gz')

### Test loading .gz files

In [18]:
uniprot_ids = ["Q8WUY3", "P50336"]
for uniprot_id in uniprot_ids:
    filepath = loader.get_structure("Q8WUY3")
    print(filepath)

    with gzip.open(filepath, "rb") as f_in:
        pass
        
        #with open(filepath.with_suffix(".cif"), "wb") as f_out:
        #    shutil.copyfileobj(f_in, f_out)

<class 'pathlib.PosixPath'>
<class 'pathlib.PosixPath'>


In [35]:
import tqdm
directory = "structures"
for file in tqdm.tqdm(sorted(os.listdir(directory))):
    print(type(file))
    if file.endswith("cif.gz"):
        print(file, "is cif.gz")

100%|██████████| 3/3 [00:00<00:00, 32430.19it/s]

<class 'str'>
AF-P50336-F1-model_v3.cif.gz is cif.gz
<class 'str'>
AF-Q8WUY3-F1-model_v3.cif.gz is cif.gz
<class 'str'>





In [None]:
# Retrieve uniprot_id from out_format f-string  
out_format = "AF-{uniprot_id}-F1-model_v{af_version}.{extension}"

## Annotate AlphaFold structures 

In [7]:
from phosphosite.structure.processing import process_af_data

In [9]:
process_af_data(structure_dir, 
    out_format="AF-{uniprot_id}-F1-model_v3.cif.gz",
    protein_ids=["Q8WUY3", "P50336", "NON_EXISTING_UNIPROT_ID"],
)

100%|██████████| 2/2 [00:00<00:00,  3.20it/s]


Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,z_coord_ca,z_coord_cb,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured
0,Q8WUY3,1,M,1,80.45,-20.961,-20.280,-18.839,-21.091,-4.446,...,6.406,6.157,7.105,unstructured,unstructured,0,0,0,0,1
1,Q8WUY3,1,E,2,84.42,-23.205,-21.900,-22.137,-21.272,-4.120,...,2.886,2.080,4.164,HELX_RH_AL_P,HELX,0,1,0,0,0
2,Q8WUY3,1,E,3,88.89,-25.078,-25.357,-26.080,-24.139,-2.443,...,4.267,5.426,3.895,HELX_RH_AL_P,HELX,0,1,0,0,0
3,Q8WUY3,1,F,4,92.30,-23.028,-23.527,-22.518,-23.934,-0.032,...,5.977,7.130,5.466,HELX_RH_AL_P,HELX,0,1,0,0,0
4,Q8WUY3,1,L,5,92.29,-22.996,-21.789,-20.788,-22.225,0.637,...,2.715,1.888,3.898,HELX_RH_AL_P,HELX,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1872,P50336,2,T,473,79.48,-26.432,-27.779,-28.905,-27.788,2.417,...,-3.645,-3.703,-2.375,BEND,BEND,1,0,0,0,0
1873,P50336,2,E,474,72.19,-25.159,-24.885,-24.409,-26.053,5.024,...,-5.288,-6.755,-5.074,unstructured,unstructured,0,0,0,0,1
1874,P50336,2,P,475,60.80,-24.319,-24.326,-23.123,-24.225,7.923,...,-3.614,-2.717,-3.987,unstructured,unstructured,0,0,0,0,1
1875,P50336,2,N,476,59.06,-23.383,-24.858,-25.721,-24.958,10.469,...,-5.866,-5.548,-4.794,unstructured,unstructured,0,0,0,0,1


In [26]:
with gzip.open(filepath, "rt") as f_in:
    structure = Bio.PDB.MMCIF2Dict.MMCIF2Dict(f_in)

In [27]:
structure

{'data_': 'AF-Q8WUY3-F1',
 '_entry.id': ['AF-Q8WUY3-F1'],
 '_atom_type.symbol': ['C', 'N', 'O', 'S'],
 '_audit_author.name': ['Jumper, John',
  'Evans, Richard',
  'Pritzel, Alexander',
  'Green, Tim',
  'Figurnov, Michael',
  'Ronneberger, Olaf',
  'Tunyasuvunakool, Kathryn',
  'Bates, Russ',
  'Zidek, Augustin',
  'Potapenko, Anna',
  'Bridgland, Alex',
  'Meyer, Clemens',
  'Kohl, Simon A. A.',
  'Ballard, Andrew J.',
  'Cowie, Andrew',
  'Romera-Paredes, Bernardino',
  'Nikolov, Stanislav',
  'Jain, Rishub',
  'Adler, Jonas',
  'Back, Trevor',
  'Petersen, Stig',
  'Reiman, David',
  'Clancy, Ellen',
  'Zielinski, Michal',
  'Steinegger, Martin',
  'Pacholska, Michalina',
  'Berghammer, Tamas',
  'Silver, David',
  'Vinyals, Oriol',
  'Senior, Andrew W.',
  'Kavukcuoglu, Koray',
  'Kohli, Pushmeet',
  'Hassabis, Demis'],
 '_audit_author.pdbx_ordinal': ['1',
  '2',
  '3',
  '4',
  '5',
  '6',
  '7',
  '8',
  '9',
  '10',
  '11',
  '12',
  '13',
  '14',
  '15',
  '16',
  '17',
  '18'