#### Bubble processing. 

See if there are any *qualitative* differences in characteristics such as 
1. structure of nearest neighbour (HELX, UNSTRUCTURED, etc.) 
between phosphoS and S (proteome-wide)

i.e. if the heatmap does not show a significant difference between S and pS across "Nearest residue frequency for sequence-adjacent triplets"; maybe there are other dimensions to explore on which there *is* a difference.  

In [1]:
# Autoreload 
%load_ext autoreload
%autoreload 2

from pathlib import Path 
import pandas as pd 
import numpy as np
import os 
import re 
import gzip 
import shutil
import Bio.PDB.MMCIF2Dict
from typing import Union, List, Tuple, Dict, Optional
from pathlib import Path
from tqdm import tqdm


pd.options.mode.chained_assignment = None  # default='warn'

from phosphosite.utils import aa1to3, aa3to1

In [165]:
from phosphosite import AF_HUMAN_CIF
AF_HUMAN_CIF

PosixPath('/home/cim/STRUCTURAL_MOTIFS/DATA/AF_HUMAN_CIF')

In [2]:
structure_dir = Path.home() / "STRUCTURAL_MOTIFS/DATA/"
af_cif_dir = structure_dir / "AF_HUMAN_CIF" 
af_pdb_dir = structure_dir / "AF_HUMAN_PDB"

# Assert that the cif and pdb directories exist.
assert af_cif_dir.exists()
assert af_pdb_dir.exists()


from phosphosite.structure import StructureLoader
cif_loader = StructureLoader(af_cif_dir, extension="cif.gz")
pdb_loader = StructureLoader(af_pdb_dir, extension="pdb")

from phosphosite import DATA_DIR
annotation_dir = DATA_DIR / "structure_annotations"
structure_df = pd.read_hdf(annotation_dir / f"structure_df.h5", key="structure_df")

In [3]:
structure_df.head()

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,z_coord_ca,z_coord_cb,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured
0,A0A075B759,1,M,1,66.72,-19.205,-19.235,-17.818,-20.083,6.188,...,6.506,7.091,7.169,unstructured,unstructured,0,0,0,0,1
1,A0A075B759,1,V,2,83.11,-17.532,-18.934,-19.228,-19.068,6.568,...,2.733,2.005,4.191,unstructured,unstructured,0,0,0,0,1
2,A0A075B759,1,N,3,88.43,-15.452,-16.16,-16.404,-17.439,6.076,...,1.051,0.356,1.501,unstructured,unstructured,0,0,0,0,1
3,A0A075B759,1,S,4,93.22,-12.711,-13.566,-12.681,-14.405,6.888,...,-0.11,0.812,0.661,unstructured,unstructured,0,0,0,0,1
4,A0A075B759,1,V,5,97.8,-10.21,-11.417,-11.992,-12.403,8.06,...,-3.217,-4.645,-2.232,STRN,STRN,0,0,1,0,0


In [4]:
uniprot_ids = list(structure_df["protein_id"].unique())
uniprot_ids

['A0A075B759',
 'A0A087WUL8',
 'A0A096LP49',
 'A0A096LP55',
 'A0A0B4J2A2',
 'A0A0B4J2F2',
 'A0A0J9YWL9',
 'A0A1B0GTR3',
 'A0AUZ9',
 'A0AV02',
 'A0AV96',
 'A0AVF1',
 'A0AVI2',
 'A0AVI4',
 'A0AVK6',
 'A0AVT1',
 'A0FGR8',
 'A0FGR9',
 'A0JLT2',
 'A0JNW5',
 'A0JP26',
 'A0MZ66',
 'A0PJK1',
 'A0PJW6',
 'A0PJW8',
 'A0PJX0',
 'A0PJX2',
 'A0PJX4',
 'A0PJY2',
 'A0PJZ3',
 'A0PK00',
 'A0PK05',
 'A1A4G5',
 'A1A4S6',
 'A1A4V9',
 'A1A4Y4',
 'A1A519',
 'A1A5B4',
 'A1A5C7',
 'A1A5D9',
 'A1E959',
 'A1IGU5',
 'A1KXE4',
 'A1KZ92',
 'A1L020',
 'A1L0T0',
 'A1L168',
 'A1L170',
 'A1L188',
 'A1L190',
 'A1L390',
 'A1L429',
 'A1L443',
 'A1L453',
 'A1L4H1',
 'A1L4K1',
 'A1X283',
 'A1XBS5',
 'A1YPR0',
 'A1Z1Q3',
 'A2A288',
 'A2A2Y4',
 'A2A2Z9',
 'A2A368',
 'A2A3K4',
 'A2A3L6',
 'A2A3N6',
 'A2AJT9',
 'A2CJ06',
 'A2IDD5',
 'A2PYH4',
 'A2RRD8',
 'A2RRH5',
 'A2RRP1',
 'A2RTX5',
 'A2RTY3',
 'A2RU30',
 'A2RU48',
 'A2RU49',
 'A2RU54',
 'A2RU67',
 'A2RUB1',
 'A2RUB6',
 'A2RUC4',
 'A2RUG3',
 'A2RUQ5',
 'A2RUR9',
 'A2RUS2',


In [5]:
# First uniprot id that starts with 'O' 
uniprot_id = [i for i in uniprot_ids if i.startswith("O")][0]
uniprot_id

'O00110'

In [6]:
df = structure_df[structure_df["protein_id"] == uniprot_id]
df

Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,z_coord_ca,z_coord_cb,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured
327100,O00110,575,M,1,42.15,-45.183,-45.111,-44.412,-46.483,-21.310,...,-0.253,-1.574,-0.191,unstructured,unstructured,0,0,0,0,1
327101,O00110,575,P,2,43.86,-43.729,-44.881,-44.849,-44.849,-18.601,...,1.146,2.646,1.025,unstructured,unstructured,0,0,0,0,1
327102,O00110,575,R,3,45.46,-42.228,-43.153,-43.959,-44.072,-15.995,...,-1.343,-2.336,-0.474,unstructured,unstructured,0,0,0,0,1
327103,O00110,575,A,4,50.12,-39.935,-39.915,-38.549,-40.921,-13.973,...,0.101,-0.073,-0.578,unstructured,unstructured,0,0,0,0,1
327104,O00110,575,F,5,48.50,-38.746,-40.218,-41.091,-40.328,-11.175,...,0.119,1.090,0.407,unstructured,unstructured,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327309,O00110,575,L,210,90.47,25.673,24.842,24.966,23.429,-10.263,...,-18.444,-16.908,-18.839,HELX_RH_AL_P,HELX,0,1,0,0,0
327310,O00110,575,H,211,90.32,25.901,25.767,25.029,25.087,-12.384,...,-19.954,-19.637,-19.287,HELX_RH_AL_P,HELX,0,1,0,0,0
327311,O00110,575,R,212,81.89,26.092,25.105,23.706,25.069,-10.204,...,-23.555,-24.058,-22.099,HELX_RH_AL_P,HELX,0,1,0,0,0
327312,O00110,575,A,213,72.68,28.971,27.486,27.028,26.583,-8.648,...,-23.263,-22.376,-23.022,unstructured,unstructured,0,0,0,0,1


In [7]:
cols = list(df.columns)
cols

['protein_id',
 'protein_number',
 'AA',
 'position',
 'quality',
 'x_coord_c',
 'x_coord_ca',
 'x_coord_cb',
 'x_coord_n',
 'y_coord_c',
 'y_coord_ca',
 'y_coord_cb',
 'y_coord_n',
 'z_coord_c',
 'z_coord_ca',
 'z_coord_cb',
 'z_coord_n',
 'secondary_structure',
 'structure_group',
 'BEND',
 'HELX',
 'STRN',
 'TURN',
 'unstructured']

In [8]:
cif_loader.parse_structure(uniprot_id=uniprot_id)

{'data_': 'AF-O00110-F1',
 '_entry.id': ['AF-O00110-F1'],
 '_atom_type.symbol': ['C', 'N', 'O', 'S'],
 '_audit_author.name': ['Jumper, John',
  'Evans, Richard',
  'Pritzel, Alexander',
  'Green, Tim',
  'Figurnov, Michael',
  'Ronneberger, Olaf',
  'Tunyasuvunakool, Kathryn',
  'Bates, Russ',
  'Zidek, Augustin',
  'Potapenko, Anna',
  'Bridgland, Alex',
  'Meyer, Clemens',
  'Kohl, Simon A. A.',
  'Ballard, Andrew J.',
  'Cowie, Andrew',
  'Romera-Paredes, Bernardino',
  'Nikolov, Stanislav',
  'Jain, Rishub',
  'Adler, Jonas',
  'Back, Trevor',
  'Petersen, Stig',
  'Reiman, David',
  'Clancy, Ellen',
  'Zielinski, Michal',
  'Steinegger, Martin',
  'Pacholska, Michalina',
  'Berghammer, Tamas',
  'Silver, David',
  'Vinyals, Oriol',
  'Senior, Andrew W.',
  'Kavukcuoglu, Koray',
  'Kohli, Pushmeet',
  'Hassabis, Demis'],
 '_audit_author.pdbx_ordinal': ['1',
  '2',
  '3',
  '4',
  '5',
  '6',
  '7',
  '8',
  '9',
  '10',
  '11',
  '12',
  '13',
  '14',
  '15',
  '16',
  '17',
  '18'

In [9]:
from phosphosite.structure.alphafold import make_af_dataframe, make_atomic_dataframe
make_af_dataframe(loader=cif_loader, protein_ids=[uniprot_id])

100%|██████████| 1/1 [00:00<00:00,  2.53it/s]


Unnamed: 0,protein_id,protein_number,AA,position,quality,x_coord_c,x_coord_ca,x_coord_cb,x_coord_n,y_coord_c,...,z_coord_ca,z_coord_cb,z_coord_n,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured
0,O00110,1,M,1,42.15,-45.183,-45.111,-44.412,-46.483,-21.310,...,-0.253,-1.574,-0.191,unstructured,unstructured,0,0,0,0,1
1,O00110,1,P,2,43.86,-43.729,-44.881,-44.849,-44.849,-18.601,...,1.146,2.646,1.025,unstructured,unstructured,0,0,0,0,1
2,O00110,1,R,3,45.46,-42.228,-43.153,-43.959,-44.072,-15.995,...,-1.343,-2.336,-0.474,unstructured,unstructured,0,0,0,0,1
3,O00110,1,A,4,50.12,-39.935,-39.915,-38.549,-40.921,-13.973,...,0.101,-0.073,-0.578,unstructured,unstructured,0,0,0,0,1
4,O00110,1,F,5,48.50,-38.746,-40.218,-41.091,-40.328,-11.175,...,0.119,1.090,0.407,unstructured,unstructured,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209,O00110,1,L,210,90.47,25.673,24.842,24.966,23.429,-10.263,...,-18.444,-16.908,-18.839,HELX_RH_AL_P,HELX,0,1,0,0,0
210,O00110,1,H,211,90.32,25.901,25.767,25.029,25.087,-12.384,...,-19.954,-19.637,-19.287,HELX_RH_AL_P,HELX,0,1,0,0,0
211,O00110,1,R,212,81.89,26.092,25.105,23.706,25.069,-10.204,...,-23.555,-24.058,-22.099,HELX_RH_AL_P,HELX,0,1,0,0,0
212,O00110,1,A,213,72.68,28.971,27.486,27.028,26.583,-8.648,...,-23.263,-22.376,-23.022,unstructured,unstructured,0,0,0,0,1


In [11]:
uniprot_id

'O00110'

In [16]:
make_atomic_dataframe(loader=cif_loader, protein_ids=["O00110", "A0FGR8"])

  0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 2/2 [00:00<00:00,  4.19it/s]


Unnamed: 0,protein_id,protein_number,AA,position,quality,atom_id,x_coord,y_coord,z_coord,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured
0,O00110,1,M,1,42.15,N,-46.483,-23.385,-0.191,unstructured,unstructured,0,0,0,0,1
1,O00110,1,M,1,42.15,CA,-45.111,-22.830,-0.253,unstructured,unstructured,0,0,0,0,1
2,O00110,1,M,1,42.15,C,-45.183,-21.310,-0.124,unstructured,unstructured,0,0,0,0,1
3,O00110,1,M,1,42.15,CB,-44.412,-23.218,-1.574,unstructured,unstructured,0,0,0,0,1
4,O00110,1,M,1,42.15,O,-45.638,-20.675,-1.072,unstructured,unstructured,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8878,A0FGR8,2,T,921,24.09,CB,-61.444,-39.027,-17.794,unstructured,unstructured,0,0,0,0,1
8879,A0FGR8,2,T,921,24.09,O,-59.319,-41.069,-18.247,unstructured,unstructured,0,0,0,0,1
8880,A0FGR8,2,T,921,24.09,CG2,-62.124,-37.828,-17.137,unstructured,unstructured,0,0,0,0,1
8881,A0FGR8,2,T,921,24.09,OG1,-60.466,-38.571,-18.699,unstructured,unstructured,0,0,0,0,1


#### Include 'nearest atom' checks within bubble. 

Include all atom types (excluding hydrogen) within `make_af_dataframe` function.

In [None]:
# Implement 'process' func for one site

# create function for process multiple sites

In [13]:
from phosphosite.utils.structure import generate_sequence_from_df
protein_id = uniprot_id
res, pos = "S", 9
verbose = True

atomic_df = make_atomic_dataframe(loader=cif_loader, protein_ids=[protein_id]) 

100%|██████████| 1/1 [00:00<00:00,  9.55it/s]


In [14]:
atomic_df

Unnamed: 0,protein_id,protein_number,AA,position,quality,atom_id,x_coord,y_coord,z_coord,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured
0,O00110,1,M,1,42.15,N,-46.483,-23.385,-0.191,unstructured,unstructured,0,0,0,0,1
1,O00110,1,M,1,42.15,CA,-45.111,-22.830,-0.253,unstructured,unstructured,0,0,0,0,1
2,O00110,1,M,1,42.15,C,-45.183,-21.310,-0.124,unstructured,unstructured,0,0,0,0,1
3,O00110,1,M,1,42.15,CB,-44.412,-23.218,-1.574,unstructured,unstructured,0,0,0,0,1
4,O00110,1,M,1,42.15,O,-45.638,-20.675,-1.072,unstructured,unstructured,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1666,O00110,1,A,214,60.02,CA,30.600,-10.458,-22.508,unstructured,unstructured,0,0,0,0,1
1667,O00110,1,A,214,60.02,C,30.873,-11.371,-23.708,unstructured,unstructured,0,0,0,0,1
1668,O00110,1,A,214,60.02,CB,30.693,-11.210,-21.173,unstructured,unstructured,0,0,0,0,1
1669,O00110,1,A,214,60.02,O,32.038,-11.391,-24.159,unstructured,unstructured,0,0,0,0,1


In [163]:
from phosphosite.bubble import process
            

phosphosite_df = None
uniprot_ids = [protein_id, "A0FGR8"]
adjacency_range = 2
radius = 8.0#6.0
#radius = 5
kwargs = {
    "protein_ids": uniprot_ids,
    "phosphosite_df": phosphosite_df,
    "adjacency_range": adjacency_range,
    "radius": radius,
    "verbose": True,
}
result = process(**kwargs)
result


  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00,  8.72it/s]


S CA
S OG
S CA
S OG
S CA
S OG
S CA
S OG
S CA
S OG
S CA
S OG
S CA
S OG
S CA
S OG
S CA
S OG
S CA
S OG
S CA
S OG
S CA
S OG
S CA
S OG
S CA
S OG
S CA
S OG
S CA
S OG
T CA
[O00110] No nearest neighbour for T at position 36
T OG1
[O00110] No nearest neighbour for T at position 36
T CA
T OG1
[O00110] No nearest neighbour for T at position 49
T CA
T OG1
T CA
T OG1
T CA
T OG1
T CA
T OG1
T CA
T OG1
T CA
T OG1
T CA
T OG1
T CA
T OG1
T CA
T OG1
T CA
T OG1
Y CA
Y OH
Y CA
Y OH
Y CA
Y OH
Y CA
Y OH


100%|██████████| 1/1 [00:00<00:00,  2.30it/s]


S CA
[A0FGR8] No nearest neighbour for S at position 10
S OG
[A0FGR8] No nearest neighbour for S at position 10
S CA
[A0FGR8] No nearest neighbour for S at position 11
S OG
[A0FGR8] No nearest neighbour for S at position 11
S CA
S OG
[A0FGR8] No nearest neighbour for S at position 28
S CA
S OG
S CA
S OG
S CA
S OG
S CA
[A0FGR8] No nearest neighbour for S at position 50
S OG
[A0FGR8] No nearest neighbour for S at position 50
S CA
[A0FGR8] No nearest neighbour for S at position 55
S OG
[A0FGR8] No nearest neighbour for S at position 55
S CA
[A0FGR8] No nearest neighbour for S at position 82
S OG
[A0FGR8] No nearest neighbour for S at position 82
S CA
[A0FGR8] No nearest neighbour for S at position 86
S OG
[A0FGR8] No nearest neighbour for S at position 86
S CA
[A0FGR8] No nearest neighbour for S at position 91
S OG
[A0FGR8] No nearest neighbour for S at position 91
S CA
S OG
S CA
S OG
S CA
S OG
S CA
S OG
S CA
S OG
S CA
S OG
S CA
S OG
S CA
S OG
S CA
S OG
S CA
S OG
S CA
S OG
S CA
S OG
S CA


Unnamed: 0,protein_id,prev,res,pos,next,ref_atom,ref_atom_dist,nn_res,nn_pos,nn_atom
0,O00110,R,S,9,R,CA,7.633196,P,12,CD
1,O00110,R,S,9,R,OG,7.882656,P,12,CD
0,O00110,R,S,47,V,CA,7.524759,I,50,N
1,O00110,R,S,47,V,OG,6.707845,I,50,N
0,O00110,F,S,54,S,CA,7.165023,W,51,O
...,...,...,...,...,...,...,...,...,...,...
1,A0FGR8,P,Y,824,V,OH,3.745005,K,875,CB
0,A0FGR8,M,Y,828,L,CA,3.983246,D,871,O
1,A0FGR8,M,Y,828,L,OH,3.107621,K,833,O
0,A0FGR8,W,Y,908,D,CA,3.700436,I,790,O


In [158]:
atomic_df = make_atomic_dataframe(loader=cif_loader, protein_ids=["O00110"])
atomic_df[atomic_df.position == 49]

100%|██████████| 1/1 [00:00<00:00, 10.82it/s]


Unnamed: 0,protein_id,protein_number,AA,position,quality,atom_id,x_coord,y_coord,z_coord,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured
375,O00110,1,T,49,59.7,N,-12.791,-0.357,39.991,unstructured,unstructured,0,0,0,0,1
376,O00110,1,T,49,59.7,CA,-11.674,-0.6,40.893,unstructured,unstructured,0,0,0,0,1
377,O00110,1,T,49,59.7,C,-10.356,-0.5,40.131,unstructured,unstructured,0,0,0,0,1
378,O00110,1,T,49,59.7,CB,-11.849,-2.001,41.497,unstructured,unstructured,0,0,0,0,1
379,O00110,1,T,49,59.7,O,-10.059,-1.319,39.261,unstructured,unstructured,0,0,0,0,1
380,O00110,1,T,49,59.7,CG2,-10.805,-2.351,42.551,unstructured,unstructured,0,0,0,0,1
381,O00110,1,T,49,59.7,OG1,-13.115,-2.068,42.119,unstructured,unstructured,0,0,0,0,1


In [None]:
sites = atomic_df[atomic_df["AA"].isin(list(residues_to_consider))][["AA", "position"]]
# Get first occurrence of each "AA" and "position" pair
sites = sites.drop_duplicates(subset=["AA", "position"])

In [155]:
atomic_df = make_atomic_dataframe(loader=cif_loader, protein_ids=["O00110"])
atomic_df[atomic_df.AA == "S"]
res_to_consider = "STY"
# Filter atomic df 
atomic_df = make_atomic_dataframe(loader=cif_loader, protein_ids=uniprot_ids)


sites = atomic_df[atomic_df["AA"].isin(list(res_to_consider))][["AA", "position"]]
# Get first occurrence of each "AA" and "position" pair
sites = sites.drop_duplicates(subset=["AA", "position"])
#[tuple(r) for r in sites.to_numpy()]



100%|██████████| 1/1 [00:00<00:00, 10.25it/s]
100%|██████████| 1/1 [00:00<00:00,  9.50it/s]


Unnamed: 0,protein_id,protein_number,AA,position,quality,atom_id,x_coord,y_coord,z_coord,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured
68,O00110,1,S,9,48.78,N,-32.438,-4.513,1.010,unstructured,unstructured,0,0,0,0,1
69,O00110,1,S,9,48.78,CA,-31.564,-3.854,1.976,unstructured,unstructured,0,0,0,0,1
70,O00110,1,S,9,48.78,C,-31.119,-2.501,1.410,unstructured,unstructured,0,0,0,0,1
71,O00110,1,S,9,48.78,CB,-30.390,-4.776,2.341,unstructured,unstructured,0,0,0,0,1
72,O00110,1,S,9,48.78,O,-30.387,-2.416,0.421,unstructured,unstructured,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1541,O00110,1,S,199,86.57,CA,10.876,-13.176,-8.295,BEND,BEND,1,0,0,0,0
1542,O00110,1,S,199,86.57,C,10.211,-12.195,-9.264,BEND,BEND,1,0,0,0,0
1543,O00110,1,S,199,86.57,CB,10.412,-12.889,-6.868,BEND,BEND,1,0,0,0,0
1544,O00110,1,S,199,86.57,O,8.993,-12.247,-9.445,BEND,BEND,1,0,0,0,0


  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00, 11.10it/s]


In [130]:
atomic_df[atomic_df["position"] == 178]

Unnamed: 0,protein_id,protein_number,AA,position,quality,atom_id,x_coord,y_coord,z_coord,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured
1363,O00110,1,S,178,72.68,N,-11.756,-12.0,-3.802,BEND,BEND,1,0,0,0,0
1364,O00110,1,S,178,72.68,CA,-12.722,-11.338,-4.694,BEND,BEND,1,0,0,0,0
1365,O00110,1,S,178,72.68,C,-12.043,-10.717,-5.917,BEND,BEND,1,0,0,0,0
1366,O00110,1,S,178,72.68,CB,-13.495,-10.266,-3.916,BEND,BEND,1,0,0,0,0
1367,O00110,1,S,178,72.68,O,-12.579,-10.722,-7.023,BEND,BEND,1,0,0,0,0
1368,O00110,1,S,178,72.68,OG,-12.704,-9.138,-3.55,BEND,BEND,1,0,0,0,0


In [123]:
seq[178]

'Y'

In [115]:
result

Unnamed: 0,protein_id,prev,res,pos,next,ref_atom,ref_atom_dist,nn_res,nn_pos,nn_atom
0,O00110,F,S,54,S,OG,7.930757,W,51,O


In [100]:
res = result.iloc[0]

In [102]:
res["AA"]

'W'

In [88]:
df = pd.DataFrame(columns=["protein_id", "prev", "res", "pos", "next", "ref_atom", "ref_atom_dist", "nn_res", "nn_pos", "nn_atom"])

In [89]:
# Add fake data for example. 
df = df.append({
    "protein_id": "O00110",
    "prev": "A",
    "res": "S",
    "pos": 9,
    "next": "A",
    "ref_atom": "OG",
    "ref_atom_dist": 0.0,
    "nn_res": "A",
    "nn_pos": 10,
    "nn_atom": "CA",
}, ignore_index=True)
df

  df = df.append({


Unnamed: 0,protein_id,prev,res,pos,next,ref_atom,ref_atom_dist,nn_res,nn_pos,nn_atom
0,O00110,A,S,9,A,OG,0.0,A,10,CA


In [103]:
new_df = pd.DataFrame.from_records([{
    "protein_id": "O00110",
    "prev": "A",
    "res": "S",
    "pos": 9,
    "next": "A",
    "ref_atom": "OG",
}])
new_df



Unnamed: 0,protein_id,prev,res,pos,next,ref_atom
0,O00110,A,S,9,A,OG


In [104]:
original_df = pd.DataFrame()

In [105]:
pd.concat([original_df, new_df])

Unnamed: 0,protein_id,prev,res,pos,next,ref_atom
0,O00110,A,S,9,A,OG


In [None]:
# Phosphosite S8 

# For each Serine residue  

# Get dataframe centred around the residue  
# Get the first row that matches the residue and position, unless there are none
df = atomic_df
res, pos = "S", 9
try:
    row = df[(df["AA"] == res) & (df["position"] == pos)].iloc[0]   
except IndexError:
    if verbose: tqdm.write(f"[{protein_id}] Could not find centre residue {res} at position {pos}")

In [58]:
reference_atom_dict = {
    "S": "OG", 
    "T": "OG1",
    "Y": "OH",
}
residues_to_consider: str = "STY", 
reference_atoms: List[str] = ["CA"]

In [59]:
reference_atoms.append(reference_atom_dict[res])
for ref_atom in reference_atoms:
    ref_coords = np.array(df[(df["AA"] == res) & (df["position"] == pos) & (df["atom_id"] == ref_atom)][["x_coord", "y_coord", "z_coord"]])

    # Filter by radius. 
    # Only include atoms (rows) whose distance from the reference atom is less than the radius.

    # Filter rows 
    df[f"{ref_atom}_dist"] = np.linalg.norm(df[["x_coord", "y_coord", "z_coord"]].values - ref_coords, axis=1)



In [60]:
df

Unnamed: 0,protein_id,protein_number,AA,position,quality,atom_id,x_coord,y_coord,z_coord,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured,CA_dist,OG_dist
0,O00110,1,M,1,42.15,N,-46.483,-23.385,-0.191,unstructured,unstructured,0,0,0,0,1,24.672503,24.532642
1,O00110,1,M,1,42.15,CA,-45.111,-22.830,-0.253,unstructured,unstructured,0,0,0,0,1,23.421747,23.207875
2,O00110,1,M,1,42.15,C,-45.183,-21.310,-0.124,unstructured,unstructured,0,0,0,0,1,22.239584,22.131739
3,O00110,1,M,1,42.15,CB,-44.412,-23.218,-1.574,unstructured,unstructured,0,0,0,0,1,23.508256,23.173217
4,O00110,1,M,1,42.15,O,-45.638,-20.675,-1.072,unstructured,unstructured,0,0,0,0,1,22.143031,22.081763
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1666,O00110,1,A,214,60.02,CA,30.600,-10.458,-22.508,unstructured,unstructured,0,0,0,0,1,67.137486,65.121092
1667,O00110,1,A,214,60.02,C,30.873,-11.371,-23.708,unstructured,unstructured,0,0,0,0,1,67.930495,65.895504
1668,O00110,1,A,214,60.02,CB,30.693,-11.210,-21.173,unstructured,unstructured,0,0,0,0,1,66.827547,64.796933
1669,O00110,1,A,214,60.02,O,32.038,-11.391,-24.159,unstructured,unstructured,0,0,0,0,1,69.174121,67.141267


In [29]:
dff = df[df[f"{ref_atom}_dist"] < radius]

In [41]:
# Gamma Oxygen's distance to itself should be zero
for test_atom in ["OG", "CA"]:
    assert (dff[(dff.position == pos) & (dff.atom_id == test_atom)][f"{test_atom}_dist"] == 0).bool(), "yikes" 

In [27]:
print(f"position: {pos}")
print("to exclude: " + str(list(range(pos - adjacency_range, pos + adjacency_range + 1))))

position: 9
to exclude: [7, 8, 9, 10, 11]


In [48]:
# Threshold by radius.
dff = df[df[f"CA_dist"] < radius]

# Exclude residue positions. 
to_exclude = list(range(pos - adjacency_range, pos + adjacency_range + 1))
dff = dff[~dff["position"].isin(to_exclude)]
dff

Unnamed: 0,protein_id,protein_number,AA,position,quality,atom_id,x_coord,y_coord,z_coord,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured,CA_dist,OG_dist
35,O00110,1,F,5,48.5,O,-38.132,-11.164,1.257,unstructured,unstructured,0,0,0,0,1,9.853511,10.089666
43,O00110,1,L,6,55.98,CA,-36.861,-10.18,-1.036,unstructured,unstructured,0,0,0,0,1,8.783429,8.774414
44,O00110,1,L,6,55.98,C,-36.962,-8.758,-0.47,unstructured,unstructured,0,0,0,0,1,7.692239,8.041911
45,O00110,1,L,6,55.98,CB,-36.379,-10.185,-2.5,unstructured,unstructured,0,0,0,0,1,9.126903,8.899121
46,O00110,1,L,6,55.98,O,-37.363,-7.825,-1.166,unstructured,unstructured,0,0,0,0,1,7.698663,8.245068
49,O00110,1,L,6,55.98,CD2,-34.533,-11.925,-2.433,unstructured,unstructured,0,0,0,0,1,9.664123,8.809672
96,O00110,1,P,12,42.79,N,-26.569,2.005,3.079,HELX_LH_PP_P,HELX,0,1,0,0,0,7.777822,8.322854
97,O00110,1,P,12,42.79,CA,-26.323,3.254,3.785,HELX_LH_PP_P,HELX,0,1,0,0,0,9.014667,9.704766
99,O00110,1,P,12,42.79,CB,-25.342,2.915,4.916,HELX_LH_PP_P,HELX,0,1,0,0,0,9.652784,10.169279
101,O00110,1,P,12,42.79,CG,-25.494,1.406,5.1,HELX_LH_PP_P,HELX,0,1,0,0,0,8.618113,8.978522


In [55]:
# Pick row with smallest CA_dist
atom = "CA"
smallest = dff[dff[f"{atom}_dist"] == dff[f"{atom}_dist"].min()]
smallest

Unnamed: 0,protein_id,protein_number,AA,position,quality,atom_id,x_coord,y_coord,z_coord,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured,CA_dist,OG_dist
102,O00110,1,P,12,42.79,CD,-25.855,0.908,3.707,HELX_LH_PP_P,HELX,0,1,0,0,0,7.633196,7.882656


In [56]:
# Pick row with smallest oxygen dist
atom = "OG"
smallest = dff[dff[f"{atom}_dist"] == dff[f"{atom}_dist"].min()]
smallest

Unnamed: 0,protein_id,protein_number,AA,position,quality,atom_id,x_coord,y_coord,z_coord,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured,CA_dist,OG_dist
102,O00110,1,P,12,42.79,CD,-25.855,0.908,3.707,HELX_LH_PP_P,HELX,0,1,0,0,0,7.633196,7.882656


In [49]:
# Order by distance 
og_dff = dff.sort_values(by="OG_dist", ascending=True)
og_dff

Unnamed: 0,protein_id,protein_number,AA,position,quality,atom_id,x_coord,y_coord,z_coord,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured,CA_dist,OG_dist
102,O00110,1,P,12,42.79,CD,-25.855,0.908,3.707,HELX_LH_PP_P,HELX,0,1,0,0,0,7.633196,7.882656
44,O00110,1,L,6,55.98,C,-36.962,-8.758,-0.47,unstructured,unstructured,0,0,0,0,1,7.692239,8.041911
46,O00110,1,L,6,55.98,O,-37.363,-7.825,-1.166,unstructured,unstructured,0,0,0,0,1,7.698663,8.245068
96,O00110,1,P,12,42.79,N,-26.569,2.005,3.079,HELX_LH_PP_P,HELX,0,1,0,0,0,7.777822,8.322854
43,O00110,1,L,6,55.98,CA,-36.861,-10.18,-1.036,unstructured,unstructured,0,0,0,0,1,8.783429,8.774414
49,O00110,1,L,6,55.98,CD2,-34.533,-11.925,-2.433,unstructured,unstructured,0,0,0,0,1,9.664123,8.809672
45,O00110,1,L,6,55.98,CB,-36.379,-10.185,-2.5,unstructured,unstructured,0,0,0,0,1,9.126903,8.899121
101,O00110,1,P,12,42.79,CG,-25.494,1.406,5.1,HELX_LH_PP_P,HELX,0,1,0,0,0,8.618113,8.978522
97,O00110,1,P,12,42.79,CA,-26.323,3.254,3.785,HELX_LH_PP_P,HELX,0,1,0,0,0,9.014667,9.704766
35,O00110,1,F,5,48.5,O,-38.132,-11.164,1.257,unstructured,unstructured,0,0,0,0,1,9.853511,10.089666


In [51]:
N = 1

# Pick top N positions (keep first if duplicates)
og_dff = og_dff.drop_duplicates(subset=["position"]).iloc[:N]
og_dff

Unnamed: 0,protein_id,protein_number,AA,position,quality,atom_id,x_coord,y_coord,z_coord,secondary_structure,structure_group,BEND,HELX,STRN,TURN,unstructured,CA_dist,OG_dist
102,O00110,1,P,12,42.79,CD,-25.855,0.908,3.707,HELX_LH_PP_P,HELX,0,1,0,0,0,7.633196,7.882656


In [89]:
# Given a residue, find nearest atom (not itself) within radius. 
seq = generate_sequence_from_df(df)
seq.index("S")


8

In [None]:

def process_bubbles(
    phosphosite_df: pd.DataFrame,
    cif_loader: StructureLoader,
):
    """Process bubbles for a given dataframe."""

    for uniprot_id in uniprot_ids:

        atomic_df = make_atomic_dataframe(loader=cif_loader, protein_ids=[uniprot_id]) 
        return atoms


        
    