In [7]:
# Autoreload 
%load_ext autoreload
%autoreload 2

"""Set random seed"""
import torch 
torch.manual_seed(0) 
import numpy as np
np.random.seed(0)

import torch
import glob 

import pandas as pd
import numpy as np

from tqdm import tqdm
from graphein.ml import ProteinGraphDataset
from pathlib import Path






The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Config

In [9]:
from graphein.protein.config import ProteinGraphConfig
from graphein.protein.edges.distance import add_distance_threshold

from functools import partial

long_interaction_threshold = 0 # seq positions 
edge_threshold_distance = 6.0 # Å
new_edge_funcs = {"edge_construction_functions": [
    partial(
    add_distance_threshold, long_interaction_threshold=long_interaction_threshold, threshold=edge_threshold_distance)
]}
config = ProteinGraphConfig(
    granularity="CA",
    **new_edge_funcs,
)

from graphein.ml.conversion import GraphFormatConvertor

columns = [
    "b_factor",
    "name",
    "edge_index",
    "x", # T5 per-residue embedding
]
convertor = GraphFormatConvertor(
    src_format="nx", dst_format="pyg", verbose="gnn",
    columns=columns,
)

# List of functions that consume a nx.Graph and return a nx.Graph. Applied to graphs after construction but before conversion to pyg
from phosphosite.graphs.pyg import add_per_residue_embedding
graph_transforms = [
    add_per_residue_embedding,
]

# TODO: copy across code that adds the residue embedding from `compot`. 

In [27]:
from phosphosite import PHOSPHOSITE_PREDICT_DIR
old_dir = PHOSPHOSITE_PREDICT_DIR / "protein_graph_dataset"
root_dir = old_dir

processed_filenames = [Path(a).stem for a in glob.glob(str(root_dir / "processed" / "*.pt"))]
downloaded_filenames = [Path(a).stem for a in glob.glob(str(root_dir / "raw" / "*.pdb"))]

len(downloaded_filenames), len(processed_filenames)

(17223, 17067)

#### Actual path for dataset

In [30]:
actual_root_dir = PHOSPHOSITE_PREDICT_DIR / "actual_protein_graph_dataset"
control_root_dir = PHOSPHOSITE_PREDICT_DIR / "control_protein_graph_dataset"
root_dir = actual_root_dir

In [13]:
# Load index dict 
# Save indexes_dict
from phosphosite.utils.io import save_index_dict, load_index_dict

filepath = Path("./indexes_dict.json")

# Load indexes_dict
indexes_dict = load_index_dict(filepath=filepath)

from phosphosite.ml.graph_dataset import PhosphositeGraphDataset 
kwargs = dict(
    root=root_dir,
    graphein_config=config, 
    graph_transformation_funcs=graph_transforms,
    graph_format_convertor=convertor,
    pre_transform=None, # before saved to disk , after PyG conversion 
    pre_filter=None,    # whether it will be in final dataset
)



In [14]:
root_dir

PosixPath('/home/cim/STRUCTURAL_MOTIFS/phosphosite/notebooks/phosphosite_prediction/actual_protein_graph_dataset')

In [20]:
processed_filenames = [Path(a).stem for a in glob.glob(str(root_dir / "processed" / "*.pt"))]
ds = PhosphositeGraphDataset(
    uniprot_ids=processed_filenames,
    y_label_map=indexes_dict,
    **kwargs,
)

In [21]:
g = ds[0]

In [23]:
g.y_index

tensor([ 345,  429,  430,  719,  789,  884,  899,  900,  917,  930, 1219,  340,
         344,  396,  400,  516,  974, 1123, 1126, 1135,  354, 1193, 1124, 1290,
        1295, 1250, 1129, 1163,  198, 1051,  659,  651,  621,  184,  542,  351,
         415,  145,  508,  448,  383,  372,  154,  992,   60,   59,  287,  523,
        1067,  538,  968,  366,  369, 1054])

### Generate control dataset (sequence adjacent)

In [26]:
len(processed_filenames) # Actual number of proteins in dataset

17066

In [31]:
from phosphosite.utils.pt import reset_pt_edge_indexes

reset_pt_edge_indexes(
    uniprot_ids = processed_filenames[0:10],
    from_dir=old_dir, 
    to_dir=control_root_dir, 
    sequence_adjacency_range=2, 
)

Q9H6L5: 100%|██████████| 10/10 [00:00<00:00, 11.56it/s]


In [34]:
# Load in to inspect 
g = torch.load(control_root_dir / "processed" / f"{processed_filenames[0]}.pt")

In [36]:
g.edge_index

tensor([[   0,    0,    1,  ..., 1308, 1308, 1309],
        [   1,    2,    2,  ..., 1309, 1310, 1310]])

In [39]:
# Get all the sequence_distance values for all edge indexes 
edge_index = g.edge_index
# absolute value 

diff = edge_index[1] - edge_index[0]

In [44]:
# Count each unique value in tensor 
unique, counts = torch.unique(diff, return_counts=True)
unique, counts

(tensor([1, 2]), tensor([1310, 1309]))

#### Send it

In [45]:
reset_pt_edge_indexes(
    uniprot_ids = processed_filenames,
    from_dir=old_dir, 
    to_dir=control_root_dir, 
    sequence_adjacency_range=2, 
)

pre_transform: 100%|██████████| 17067/17067 [31:04<00:00,  9.15it/s]


In [54]:
# arange tensor
# Random values  
t1 = torch.rand(10)
t2 = torch.rand(10)

In [55]:
# concatenate 10 times t1 
stacked1 = torch.stack([t1 for _ in range(10)], dim=0)
stacked2 = torch.stack([t1 for _ in range(10)], dim=1)

tensor([[0.4963, 0.7682, 0.0885, 0.1320, 0.3074, 0.6341, 0.4901, 0.8964, 0.4556,
         0.6323],
        [0.4963, 0.7682, 0.0885, 0.1320, 0.3074, 0.6341, 0.4901, 0.8964, 0.4556,
         0.6323],
        [0.4963, 0.7682, 0.0885, 0.1320, 0.3074, 0.6341, 0.4901, 0.8964, 0.4556,
         0.6323],
        [0.4963, 0.7682, 0.0885, 0.1320, 0.3074, 0.6341, 0.4901, 0.8964, 0.4556,
         0.6323],
        [0.4963, 0.7682, 0.0885, 0.1320, 0.3074, 0.6341, 0.4901, 0.8964, 0.4556,
         0.6323],
        [0.4963, 0.7682, 0.0885, 0.1320, 0.3074, 0.6341, 0.4901, 0.8964, 0.4556,
         0.6323],
        [0.4963, 0.7682, 0.0885, 0.1320, 0.3074, 0.6341, 0.4901, 0.8964, 0.4556,
         0.6323],
        [0.4963, 0.7682, 0.0885, 0.1320, 0.3074, 0.6341, 0.4901, 0.8964, 0.4556,
         0.6323],
        [0.4963, 0.7682, 0.0885, 0.1320, 0.3074, 0.6341, 0.4901, 0.8964, 0.4556,
         0.6323],
        [0.4963, 0.7682, 0.0885, 0.1320, 0.3074, 0.6341, 0.4901, 0.8964, 0.4556,
         0.6323]])