In [2]:
import os
import sys

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

from functools import partial
from graphein.protein.config import ProteinGraphConfig
from graphein.protein.edges.distance import (add_distance_threshold,
                                             add_peptide_bonds,
                                             add_hydrogen_bond_interactions,
                                             add_disulfide_interactions,
                                             add_ionic_interactions,
                                             add_aromatic_interactions,
                                             add_aromatic_sulphur_interactions,
                                             add_cation_pi_interactions
                                            )
from graphein.protein.graphs import construct_graph
from graphein.protein.features.sequence.embeddings import esm_residue_embedding
from graphein.protein.visualisation import plotly_protein_structure_graph
from src.processing.graph import convert_nx_to_pyg_data

import torch
import pytorch_lightning as pl
from pytorch_lightning import loggers as pl_loggers
from pytorch_lightning.callbacks import ModelCheckpoint

# sys.path.insert(0, os.path.abspath(os.path.join('..')))

from src.dataset import PPIDataModule
from src.models import LightningGCNN


To use the Graphein submodule graphein.protein.features.sequence.embeddings, you need to install: biovec 
biovec cannot be installed via conda
To use the Graphein submodule graphein.protein.visualisation, you need to install: pytorch3d 
To do so, use the following command: conda install -c pytorch3d pytorch3d


In [2]:
# data = np.load(npy_file)
# print(data.shape)
# processed_pdb_codes = [path.split('.')[0] for path in os.listdir(processed_dir)]

# idx_to_del = []
# for i in range(len(data)):
#     if data[i, 2] not in processed_pdb_codes or data[i, 5] not in processed_pdb_codes:
#         idx_to_del.append(i)
# data = np.delete(data, idx_to_del, 0)
# print(data.shape)
# np.save(npy_file, data)

In [6]:
npy_file =  '../data/preprocessed/human_data.npy'
processed_dir =  '../data/graphs/pan_human'
BATCH_SIZE = 4
SEED = 42
EPOCHS = 50
ppi_data = PPIDataModule(npy_file=npy_file, processed_dir=processed_dir, batch_size=BATCH_SIZE)
ppi_data.setup(train_size=0.8, random_seed=SEED)

train_loader = ppi_data.train_dataloader()
print("Train len:", len(ppi_data.train))
test_loader = ppi_data.test_dataloader()
print("Test len:",len(ppi_data.test))



Train len: 17553
Test len: 4389


In [16]:
ppigcnn = LightningGCNN()
checkpoint_callback = ModelCheckpoint(dirpath=os.path.join('checkpoint','pan-human-data', 'ppi_gnn'), save_top_k=1, monitor='val_acc')
tb_logger = pl_loggers.TensorBoardLogger(save_dir=os.path.join('logs','pan-human-data'), name='ppi_gnn')
trainer = pl.Trainer(max_epochs=EPOCHS, logger=tb_logger, callbacks=[checkpoint_callback], log_every_n_steps=10, check_val_every_n_epoch=1)
trainer.fit(ppigcnn, train_dataloaders=train_loader, val_dataloaders=test_loader)



GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(

   | Name       | Type           | Params
-----------------------------------------------
0  | pro1_conv1 | GCNConv        | 1.0 M 
1  | pro1_fc1   | Linear         | 131 K 
2  | pro2_conv1 | GCNConv        | 1.0 M 
3  | pro2_fc1   | Linear         | 131 K 
4  | relu       | LeakyReLU      | 0     
5  | dropout    | Dropout        | 0     
6  | sigmoid    | Sigmoid        | 0     
7  | fc1        | Linear         | 65.8 K
8  | fc2        | Linear         | 16.4 K
9  | out        | Linear         | 65    
10 | loss_fn    | MSELoss        | 0     
11 | accuracy   | BinaryAccuracy | 0     
-----------------------------------------------
2.4 M     Trainable params
0         Non-trainable params
2.4 M     Total params
9.776     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


KeyError: 'sequence_A'