`conda activate CASM_TORCH2`

# Install torch if necessary. See https://pytorch.org/get-started/locally/
# pip install torch==1.11.0

# Install torch geometric if necessary. See: https://pytorch-geometric.readthedocs.io/en/latest/notes/installation.html
# pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-1.11.0+cpu.html


In [4]:
import sys
sys.path.append('../')

import logging
logging.getLogger('matplotlib').setLevel(logging.CRITICAL)
logging.getLogger('graphein').setLevel(logging.INFO)

In [30]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import pytorch_lightning as pl
from tqdm.notebook import tqdm
import networkx as nx
import torch_geometric
from torch_geometric.data import Data
from torch_geometric.utils import from_networkx
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import f1_score

import math
import glob

import warnings
warnings.filterwarnings("ignore")

In [18]:
from CASM.kin_sub_pairs import load_kin_sub_list
df = load_kin_sub_list(
    "../datasets/Kinase_Substrate_Dataset"
)
#print(df)

# Filter for only ``str`` rows
def filter_fn(row):
    if type(row['KIN_ACC_ID']) == str: 
        return True
    else: return False
    
df.apply(filter_fn, axis=1)

#y = torch.randn([5, 10])



tensor([[-0.1810, -0.1420,  0.4877, -0.8634,  0.2543,  0.3596, -0.4206, -1.6959,
          0.5501,  0.2975],
        [-0.1548, -1.1922, -0.4439,  0.0369, -0.3331,  0.2836,  0.7073,  0.7566,
          0.2760,  0.8032],
        [-0.0613, -0.9581,  0.1742,  0.2459, -0.9061,  0.9821,  0.9204, -0.7289,
         -0.2647, -0.2588],
        [ 1.0710, -0.3850,  1.0271,  0.7643,  1.6322,  0.5264,  0.1415, -0.3935,
         -0.2695,  0.9044],
        [-0.6352,  2.1997,  1.0185,  0.4732,  0.0593, -2.2480,  1.1534,  1.0344,
         -0.7669, -2.0264]])


In [53]:
npy_file = "../../DATA/PPI/npy_file_human_dataset.npy"
processed_dir = "../../DATA/processed"
npy_ar = np.load(npy_file)

print(npy_ar.shape)

from torch.utils.data import Dataset as Dataset_n
from torch_geometric.data import DataLoader as DataLoader_n

class LabelledDataset(Dataset_n):
    def __init__(self, npy_file, processed_dir):
      self.npy_ar = np.load(npy_file)
      self.processed_dir = processed_dir
      self.protein_1 = self.npy_ar[:,2]
      self.protein_2 = self.npy_ar[:,5]
      self.label = self.npy_ar[:,6].astype(float)
      self.n_samples = self.npy_ar.shape[0]

      print(f"P1: {self.protein_1[0:10]}")

    def __len__(self):
      return(self.n_samples)

    def __getitem__(self, index):
      prot_1 = os.path.join(self.processed_dir, self.protein_1[index]+".pt")
      prot_2 = os.path.join(self.processed_dir, self.protein_2[index]+".pt")
      #print(f'Second prot is {prot_2}')
      prot_1 = torch.load(glob.glob(prot_1)[0])
      #print(f'Here lies {glob.glob(prot_2)}')
      prot_2 = torch.load(glob.glob(prot_2)[0])
      return prot_1, prot_2, torch.tensor(self.label[index])


dataset = LabelledDataset(npy_file = npy_file ,processed_dir= processed_dir)

print(dataset[0])

final_pairs =  np.load(npy_file)
size = final_pairs.shape[0]

seed = 42
torch.manual_seed(seed)

trainset, testset = torch.utils.data.random_split(dataset, [math.floor(0.8 * size), size - math.floor(0.8 * size) ])


# Dataloaders
trainloader = DataLoader_n(dataset= trainset, batch_size= 4, num_workers = 0)
testloader = DataLoader_n(dataset= testset, batch_size= 4, num_workers = 0)
print("Length")
print(len(trainloader))
print(len(testloader))





(22217, 7)
P1: ['1FLK' '2JVX' '3ALN' '1J0W' '5FQD' '4EGC' '2CYX' '1CFH' '3UXG' '1P9C']


IndexError: list index out of range

In [54]:

import torch
from graphein.ml import ProteinGraphDataset
import graphein.protein as gp

To use the Graphein submodule graphein.protein.features.sequence.embeddings, you need to install: biovec 
biovec cannot be installed via conda
To use the Graphein submodule graphein.protein.visualisation, you need to install: pytorch3d 
To do so, use the following command: conda install -c pytorch3d pytorch3d


In [None]:
"""
Kinase dataset
"""

npy_file = "../CASM/alphafold_structures" # TODO:SAVED ARRAY OF INTERACTING PROTEIN PAIRS
processed_dir = "../../"
npy_ar = np.load(npy_file)


print(npy_ar.shape)

from torch.utils.data import Dataset 
from torch_geometric.data import DataLoader 

class KinaseInteractionDataset(Dataset):
    def __init__(self, npy_file, processed_dir):

      # TODO
      self.npy_ar = np.load(npy_file)
      self.processed_dir = processed_dir
      self.kinase = self.npy_ar[:,2]
      self.substrate = self.npy_ar[:,5]

      self.mod_rsd = self.npy_ar[:,6]

      self.label = self.npy_ar[:,6].astype(float)
      self.n_samples = self.npy_ar.shape[0]

      print(f"kinase: {self.kinase[0:10]}")

    def __len__(self):
      return(self.n_samples)

    def __getitem__(self, index):
      

      kinase = os.path.join(self.processed_dir, self.kinase[index]+".pt")
      site = os.path.join(self.processed_dir, self.substrate[index] + "-" + self.mod_rsd[index] + ".pt") # Filenames are {acc_id}-{mod_rsd}.pt


      kinase = torch.load(glob.glob(kinase)[0])

      psite = torch.load(glob.glob(psite)[0])

      return kinase, psite, mod_rsd, torch.tensor(self.label[index])

      # TODO: alternative to keeping labels stored:  in __getitem__ , we lookup from another source whether or not 
      # the two things interact (i.e. which label); since this will change based on how we create negative examples etc. 
      # so we should abstract away into a function that returns what the label is
      # this means we don't regenerate the .npy array everytime we modify our dataset

dataset = KinaseInteractionDataset(npy_file=npy_file ,processed_dir=processed_dir)

print(dataset[0])

final_pairs =  np.load(npy_file)
size = final_pairs.shape[0]

seed = 42
torch.manual_seed(seed)

trainset, testset = torch.utils.data.random_split(dataset, [math.floor(0.8 * size), size - math.floor(0.8 * size) ])


# Dataloaders
trainloader = DataLoader(dataset=trainset, batch_size=4, num_workers = 0)
testloader = DataLoader(dataset=testset, batch_size=4, num_workers = 0)
print("Length")
print(len(trainloader))
print(len(testloader))


In [52]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import os
import glob
from tqdm import tqdm
import pathlib
import math
import sklearn
#import torch_optimizer as optim
from torch.optim.lr_scheduler import MultiStepLR, ReduceLROnPlateau
#from metrics import *



for count, (p1, p2, label) in enumerate(trainloader):

    print(label)

IndexError: list index out of range