In [1]:
import pandas as pd
import numpy as np
import torch
import networkx as nx
import pickle

from biopandas.pdb import PandasPdb
from torch_geometric.data import Data
from torch_geometric.utils import convert
from rdkit import Chem
from rdkit.Chem import AllChem
from torchdrug import data, utils
from graphein.protein.graphs import construct_graph
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_protein_graph(i):
    atom_df = PandasPdb().read_pdb(i).df['ATOM']
    node_feature = atom_df[['residue_number', 'occupancy', 'b_factor',]]
    position = atom_df[['x_coord', 'y_coord', 'z_coord']]

    mol = Chem.MolFromPDBFile(i)
    atoms = [(atom.GetFormalCharge(), atom.GetHybridization()) for atom in mol.GetAtoms()]
    nodes = pd.concat([pd.DataFrame(atoms), node_feature], axis=1)

    atom_type = np.array([a.GetAtomicNum() for a in mol.GetAtoms()])

    bonds = [(
        bond.GetBeginAtomIdx(), bond.GetEndAtomIdx(), bond.GetBondType(), bond.GetStereo(), 
    ) for bond in mol.GetBonds()]
    bonds = pd.DataFrame(bonds).values

    edges = bonds[:, :2]
    edges_attr = bonds[:, 2:]

    node_features = torch.Tensor(nodes.values.astype(int))
    atom_type = torch.Tensor(atom_type.astype(int))
    edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
    edges_attr = torch.tensor(edges_attr, dtype=torch.long)
    position = torch.tensor(position.values, dtype=torch.long)
    data = Data(x=node_features, edge_index=edge_index, edge_attr=edges_attr, pos=position, atom_type=atom_type)

    return data

In [3]:
proteins = !ls pdb/

In [4]:
len(proteins)

9624

In [5]:
protein_graphs = {}
errors = []
for i in proteins:
    try:
        protein_graphs[i.split('.')[0]] = get_protein_graph('pdb/'+i)
    except:
        print('###')
        errors.append(i)

[14:11:05] Explicit valence for atom # 17910 O, 3, is greater than permitted


###


[14:11:24] Explicit valence for atom # 1081 O, 3, is greater than permitted


###


[14:11:25] Explicit valence for atom # 7102 O, 3, is greater than permitted


###


[14:11:33] Explicit valence for atom # 1086 O, 3, is greater than permitted


###


[14:12:04] Explicit valence for atom # 2315 O, 3, is greater than permitted


###


[14:12:29] Explicit valence for atom # 1995 O, 3, is greater than permitted


###


[14:13:10] Explicit valence for atom # 5116 O, 3, is greater than permitted


###


[14:13:12] Explicit valence for atom # 1894 C, 6, is greater than permitted


###


[14:13:35] Explicit valence for atom # 563 O, 3, is greater than permitted


###


[14:13:38] Explicit valence for atom # 2569 O, 3, is greater than permitted


###


[14:15:45] Explicit valence for atom # 1075 O, 3, is greater than permitted


###


[14:15:53] Explicit valence for atom # 2956 O, 3, is greater than permitted


###


[14:16:45] Explicit valence for atom # 3146 O, 3, is greater than permitted


###


[14:17:04] Explicit valence for atom # 1157 O, 3, is greater than permitted


###


[14:17:09] Explicit valence for atom # 1001 O, 3, is greater than permitted


###


[14:17:18] Explicit valence for atom # 10837 O, 3, is greater than permitted


###


[14:17:53] Explicit valence for atom # 237 C, 6, is greater than permitted


###


[14:18:13] Explicit valence for atom # 5454 O, 3, is greater than permitted


###


[14:18:17] Explicit valence for atom # 477 O, 3, is greater than permitted


###


[14:18:22] Explicit valence for atom # 11378 O, 3, is greater than permitted


###


[14:18:32] Explicit valence for atom # 13417 O, 3, is greater than permitted


###


[14:19:01] Explicit valence for atom # 1891 O, 3, is greater than permitted


###


[14:19:36] Explicit valence for atom # 2705 O, 3, is greater than permitted


###


[14:19:39] Explicit valence for atom # 10334 O, 3, is greater than permitted


###


[14:19:53] Explicit valence for atom # 8959 O, 3, is greater than permitted


###


[14:20:51] Explicit valence for atom # 13252 O, 3, is greater than permitted


###


[14:21:07] Explicit valence for atom # 4617 O, 3, is greater than permitted


###


[14:22:28] Explicit valence for atom # 990 O, 3, is greater than permitted


###


[14:22:41] Explicit valence for atom # 4581 O, 3, is greater than permitted


###


[14:22:42] Explicit valence for atom # 13830 O, 3, is greater than permitted


###


[14:22:54] Explicit valence for atom # 2940 C, 5, is greater than permitted


###


[14:23:06] Explicit valence for atom # 3994 O, 3, is greater than permitted
[14:23:06] Explicit valence for atom # 3849 O, 3, is greater than permitted


###
###


[14:23:07] Explicit valence for atom # 9828 O, 3, is greater than permitted


###


[14:23:36] Explicit valence for atom # 15230 O, 3, is greater than permitted


###


[14:23:49] Explicit valence for atom # 244 O, 3, is greater than permitted


###


[14:23:53] Explicit valence for atom # 4825 O, 3, is greater than permitted


###


[14:24:25] Explicit valence for atom # 4981 O, 3, is greater than permitted


###


[14:24:46] Explicit valence for atom # 946 O, 3, is greater than permitted


###


[14:24:54] Explicit valence for atom # 4665 O, 3, is greater than permitted


###


[14:25:48] Explicit valence for atom # 6846 O, 3, is greater than permitted


###


[14:25:55] Explicit valence for atom # 534 O, 3, is greater than permitted


###


[14:26:44] Explicit valence for atom # 8737 O, 3, is greater than permitted


###


In [6]:
len(errors)

43

In [10]:
import numpy as np
# !mkdir protein_graphs

In [11]:
for i in protein_graphs.keys():
    torch.save(protein_graphs[i], 'protein_graphs/{}.pt'.format(i))

In [15]:
protein_graphs['A0A024RBG1']

Data(x=[1437, 5], edge_index=[2, 1467], edge_attr=[1467, 2], pos=[1437, 3], atom_type=[1437])