In [2]:
import pandas as pd
import numpy as np

df= pd.read_csv("../NOVO/train.csv")


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31390 entries, 0 to 31389
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   seq_id            31390 non-null  int64  
 1   protein_sequence  31390 non-null  object 
 2   pH                31104 non-null  float64
 3   data_source       28043 non-null  object 
 4   tm                31390 non-null  float64
dtypes: float64(2), int64(1), object(2)
memory usage: 1.2+ MB


In [3]:
df.describe()

Unnamed: 0,seq_id,pH,tm
count,31390.0,31104.0,31390.0
mean,15694.5,6.892339,49.147337
std,9061.656811,1.612225,14.010089
min,0.0,1.99,-1.0
25%,7847.25,7.0,42.1
50%,15694.5,7.0,48.0
75%,23541.75,7.0,53.8
max,31389.0,64.9,130.0


In [1]:
import numpy as np
from biopandas.pdb import PandasPdb
import torch
import dgl

In [2]:
def get_distance_matrix(coords):
    diff_tensor = np.expand_dims(coords, axis=1) - np.expand_dims(coords, axis=0)
    distance_matrix = np.sqrt(np.sum(np.power(diff_tensor, 2), axis=-1))
    return distance_matrix

def pdb_to_graph(pdb_path, distance_threshold=6.0, contain_b_factor=True):
    atom_df = PandasPdb().read_pdb(pdb_path)
    atom_df = atom_df.df['ATOM']
    residue_df = atom_df.groupby('residue_number', as_index=False)[['x_coord', 'y_coord', 'z_coord', 'b_factor']].mean().sort_values('residue_number')
    coords = residue_df[['x_coord', 'y_coord', 'z_coord']].values
    distance_matrix = get_distance_matrix(coords)
    adj = distance_matrix < distance_threshold
    u, v = np.nonzero(adj)
    u, v = torch.from_numpy(u), torch.from_numpy(v)
    graph = dgl.graph((u, v), num_nodes=len(coords))
    if contain_b_factor:
        b_factor = torch.from_numpy(residue_df['b_factor'].values)
        graph.ndata['b_factor'] = b_factor
    return graph

In [3]:
graph = pdb_to_graph('../NOVO/wildtype_structure_prediction_af2.pdb')
graph

Graph(num_nodes=221, num_edges=1175,
      ndata_schemes={'b_factor': Scheme(shape=(), dtype=torch.float64)}
      edata_schemes={})

In [4]:
dgl.save_graphs('dgl_graph.bin', [graph])

graph_list, label_dict = dgl.load_graphs('dgl_graph.bin')
graph = graph_list[0]

In [5]:
import networkx as nx
import matplotlib.pyplot as plt

In [6]:
nx_graph = dgl.to_networkx(graph)
plt.figure(figsize=(4, 3), dpi=200)
nx.draw(nx_graph, pos=nx.kamada_kawai_layout(nx_graph), node_size=50, arrows=False)
plt.show()

: 

: 

In [None]:
nx_graph = dgl.to_networkx(graph, node_attrs=['b_factor'])
plt.figure(figsize=(4, 3), dpi=200)
nx.draw(nx_graph, pos=nx.kamada_kawai_layout(nx_graph), node_size=50, arrows=False,
        node_color=np.array([nx_graph.nodes[v]['b_factor'] for v in nx_graph]))
plt.show()