In [None]:
import numpy as np
import glob
import pickle
from scipy import sparse

In [None]:
%%time
files = glob.glob('/Users/sjiang87/data_weight/gnnatv/data/amide/*.pickle')

ligands, HFEs, MinNodes, MaxNodes, MinEdges, MaxEdges = [], [], [], [], [], []

for i, file in enumerate(files):
    with open(file, 'rb') as f:
        data = pickle.load(f)
    ligands.append(data['ligand'])
    HFEs.append(data['label'][0])
    MinNodes.append(min([data['pos'][i].shape for i in range(len(data['pos']))])[0])
    MaxNodes.append(max([data['pos'][i].shape for i in range(len(data['pos']))])[0])
    MinEdges.append(min([data['adj'][i].sum() for i in range(len(data['adj']))]))
    MaxEdges.append(max([data['adj'][i].sum() for i in range(len(data['adj']))]))
    
MaxEdge = int(max(MaxEdges))
MaxNode = int(max(MaxNodes) + 2)
print('Max node:', MaxNode)
print('Max edge:', MaxEdge)

In [8]:
def add_node_features(x, adj):
    f2 = x.mean(axis=0)  # bulk water
    f1 = np.copy(f2)
    f1[2] = x[..., 2].min()  # bulk sam site.
    x = np.vstack((x, f1, f2))
    x = np.hstack((x,
                   adj.sum(axis=0)[..., np.newaxis],
                   np.array([1] * (x.shape[0] - 2) + [0] * 2)[..., np.newaxis],
                   np.array([0] * (x.shape[0] - 2) + [1, 0])[..., np.newaxis],
                   np.array([0] * (x.shape[0] - 2) + [0, 1])[..., np.newaxis]))
    return x

In [11]:
file = files[0]

with open(file, 'rb') as f:
    data = pickle.load(f)
    
data.keys()

dict_keys(['label', 'adj', 'ligand', 'pos'])

In [15]:
node_feature_previous = data['pos']
adj_previous = data['adj']
n_time = len(node_feature_previous) # which is 8000

node_feature = np.zeros((n_time, MaxNode, 7)) #create node feature matrix. matrix of all water and SAM site locations. 
for i in range(n_time):
    node_feature_temp = add_node_features(node_feature_previous[i], adj_previous[i])
    n_node_temp = len(node_feature_temp)
    node_feature[i, :n_node_temp, :] = node_feature_temp

node_feature.shape

#there are 8000 samples data over time, where each data represent a water environment matrix of about 40-58 (58 max) water molecule over time.
#There are 3 x,y,z location node of water, 3 x,y,z location of SAM site, and 1 value associated (making the last matrix size 7)

(8000, 58, 7)

In [16]:
adj = np.zeros((n_time, MaxNode, MaxNode)) 
#create all adjacent matrix, which represent how all site interact with one another.This would have an square matrix for last 2
#since all site interact with each other.
for i in range(n_time):
    adj_temp = adj_previous[i]
    n_node_temp = len(adj_temp)
    adj[i, :n_node_temp, :n_node_temp] = adj_temp
    
adj.shape

(8000, 58, 58)

In [20]:
#temporal
#reshape nodes into temporal (smaller snippets). Rather than having it at every interval of time (1 second)
#we reshape to 40 interval of time (40 second).
window = 40
n1, n2, n3 = node_feature.shape
node_temporal = np.zeros((int(n1 / window), int(n2 * window), n3))

for i in range(len(node_temporal)):
    node_temporal_temp = np.zeros((int(n2 * window), n3))
    for j in range(window):
        node_temporal_temp[j * n2 : (j + 1) * n2] = node_feature[i * window + j]
    node_temporal[i] = node_temporal_temp
    
node_temporal.shape



(200, 2320, 7)

In [22]:
#temporal
#reshape nodes into temporal (smaller snippets). Rather than having it at every interval of time (1 second)
#we reshape to 40 interval of time (40 second).
window = 40
n1, n2, n3 = adj.shape
adj_temporal = np.zeros((int(n1 / window), int(n2 * window), int(n2 * window)))

for i in range(len(adj_temporal)):
    adj_temporal_temp = np.zeros((int(n2 * window), int(n2 * window)))
    for j in range(window):
        adj_temporal_temp[j * n2 : (j + 1) * n2, j * n2 : (j + 1) * n2] = adj[i * window + j]
    adj_temporal[i] = adj_temporal_temp
    
adj_temporal.shape

(200, 2320, 2320)

In [29]:
#create the temporal node as a function.
def organize_node(data, window=40):
    node_feature_previous = data['pos']
    adj_previous = data['adj']
    n_time = len(node_feature_previous) # which is 8000

    node_feature = np.zeros((n_time, MaxNode, 7))
    for i in range(n_time):
        node_feature_temp = add_node_features(node_feature_previous[i], adj_previous[i])
        n_node_temp = len(node_feature_temp)
        node_feature[i, :n_node_temp, :] = node_feature_temp

    n1, n2, n3 = node_feature.shape
    node_temporal = np.zeros((int(n1 / window), int(n2 * window), n3))

    for i in range(len(node_temporal)):
        node_temporal_temp = np.zeros((int(n2 * window), n3))
        for j in range(window):
            node_temporal_temp[j * n2 : (j + 1) * n2] = node_feature[i * window + j]
        node_temporal[i] = node_temporal_temp

    return node_temporal

In [37]:
#create the adjacent node as a function.
def organize_adj(data, window=40):
    adj = np.zeros((n_time, MaxNode, MaxNode))
    for i in range(n_time):
        adj_temp = adj_previous[i]
        n_node_temp = len(adj_temp)
        adj[i, :n_node_temp, :n_node_temp] = adj_temp

    n1, n2, n3 = adj.shape
    adj_temporal = np.zeros((int(n1 / window), int(n2 * window), int(n2 * window)))

    for i in range(len(adj_temporal)):
        adj_temporal_temp = np.zeros((int(n2 * window), int(n2 * window)))
        for j in range(window):
            adj_temporal_temp[j * n2 : (j + 1) * n2, j * n2 : (j + 1) * n2] = adj[i * window + j]
        adj_temporal[i] = adj_temporal_temp
    return adj_temporal

In [39]:
#return the size of label in accordance to temporal size
def organize_label(data, window=40):
    y = np.array([data['label'][0]] * int(8000 / 40))
    return y

In [None]:
%%time
#load all files under organized data with desired temporal.
for i, file in enumerate(files):
    with open(file, 'rb') as f:
        data = pickle.load(f)
    node = organize_node(data, window=40)
    adj = organize_adj(data, window=40)
    y = organize_label(data, window=40)
    with open(f'{data['ligand']}.pickle', 'wb') as f:
        pickle.dump(node, f)
        pickle.dump(adj, f)
        pickle.dump(y, f)