In [10]:
import pickle
import os
import numpy as np
import scipy.sparse as sp
from collections import Counter, defaultdict
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

In [3]:
prefix = 'iYO844'

path = '../data/{}'.format(prefix)
path = path

In [6]:
def load_nodes():
    """
    return nodes dict
        total: total number of nodes
        count: a dict of int, number of nodes for each type
        attr: a dict of np.array (or None), attribute matrices for each type of nodes
        shift: node_id shift for each type. You can get the id range of a type by 
                    [ shift[node_type], shift[node_type]+count[node_type] )
    """
    nodes = {'total': 0, 'count': Counter(), 'attr': {}, 'shift': {}}
    
    with open(os.path.join(path, 'node.dat'), 'r', encoding='utf-8') as f:
        for line in f:
            th = line.split('\t')
            if len(th) == 4:
                # Then this line of node has attribute
                node_id, node_name, node_type, node_attr = th
                node_id = int(node_id)
                node_type = int(node_type)
                node_attr = list(map(float, node_attr.split(',')))
                nodes['count'][node_type] += 1
                nodes['attr'][node_id] = node_attr
                nodes['total'] += 1
            elif len(th) == 3:
                # Then this line of node doesn't have attribute
                node_id, node_name, node_type = th
                node_id = int(node_id)
                node_type = int(node_type)
                nodes['count'][node_type] += 1
                nodes['total'] += 1
            else:
                raise Exception("Too few information to parse!")
    shift = 0
    attr = {}
    for i in range(len(nodes['count'])):
        nodes['shift'][i] = shift
        if shift in nodes['attr']:
            mat = []
            for j in range(shift, shift+nodes['count'][i]):
                mat.append(nodes['attr'][j])
            attr[i] = np.array(mat)
        else:
            attr[i] = None
        shift += nodes['count'][i]
    nodes['attr'] = attr
    return nodes

load_nodes()

{'total': 1000,
 'count': Counter({0: 384, 1: 616}),
 'attr': {0: array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
          0.        ],
         [0.        , 0.        , 0.71546936, ..., 0.        , 0.        ,
          0.        ],
         [0.        , 0.        , 0.7172692 , ..., 0.        , 0.        ,
          0.        ],
         ...,
         [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
          0.        ],
         [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
          0.        ],
         [0.        , 0.        , 0.7081847 , ..., 0.        , 0.        ,
          0.        ]]),
  1: array([[0.        , 0.        , 0.        , ..., 0.07333138, 0.01263912,
          0.01970579],
         [0.        , 0.        , 0.        , ..., 0.07504918, 0.00998594,
          0.03200828],
         [0.        , 0.        , 0.        , ..., 0.06466646, 0.00431759,
          0.01475397],
         ...,
         [0.        ,

In [12]:
data = pickle.load(open(os.path.join(path, 'node.pkl'), 'rb'))

In [16]:
enzyme_data = data[data['node_type_id'] == 0]
enzyme_data['node_feature']

0      {'logits': [[19.581642, -15.330734, -12.3731, ...
1      {'logits': [[37.44548, -5.164447, -2.6643798, ...
2      {'logits': [[29.771925, -7.539098, -5.8310566,...
3      {'logits': [[48.52335, 8.085651, 7.698195, 6.7...
4      {'logits': [[14.981688, -19.190962, -10.923376...
                             ...                        
379    {'logits': [[20.193998, -18.742842, -16.95475,...
380    {'logits': [[4.1331844, -13.998642, -9.914898,...
381    {'logits': [[17.642204, -13.6244755, -11.74872...
382    {'logits': [[13.764095, -18.00769, -11.966258,...
383    {'logits': [[38.041904, 0.94647855, 1.873882, ...
Name: node_feature, Length: 384, dtype: object

In [35]:
enzyme_data['logits'] = enzyme_data['node_feature'].apply(lambda x: x['logits'])
enzyme_data['single'] = enzyme_data['node_feature'].apply(lambda x: x['single'])
enzyme_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  enzyme_data['logits'] = enzyme_data['node_feature'].apply(lambda x: x['logits'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  enzyme_data['single'] = enzyme_data['node_feature'].apply(lambda x: x['single'])


Unnamed: 0,node_id,node_name,node_type_id,node_feature,logits,single
0,0,BSU00090,0,"{'logits': [[19.581642, -15.330734, -12.3731, ...","[[19.581642, -15.330734, -12.3731, -11.610533,...","[-2.6357093, 11.089254, 0.9175112, -2.2136374,..."
1,1,BSU00140,0,"{'logits': [[37.44548, -5.164447, -2.6643798, ...","[[37.44548, -5.164447, -2.6643798, -4.42679, -...","[14.018574, 11.753677, 13.034294, 11.752765, 1..."
2,2,BSU00150,0,"{'logits': [[29.771925, -7.539098, -5.8310566,...","[[29.771925, -7.539098, -5.8310566, -7.5676093...","[15.961157, 12.483061, 6.694125, 2.3762176, -1..."
3,3,BSU00180,0,"{'logits': [[48.52335, 8.085651, 7.698195, 6.7...","[[48.52335, 8.085651, 7.698195, 6.7147527, 5.3...","[10.051447, 2.882035, 6.5983353, -8.677919, 2...."
4,4,BSU00270,0,"{'logits': [[14.981688, -19.190962, -10.923376...","[[14.981688, -19.190962, -10.923376, -14.69154...","[12.062081, 6.2870865, -1.5195656, -1.3795595,..."
...,...,...,...,...,...,...
379,379,BSU40190,0,"{'logits': [[20.193998, -18.742842, -16.95475,...","[[20.193998, -18.742842, -16.95475, -19.164597...","[11.878231, 10.247673, 11.635625, 10.802475, 1..."
380,380,BSU40320,0,"{'logits': [[4.1331844, -13.998642, -9.914898,...","[[4.1331844, -13.998642, -9.914898, -11.5434, ...","[-2.6511145, 1.0494355, -0.45718634, 4.1283865..."
381,381,BSU40340,0,"{'logits': [[17.642204, -13.6244755, -11.74872...","[[17.642204, -13.6244755, -11.748726, -12.7664...","[17.341208, 9.80417, 8.620768, 13.817736, 7.69..."
382,382,BSU40420,0,"{'logits': [[13.764095, -18.00769, -11.966258,...","[[13.764095, -18.00769, -11.966258, -13.81245,...","[2.231642, 7.7314253, 8.377593, 4.8179655, 8.0..."


In [42]:
e_attr = enzyme_data['node_feature']
e_attr_logits = np.stack(enzyme_data['logits'])

In [32]:
mol_data = data[data['node_type_id'] == 1]
meta_attr = np.stack(mol_data['node_feature'])
meta_attr

array([[-2.44304930e+02,  4.38309270e-02,  7.58186800e-02, ...,
         7.33313800e-02,  1.26391180e-02,  1.97057910e-02],
       [-2.91624900e+02,  5.69625500e-02,  8.98336540e-02, ...,
         7.50491840e-02,  9.98593800e-03,  3.20082830e-02],
       [-2.06304800e+02,  4.74621170e-02,  7.07104950e-02, ...,
         6.46664600e-02,  4.31759100e-03,  1.47539680e-02],
       ...,
       [-1.01981970e+02,  5.90845720e-02,  1.00022756e-01, ...,
         8.50683900e-02,  3.57312900e-03,  3.69745270e-02],
       [-1.57029740e+02,  4.24194800e-02,  5.50481830e-02, ...,
         6.43701100e-02,  1.19667780e-02,  2.39740460e-02],
       [-1.67703100e+02,  6.42560000e-02,  8.51914000e-02, ...,
         7.71485050e-02,  1.29328640e-02,  2.92799660e-02]])

In [31]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(meta_attr)
print(scaler.transform(meta_attr))

[[-0.5687702  -0.1305593   0.09840174 ...  0.23551064  0.99755411
  -0.51713933]
 [-0.77894811  0.65220058  0.63384409 ...  0.307462    0.49826178
   0.78800047]
 [-0.3999876   0.08589147 -0.09675657 ... -0.12742531 -0.56844392
  -1.04246552]
 ...
 [ 0.06337606  0.7786917   1.02311892 ...  0.72712312 -0.70854153
   1.31485655]
 [-0.1811259  -0.21469392 -0.6951355  ... -0.13983813  0.87102888
  -0.06433111]
 [-0.22853304  1.08695415  0.45648668 ...  0.39539346  1.05283311
   0.49856033]]


In [19]:
data

Unnamed: 0,node_id,node_name,node_type_id,node_feature
0,0,BSU00090,0,"{'logits': [[19.581642, -15.330734, -12.3731, ..."
1,1,BSU00140,0,"{'logits': [[37.44548, -5.164447, -2.6643798, ..."
2,2,BSU00150,0,"{'logits': [[29.771925, -7.539098, -5.8310566,..."
3,3,BSU00180,0,"{'logits': [[48.52335, 8.085651, 7.698195, 6.7..."
4,4,BSU00270,0,"{'logits': [[14.981688, -19.190962, -10.923376..."
...,...,...,...,...
611,995,msa_c,1,"[-81.3787, 0.026036164, 0.046005037, -0.039838..."
612,996,2pcpgc_c,1,"[-162.07196, 0.058406197, 0.10162693, -0.07970..."
613,997,quc_c,1,"[-101.98197, 0.059084572, 0.100022756, -0.0800..."
614,998,glcn__D_c,1,"[-157.02974, 0.04241948, 0.055048183, -0.05639..."


In [23]:
Counter(data['node_type_id'])
    

Counter({0: 384, 1: 616})

In [44]:
def load_nodes():
    """
    return nodes dict
        total: total number of nodes
        count: a dict of int, number of nodes for each type
        attr: a dict of np.array (or None), attribute matrices for each type of nodes
        shift: node_id shift for each type. You can get the id range of a type by 
                    [ shift[node_type], shift[node_type]+count[node_type] )
    """
    nodes = {'total': 0, 'count': Counter(), 'attr': {}, 'shift': {}}
    
    # load node.pkl
    data = pickle.load(open(os.path.join(path, 'node.pkl'), 'rb'))
    nodes['total'] = data.shape[0]
    nodes['count'] = Counter(data['node_type_id'])
    
    e_data = data[data['node_type_id'] == 0]
    m_data = data[data['node_type_id'] == 1]
    
    # attr of enzyme
    # e_data['single'] = e_data['node_feature'].apply(lambda x: x['single'])
    e_attr_logits = np.stack(e_data['node_feature'].apply(lambda x: x['logits']))
    
    # attr of molecule
    m_attr = np.stack(m_data['node_feature'])
    scaler = StandardScaler()
    scaler.fit(m_attr)
    m_attr = scaler.transform(m_attr)
    
    nodes['attr'] = {0:e_attr_logits, 1:m_attr}
    nodes['shift'] = {0:0, 1:len(e_data)}

    return nodes

nodes = load_nodes()
nodes

{'total': 1000,
 'count': Counter({0: 384, 1: 616}),
 'attr': {0: array([[[ 19.581642  , -15.330734  , -12.3731    , ...,   0.        ,
             0.        ,   0.        ],
          [-15.330734  ,  41.468864  , -14.037407  , ...,   0.        ,
             0.        ,   0.        ],
          [-12.3731    , -14.037407  ,  34.07164   , ...,   0.        ,
             0.        ,   0.        ],
          ...,
          [  0.        ,   0.        ,   0.        , ...,   0.        ,
             0.        ,   0.        ],
          [  0.        ,   0.        ,   0.        , ...,   0.        ,
             0.        ,   0.        ],
          [  0.        ,   0.        ,   0.        , ...,   0.        ,
             0.        ,   0.        ]],
  
         [[ 37.44548   ,  -5.164447  ,  -2.6643798 , ...,   0.        ,
             0.        ,   0.        ],
          [ -5.164447  ,  50.874325  ,  -5.5486393 , ...,   0.        ,
             0.        ,   0.        ],
          [ -2.664379

In [69]:
def load_labels(name):
    """
    return labels dict
        num_labels: total number of labels
        total: total number of labeled data
        count: number of labeled data for each node type
        data: a numpy matrix with shape (self.nodes['total'], self.labels['num_labels'])
        mask: to indicate if that node is labeled, if False, that line of data is masked
    """
    labels = {'num_labels': 0, 'total': 0,
                'count': Counter(), 'data': None, 'mask': None}
    nl = 2
    mask = np.zeros(nodes['total'], dtype=bool)
    data = [[0.0, 0.0] for i in range(nodes['total'])]
    with open(os.path.join(path, name), 'r', encoding='utf-8') as f:
        for line in f:
            th = line.split('\t')
            node_id, node_type, node_label = int(th[0]), int(
                th[1]), list(map(float, th[2].split(',')))
            # print(node_label)
            mask[node_id] = True
            data[node_id] = node_label
            labels['count'][node_type] += 1
            labels['total'] += 1
    labels['num_labels'] = nl
    # print(data)
    labels['data'] = np.array(data)
    labels['mask'] = mask
    return labels

In [70]:
data_t = load_labels('label.dat')
data_t

{'num_labels': 2,
 'total': 54,
 'count': Counter({0: 54}),
 'data': array([[0., 0.],
        [0., 0.],
        [0., 0.],
        ...,
        [0., 0.],
        [0., 0.],
        [0., 0.]]),
 'mask': array([False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False,  True, False,  True, False, False,
        False, False,  True, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False,  True, False, False,  True, False, False, False, False,
        False, False, False, False, False, False, False, False,  True,
        False, False, False, False, False, False, False, False,  True,
        False,  True, False,  True, False, False, False, False, False,
        False, False

In [74]:
data_t['data']

array([[0., 0.],
       [0., 0.],
       [0., 0.],
       ...,
       [0., 0.],
       [0., 0.],
       [0., 0.]])

In [68]:
t = np.array(data_t)
t[31]

array([30.,  0.])