In [30]:
import pickle
import numpy as np
import pandas as pd
import scipy.sparse as sp
import torch
import torch.sparse as tsp

In [31]:
!ls ../data/processed

disease-drug-sparse-adj.npz  final_data_dict.pkl  ppi-sparse-adj.npz
disease-prot-sparse-adj.npz  graph_info.pkl
drug-protein-sparse-adj.npz  index_map


In [32]:
path = "../data/processed/"

In [33]:
 with open(path + 'graph_info.pkl', 'rb') as f:
        protein_num, drug_num, disease_num = pickle.load(f)

In [34]:
def remove_bidirection(edge_index, edge_type):
    mask = edge_index[0] > edge_index[1]
    keep_set = mask.nonzero().view(-1)

    if edge_type is None:
        return edge_index[:, keep_set]
    else:
        return edge_index[:, keep_set], edge_type[keep_set]


def to_bidirection(edge_index, edge_type=None):
    tmp = edge_index.clone()
    tmp[0, :], tmp[1, :] = edge_index[1, :], edge_index[0, :]
    if edge_type is None:
        return torch.cat([edge_index, tmp], dim=1)
    else:
        return torch.cat([edge_index, tmp], dim=1), torch.cat([edge_type, edge_type])
    
def sparse_id(n):
    idx = [[i for i in range(n)], [i for i in range(n)]]
    val = [1 for i in range(n)]
    i = torch.LongTensor(idx)
    v = torch.FloatTensor(val)
    shape = (n, n)

    return torch.sparse.FloatTensor(i, v, torch.Size(shape))

In [35]:
# ########################################
# protein-protein
# ########################################
pp_adj = sp.load_npz(path + "ppi-sparse-adj.npz")
pp_adj.shape

(18505, 18505)

In [36]:
# ########################################
# drug-protein
# ########################################
dp_adj = sp.load_npz(path + "drug-protein-sparse-adj.npz").tocsr()

dp_adj[np.where(dp_adj.toarray()>1)]=1
            
dp_adj.shape

(6158, 18505)

In [37]:
# ########################################
# disease-drug
# ########################################
dd_adj = sp.load_npz(path + "disease-drug-sparse-adj.npz").tocsr()

dd_adj[np.where(dd_adj.toarray()>1)]=1

dd_adj.shape

(1448, 6158)

In [38]:
# ########################################
# disease-protein
# ########################################
disp_adj = sp.load_npz(path + "disease-prot-sparse-adj.npz").tocsr()

disp_adj[np.where(disp_adj.toarray()>1)]=1

disp_adj.shape

(1448, 18505)

In [39]:
np.sum(disp_adj.toarray()[1446,:])

332.0

In [40]:
# ########################################
# protein feature matrix
# ########################################
# protein_feat = sp.identity(protein_num)
ind = torch.LongTensor([range(protein_num), range(protein_num)])
val = torch.FloatTensor([1] * protein_num)
protein_feat = torch.sparse.FloatTensor(ind, val,torch.Size([protein_num, protein_num]))

In [41]:
# ########################################
# drug feature matrix
# ########################################
row = np.array(range(drug_num), dtype=np.long)
col = np.array(range(drug_num), dtype=np.long)

ind = torch.LongTensor([row, col])
val = torch.FloatTensor([1] * len(row))

drug_feat = torch.sparse.FloatTensor(ind, val,
                                     torch.Size([drug_num, drug_num]))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  row = np.array(range(drug_num), dtype=np.long)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  col = np.array(range(drug_num), dtype=np.long)


In [42]:
# ########################################
# disease feature matrix
# ########################################
row = np.array(range(disease_num), dtype=np.long)
col = np.array(range(disease_num), dtype=np.long)

ind = torch.LongTensor([row, col])
val = torch.FloatTensor([1] * len(row))

disease_feat = torch.sparse.FloatTensor(ind, val,
                                        torch.Size([disease_num, disease_num]))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  row = np.array(range(disease_num), dtype=np.long)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  col = np.array(range(disease_num), dtype=np.long)


In [43]:
data = {'n_drug': drug_num,
        'n_prot': protein_num,
        'n_dis': disease_num,
        'd_feat': drug_feat,
        'p_feat': protein_feat,
        'dis_feat': disease_feat,
        'dp_adj': dp_adj.tocoo(),
        'pp_adj': pp_adj.tocoo(),
        'dd_adj': dd_adj.tocoo(),
        'disp_adj': disp_adj.tocoo()}

In [44]:
data['d_feat'] = sparse_id(data['n_drug'])
data['p_feat'] = sparse_id(data['n_prot'])
data['dis_feat'] = sparse_id(data['n_dis'])

data['n_drug_feat'] = data['d_feat'].shape[1]
data['d_norm'] = torch.ones(data['n_drug_feat'])

data['n_dis_feat'] = data['dis_feat'].shape[1]
data['dis_norm'] = torch.ones(data['n_dis_feat'])

In [45]:
# ###################################
# dp_edge_index and range index
# ###################################

data['dp_edge_index'] = np.array([data['dp_adj'].col-1, data['dp_adj'].row-1])
# print(data['dp_edge_index'].shape)

count_drug = np.zeros(data['n_drug'], dtype=np.int32)
for i in data['dp_edge_index'][1, :]:
    count_drug[i] += 1
# print(count_drug)

range_list = []
start = 0
end = data['n_drug']
for i in count_drug:
    end += i
    range_list.append((start, end))
    start = end
    
# print(range_list)

data['dp_edge_index'] = torch.from_numpy(data['dp_edge_index'] + np.array([[0], [data['n_prot']]]))
data['dp_range_list'] = torch.Tensor(range_list)

In [46]:
data['dp_edge_index']

tensor([[   56,   253,  2531,  ...,  2926,    49,  3000],
        [18504, 18505, 18505,  ..., 24660, 24661, 24661]])

In [47]:
data['dp_range_list']

tensor([[    0.,  6170.],
        [ 6170.,  6173.],
        [ 6173.,  6187.],
        ...,
        [28154., 28155.],
        [28155., 28157.],
        [28157., 28158.]])

In [48]:
# ###################################
# disp_edge_index and range index
# ###################################

data['disp_edge_index'] = np.array([data['disp_adj'].col-1, data['disp_adj'].row-1])

count_dis = np.zeros(data['n_dis'], dtype=np.int32)
for i in data['disp_edge_index'][1, :]:
    count_dis[i] += 1
range_list = []
start = 0
end = 0
for i in count_dis:
    end += i
    range_list.append((start, end))
    start = end

data['disp_edge_index'] = torch.from_numpy(data['disp_edge_index'] + np.array([[0], [data['n_prot']]]))
data['disp_range_list'] = torch.Tensor(range_list)

In [49]:
# # data['disp_edge_index'] = np.array([data['disp_adj'].col, data['disp_adj'].row])
# data['disp_edge_index'] = torch.LongTensor(np.concatenate((data['disp_adj'].row.reshape(1, -1),
#                                                            data['disp_adj'].col.reshape(1, -1)),
#                                                           axis=0))
list(data['disp_edge_index'][0]).count(1446)

0

In [50]:
data['disp_range_list']

tensor([[    0.,   436.],
        [  436.,   578.],
        [  578.,   580.],
        ...,
        [29623., 29955.],
        [29955., 29956.],
        [29956., 29957.]])

In [51]:
# ###################################
# pp_edge_index and range index
# ###################################
data['pp_edge_index'] = np.array([data['pp_adj'].col-1, data['pp_adj'].row-1])

count_dis = np.zeros(data['n_prot'], dtype=np.int32)
for i in data['pp_edge_index'][1, :]:
    count_dis[i] += 1
range_list = []
start = 0
end = 0
for i in count_dis:
    end += i
    range_list.append((start, end))
    start = end

data['pp_edge_index'] = torch.from_numpy(data['pp_edge_index'] + np.array([[0], [data['n_prot']]]))
data['pp_range_list'] = torch.Tensor(range_list)

In [52]:
# ###################################
# drug disease_edge_index and range index
# ###################################
num = [0]
edge_index_list = []
edge_type_list = []

adj = dd_adj.tocoo()
edge_index_list.append(torch.tensor([adj.row, adj.col], dtype=torch.long))
edge_type_list.append(torch.tensor([i] * adj.nnz, dtype=torch.long))
num.append(num[-1] + adj.nnz)

    # if i % 100 == 0:
    #     print(i)

# data['dd_edge_index'] = t.cat(edge_index_list, 1)
# data['dd_edge_type'] = t.cat(edge_type_list, 0)
data['dd_edge_index'] = edge_index_list[0]
data['dd_edge_type'] = edge_type_list
data['dd_edge_type_num'] = num
data['dd_y_pos'] = torch.ones(num[-1])
data['dd_y_neg'] = torch.zeros(num[-1])



# data['dd_edge_index'] = np.array([data['dd_adj'].col-1, data['dd_adj'].row-1])

# count_drug = np.zeros(data['n_drug'], dtype=np.int32)
# for i in data['dd_edge_index'][1, :]:
#     count_drug[i] += 1
# range_list = []
# start = 0
# end = 0
# for i in count_drug:
#     end += ia['n_pro
#     range_list.append((start, end))
#     start = end

# data['dd_edge_index'] = torch.from_numpy(data['dd_edge_index'] + np.array([[0], [data['n_prot']]]))
# data['dd_range_list'] = torch.Tensor(range_list)

In [53]:
len(data['dd_edge_index'][0])

195811

In [54]:
def process_prot_edge(pp_net):
    indices = torch.LongTensor(np.concatenate((pp_net.col.reshape(1, -1),
                                               pp_net.row.reshape(1, -1)),
                                              axis=0))
    indices = remove_bidirection(indices, None)
    n_edge = indices.shape[1]

    rd = np.random.binomial(1, 0.9, n_edge)
    train_mask = rd.nonzero()[0]
    test_mask = (1 - rd).nonzero()[0]

    train_indices = indices[:, train_mask]
    train_indices = to_bidirection(train_indices, None)

    test_indices = indices[:, test_mask]
    test_indices = to_bidirection(test_indices, None)

    return train_indices, test_indices

data['pp_train_indices'], data['pp_test_indices'] = process_prot_edge(data['pp_adj'])

In [55]:
data.keys()

dict_keys(['n_drug', 'n_prot', 'n_dis', 'd_feat', 'p_feat', 'dis_feat', 'dp_adj', 'pp_adj', 'dd_adj', 'disp_adj', 'n_drug_feat', 'd_norm', 'n_dis_feat', 'dis_norm', 'dp_edge_index', 'dp_range_list', 'disp_edge_index', 'disp_range_list', 'pp_edge_index', 'pp_range_list', 'dd_edge_index', 'dd_edge_type', 'dd_edge_type_num', 'dd_y_pos', 'dd_y_neg', 'pp_train_indices', 'pp_test_indices'])

In [56]:
file = path + 'final_data_dict.pkl'
with open(file, 'wb') as f:   # the whole dataset
    pickle.dump(data, f)

In [57]:
file = path + 'final_data_dict.pkl'
with open(file, 'rb') as f:   # the whole dataset
    data = pickle.load(f)
#     print(pickle.load(f))

In [58]:
data.keys()

dict_keys(['n_drug', 'n_prot', 'n_dis', 'd_feat', 'p_feat', 'dis_feat', 'dp_adj', 'pp_adj', 'dd_adj', 'disp_adj', 'n_drug_feat', 'd_norm', 'n_dis_feat', 'dis_norm', 'dp_edge_index', 'dp_range_list', 'disp_edge_index', 'disp_range_list', 'pp_edge_index', 'pp_range_list', 'dd_edge_index', 'dd_edge_type', 'dd_edge_type_num', 'dd_y_pos', 'dd_y_neg', 'pp_train_indices', 'pp_test_indices'])

In [None]:
def get_max(edge_index):
    maxr=maxc=0

    for i in range(len(edge_index)):

        valr = edge_index[i][0]
        valc = edge_index[i][1]

        if(valr > maxr):
            maxr = valr

        if(valc > maxc):
            maxc = valc

    return (maxr, maxc)

def get_max_t(edge_index):
    edge_index = torch.t(edge_index)
    maxr=maxc=0

    for i in range(len(edge_index)):

        valr = edge_index[i][0]
        valc = edge_index[i][1]

        if(valr > maxr):
            maxr = valr

        if(valc > maxc):
            maxc = valc

    return (maxr, maxc)

def auprc_auroc_ap(target_tensor, score_tensor):
    y = target_tensor.detach().cpu().numpy()
    pred = score_tensor.detach().cpu().numpy()
    auroc, ap = metrics.roc_auc_score(y, pred), metrics.average_precision_score(y, pred)
    y, xx, _ = metrics.precision_recall_curve(y, pred)
    auprc = metrics.auc(xx, y)

    return auprc, auroc, ap

def get_range_list(edge_list):
    tmp = []
    s = 0
    for i in edge_list:
        tmp.append((s, s + i.shape[1]))
        s += i.shape[1]
    return torch.tensor(tmp)

def negative_sampling(pos_edge_index, num_nodes, num_dis):
#     print("POS:",pos_edge_index.shape)
    idx = (pos_edge_index[0] * num_nodes + pos_edge_index[1])
#     print("IDX:",idx.shape)
    idx = idx.to(torch.device('cpu'))

    perm = torch.tensor(np.random.choice(num_nodes*num_dis, idx.size(0)))
#     print("PERM:", perm.shape)
    mask = torch.from_numpy(np.isin(perm, idx).astype(np.uint8))
#     print("MASK:", mask.shape)
    rest = mask.nonzero().view(-1)
#     print("REST:", rest.shape)
    while rest.numel() > 0:  # pragma: no cover
        tmp = torch.tensor(np.random.choice(num_nodes*num_dis, rest.size(0)))
        mask = torch.from_numpy(np.isin(tmp, idx).astype(np.uint8))
        perm[rest] = tmp
        rest = mask.nonzero().view(-1)
#     print("PERM:", perm.shape)
    row, col = perm / num_nodes, perm % num_nodes
    returnable = torch.stack([row, col], dim=0).long().to(pos_edge_index.device)
#     print("RETURNABLE:", returnable.shape)
    return returnable


def typed_negative_sampling(pos_edge_index, num_nodes, num_dis, range_list):
    tmp = []
    for start, end in range_list:
        tmp.append(negative_sampling(pos_edge_index[:, start: end], num_nodes, num_dis))
        
    value = torch.cat(tmp, dim=1)
#     print("VALUE:", value.shape)
    return value


In [None]:
typed_negative_sampling(v1, 6158, 5529, v3)

In [None]:
def process_edges(raw_edge_list, p=0.9):
    train_list = []
    test_list = []
    train_label_list = []
    test_label_list = []

#     for i, idx in enumerate(raw_edge_list):

    train_mask = np.random.binomial(1, p, raw_edge_list.shape[1])
    test_mask = 1 - train_mask
    train_set = train_mask.nonzero()[0]
    test_set = test_mask.nonzero()[0]

    train_list.append(raw_edge_list[:, train_set])
    test_list.append(raw_edge_list[:, test_set])

    train_label_list.append(torch.ones(2 * train_set.size, dtype=torch.long))
    test_label_list.append(torch.ones(2 * test_set.size, dtype=torch.long))

#     train_list = [to_bidirection(idx) for idx in train_list]
#     test_list = [to_bidirection(idx) for idx in test_list]

    train_range = get_range_list(train_list)
    test_range = get_range_list(test_list)

    train_edge_idx = torch.cat(train_list, dim=1)
    test_edge_idx = torch.cat(test_list, dim=1)

    train_et = torch.cat(train_label_list)
    test_et = torch.cat(test_label_list)
    
    print("train_edge_idx.shape", train_edge_idx.shape)
#     print("train_edge_idx", train_edge_idx)
    print("get_max(train_edge_idx)", get_max_t(train_edge_idx))
#     print("train_range", train_range)
    
    print("\n\n")
    
    print("test_edge_idx.shape", test_edge_idx.shape)
#     print("test_edge_idx", test_edge_idx)
    print("get_max(test_edge_idx)", get_max_t(test_edge_idx))
#     print("test_range", test_range)

    return train_edge_idx, train_et, train_range, test_edge_idx, test_et, test_range

v1, v2, v3, v4, v5, v6 = process_edges(data['dd_edge_index'])

In [None]:
v6