In [1]:
import argparse

In [2]:
%load_ext autoreload
%autoreload 2

from load_data import load_data
from model import Optimizer, nihgcn
from myutils import *
from sampler import BalancedSampler
from sklearn.model_selection import KFold

In [3]:
class Args:
    def __init__(self):
        self.device = "cpu"  # cuda:number or cpu
        self.data = "nci"  # Dataset{gdsc or ccle}
        self.lr = 0.001  # the learning rate
        self.wd = 1e-5  # the weight decay for l2 normalizaton
        self.layer_size = [1024, 1024]  # Output sizes of every layer
        self.alpha = 0.25  # the scale for balance gcn and ni
        self.gamma = 8  # the scale for sigmod
        self.epochs = 1000  # the epochs for model


args = Args()

In [4]:
res, drug_finger, exprs, null_mask = load_data(args)

load nci


In [5]:
res.shape

(59, 976)

In [6]:
null_mask.shape

(59, 976)

In [7]:
adj_coo = sp.coo_matrix(res)
pos_edges = np.vstack((adj_coo.row, adj_coo.col)).T

# ネガティブエッジ候補作成
adj_dense = adj_coo.toarray()

# ポジティブ（接続ありかつ null_mask==0）
pos_mask = np.logical_and(adj_dense == 1, null_mask == 0)
pos_row, pos_col = np.where(pos_mask)
pos_edges = np.vstack((pos_row, pos_col)).T
pos_labels = np.ones(len(pos_edges), dtype=int)

# ネガティブ（接続なしかつ null_mask==0）
neg_mask = np.logical_and(adj_dense == 0, null_mask == 0)
neg_row, neg_col = np.where(neg_mask)
neg_edges = np.vstack((neg_row, neg_col)).T
neg_labels = np.zeros(len(neg_edges), dtype=int)

# 統合
all_edges = np.vstack((pos_edges, neg_edges))
all_labels = np.concatenate((pos_labels, neg_labels))

In [39]:
k = 5
kf = KFold(n_splits=5, shuffle=True, random_state=42)

true_datas = pd.DataFrame()
predict_datas = pd.DataFrame()

for train_idx, test_idx in kf.split(all_edges):
    train_edges = all_edges[train_idx]
    train_labels = all_labels[train_idx]
    test_edges = all_edges[test_idx]
    test_labels = all_labels[test_idx]

    sampler = BalancedSampler(
        edge_train=train_edges,
        label_train=train_labels,
        edge_test=test_edges,
        label_test=test_labels,
        adj_shape=res.shape
    )
    train = (sampler.train_data[sampler.train_mask.to(bool)])
    test = (sampler.test_data[sampler.test_mask.to(bool)])
    print('total:', len(train)+len(test))
    print('train:', len(train))
    print('test:', len(test))
    print('total pos:', int(sum(train)+sum(test)))
    print('total neg:', (len(train) + len(test))-int(sum(train)+sum(test)))
    print('train pos:', int(sum(train)))
    print('train neg:', int(len(train) - sum(train)))
    print('train pos:', int(sum(test)))
    print('train neg:', int(len(test) - sum(test)))

total: 14312
train: 11449
test: 2863
total pos: 7050
total neg: 7262
train pos: 5644
train neg: 5805
train pos: 1406
train neg: 1457
total: 14312
train: 11449
test: 2863
total pos: 7050
total neg: 7262
train pos: 5643
train neg: 5806
train pos: 1407
train neg: 1456
total: 14312
train: 11450
test: 2862
total pos: 7050
total neg: 7262
train pos: 5615
train neg: 5835
train pos: 1435
train neg: 1427
total: 14312
train: 11450
test: 2862
total pos: 7050
total neg: 7262
train pos: 5632
train neg: 5818
train pos: 1418
train neg: 1444
total: 14312
train: 11450
test: 2862
total pos: 7050
total neg: 7262
train pos: 5666
train neg: 5784
train pos: 1384
train neg: 1478
