In [1]:
import argparse

In [2]:
%load_ext autoreload
%autoreload 2

from load_data import load_data
from model import Optimizer, nihgcn
from myutils import *
from sampler import BalancedSampler
from sklearn.model_selection import KFold

In [3]:
class Args:
    def __init__(self):
        self.device = "cpu"  # cuda:number or cpu
        self.data = "nci"  # Dataset{gdsc or ccle}
        self.lr = 0.001  # the learning rate
        self.wd = 1e-5  # the weight decay for l2 normalizaton
        self.layer_size = [1024, 1024]  # Output sizes of every layer
        self.alpha = 0.25  # the scale for balance gcn and ni
        self.gamma = 8  # the scale for sigmod
        self.epochs = 1000  # the epochs for model


args = Args()

In [10]:
res, drug_finger, exprs, null_mask = load_data(args)

load nci


In [11]:
res.shape

(59, 976)

In [12]:
null_mask.shape

(59, 976)

In [13]:
adj_coo = sp.coo_matrix(res)
pos_edges = np.vstack((adj_coo.row, adj_coo.col)).T

# ネガティブエッジ候補作成
adj_dense = adj_coo.toarray()

# ポジティブ（接続ありかつ null_mask==0）
pos_mask = np.logical_and(adj_dense == 1, null_mask == 0)
pos_row, pos_col = np.where(pos_mask)
pos_edges = np.vstack((pos_row, pos_col)).T
pos_labels = np.ones(len(pos_edges), dtype=int)

# ネガティブ（接続なしかつ null_mask==0）
neg_mask = np.logical_and(adj_dense == 0, null_mask == 0)
neg_row, neg_col = np.where(neg_mask)
neg_edges = np.vstack((neg_row, neg_col)).T
neg_labels = np.zeros(len(neg_edges), dtype=int)

# 統合
all_edges = np.vstack((pos_edges, neg_edges))
all_labels = np.concatenate((pos_labels, neg_labels))

In [14]:
k = 5
kf = KFold(n_splits=5, shuffle=True, random_state=42)

true_datas = pd.DataFrame()
predict_datas = pd.DataFrame()

for i, (train_idx, test_idx) in enumerate(kf.split(all_edges)):
    train_edges = all_edges[train_idx]
    train_labels = all_labels[train_idx]
    test_edges = all_edges[test_idx]
    test_labels = all_labels[test_idx]

    sampler = BalancedSampler(
        edge_train=train_edges,
        label_train=train_labels,
        edge_test=test_edges,
        label_test=test_labels,
        adj_shape=res.shape,
        seed=i
    )
    
    model = nihgcn(
        adj_mat=sampler.train_data,
        cell_exprs=exprs,
        drug_finger=drug_finger,
        layer_size=args.layer_size,
        alpha=args.alpha,
        gamma=args.gamma,
        device=args.device,
    ).to(args.device)
    
    opt = Optimizer(
        model,
        sampler.train_data,
        sampler.test_data,
        sampler.test_mask,
        sampler.train_mask,
        roc_auc,
        lr=args.lr,
        wd=args.wd,
        epochs=args.epochs,
        device=args.device,
    ).to(args.device)
    true_data, predict_data = opt()
    true_datas = pd.concat([true_datas, translate_result(true_data)], ignore_index=True)
    predict_datas = pd.concat(
        [predict_datas, translate_result(predict_data)], ignore_index=True
    )

epoch:   0 loss:0.698576 auc:0.5037
epoch:  20 loss:0.156055 auc:0.9566
epoch:  40 loss:0.098736 auc:0.9597
epoch:  60 loss:0.074795 auc:0.9604
epoch:  80 loss:0.064980 auc:0.9605
epoch: 100 loss:0.057224 auc:0.9612
epoch: 120 loss:0.053597 auc:0.9614
epoch: 140 loss:0.050390 auc:0.9615
epoch: 160 loss:0.048226 auc:0.9619
epoch: 180 loss:0.046774 auc:0.9619
epoch: 200 loss:0.045474 auc:0.9616
epoch: 220 loss:0.044665 auc:0.9620
epoch: 240 loss:0.043468 auc:0.9619
epoch: 260 loss:0.044214 auc:0.9626
epoch: 280 loss:0.042040 auc:0.9622
epoch: 300 loss:0.043298 auc:0.9605
epoch: 320 loss:0.041027 auc:0.9618
epoch: 340 loss:0.040442 auc:0.9618
epoch: 360 loss:0.040315 auc:0.9622
epoch: 380 loss:0.040355 auc:0.9617
epoch: 400 loss:0.039341 auc:0.9615
epoch: 420 loss:0.040102 auc:0.9623
epoch: 440 loss:0.038658 auc:0.9615
epoch: 460 loss:0.038767 auc:0.9615
epoch: 480 loss:0.038507 auc:0.9611
epoch: 500 loss:0.038562 auc:0.9619
epoch: 520 loss:0.037495 auc:0.9612
epoch: 540 loss:0.037779 auc

In [15]:
true_datas.to_csv(f"true_{args.data}.csv")
predict_datas.to_csv(f"pred_{args.data}.csv")