In [1]:
import argparse

import numpy as np
import pandas as pd
import scipy.sparse as sp
from sklearn.model_selection import KFold

In [2]:
%load_ext autoreload
%autoreload 2

from load_data import load_data
from model import GModel
from myutils import roc_auc, translate_result
from optimizer import Optimizer
from sampler import BalancedSampler

In [3]:
class Args:
    def __init__(self):
        self.device = "cpu"  # cuda:number or cpu
        self.data = "nci"  # Dataset{gdsc or ccle}

args = Args()
device = args.device

In [4]:
res, drug_feature, exprs, mut, cna, null_mask = load_data(args)

load nci
response matrix (res) shape: (977, 59)
exprs shape: (60, 23059)
mut shape: (60, 9307)
cna shape: (60, 23232)
59
exprs shape: (59, 23059)
mut shape: (59, 9307)
cna shape: (59, 23232)
drug_feature shape: (976, 920)
response matrix (res) shape: (59, 976)
null_mask shape: (59, 976)


In [5]:
res

Unnamed: 0,740,752,755,757,762,1390,1895,3053,3061,3088,...,808790,808792,809693,810341,810717,811429,812926,812927,813488,820919
786_0,1.0,,,,,,1.0,,,1.0,...,0.0,,,,,,,,1.0,
A498,0.0,0.0,0.0,0.0,,0.0,,,,0.0,...,,,,,,1.0,,,1.0,0.0
A549,,,,,,,,,,,...,,,,,,,,,,
ACHN,1.0,,,,1.0,,1.0,,,1.0,...,,,,,,1.0,1.0,,1.0,
BT_549,0.0,0.0,0.0,0.0,,,,,,,...,0.0,0.0,,,,,,0.0,0.0,0.0
CAKI_1,,,,,,1.0,,0.0,,,...,,,,1.0,,1.0,1.0,0.0,,
CCRF_CEM,1.0,1.0,,,1.0,,,0.0,,1.0,...,,0.0,,1.0,1.0,0.0,,,,
COLO205,,,,,,,,,,,...,1.0,1.0,,,1.0,,,,,1.0
DU_145,,,,,,,,,,,...,0.0,,,,,,,,,
EKVX,,0.0,0.0,,,0.0,,,0.0,0.0,...,,,,,,,1.0,,0.0,


In [6]:
adj_coo = sp.coo_matrix(res)
pos_edges = np.vstack((adj_coo.row, adj_coo.col)).T

# ネガティブエッジ候補作成
adj_dense = adj_coo.toarray()

# ポジティブ（接続ありかつ null_mask==0）
pos_mask = np.logical_and(adj_dense == 1, null_mask == 0)
pos_row, pos_col = np.where(pos_mask)
pos_edges = np.vstack((pos_row, pos_col)).T
pos_labels = np.ones(len(pos_edges), dtype=int)

# ネガティブ（接続なしかつ null_mask==0）
neg_mask = np.logical_and(adj_dense == 0, null_mask == 0)
neg_row, neg_col = np.where(neg_mask)
neg_edges = np.vstack((neg_row, neg_col)).T
neg_labels = np.zeros(len(neg_edges), dtype=int)

# 統合
all_edges = np.vstack((pos_edges, neg_edges))
all_labels = np.concatenate((pos_labels, neg_labels))

In [7]:
exprs.shape

(59, 23059)

In [8]:
len(all_labels)

14312

In [9]:
k = 5
kf = KFold(n_splits=5, shuffle=True, random_state=42)

true_datas = pd.DataFrame()
predict_datas = pd.DataFrame()

for i, (train_idx, test_idx) in enumerate(kf.split(all_edges)):
    train_edges = all_edges[train_idx]
    train_labels = all_labels[train_idx]
    test_edges = all_edges[test_idx]
    test_labels = all_labels[test_idx]

    sampler = BalancedSampler(
        edge_train=train_edges,
        label_train=train_labels,
        edge_test=test_edges,
        label_test=test_labels,
        adj_shape=res.shape,
    )
    
    model = GModel(
        adj_mat=sampler.train_data,
        gene=exprs,
        cna=cna,
        mutation=mut,
        sigma=2,
        k=11,
        iterates=3,
        feature_drug=drug_feature,
        n_hid1=192,
        n_hid2=36,
        alpha=5.74,
        device=device,
    )
    opt = Optimizer(
        model,
        sampler.train_data,
        sampler.test_data,
        sampler.test_mask,
        sampler.train_mask,
        roc_auc,
        lr=5e-4,
        epochs=1000,
        device=device,
    ).to(device)
    epoch, true_data, predict_data = opt()
    true_datas = pd.concat([true_datas, translate_result(true_data)])
    predict_datas = pd.concat([predict_datas, translate_result(predict_data)])

epoch:   0 loss:0.806833 auc:0.4870
epoch:  20 loss:0.227045 auc:0.9195
epoch:  40 loss:0.133597 auc:0.9334
epoch:  60 loss:0.102908 auc:0.9417
epoch:  80 loss:0.089995 auc:0.9459
epoch: 100 loss:0.083118 auc:0.9484
epoch: 120 loss:0.078900 auc:0.9502
epoch: 140 loss:0.076082 auc:0.9514
epoch: 160 loss:0.074096 auc:0.9523
epoch: 180 loss:0.072639 auc:0.9530
epoch: 200 loss:0.071530 auc:0.9535
epoch: 220 loss:0.070665 auc:0.9540
epoch: 240 loss:0.069975 auc:0.9544
epoch: 260 loss:0.069415 auc:0.9547
epoch: 280 loss:0.068957 auc:0.9549
epoch: 300 loss:0.068579 auc:0.9551
epoch: 320 loss:0.068267 auc:0.9552
epoch: 340 loss:0.068008 auc:0.9553
epoch: 360 loss:0.067792 auc:0.9554
epoch: 380 loss:0.067611 auc:0.9555
epoch: 400 loss:0.067458 auc:0.9555
epoch: 420 loss:0.067329 auc:0.9555
epoch: 440 loss:0.067219 auc:0.9555
epoch: 460 loss:0.067125 auc:0.9555
epoch: 480 loss:0.067043 auc:0.9555
epoch: 500 loss:0.066972 auc:0.9555
epoch: 520 loss:0.066909 auc:0.9554
epoch: 540 loss:0.066855 auc

In [10]:
true_datas.to_csv(f"true_{args.data}.csv")
predict_datas.to_csv(f"pred_{args.data}.csv")