In [1]:
import torch
import numpy as np
import pandas as pd
import scipy.sparse as sp

In [2]:
cna = pd.read_csv("nci_data/nci60GeneCop.csv", index_col=0).T.fillna(0)
mutation = pd.read_csv("nci_data/nci60GeneMut.csv", index_col=0).T.fillna(0)
gene = pd.read_csv('nci_data/nci60_gene_exp.csv', index_col=0).T
drug_response = pd.read_csv("nci_data/nci60Act.csv", index_col=0)
drug_response.columns = gene.index

In [3]:
variance = gene.std()
variance = variance.sort_values(ascending=False)
variance = pd.DataFrame(variance > np.percentile(variance, 90))
variance = list(variance[variance[0] == True].index)
len(variance)

2383

In [4]:
dti = pd.read_csv('nci_data/dti_drugbank.csv', index_col=0)
dti.shape

(100, 403)

In [5]:
genes = sorted(list(set(variance) | set(dti.columns)))
# genes

In [6]:
gene = gene[sorted(set(gene.columns) & set(genes))]

gene = np.array(gene, dtype=np.float32)

In [7]:
nsc_class = pd.read_csv("nci_data/nsc_cid_smiles_class_name.csv", index_col=0)[
    ["NSC", "MECHANISM"]
]
nsc_class = nsc_class[nsc_class.MECHANISM == "DNA"]
nsc_class.shape

(269, 2)

In [8]:
drug_response = drug_response[drug_response.index.isin(list(nsc_class.NSC))]
drug_response

Unnamed: 0,MCF7,MDA_MB_231,HS578T,BT_549,T47D,SF_268,SF_295,SF_539,SNB_19,SNB_75,...,PC_3,DU_145,786_0,A498,ACHN,CAKI_1,RXF_393,SN12C,TK_10,UO_31
740,0.703626,-1.219032,-1.892792,-0.877267,-1.156158,0.510978,0.536910,0.589970,-0.356387,-1.460314,...,0.626838,0.507493,0.682664,-0.891882,0.499337,0.541612,-1.400771,0.602244,-1.641942,0.231533
752,0.475296,-0.312852,-1.089067,-0.441030,-0.058619,0.057507,0.125700,0.111693,-3.285729,-0.114051,...,-0.321691,0.507798,0.384102,-1.314527,-0.318444,0.557175,0.345056,-0.047731,0.155244,-0.160223
755,0.704027,-0.438857,-0.548744,-1.441942,0.496864,0.096265,-0.082186,0.417634,-1.927502,-0.372021,...,0.123647,0.543639,0.623318,-1.374212,-0.173024,0.314436,-1.002183,-0.881252,0.491364,-0.183200
762,0.547964,-1.033803,-1.399273,-0.538268,1.137432,0.135942,-0.094460,0.562628,-1.398911,-1.050409,...,-0.294845,0.934592,0.591263,-0.552673,1.797227,1.260987,0.172806,0.869675,-0.529810,-0.634413
1390,0.517269,0.960399,-1.710657,-0.260192,-0.428596,-2.369012,0.224249,-1.481654,-2.369012,-0.950229,...,0.534589,0.819455,0.091197,-0.188655,0.800745,1.468891,0.182377,-0.100527,0.519999,-0.167908
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
772992,-0.672839,-0.009522,1.593912,-0.015442,-0.336637,0.263434,0.524381,1.450999,-0.073388,1.877515,...,0.140564,-0.863786,0.409866,2.250914,0.739791,1.910757,2.324851,0.728314,1.252344,0.959549
783107,0.780763,0.353024,-1.901341,-1.812769,0.402564,0.004460,0.329454,-0.072378,-2.294693,1.159211,...,0.031677,0.463469,0.134452,0.537280,0.483558,0.484789,0.790988,0.112736,-0.212100,0.516311
784722,1.544864,-0.722438,-0.591117,-0.292344,0.491538,0.250683,0.672894,0.135474,0.555915,0.754361,...,-0.565436,0.387815,0.702974,0.713895,0.149978,-1.265221,-0.104946,0.714125,-0.574573,-1.220123
789797,-0.561025,-0.217302,-0.638256,-0.638256,1.112001,-0.638256,-0.674451,-0.638256,-0.638256,0.861387,...,-0.266452,-0.638256,-0.638256,1.989888,-0.521750,2.481620,-0.331090,1.672402,-0.602061,2.006186


In [9]:
train = pd.read_csv("nci_data/train.csv")
val = pd.read_csv("nci_data/val.csv")
test = pd.read_csv("nci_data/test.csv")

In [10]:
tmp = []
for i in train.iterrows():
    tmp.append(drug_response.loc[i[1]['Drug'], i[1]['Cell'],])

In [11]:
np.sum((np.array(tmp) > 0).astype(int) == (np.load('nci_data/train_labels.npy')))

9684

In [12]:
tmp = []
for i in val.iterrows():
    tmp.append(drug_response.loc[i[1]['Drug'], i[1]['Cell'],])

In [13]:
np.sum((np.array(tmp) > 0).astype(int) == (np.load('nci_data/val_labels.npy')))

3228

In [14]:
tmp = []
for i in test.iterrows():
    tmp.append(drug_response.loc[i[1]['Drug'], i[1]['Cell'],])

np.sum((np.array(tmp) > 0).astype(int) == (np.load('nci_data/test_labels.npy')))

3228

In [15]:
np.sum((np.array(tmp) > 0).astype(int) == (np.load('nci_data/test_labels.npy')))

3228

In [16]:
(np.array(tmp) > 0).astype(int)

array([1, 0, 1, ..., 1, 0, 0])

In [17]:
np.load('nci_data/val_labels.npy')

array([0., 0., 0., ..., 0., 0., 0.])

In [18]:
drug_feature = pd.read_csv("nci_data/mfp.csv", index_col=0)
feature_drug = np.array(drug_feature, dtype=np.float32)
feature_drug

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [19]:
idxs, cols = list(drug_response.index), list(drug_response.columns)

In [20]:
def mask(data, rows, cols):
    for i, j in zip(rows, cols):
        data[j, i] = 0
    return data

In [21]:
cell_drug_ = drug_response.T.fillna(0)
cell_drug = np.array(cell_drug_, dtype=np.float32)

In [22]:
# remove test
rows_indices_test = [idxs.index(i) for i in test["Drug"]]
cols_indices_test = [cols.index(i) for i in test["Cell"]]
cell_drug_ = mask(cell_drug, rows_indices_test, cols_indices_test)

In [23]:
# remove val
rows_indices_val = [idxs.index(i) for i in val["Drug"]]
cols_indices_val = [cols.index(i) for i in val["Cell"]]
cell_drug_ = mask(cell_drug_, rows_indices_val, cols_indices_val)

In [24]:
train_data = (cell_drug_ > 0).astype(np.float32)
np.sum(train_data)

4562.0

In [25]:
def get_mask(data, rows, cols):
    tmp = np.zeros(data.shape)
    for i, j in zip(rows, cols):
        tmp[i, j] = 1

    return torch.tensor(tmp.astype(bool))

In [26]:
test_mask = get_mask(drug_response, rows_indices_test, cols_indices_test).T
# test_mask

In [27]:
test_data = torch.tensor(drug_response.values > 0, dtype=int).T * test_mask
# test_data

In [28]:
val_mask = get_mask(drug_response, rows_indices_val, cols_indices_val).T
# val_mask

In [29]:
val_data = torch.tensor(drug_response.values > 0, dtype=int).T * val_mask
# val_data

In [30]:
rows_indices_train = [idxs.index(i) for i in train["Drug"]]
cols_indices_train = [cols.index(i) for i in train["Cell"]]

In [31]:
train_mask = get_mask(drug_response, rows_indices_train, cols_indices_train).T
# train_mask

In [32]:
val_data[val_mask]

tensor([1, 1, 0,  ..., 1, 1, 1])

In [33]:
(np.load('nci_data/val_labels.npy'))

array([0., 0., 0., ..., 0., 0., 0.])

In [34]:
np.load('nci_data/test_labels.npy').sum()

1504.0

In [35]:
test_data[test_mask].sum()

tensor(1504)

In [36]:
# データの準備
cell_drug = drug_response.T.fillna(0).values.astype(np.float32)
cell_drug_masked = cell_drug.copy()  # コピーを作成

# マスク関数の明確な実装
def mask_data(data, rows, cols):
    data_copy = data.copy()
    for i, j in zip(rows, cols):
        data_copy[j, i] = 0  # インデックスの順序に注意
    return data_copy

# テストと検証データを除外
rows_indices_test = [idxs.index(i) for i in test["Drug"]]
cols_indices_test = [cols.index(i) for i in test["Cell"]]
cell_drug_masked = mask_data(cell_drug_masked, rows_indices_test, cols_indices_test)

rows_indices_val = [idxs.index(i) for i in val["Drug"]]
cols_indices_val = [cols.index(i) for i in val["Cell"]]
cell_drug_masked = mask_data(cell_drug_masked, rows_indices_val, cols_indices_val)

# 訓練データの作成
train_data = (cell_drug_masked > 0).astype(np.float32)

# マスクの作成を一貫した方法で行う
def get_mask(shape, rows, cols):
    tmp = np.zeros(shape)
    for i, j in zip(rows, cols):
        tmp[j, i] = 1  # インデックスの順序に注意
    return torch.tensor(tmp.astype(bool))

# マスクの作成
test_mask = get_mask(cell_drug.shape, rows_indices_test, cols_indices_test)
val_mask = get_mask(cell_drug.shape, rows_indices_val, cols_indices_val)
rows_indices_train = [idxs.index(i) for i in train["Drug"]]
cols_indices_train = [cols.index(i) for i in train["Cell"]]
train_mask = get_mask(cell_drug.shape, rows_indices_train, cols_indices_train)

# データの作成
test_data = torch.tensor(cell_drug > 0, dtype=int) * test_mask
val_data = torch.tensor(cell_drug > 0, dtype=int) * val_mask


In [37]:
test_data[test_mask]

tensor([1, 1, 1,  ..., 1, 1, 0])

In [38]:
np.load('nci_data/test_labels.npy')

array([1., 0., 1., ..., 1., 0., 0.])

In [39]:
import os
import sys

import inspect
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(inspect.getfile(lambda: None)), "..", "..")))
from load_data import load_data
from nci_model import Optimizer, nihgcn
from myutils import *
from metrics import print_binary_classification_metrics

In [40]:
# argparseを使わない変数の設定
device = "cpu"  # cuda:number or cpu
lr = 0.001      # the learning rate
wd = 1e-5       # the weight decay for l2 normalizaton
layer_size = [1024, 1024]  # Output sizes of every layer
alpha = 0.25    # the scale for balance gcn and ni
gamma = 8       # the scale for sigmod
epochs = 1000   # the epochs for model

def get_results():
    # モデルの初期化
    model = nihgcn(
        adj_mat=torch.tensor(train_data),
        cell_exprs=gene,
        drug_finger=feature_drug,
        layer_size=layer_size,
        alpha=alpha,
        gamma=gamma,
        device=device,
    ).to(device)
    
    opt = Optimizer(
        model=model,
        train_data=torch.tensor(train_data),
        val_data=val_data,
        test_data=test_data,
        train_mask=train_mask,
        val_mask=val_mask,
        test_mask=test_mask,
        evaluate_fun=roc_auc,
        lr=lr,
        wd=wd,
        epochs=epochs,
        device=device,
    ).to(device)
    
    test_true, test_pred, test_auc = opt()
    return test_true, test_pred

In [41]:
import pandas as pd
import torch
import numpy as np
import os
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
)

# 結果を保存するCSVファイル名
results_file = 'experiment_results.csv'
threshold = 0.5

# CSVファイルが既に存在するかチェック
if os.path.exists(results_file):
    # 既存のファイルを読み込む
    results_df = pd.read_csv(results_file)
else:
    # 新しいDataFrameを作成
    results_df = pd.DataFrame(columns=['Run', 'Accuracy', 'Precision', 'Recall', 'F1'])

# 5回実験を実行
for run in range(100):
    # モデルの学習と予測を実行
    test_true, test_pred = get_results()
    
    predictions_binary = (test_pred > threshold).numpy()
    test_true = test_true.numpy()
    
    accuracy = accuracy_score(test_true, predictions_binary)
    precision = precision_score(test_true, predictions_binary)
    recall = recall_score(test_true, predictions_binary)
    f1 = f1_score(test_true, predictions_binary)
    
    # 結果をDataFrameに追加
    new_row = pd.DataFrame({
        'Run': [run + 1],
        'Accuracy': [accuracy],
        'Precision': [precision],
        'Recall': [recall],
        'F1': [f1],
    })
    
    results_df = pd.concat([results_df, new_row], ignore_index=True)
    
    # 各実行後にCSVに保存（上書き）
    results_df.to_csv(results_file, index=False)

Epoch:    0 | Train Loss: 0.699535 | Train AUC: 0.8239 | Val AUC: 0.7540
Epoch:   10 | Train Loss: 0.274748 | Train AUC: 0.9185 | Val AUC: 0.8326
Epoch:   20 | Train Loss: 0.221208 | Train AUC: 0.9694 | Val AUC: 0.8680
Epoch:   30 | Train Loss: 0.195521 | Train AUC: 0.9932 | Val AUC: 0.8961
Epoch:   40 | Train Loss: 0.181169 | Train AUC: 0.9975 | Val AUC: 0.9053
Epoch:   50 | Train Loss: 0.172122 | Train AUC: 0.9983 | Val AUC: 0.9062
Epoch:   60 | Train Loss: 0.165843 | Train AUC: 0.9987 | Val AUC: 0.9054
Epoch:   70 | Train Loss: 0.161047 | Train AUC: 0.9991 | Val AUC: 0.9041
Epoch:   80 | Train Loss: 0.157195 | Train AUC: 0.9993 | Val AUC: 0.9028
Epoch:   90 | Train Loss: 0.154985 | Train AUC: 0.9994 | Val AUC: 0.9009
Epoch:  100 | Train Loss: 0.152084 | Train AUC: 0.9995 | Val AUC: 0.9007
Epoch:  110 | Train Loss: 0.149563 | Train AUC: 0.9997 | Val AUC: 0.8991
Epoch:  120 | Train Loss: 0.147814 | Train AUC: 0.9998 | Val AUC: 0.8981
Epoch:  130 | Train Loss: 0.146216 | Train AUC: 0.9

  results_df = pd.concat([results_df, new_row], ignore_index=True)


Epoch:   10 | Train Loss: 0.271443 | Train AUC: 0.9200 | Val AUC: 0.8308
Epoch:   20 | Train Loss: 0.219848 | Train AUC: 0.9689 | Val AUC: 0.8658
Epoch:   30 | Train Loss: 0.194958 | Train AUC: 0.9927 | Val AUC: 0.8931
Epoch:   40 | Train Loss: 0.180932 | Train AUC: 0.9975 | Val AUC: 0.9040
Epoch:   50 | Train Loss: 0.171991 | Train AUC: 0.9982 | Val AUC: 0.9053
Epoch:   60 | Train Loss: 0.165806 | Train AUC: 0.9986 | Val AUC: 0.9048
Epoch:   70 | Train Loss: 0.161104 | Train AUC: 0.9989 | Val AUC: 0.9033
Epoch:   80 | Train Loss: 0.157271 | Train AUC: 0.9992 | Val AUC: 0.9024
Epoch:   90 | Train Loss: 0.154270 | Train AUC: 0.9995 | Val AUC: 0.9008
Epoch:  100 | Train Loss: 0.151860 | Train AUC: 0.9996 | Val AUC: 0.9004
Epoch:  110 | Train Loss: 0.149697 | Train AUC: 0.9997 | Val AUC: 0.8994
Epoch:  120 | Train Loss: 0.147591 | Train AUC: 0.9997 | Val AUC: 0.8987
Epoch:  130 | Train Loss: 0.146709 | Train AUC: 0.9998 | Val AUC: 0.8972
Epoch:  140 | Train Loss: 0.144890 | Train AUC: 0.9

In [42]:
results_df

Unnamed: 0,Run,Accuracy,Precision,Recall,F1,Threshold
0,1,0.788724,0.772185,0.775266,0.773723,
1,2,0.793061,0.779412,0.775266,0.777333,
2,3,0.791202,0.777036,0.773936,0.775483,
3,4,0.789343,0.772487,0.776596,0.774536,
4,5,0.790582,0.776000,0.773936,0.774967,
...,...,...,...,...,...,...
95,96,0.791822,0.779195,0.771941,0.775551,
96,97,0.792751,0.779264,0.774601,0.776926,
97,98,0.793371,0.779933,0.775266,0.777593,
98,99,0.789653,0.774086,0.774601,0.774344,
