In [1]:
import argparse
import random

import numpy as np
import pandas as pd
import scipy.sparse as sp
import torch
from sklearn.model_selection import KFold
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

In [2]:
%load_ext autoreload
%autoreload 2

from DeepDSC.DeepDSC import (
    AE,
    DF,
    GeneExpressionDataset,
    calculate_morgan_fingerprints,
    prepare_data,
    prepare_drug_data,
    prepare_train_val_test_data,
    train_autoencoder,
    train_df_model,
)
from load_data import load_data
from sampler import Sampler

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

torch.manual_seed(0)
random.seed(0)
np.random.seed(0)

In [4]:
class Args:
    def __init__(self):
        self.device = "cpu"  # cuda:number or cpu
        self.data = "nci"  # Dataset{gdsc or ccle}


args = Args()
res, drug_feature, exprs, mut, cna, null_mask, pos_num = load_data(args)
cells = {i: j for i, j in enumerate(res.index)}
drugs = {i: j for i, j in enumerate(res.columns)}

load nci


In [5]:
def main(PATH, train, val):
    print("Loading gene expression data...")
    normalized_gene_exp_tensor, gene_exp = prepare_data(
        data1=PATH + "/gene_exp_part1.csv.gz", data2=PATH + "gene_exp_part2.csv.gz"
    )
    normalized_gene_exp_dataset = GeneExpressionDataset(normalized_gene_exp_tensor)
    normalized_gene_exp_dataloader = DataLoader(
        normalized_gene_exp_dataset, batch_size=10000, shuffle=True
    )

    # オートエンコーダーのトレーニング
    print("Training autoencoder...")
    autoencoder = AE(normalized_gene_exp_tensor.shape[1]).to(device)
    train_autoencoder(autoencoder, normalized_gene_exp_dataloader)
    print("Autoencoder training completed.")

    # 圧縮特徴の抽出
    print("Extracting compressed features...")
    compressed_features_tensor = autoencoder.encoder(normalized_gene_exp_tensor)
    compressed_features = pd.DataFrame(
        compressed_features_tensor.cpu().detach().numpy(), index=gene_exp.columns
    )
    print(f"Compressed features shape: {compressed_features.shape}")
    drug_response, nsc_sm = prepare_drug_data(is_nsc=True)
    mfp = calculate_morgan_fingerprints(drug_response.T, nsc_sm)
    print(f"Morgan fingerprints shape: {mfp.shape}")

    train_labels = train[2]
    val_labels = test[2]
    train_data = train[[0, 1]]
    val_data = test[[0, 1]]
    print(
        f"Training data size: {len(train_data)}, Validation data size: {len(val_data)}"
    )
    train_data, val_data = prepare_train_val_test_data(
        train_data, val_data, compressed_features, mfp
    )
    df_model = DF().to(device)
    val_labels, best_val_out = train_df_model(
        df_model,
        train_data,
        val_data,
        torch.tensor(train_labels).double(),
        torch.tensor(val_labels).double(),
    )
    print("DF model training completed.")
    return val_labels, best_val_out

In [6]:
epochs = []
true_datas = pd.DataFrame()
predict_datas = pd.DataFrame()
k = 5
kfold = KFold(n_splits=k, shuffle=True, random_state=42)
device = "cpu"
PATH = "../nci_data/"

true_datas = pd.DataFrame()
predict_datas = pd.DataFrame()

for train_index, test_index in kfold.split(np.arange(pos_num)):
    sampler = Sampler(res, train_index, test_index, null_mask)

    train_data = pd.DataFrame(sampler.train_data, index=res.index, columns=res.columns)
    test_data = pd.DataFrame(sampler.test_data, index=res.index, columns=res.columns)

    train_mask = pd.DataFrame(sampler.train_mask, index=res.index, columns=res.columns)
    test_mask = pd.DataFrame(sampler.test_mask, index=res.index, columns=res.columns)

    train = pd.DataFrame(train_mask.values.nonzero()).T
    train[2] = train_data.values[train_mask.values.nonzero()].astype(int)

    test = pd.DataFrame(test_mask.values.nonzero()).T
    test[2] = test_data.values[test_mask.values.nonzero()].astype(int)

    train[0] = [cells[i] for i in train[0]]
    train[1] = [drugs[i] for i in train[1]]
    train = train[[1, 0]]
    
    test[0] = [cells[i] for i in test[0]]
    test[1] = [drugs[i] for i in test[1]]

    val_labels, best_val_out = main(PATH, train, test)
    true_datas = pd.concat([true_datas, pd.DataFrame(val_labels.cpu().numpy())], axis=1)
    predict_datas = pd.concat(
        [predict_datas, pd.DataFrame(best_val_out.cpu().numpy())], axis=1
    )

Loading gene expression data...
Training autoencoder...


100%|█████████████████████████████████████████████████████████████████████████████████| 800/800 [03:57<00:00,  3.36it/s]


Autoencoder training completed.
Extracting compressed features...
Compressed features shape: (60, 500)
Morgan fingerprints shape: (1005, 256)
Training data size: 47243, Validation data size: 10776
Epoch 1 Loss: 0.778 Val Loss: 2.153
Accuracy: 0.500
Epoch 2 Loss: 1.970 Val Loss: 1.520
Accuracy: 0.500
Epoch 3 Loss: 1.394 Val Loss: 0.780
Accuracy: 0.500
Epoch 4 Loss: 0.744 Val Loss: 0.791
Accuracy: 0.500
Epoch 5 Loss: 0.827 Val Loss: 0.908
Accuracy: 0.499
Epoch 6 Loss: 0.964 Val Loss: 0.843
Accuracy: 0.500
Epoch 7 Loss: 0.890 Val Loss: 0.729
Accuracy: 0.504
Epoch 8 Loss: 0.750 Val Loss: 0.693
Accuracy: 0.535
Epoch 9 Loss: 0.686 Val Loss: 0.745
Accuracy: 0.501
Epoch 10 Loss: 0.715 Val Loss: 0.806
Accuracy: 0.500
Epoch 11 Loss: 0.763 Val Loss: 0.824
Accuracy: 0.500
Epoch 12 Loss: 0.777 Val Loss: 0.798
Accuracy: 0.500
Epoch 13 Loss: 0.755 Val Loss: 0.752
Accuracy: 0.500
Epoch 14 Loss: 0.718 Val Loss: 0.712
Accuracy: 0.514
Epoch 15 Loss: 0.689 Val Loss: 0.691
Accuracy: 0.539
Epoch 16 Loss: 0.

100%|█████████████████████████████████████████████████████████████████████████████████| 800/800 [06:11<00:00,  2.15it/s]


Autoencoder training completed.
Extracting compressed features...
Compressed features shape: (60, 500)
Morgan fingerprints shape: (1005, 256)
Training data size: 47243, Validation data size: 10776
Epoch 1 Loss: 0.777 Val Loss: 2.157
Accuracy: 0.500
Epoch 2 Loss: 1.970 Val Loss: 1.522
Accuracy: 0.500
Epoch 3 Loss: 1.395 Val Loss: 0.778
Accuracy: 0.500
Epoch 4 Loss: 0.745 Val Loss: 0.788
Accuracy: 0.500
Epoch 5 Loss: 0.827 Val Loss: 0.905
Accuracy: 0.500
Epoch 6 Loss: 0.965 Val Loss: 0.841
Accuracy: 0.501
Epoch 7 Loss: 0.891 Val Loss: 0.727
Accuracy: 0.504
Epoch 8 Loss: 0.751 Val Loss: 0.691
Accuracy: 0.538
Epoch 9 Loss: 0.687 Val Loss: 0.743
Accuracy: 0.501
Epoch 10 Loss: 0.715 Val Loss: 0.805
Accuracy: 0.501
Epoch 11 Loss: 0.763 Val Loss: 0.823
Accuracy: 0.501
Epoch 12 Loss: 0.777 Val Loss: 0.796
Accuracy: 0.501
Epoch 13 Loss: 0.755 Val Loss: 0.750
Accuracy: 0.501
Epoch 14 Loss: 0.718 Val Loss: 0.709
Accuracy: 0.512
Epoch 15 Loss: 0.690 Val Loss: 0.688
Accuracy: 0.543
Epoch 16 Loss: 0.

100%|█████████████████████████████████████████████████████████████████████████████████| 800/800 [05:19<00:00,  2.51it/s]


Autoencoder training completed.
Extracting compressed features...
Compressed features shape: (60, 500)
Morgan fingerprints shape: (1005, 256)
Training data size: 47245, Validation data size: 10774
Epoch 1 Loss: 0.776 Val Loss: 2.156
Accuracy: 0.500
Epoch 2 Loss: 1.970 Val Loss: 1.522
Accuracy: 0.500
Epoch 3 Loss: 1.394 Val Loss: 0.781
Accuracy: 0.501
Epoch 4 Loss: 0.744 Val Loss: 0.795
Accuracy: 0.501
Epoch 5 Loss: 0.827 Val Loss: 0.912
Accuracy: 0.499
Epoch 6 Loss: 0.964 Val Loss: 0.847
Accuracy: 0.499
Epoch 7 Loss: 0.890 Val Loss: 0.731
Accuracy: 0.506
Epoch 8 Loss: 0.750 Val Loss: 0.694
Accuracy: 0.528
Epoch 9 Loss: 0.686 Val Loss: 0.745
Accuracy: 0.502
Epoch 10 Loss: 0.715 Val Loss: 0.806
Accuracy: 0.501
Epoch 11 Loss: 0.763 Val Loss: 0.824
Accuracy: 0.501
Epoch 12 Loss: 0.777 Val Loss: 0.797
Accuracy: 0.501
Epoch 13 Loss: 0.755 Val Loss: 0.751
Accuracy: 0.502
Epoch 14 Loss: 0.718 Val Loss: 0.712
Accuracy: 0.514
Epoch 15 Loss: 0.689 Val Loss: 0.691
Accuracy: 0.536
Epoch 16 Loss: 0.

100%|█████████████████████████████████████████████████████████████████████████████████| 800/800 [07:13<00:00,  1.84it/s]


Autoencoder training completed.
Extracting compressed features...
Compressed features shape: (60, 500)
Morgan fingerprints shape: (1005, 256)
Training data size: 47245, Validation data size: 10774
Epoch 1 Loss: 0.778 Val Loss: 2.162
Accuracy: 0.500
Epoch 2 Loss: 1.969 Val Loss: 1.527
Accuracy: 0.500
Epoch 3 Loss: 1.395 Val Loss: 0.782
Accuracy: 0.500
Epoch 4 Loss: 0.745 Val Loss: 0.788
Accuracy: 0.503
Epoch 5 Loss: 0.827 Val Loss: 0.904
Accuracy: 0.500
Epoch 6 Loss: 0.964 Val Loss: 0.840
Accuracy: 0.502
Epoch 7 Loss: 0.890 Val Loss: 0.727
Accuracy: 0.508
Epoch 8 Loss: 0.751 Val Loss: 0.693
Accuracy: 0.533
Epoch 9 Loss: 0.686 Val Loss: 0.747
Accuracy: 0.502
Epoch 10 Loss: 0.715 Val Loss: 0.809
Accuracy: 0.501
Epoch 11 Loss: 0.763 Val Loss: 0.827
Accuracy: 0.501
Epoch 12 Loss: 0.777 Val Loss: 0.800
Accuracy: 0.501
Epoch 13 Loss: 0.755 Val Loss: 0.753
Accuracy: 0.501
Epoch 14 Loss: 0.718 Val Loss: 0.712
Accuracy: 0.515
Epoch 15 Loss: 0.690 Val Loss: 0.690
Accuracy: 0.541
Epoch 16 Loss: 0.

100%|█████████████████████████████████████████████████████████████████████████████████| 800/800 [11:40<00:00,  1.14it/s]


Autoencoder training completed.
Extracting compressed features...
Compressed features shape: (60, 500)
Morgan fingerprints shape: (1005, 256)
Training data size: 47245, Validation data size: 10774
Epoch 1 Loss: 0.778 Val Loss: 2.158
Accuracy: 0.500
Epoch 2 Loss: 1.970 Val Loss: 1.523
Accuracy: 0.500
Epoch 3 Loss: 1.396 Val Loss: 0.778
Accuracy: 0.499
Epoch 4 Loss: 0.746 Val Loss: 0.786
Accuracy: 0.503
Epoch 5 Loss: 0.827 Val Loss: 0.903
Accuracy: 0.501
Epoch 6 Loss: 0.964 Val Loss: 0.839
Accuracy: 0.501
Epoch 7 Loss: 0.891 Val Loss: 0.726
Accuracy: 0.504
Epoch 8 Loss: 0.751 Val Loss: 0.691
Accuracy: 0.540
Epoch 9 Loss: 0.687 Val Loss: 0.744
Accuracy: 0.499
Epoch 10 Loss: 0.715 Val Loss: 0.806
Accuracy: 0.500
Epoch 11 Loss: 0.763 Val Loss: 0.824
Accuracy: 0.500
Epoch 12 Loss: 0.777 Val Loss: 0.798
Accuracy: 0.500
Epoch 13 Loss: 0.756 Val Loss: 0.751
Accuracy: 0.502
Epoch 14 Loss: 0.719 Val Loss: 0.711
Accuracy: 0.514
Epoch 15 Loss: 0.690 Val Loss: 0.688
Accuracy: 0.541
Epoch 16 Loss: 0.

In [7]:
true_datas.T.reset_index(drop=True).to_csv("true_nci.csv")
predict_datas.T.reset_index(drop=True).to_csv("pred_nci.csv")