In [1]:
import argparse
import random

import numpy as np
import pandas as pd
import scipy.sparse as sp
import torch
from sklearn.model_selection import KFold
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

In [2]:
%load_ext autoreload
%autoreload 2

from load_data import load_data
from sampler import Sampler

from DeepDSC.DeepDSC import (AE, DF, GeneExpressionDataset,
                             calculate_morgan_fingerprints, prepare_data,
                             prepare_drug_data, prepare_train_val_test_data,
                             train_autoencoder, train_df_model)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

torch.manual_seed(0)
random.seed(0)
np.random.seed(0)

In [4]:
data = "ctrp"

In [5]:
class Args:
    def __init__(self):
        self.device = "cpu"  # cuda:number or cpu
        self.data = data  # Dataset{gdsc or ccle}


args = Args()
res, drug_feature, exprs, mut, cna, null_mask, pos_num = load_data(args)
cells = {i: j for i, j in enumerate(res.index)}
drugs = {i: j for i, j in enumerate(res.columns)}

In [6]:
def main(PATH, train, val):
    normalized_gene_exp_tensor, gene_exp = prepare_data(
        data1=PATH + "/gene_exp_part1.csv.gz", data2=PATH + "gene_exp_part2.csv.gz"
    )
    normalized_gene_exp_dataset = GeneExpressionDataset(normalized_gene_exp_tensor)
    normalized_gene_exp_dataloader = DataLoader(
        normalized_gene_exp_dataset, batch_size=10000, shuffle=True
    )

    # オートエンコーダーのトレーニング
    autoencoder = AE(normalized_gene_exp_tensor.shape[1]).to(device)
    train_autoencoder(autoencoder, normalized_gene_exp_dataloader)

    # 圧縮特徴の抽出
    compressed_features_tensor = autoencoder.encoder(normalized_gene_exp_tensor)
    compressed_features = pd.DataFrame(
        compressed_features_tensor.cpu().detach().numpy(), index=gene_exp.columns
    )

    # 薬物応答データの準備
    drug_response, nsc_sm = prepare_drug_data(is_nsc=False, is_gdsc=False, is_1=False)
    mfp = calculate_morgan_fingerprints(drug_response, nsc_sm)
    print(f"Morgan fingerprints shape: {mfp.shape}")

    train_labels = train[2]
    val_labels = test[2]
    train_data = train[[0, 1]]
    val_data = test[[0, 1]]
    print(
        f"Training data size: {len(train_data)}, Validation data size: {len(val_data)}"
    )
    train_data, val_data = prepare_train_val_test_data(
        train_data, val_data, compressed_features, mfp
    )
    df_model = DF().to(device)
    val_labels, best_val_out = train_df_model(
        df_model,
        train_data,
        val_data,
        torch.tensor(train_labels).double(),
        torch.tensor(val_labels).double(),
    )
    print("DF model training completed.")
    return val_labels, best_val_out

In [7]:
epochs = []
true_datas = pd.DataFrame()
predict_datas = pd.DataFrame()
k = 5
kfold = KFold(n_splits=k, shuffle=True, random_state=42)
device = "cpu"
PATH = f"../{data}_data/"

true_datas = pd.DataFrame()
predict_datas = pd.DataFrame()

for train_index, test_index in kfold.split(np.arange(pos_num)):
    sampler = Sampler(res, train_index, test_index, null_mask.T)

    train_data = pd.DataFrame(sampler.train_data, index=res.index, columns=res.columns)
    test_data = pd.DataFrame(sampler.test_data, index=res.index, columns=res.columns)

    train_mask = pd.DataFrame(sampler.train_mask, index=res.index, columns=res.columns)
    test_mask = pd.DataFrame(sampler.test_mask, index=res.index, columns=res.columns)

    train = pd.DataFrame(train_mask.values.nonzero()).T
    train[2] = train_data.values[train_mask.values.nonzero()].astype(int)

    test = pd.DataFrame(test_mask.values.nonzero()).T
    test[2] = test_data.values[test_mask.values.nonzero()].astype(int)

    train[0] = [cells[i] for i in train[0]]
    train[1] = [drugs[i] for i in train[1]]
    test[0] = [cells[i] for i in test[0]]
    test[1] = [drugs[i] for i in test[1]]

    val_labels, best_val_out = main(PATH, train, test)
    true_datas = pd.concat([true_datas, pd.DataFrame(val_labels.cpu().numpy())], axis=1)
    predict_datas = pd.concat(
        [predict_datas, pd.DataFrame(best_val_out.cpu().numpy())], axis=1
    )

100%|███████████████████████████████████████████████████████████████████████████████████████| 800/800 [39:06<00:00,  2.93s/it]


Morgan fingerprints shape: (460, 256)
Training data size: 232024, Validation data size: 43818
Epoch 1 Loss: 0.826 Val Loss: 2.069
Accuracy: 0.500
Epoch 2 Loss: 1.568 Val Loss: 1.619
Accuracy: 0.500
Epoch 3 Loss: 1.233 Val Loss: 0.876
Accuracy: 0.500
Epoch 4 Loss: 0.716 Val Loss: 0.649
Accuracy: 0.620
Epoch 5 Loss: 0.671 Val Loss: 0.723
Accuracy: 0.523
Epoch 6 Loss: 0.821 Val Loss: 0.712
Accuracy: 0.529
Epoch 7 Loss: 0.805 Val Loss: 0.645
Accuracy: 0.599
Epoch 8 Loss: 0.691 Val Loss: 0.623
Accuracy: 0.690
Epoch 9 Loss: 0.607 Val Loss: 0.672
Accuracy: 0.559
Epoch 10 Loss: 0.597 Val Loss: 0.744
Accuracy: 0.508
Epoch 11 Loss: 0.626 Val Loss: 0.791
Accuracy: 0.500
Epoch 12 Loss: 0.650 Val Loss: 0.792
Accuracy: 0.500
Epoch 13 Loss: 0.649 Val Loss: 0.756
Accuracy: 0.509
Epoch 14 Loss: 0.626 Val Loss: 0.701
Accuracy: 0.533
Epoch 15 Loss: 0.594 Val Loss: 0.647
Accuracy: 0.591
Epoch 16 Loss: 0.568 Val Loss: 0.607
Accuracy: 0.666
Epoch 17 Loss: 0.554 Val Loss: 0.583
Accuracy: 0.723
Epoch 18 Loss:

100%|███████████████████████████████████████████████████████████████████████████████████████| 800/800 [39:42<00:00,  2.98s/it]


Morgan fingerprints shape: (460, 256)
Training data size: 232024, Validation data size: 43818
Epoch 1 Loss: 0.826 Val Loss: 2.066
Accuracy: 0.500
Epoch 2 Loss: 1.568 Val Loss: 1.616
Accuracy: 0.500
Epoch 3 Loss: 1.234 Val Loss: 0.874
Accuracy: 0.500
Epoch 4 Loss: 0.716 Val Loss: 0.649
Accuracy: 0.621
Epoch 5 Loss: 0.671 Val Loss: 0.724
Accuracy: 0.523
Epoch 6 Loss: 0.821 Val Loss: 0.713
Accuracy: 0.529
Epoch 7 Loss: 0.805 Val Loss: 0.646
Accuracy: 0.597
Epoch 8 Loss: 0.691 Val Loss: 0.623
Accuracy: 0.688
Epoch 9 Loss: 0.607 Val Loss: 0.671
Accuracy: 0.559
Epoch 10 Loss: 0.597 Val Loss: 0.743
Accuracy: 0.508
Epoch 11 Loss: 0.626 Val Loss: 0.790
Accuracy: 0.500
Epoch 12 Loss: 0.650 Val Loss: 0.791
Accuracy: 0.500
Epoch 13 Loss: 0.649 Val Loss: 0.755
Accuracy: 0.506
Epoch 14 Loss: 0.626 Val Loss: 0.700
Accuracy: 0.528
Epoch 15 Loss: 0.594 Val Loss: 0.647
Accuracy: 0.588
Epoch 16 Loss: 0.568 Val Loss: 0.607
Accuracy: 0.667
Epoch 17 Loss: 0.554 Val Loss: 0.583
Accuracy: 0.718
Epoch 18 Loss:

100%|███████████████████████████████████████████████████████████████████████████████████████| 800/800 [39:51<00:00,  2.99s/it]


Morgan fingerprints shape: (460, 256)
Training data size: 232024, Validation data size: 43818
Epoch 1 Loss: 0.825 Val Loss: 2.066
Accuracy: 0.500
Epoch 2 Loss: 1.568 Val Loss: 1.615
Accuracy: 0.500
Epoch 3 Loss: 1.233 Val Loss: 0.873
Accuracy: 0.500
Epoch 4 Loss: 0.716 Val Loss: 0.649
Accuracy: 0.620
Epoch 5 Loss: 0.671 Val Loss: 0.725
Accuracy: 0.522
Epoch 6 Loss: 0.821 Val Loss: 0.713
Accuracy: 0.527
Epoch 7 Loss: 0.805 Val Loss: 0.646
Accuracy: 0.599
Epoch 8 Loss: 0.691 Val Loss: 0.623
Accuracy: 0.687
Epoch 9 Loss: 0.607 Val Loss: 0.671
Accuracy: 0.558
Epoch 10 Loss: 0.597 Val Loss: 0.743
Accuracy: 0.509
Epoch 11 Loss: 0.626 Val Loss: 0.790
Accuracy: 0.500
Epoch 12 Loss: 0.650 Val Loss: 0.791
Accuracy: 0.500
Epoch 13 Loss: 0.649 Val Loss: 0.754
Accuracy: 0.506
Epoch 14 Loss: 0.626 Val Loss: 0.700
Accuracy: 0.534
Epoch 15 Loss: 0.594 Val Loss: 0.646
Accuracy: 0.590
Epoch 16 Loss: 0.568 Val Loss: 0.606
Accuracy: 0.668
Epoch 17 Loss: 0.554 Val Loss: 0.582
Accuracy: 0.722
Epoch 18 Loss:

100%|███████████████████████████████████████████████████████████████████████████████████████| 800/800 [46:27<00:00,  3.48s/it]


Morgan fingerprints shape: (460, 256)
Training data size: 232024, Validation data size: 43818
Epoch 1 Loss: 0.826 Val Loss: 2.066
Accuracy: 0.500
Epoch 2 Loss: 1.568 Val Loss: 1.616
Accuracy: 0.500
Epoch 3 Loss: 1.234 Val Loss: 0.874
Accuracy: 0.500
Epoch 4 Loss: 0.716 Val Loss: 0.648
Accuracy: 0.623
Epoch 5 Loss: 0.671 Val Loss: 0.723
Accuracy: 0.524
Epoch 6 Loss: 0.821 Val Loss: 0.712
Accuracy: 0.530
Epoch 7 Loss: 0.805 Val Loss: 0.645
Accuracy: 0.600
Epoch 8 Loss: 0.692 Val Loss: 0.623
Accuracy: 0.686
Epoch 9 Loss: 0.607 Val Loss: 0.671
Accuracy: 0.561
Epoch 10 Loss: 0.597 Val Loss: 0.744
Accuracy: 0.509
Epoch 11 Loss: 0.626 Val Loss: 0.790
Accuracy: 0.500
Epoch 12 Loss: 0.650 Val Loss: 0.792
Accuracy: 0.500
Epoch 13 Loss: 0.649 Val Loss: 0.755
Accuracy: 0.506
Epoch 14 Loss: 0.626 Val Loss: 0.700
Accuracy: 0.531
Epoch 15 Loss: 0.594 Val Loss: 0.646
Accuracy: 0.589
Epoch 16 Loss: 0.568 Val Loss: 0.606
Accuracy: 0.668
Epoch 17 Loss: 0.554 Val Loss: 0.582
Accuracy: 0.718
Epoch 18 Loss:

100%|█████████████████████████████████████████████████████████████████████████████████████| 800/800 [1:40:12<00:00,  7.52s/it]


Morgan fingerprints shape: (460, 256)
Training data size: 232024, Validation data size: 43818
Epoch 1 Loss: 0.825 Val Loss: 2.069
Accuracy: 0.500
Epoch 2 Loss: 1.568 Val Loss: 1.618
Accuracy: 0.500
Epoch 3 Loss: 1.234 Val Loss: 0.874
Accuracy: 0.500
Epoch 4 Loss: 0.716 Val Loss: 0.648
Accuracy: 0.622
Epoch 5 Loss: 0.671 Val Loss: 0.723
Accuracy: 0.524
Epoch 6 Loss: 0.821 Val Loss: 0.711
Accuracy: 0.530
Epoch 7 Loss: 0.806 Val Loss: 0.645
Accuracy: 0.603
Epoch 8 Loss: 0.692 Val Loss: 0.622
Accuracy: 0.690
Epoch 9 Loss: 0.607 Val Loss: 0.670
Accuracy: 0.556
Epoch 10 Loss: 0.597 Val Loss: 0.743
Accuracy: 0.507
Epoch 11 Loss: 0.627 Val Loss: 0.789
Accuracy: 0.500
Epoch 12 Loss: 0.651 Val Loss: 0.790
Accuracy: 0.500
Epoch 13 Loss: 0.650 Val Loss: 0.753
Accuracy: 0.505
Epoch 14 Loss: 0.627 Val Loss: 0.698
Accuracy: 0.533
Epoch 15 Loss: 0.595 Val Loss: 0.644
Accuracy: 0.586
Epoch 16 Loss: 0.569 Val Loss: 0.604
Accuracy: 0.669
Epoch 17 Loss: 0.555 Val Loss: 0.580
Accuracy: 0.724
Epoch 18 Loss:

In [8]:
true_datas.T.reset_index(drop=True).to_csv("true_ctrp.csv")
predict_datas.T.reset_index(drop=True).to_csv("pred_ctrp.csv")