In [1]:
import argparse
import random

import numpy as np
import pandas as pd
import scipy.sparse as sp
import torch
from sklearn.model_selection import KFold
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

In [2]:
%load_ext autoreload
%autoreload 2

from load_data import load_data
from sampler import Sampler

from DeepDSC.DeepDSC import (AE, DF, GeneExpressionDataset,
                             calculate_morgan_fingerprints, prepare_data,
                             prepare_drug_data, prepare_train_val_test_data,
                             train_autoencoder, train_df_model)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

torch.manual_seed(0)
random.seed(0)
np.random.seed(0)

In [4]:
data = "gdsc1"

In [5]:
class Args:
    def __init__(self):
        self.device = device  # cuda:number or cpu
        self.data = data  # Dataset{gdsc or ccle}


args = Args()
res, drug_feature, exprs, mut, cna, null_mask, pos_num = load_data(args)
cells = {i: j for i, j in enumerate(res.index)}
drugs = {i: j for i, j in enumerate(res.columns)}

load gdsc1


In [6]:
def main(PATH, train, val):
    normalized_gene_exp_tensor, gene_exp = prepare_data(
        data1=PATH + "/gene_exp_part1.csv.gz", data2=PATH + "gene_exp_part2.csv.gz"
    )
    normalized_gene_exp_dataset = GeneExpressionDataset(normalized_gene_exp_tensor)
    normalized_gene_exp_dataloader = DataLoader(
        normalized_gene_exp_dataset, batch_size=10000, shuffle=True
    )

    # オートエンコーダーのトレーニング
    autoencoder = AE(normalized_gene_exp_tensor.shape[1]).to(device)
    train_autoencoder(autoencoder, normalized_gene_exp_dataloader)

    # 圧縮特徴の抽出
    compressed_features_tensor = autoencoder.encoder(normalized_gene_exp_tensor)
    compressed_features = pd.DataFrame(
        compressed_features_tensor.cpu().detach().numpy(), index=gene_exp.columns
    )

    # 薬物応答データの準備
    drug_response, nsc_sm = prepare_drug_data(is_nsc=False, is_gdsc=True, is_1=True)
    mfp = calculate_morgan_fingerprints(drug_response, nsc_sm)
    print(f"Morgan fingerprints shape: {mfp.shape}")

    train_labels = train[2]
    val_labels = test[2]
    train_data = train[[0, 1]]
    val_data = test[[0, 1]]
    print(
        f"Training data size: {len(train_data)}, Validation data size: {len(val_data)}"
    )
    train_data, val_data = prepare_train_val_test_data(
        train_data, val_data, compressed_features, mfp
    )

    df_model = DF().to(device)
    val_labels, best_val_out = train_df_model(
        df_model,
        train_data,
        val_data,
        torch.tensor(train_labels).double().to(device),
        torch.tensor(val_labels).double().to(device),
    )
    print("DF model training completed.")
    return val_labels, best_val_out

In [7]:
epochs = []
true_datas = pd.DataFrame()
predict_datas = pd.DataFrame()
k = 5
kfold = KFold(n_splits=k, shuffle=True, random_state=42)
PATH = f"../{data}_data/"

true_datas = pd.DataFrame()
predict_datas = pd.DataFrame()

for i, (train_index, test_index) in enumerate(kfold.split(np.arange(pos_num))):
    sampler = Sampler(res, train_index, test_index, null_mask.T, i)

    train_data = pd.DataFrame(sampler.train_data, index=res.index, columns=res.columns)
    test_data = pd.DataFrame(sampler.test_data, index=res.index, columns=res.columns)

    train_mask = pd.DataFrame(sampler.train_mask, index=res.index, columns=res.columns)
    test_mask = pd.DataFrame(sampler.test_mask, index=res.index, columns=res.columns)

    train = pd.DataFrame(train_mask.values.nonzero()).T
    train[2] = train_data.values[train_mask.values.nonzero()].astype(int)

    test = pd.DataFrame(test_mask.values.nonzero()).T
    test[2] = test_data.values[test_mask.values.nonzero()].astype(int)

    train[0] = [cells[i] for i in train[0]]
    train[1] = [drugs[i] for i in train[1]]
    test[0] = [cells[i] for i in test[0]]
    test[1] = [drugs[i] for i in test[1]]

    val_labels, best_val_out = main(PATH, train, test)
    true_datas = pd.concat([true_datas, pd.DataFrame(val_labels.cpu().numpy())], axis=1)
    predict_datas = pd.concat(
        [predict_datas, pd.DataFrame(best_val_out.cpu().numpy())], axis=1
    )

100%|██████████| 800/800 [00:31<00:00, 25.34it/s]


Morgan fingerprints shape: (332, 256)
Training data size: 240628, Validation data size: 20476
Epoch 1 Loss: 0.937 Val Loss: 2.236
Accuracy: 0.500
Epoch 2 Loss: 0.777 Val Loss: 2.306
Accuracy: 0.500
Epoch 3 Loss: 0.796 Val Loss: 1.776
Accuracy: 0.500
Epoch 4 Loss: 0.626 Val Loss: 1.159
Accuracy: 0.500
Epoch 5 Loss: 0.453 Val Loss: 0.744
Accuracy: 0.551
Epoch 6 Loss: 0.401 Val Loss: 0.596
Accuracy: 0.688
Epoch 7 Loss: 0.459 Val Loss: 0.567
Accuracy: 0.700
Epoch 8 Loss: 0.483 Val Loss: 0.572
Accuracy: 0.700
Epoch 9 Loss: 0.440 Val Loss: 0.620
Accuracy: 0.671
Epoch 10 Loss: 0.386 Val Loss: 0.709
Accuracy: 0.628
Epoch 11 Loss: 0.360 Val Loss: 0.810
Accuracy: 0.571
Epoch 12 Loss: 0.359 Val Loss: 0.892
Accuracy: 0.522
Epoch 13 Loss: 0.368 Val Loss: 0.933
Accuracy: 0.514
Epoch 14 Loss: 0.373 Val Loss: 0.931
Accuracy: 0.522
Epoch 15 Loss: 0.370 Val Loss: 0.892
Accuracy: 0.555
Epoch 16 Loss: 0.358 Val Loss: 0.827
Accuracy: 0.594
Epoch 17 Loss: 0.342 Val Loss: 0.751
Accuracy: 0.643
Epoch 18 Loss:

100%|██████████| 800/800 [00:31<00:00, 25.40it/s]


Morgan fingerprints shape: (332, 256)
Training data size: 240628, Validation data size: 20476
Epoch 1 Loss: 0.937 Val Loss: 2.236
Accuracy: 0.500
Epoch 2 Loss: 0.777 Val Loss: 2.306
Accuracy: 0.500
Epoch 3 Loss: 0.796 Val Loss: 1.777
Accuracy: 0.500
Epoch 4 Loss: 0.626 Val Loss: 1.161
Accuracy: 0.500
Epoch 5 Loss: 0.452 Val Loss: 0.747
Accuracy: 0.552
Epoch 6 Loss: 0.400 Val Loss: 0.599
Accuracy: 0.693
Epoch 7 Loss: 0.459 Val Loss: 0.571
Accuracy: 0.694
Epoch 8 Loss: 0.483 Val Loss: 0.577
Accuracy: 0.701
Epoch 9 Loss: 0.440 Val Loss: 0.625
Accuracy: 0.668
Epoch 10 Loss: 0.385 Val Loss: 0.714
Accuracy: 0.627
Epoch 11 Loss: 0.360 Val Loss: 0.816
Accuracy: 0.571
Epoch 12 Loss: 0.359 Val Loss: 0.897
Accuracy: 0.523
Epoch 13 Loss: 0.368 Val Loss: 0.939
Accuracy: 0.523
Epoch 14 Loss: 0.373 Val Loss: 0.938
Accuracy: 0.523
Epoch 15 Loss: 0.369 Val Loss: 0.898
Accuracy: 0.555
Epoch 16 Loss: 0.358 Val Loss: 0.833
Accuracy: 0.585
Epoch 17 Loss: 0.342 Val Loss: 0.757
Accuracy: 0.639
Epoch 18 Loss:

100%|██████████| 800/800 [00:31<00:00, 25.31it/s]


Morgan fingerprints shape: (332, 256)
Training data size: 240628, Validation data size: 20476
Epoch 1 Loss: 0.937 Val Loss: 2.232
Accuracy: 0.500
Epoch 2 Loss: 0.777 Val Loss: 2.301
Accuracy: 0.500
Epoch 3 Loss: 0.797 Val Loss: 1.773
Accuracy: 0.500
Epoch 4 Loss: 0.626 Val Loss: 1.157
Accuracy: 0.500
Epoch 5 Loss: 0.453 Val Loss: 0.745
Accuracy: 0.553
Epoch 6 Loss: 0.401 Val Loss: 0.598
Accuracy: 0.687
Epoch 7 Loss: 0.459 Val Loss: 0.569
Accuracy: 0.700
Epoch 8 Loss: 0.483 Val Loss: 0.575
Accuracy: 0.700
Epoch 9 Loss: 0.440 Val Loss: 0.622
Accuracy: 0.670
Epoch 10 Loss: 0.386 Val Loss: 0.711
Accuracy: 0.629
Epoch 11 Loss: 0.360 Val Loss: 0.812
Accuracy: 0.571
Epoch 12 Loss: 0.359 Val Loss: 0.893
Accuracy: 0.524
Epoch 13 Loss: 0.368 Val Loss: 0.935
Accuracy: 0.515
Epoch 14 Loss: 0.373 Val Loss: 0.933
Accuracy: 0.524
Epoch 15 Loss: 0.370 Val Loss: 0.893
Accuracy: 0.547
Epoch 16 Loss: 0.358 Val Loss: 0.828
Accuracy: 0.585
Epoch 17 Loss: 0.343 Val Loss: 0.753
Accuracy: 0.640
Epoch 18 Loss:

100%|██████████| 800/800 [00:57<00:00, 13.85it/s]


Morgan fingerprints shape: (332, 256)
Training data size: 240628, Validation data size: 20476
Epoch 1 Loss: 0.937 Val Loss: 2.241
Accuracy: 0.500
Epoch 2 Loss: 0.777 Val Loss: 2.310
Accuracy: 0.500
Epoch 3 Loss: 0.796 Val Loss: 1.779
Accuracy: 0.500
Epoch 4 Loss: 0.626 Val Loss: 1.161
Accuracy: 0.500
Epoch 5 Loss: 0.453 Val Loss: 0.745
Accuracy: 0.553
Epoch 6 Loss: 0.400 Val Loss: 0.597
Accuracy: 0.688
Epoch 7 Loss: 0.459 Val Loss: 0.568
Accuracy: 0.702
Epoch 8 Loss: 0.483 Val Loss: 0.573
Accuracy: 0.697
Epoch 9 Loss: 0.440 Val Loss: 0.621
Accuracy: 0.672
Epoch 10 Loss: 0.386 Val Loss: 0.710
Accuracy: 0.629
Epoch 11 Loss: 0.360 Val Loss: 0.812
Accuracy: 0.571
Epoch 12 Loss: 0.359 Val Loss: 0.894
Accuracy: 0.523
Epoch 13 Loss: 0.368 Val Loss: 0.937
Accuracy: 0.516
Epoch 14 Loss: 0.373 Val Loss: 0.935
Accuracy: 0.523
Epoch 15 Loss: 0.369 Val Loss: 0.895
Accuracy: 0.555
Epoch 16 Loss: 0.358 Val Loss: 0.830
Accuracy: 0.594
Epoch 17 Loss: 0.342 Val Loss: 0.754
Accuracy: 0.642
Epoch 18 Loss:

100%|██████████| 800/800 [01:35<00:00,  8.40it/s]


Morgan fingerprints shape: (332, 256)
Training data size: 240630, Validation data size: 20474
Epoch 1 Loss: 0.937 Val Loss: 2.235
Accuracy: 0.500
Epoch 2 Loss: 0.777 Val Loss: 2.305
Accuracy: 0.500
Epoch 3 Loss: 0.796 Val Loss: 1.776
Accuracy: 0.500
Epoch 4 Loss: 0.626 Val Loss: 1.159
Accuracy: 0.500
Epoch 5 Loss: 0.453 Val Loss: 0.745
Accuracy: 0.560
Epoch 6 Loss: 0.400 Val Loss: 0.598
Accuracy: 0.686
Epoch 7 Loss: 0.459 Val Loss: 0.569
Accuracy: 0.698
Epoch 8 Loss: 0.483 Val Loss: 0.575
Accuracy: 0.697
Epoch 9 Loss: 0.440 Val Loss: 0.622
Accuracy: 0.671
Epoch 10 Loss: 0.386 Val Loss: 0.711
Accuracy: 0.627
Epoch 11 Loss: 0.360 Val Loss: 0.812
Accuracy: 0.572
Epoch 12 Loss: 0.359 Val Loss: 0.893
Accuracy: 0.523
Epoch 13 Loss: 0.368 Val Loss: 0.935
Accuracy: 0.515
Epoch 14 Loss: 0.373 Val Loss: 0.933
Accuracy: 0.523
Epoch 15 Loss: 0.370 Val Loss: 0.893
Accuracy: 0.547
Epoch 16 Loss: 0.358 Val Loss: 0.828
Accuracy: 0.594
Epoch 17 Loss: 0.343 Val Loss: 0.752
Accuracy: 0.640
Epoch 18 Loss:

In [8]:
true_datas.T.reset_index(drop=True).to_csv("true_gdsc1.csv")
predict_datas.T.reset_index(drop=True).to_csv("pred_gdsc1.csv")