In [1]:
import argparse
import random

import numpy as np
import pandas as pd
import scipy.sparse as sp
import torch
from sklearn.model_selection import KFold
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

In [2]:
%load_ext autoreload
%autoreload 2

from load_data import load_data
from sampler import Sampler

from DeepDSC.DeepDSC import (AE, DF, GeneExpressionDataset,
                             calculate_morgan_fingerprints, prepare_data,
                             prepare_drug_data, prepare_train_val_test_data,
                             train_autoencoder, train_df_model)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

torch.manual_seed(0)
random.seed(0)
np.random.seed(0)

In [4]:
data = "gdsc2"

In [5]:
class Args:
    def __init__(self):
        self.device = device  # cuda:number or cpu
        self.data = data  # Dataset{gdsc or ccle}


args = Args()
res, drug_feature, exprs, mut, cna, null_mask, pos_num = load_data(args)
cells = {i: j for i, j in enumerate(res.index)}
drugs = {i: j for i, j in enumerate(res.columns)}

load gdsc2


In [6]:
def main(PATH, train, val):
    normalized_gene_exp_tensor, gene_exp = prepare_data(
        data1=PATH + "/gene_exp_part1.csv.gz", data2=PATH + "gene_exp_part2.csv.gz"
    )
    normalized_gene_exp_dataset = GeneExpressionDataset(normalized_gene_exp_tensor)
    normalized_gene_exp_dataloader = DataLoader(
        normalized_gene_exp_dataset, batch_size=10000, shuffle=True
    )

    # オートエンコーダーのトレーニング
    autoencoder = AE(normalized_gene_exp_tensor.shape[1]).to(device)
    train_autoencoder(autoencoder, normalized_gene_exp_dataloader)

    # 圧縮特徴の抽出
    compressed_features_tensor = autoencoder.encoder(normalized_gene_exp_tensor)
    compressed_features = pd.DataFrame(
        compressed_features_tensor.cpu().detach().numpy(), index=gene_exp.columns
    )

    # 薬物応答データの準備
    drug_response, nsc_sm = prepare_drug_data(is_nsc=False, is_gdsc=True, is_1=False)
    mfp = calculate_morgan_fingerprints(drug_response, nsc_sm)
    print(f"Morgan fingerprints shape: {mfp.shape}")

    train_labels = train[2]
    val_labels = test[2]
    train_data = train[[0, 1]]
    val_data = test[[0, 1]]
    print(
        f"Training data size: {len(train_data)}, Validation data size: {len(val_data)}"
    )
    train_data, val_data = prepare_train_val_test_data(
        train_data, val_data, compressed_features, mfp
    )
    df_model = DF().to(device)
    val_labels, best_val_out = train_df_model(
        df_model,
        train_data,
        val_data,
        torch.tensor(train_labels).double().to(device),
        torch.tensor(val_labels).double().to(device),
    )
    print("DF model training completed.")
    return val_labels, best_val_out

In [7]:
epochs = []
true_datas = pd.DataFrame()
predict_datas = pd.DataFrame()
k = 5
kfold = KFold(n_splits=k, shuffle=True, random_state=42)
PATH = f"../{data}_data/"

true_datas = pd.DataFrame()
predict_datas = pd.DataFrame()

for i, (train_index, test_index) in enumerate(kfold.split(np.arange(pos_num))):
    sampler = Sampler(res, train_index, test_index, null_mask.T, i)

    train_data = pd.DataFrame(sampler.train_data, index=res.index, columns=res.columns)
    test_data = pd.DataFrame(sampler.test_data, index=res.index, columns=res.columns)

    train_mask = pd.DataFrame(sampler.train_mask, index=res.index, columns=res.columns)
    test_mask = pd.DataFrame(sampler.test_mask, index=res.index, columns=res.columns)

    train = pd.DataFrame(train_mask.values.nonzero()).T
    train[2] = train_data.values[train_mask.values.nonzero()].astype(int)

    test = pd.DataFrame(test_mask.values.nonzero()).T
    test[2] = test_data.values[test_mask.values.nonzero()].astype(int)

    train[0] = [cells[i] for i in train[0]]
    train[1] = [drugs[i] for i in train[1]]
    test[0] = [cells[i] for i in test[0]]
    test[1] = [drugs[i] for i in test[1]]

    val_labels, best_val_out = main(PATH, train, test)
    true_datas = pd.concat([true_datas, pd.DataFrame(val_labels.cpu().numpy())], axis=1)
    predict_datas = pd.concat(
        [predict_datas, pd.DataFrame(best_val_out.cpu().numpy())], axis=1
    )

100%|██████████| 800/800 [00:56<00:00, 14.21it/s]


Morgan fingerprints shape: (240, 256)
Training data size: 178648, Validation data size: 11466
Epoch 1 Loss: 0.951 Val Loss: 2.313
Accuracy: 0.500
Epoch 2 Loss: 0.611 Val Loss: 2.525
Accuracy: 0.500
Epoch 3 Loss: 0.658 Val Loss: 2.087
Accuracy: 0.500
Epoch 4 Loss: 0.551 Val Loss: 1.468
Accuracy: 0.500
Epoch 5 Loss: 0.409 Val Loss: 0.933
Accuracy: 0.514
Epoch 6 Loss: 0.317 Val Loss: 0.635
Accuracy: 0.646
Epoch 7 Loss: 0.322 Val Loss: 0.535
Accuracy: 0.766
Epoch 8 Loss: 0.367 Val Loss: 0.517
Accuracy: 0.765
Epoch 9 Loss: 0.366 Val Loss: 0.540
Accuracy: 0.769
Epoch 10 Loss: 0.326 Val Loss: 0.603
Accuracy: 0.679
Epoch 11 Loss: 0.287 Val Loss: 0.697
Accuracy: 0.630
Epoch 12 Loss: 0.270 Val Loss: 0.795
Accuracy: 0.604
Epoch 13 Loss: 0.269 Val Loss: 0.872
Accuracy: 0.604
Epoch 14 Loss: 0.274 Val Loss: 0.915
Accuracy: 0.579
Epoch 15 Loss: 0.278 Val Loss: 0.919
Accuracy: 0.595
Epoch 16 Loss: 0.276 Val Loss: 0.889
Accuracy: 0.604
Epoch 17 Loss: 0.269 Val Loss: 0.833
Accuracy: 0.620
Epoch 18 Loss:

100%|██████████| 800/800 [00:53<00:00, 14.97it/s]


Morgan fingerprints shape: (240, 256)
Training data size: 178648, Validation data size: 11466
Epoch 1 Loss: 0.951 Val Loss: 2.302
Accuracy: 0.500
Epoch 2 Loss: 0.612 Val Loss: 2.514
Accuracy: 0.500
Epoch 3 Loss: 0.658 Val Loss: 2.080
Accuracy: 0.500
Epoch 4 Loss: 0.550 Val Loss: 1.465
Accuracy: 0.500
Epoch 5 Loss: 0.408 Val Loss: 0.935
Accuracy: 0.513
Epoch 6 Loss: 0.316 Val Loss: 0.642
Accuracy: 0.634
Epoch 7 Loss: 0.322 Val Loss: 0.545
Accuracy: 0.759
Epoch 8 Loss: 0.366 Val Loss: 0.528
Accuracy: 0.758
Epoch 9 Loss: 0.365 Val Loss: 0.550
Accuracy: 0.756
Epoch 10 Loss: 0.325 Val Loss: 0.613
Accuracy: 0.669
Epoch 11 Loss: 0.286 Val Loss: 0.705
Accuracy: 0.626
Epoch 12 Loss: 0.269 Val Loss: 0.802
Accuracy: 0.603
Epoch 13 Loss: 0.268 Val Loss: 0.879
Accuracy: 0.587
Epoch 14 Loss: 0.274 Val Loss: 0.921
Accuracy: 0.576
Epoch 15 Loss: 0.277 Val Loss: 0.925
Accuracy: 0.576
Epoch 16 Loss: 0.275 Val Loss: 0.895
Accuracy: 0.603
Epoch 17 Loss: 0.268 Val Loss: 0.841
Accuracy: 0.613
Epoch 18 Loss:

100%|██████████| 800/800 [01:40<00:00,  7.92it/s]


Morgan fingerprints shape: (240, 256)
Training data size: 178650, Validation data size: 11464
Epoch 1 Loss: 0.951 Val Loss: 2.319
Accuracy: 0.500
Epoch 2 Loss: 0.611 Val Loss: 2.531
Accuracy: 0.500
Epoch 3 Loss: 0.658 Val Loss: 2.093
Accuracy: 0.500
Epoch 4 Loss: 0.550 Val Loss: 1.473
Accuracy: 0.500
Epoch 5 Loss: 0.409 Val Loss: 0.937
Accuracy: 0.514
Epoch 6 Loss: 0.317 Val Loss: 0.639
Accuracy: 0.643
Epoch 7 Loss: 0.322 Val Loss: 0.539
Accuracy: 0.763
Epoch 8 Loss: 0.367 Val Loss: 0.521
Accuracy: 0.763
Epoch 9 Loss: 0.366 Val Loss: 0.543
Accuracy: 0.758
Epoch 10 Loss: 0.325 Val Loss: 0.606
Accuracy: 0.687
Epoch 11 Loss: 0.287 Val Loss: 0.699
Accuracy: 0.632
Epoch 12 Loss: 0.270 Val Loss: 0.796
Accuracy: 0.604
Epoch 13 Loss: 0.269 Val Loss: 0.873
Accuracy: 0.592
Epoch 14 Loss: 0.274 Val Loss: 0.916
Accuracy: 0.579
Epoch 15 Loss: 0.278 Val Loss: 0.919
Accuracy: 0.592
Epoch 16 Loss: 0.276 Val Loss: 0.889
Accuracy: 0.604
Epoch 17 Loss: 0.269 Val Loss: 0.833
Accuracy: 0.619
Epoch 18 Loss:

100%|██████████| 800/800 [01:04<00:00, 12.32it/s]


Morgan fingerprints shape: (240, 256)
Training data size: 178650, Validation data size: 11464
Epoch 1 Loss: 0.951 Val Loss: 2.309
Accuracy: 0.500
Epoch 2 Loss: 0.612 Val Loss: 2.523
Accuracy: 0.500
Epoch 3 Loss: 0.658 Val Loss: 2.087
Accuracy: 0.500
Epoch 4 Loss: 0.550 Val Loss: 1.470
Accuracy: 0.500
Epoch 5 Loss: 0.408 Val Loss: 0.937
Accuracy: 0.514
Epoch 6 Loss: 0.316 Val Loss: 0.641
Accuracy: 0.642
Epoch 7 Loss: 0.322 Val Loss: 0.542
Accuracy: 0.760
Epoch 8 Loss: 0.366 Val Loss: 0.525
Accuracy: 0.759
Epoch 9 Loss: 0.365 Val Loss: 0.548
Accuracy: 0.755
Epoch 10 Loss: 0.325 Val Loss: 0.612
Accuracy: 0.690
Epoch 11 Loss: 0.286 Val Loss: 0.705
Accuracy: 0.632
Epoch 12 Loss: 0.269 Val Loss: 0.803
Accuracy: 0.605
Epoch 13 Loss: 0.268 Val Loss: 0.881
Accuracy: 0.593
Epoch 14 Loss: 0.274 Val Loss: 0.924
Accuracy: 0.578
Epoch 15 Loss: 0.277 Val Loss: 0.929
Accuracy: 0.593
Epoch 16 Loss: 0.275 Val Loss: 0.899
Accuracy: 0.605
Epoch 17 Loss: 0.268 Val Loss: 0.844
Accuracy: 0.621
Epoch 18 Loss:

100%|██████████| 800/800 [01:00<00:00, 13.21it/s]


Morgan fingerprints shape: (240, 256)
Training data size: 178650, Validation data size: 11464
Epoch 1 Loss: 0.951 Val Loss: 2.303
Accuracy: 0.500
Epoch 2 Loss: 0.612 Val Loss: 2.516
Accuracy: 0.500
Epoch 3 Loss: 0.658 Val Loss: 2.081
Accuracy: 0.500
Epoch 4 Loss: 0.550 Val Loss: 1.466
Accuracy: 0.500
Epoch 5 Loss: 0.408 Val Loss: 0.935
Accuracy: 0.513
Epoch 6 Loss: 0.317 Val Loss: 0.640
Accuracy: 0.642
Epoch 7 Loss: 0.322 Val Loss: 0.541
Accuracy: 0.762
Epoch 8 Loss: 0.366 Val Loss: 0.524
Accuracy: 0.761
Epoch 9 Loss: 0.365 Val Loss: 0.547
Accuracy: 0.757
Epoch 10 Loss: 0.325 Val Loss: 0.610
Accuracy: 0.690
Epoch 11 Loss: 0.287 Val Loss: 0.703
Accuracy: 0.633
Epoch 12 Loss: 0.269 Val Loss: 0.801
Accuracy: 0.607
Epoch 13 Loss: 0.269 Val Loss: 0.878
Accuracy: 0.577
Epoch 14 Loss: 0.274 Val Loss: 0.921
Accuracy: 0.577
Epoch 15 Loss: 0.277 Val Loss: 0.925
Accuracy: 0.577
Epoch 16 Loss: 0.276 Val Loss: 0.895
Accuracy: 0.607
Epoch 17 Loss: 0.268 Val Loss: 0.840
Accuracy: 0.634
Epoch 18 Loss:

In [8]:
true_datas.T.reset_index(drop=True).to_csv("true_gdsc2.csv")
predict_datas.T.reset_index(drop=True).to_csv("pred_gdsc2.csv")