In [8]:
import argparse
import random

import numpy as np
import pandas as pd
import scipy.sparse as sp
import torch
from sklearn.model_selection import KFold
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

In [9]:
%load_ext autoreload
%autoreload 2

from load_data import load_data
from sampler import NewSampler

from DeepDSC.DeepDSC import (AE, DF, GeneExpressionDataset,
                             calculate_morgan_fingerprints, prepare_data,
                             prepare_drug_data, prepare_train_val_test_data,
                             train_autoencoder, train_df_model)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [22]:
data = "nci"
PATH = "../nci_data/"

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [12]:
class Args:
    def __init__(self):
        self.device = device  # cuda:number or cpu
        self.data = "nci"  # Dataset{gdsc or ccle}


args = Args()
res, drug_feature, exprs, mut, cna, null_mask, pos_num = load_data(args)
cells = {i: j for i, j in enumerate(res.index)}
drugs = {i: j for i, j in enumerate(res.columns)}

load nci


In [13]:
cell_sum = np.sum(res, axis=1)
drug_sum = np.sum(res, axis=0)

target_dim = [
    0,  # Cell
    # 1  # Drug
]

In [26]:
def main(PATH, train, val):
    print("Loading gene expression data...")
    normalized_gene_exp_tensor, gene_exp = prepare_data(
        data1=PATH + "/gene_exp_part1.csv.gz", data2=PATH + "gene_exp_part2.csv.gz"
    )
    normalized_gene_exp_dataset = GeneExpressionDataset(normalized_gene_exp_tensor)
    normalized_gene_exp_dataloader = DataLoader(
        normalized_gene_exp_dataset, batch_size=10000, shuffle=True
    )

    # オートエンコーダーのトレーニング
    print("Training autoencoder...")
    autoencoder = AE(normalized_gene_exp_tensor.shape[1]).to(device)
    train_autoencoder(autoencoder, normalized_gene_exp_dataloader)
    print("Autoencoder training completed.")

    # 圧縮特徴の抽出
    print("Extracting compressed features...")
    compressed_features_tensor = autoencoder.encoder(normalized_gene_exp_tensor)
    compressed_features = pd.DataFrame(
        compressed_features_tensor.cpu().detach().numpy(), index=gene_exp.columns
    )
    print(f"Compressed features shape: {compressed_features.shape}")
    drug_response, nsc_sm = prepare_drug_data(is_nsc=True)
    mfp = calculate_morgan_fingerprints(drug_response.T, nsc_sm)
    print(f"Morgan fingerprints shape: {mfp.shape}")

    train_labels = train[2]
    val_labels = val[2]
    train_data = train[[1, 0]]
    train_data.columns = [0, 1]
    val_data = val[[1, 0]]
    val_data.columns = [0, 1]
    print(
        f"Training data size: {len(train_data)}, Validation data size: {len(val_data)}"
    )
    train_data, val_data = prepare_train_val_test_data(
        train_data, val_data, compressed_features, mfp
    )
    df_model = DF().to(device)
    val_labels, best_val_out = train_df_model(
        df_model,
        train_data,
        val_data,
        torch.tensor(train_labels).double().to(device),
        torch.tensor(val_labels).double().to(device),
    )
    print("DF model training completed.")
    return val_labels, best_val_out

In [27]:
def DeepDSC(res_mat, null_mask, target_dim, target_index, seed):
    sampler = NewSampler(res_mat, null_mask, target_dim, target_index, seed)

    train_data = pd.DataFrame(sampler.train_data, index=res.index, columns=res.columns)
    test_data = pd.DataFrame(sampler.test_data, index=res.index, columns=res.columns)

    train_mask = pd.DataFrame(sampler.train_mask, index=res.index, columns=res.columns)
    test_mask = pd.DataFrame(sampler.test_mask, index=res.index, columns=res.columns)

    train = pd.DataFrame(train_mask.values.nonzero()).T
    train[2] = train_data.values[train_mask.values.nonzero()].astype(int)

    test = pd.DataFrame(test_mask.values.nonzero()).T
    test[2] = test_data.values[test_mask.values.nonzero()].astype(int)

    train[0] = [cells[i] for i in train[0]]
    train[1] = [drugs[i] for i in train[1]]
    test[0] = [cells[i] for i in test[0]]
    test[1] = [drugs[i] for i in test[1]]

    val_labels, best_val_out = main(PATH, train, test)
    return val_labels, best_val_out

In [28]:
n_kfold = 1
true_data_s = pd.DataFrame()
predict_data_s = pd.DataFrame()
for dim in target_dim:
    for seed, target_index in enumerate(tqdm(np.arange(res.shape[dim]))):
        if dim:
            if drug_sum[target_index] < 10:
                continue
        else:
            if cell_sum[target_index] < 10:
                continue
        epochs = []
        for fold in range(n_kfold):
            val_labels, best_val_out = DeepDSC(
                res.values, null_mask.values, dim, target_index, seed
            )

    true_datas = pd.concat([true_datas, pd.DataFrame(val_labels.cpu().numpy())], axis=1)
    predict_datas = pd.concat(
        [predict_datas, pd.DataFrame(best_val_out.cpu().numpy())], axis=1
    )

  if cell_sum[target_index] < 10:


Loading gene expression data...
Training autoencoder...



  0%|          | 0/800 [00:00<?, ?it/s][A
  0%|          | 1/800 [00:00<04:23,  3.04it/s][A
  0%|          | 2/800 [00:00<04:01,  3.30it/s][A
  0%|          | 3/800 [00:00<03:51,  3.44it/s][A
  0%|          | 4/800 [00:01<03:47,  3.50it/s][A
  1%|          | 5/800 [00:01<03:44,  3.55it/s][A
  1%|          | 6/800 [00:01<03:41,  3.59it/s][A
  1%|          | 7/800 [00:02<03:57,  3.34it/s][A
  1%|          | 8/800 [00:02<03:55,  3.36it/s][A
  1%|          | 9/800 [00:02<03:50,  3.43it/s][A
  1%|▏         | 10/800 [00:02<03:50,  3.43it/s][A
  1%|▏         | 11/800 [00:03<03:50,  3.42it/s][A
  2%|▏         | 12/800 [00:03<03:52,  3.40it/s][A
  2%|▏         | 13/800 [00:03<03:57,  3.31it/s][A
  2%|▏         | 14/800 [00:04<03:58,  3.30it/s][A
  2%|▏         | 15/800 [00:04<03:54,  3.35it/s][A
  2%|▏         | 16/800 [00:04<03:48,  3.43it/s][A
  2%|▏         | 17/800 [00:04<03:46,  3.45it/s][A
  2%|▏         | 18/800 [00:05<03:44,  3.49it/s][A
  2%|▏         | 19/800 [00:0

Autoencoder training completed.
Extracting compressed features...
Compressed features shape: (60, 500)
Morgan fingerprints shape: (1005, 256)
Training data size: 57035, Validation data size: 912
Epoch 1 Loss: 0.773 Val Loss: 2.133
Accuracy: 0.500
Epoch 2 Loss: 2.002 Val Loss: 1.471
Accuracy: 0.500
Epoch 3 Loss: 1.394 Val Loss: 0.747
Accuracy: 0.512
Epoch 4 Loss: 0.738 Val Loss: 0.803
Accuracy: 0.498
Epoch 5 Loss: 0.839 Val Loss: 0.922
Accuracy: 0.501
Epoch 6 Loss: 0.970 Val Loss: 0.849
Accuracy: 0.500
Epoch 7 Loss: 0.889 Val Loss: 0.724
Accuracy: 0.499
Epoch 8 Loss: 0.749 Val Loss: 0.682
Accuracy: 0.559
Epoch 9 Loss: 0.688 Val Loss: 0.730
Accuracy: 0.507
Epoch 10 Loss: 0.719 Val Loss: 0.787
Accuracy: 0.501
Epoch 11 Loss: 0.767 Val Loss: 0.801
Accuracy: 0.501
Epoch 12 Loss: 0.779 Val Loss: 0.773
Accuracy: 0.500
Epoch 13 Loss: 0.755 Val Loss: 0.728
Accuracy: 0.501
Epoch 14 Loss: 0.718 Val Loss: 0.692
Accuracy: 0.511
Epoch 15 Loss: 0.690 Val Loss: 0.675
Accuracy: 0.577
Epoch 16 Loss: 0.68

  2%|▏         | 1/60 [08:47<8:38:28, 527.26s/it]

Epoch 100 Loss: 0.661 Val Loss: 0.659
Accuracy: 0.617
DF model training completed.


  if cell_sum[target_index] < 10:


Loading gene expression data...
Training autoencoder...



  0%|          | 0/800 [00:00<?, ?it/s][A
  0%|          | 1/800 [00:00<07:09,  1.86it/s][A
  0%|          | 2/800 [00:01<06:33,  2.03it/s][A
  0%|          | 3/800 [00:01<06:31,  2.04it/s][A
  0%|          | 4/800 [00:02<06:57,  1.91it/s][A
  1%|          | 5/800 [00:02<06:45,  1.96it/s][A
  1%|          | 6/800 [00:03<06:34,  2.01it/s][A
  1%|          | 7/800 [00:03<06:46,  1.95it/s][A
  1%|          | 8/800 [00:04<06:48,  1.94it/s][A
  1%|          | 9/800 [00:04<06:33,  2.01it/s][A
  1%|▏         | 10/800 [00:04<06:15,  2.10it/s][A
  1%|▏         | 11/800 [00:05<06:07,  2.14it/s][A
  2%|▏         | 12/800 [00:05<06:13,  2.11it/s][A
  2%|▏         | 13/800 [00:06<06:08,  2.14it/s][A
  2%|▏         | 14/800 [00:06<06:12,  2.11it/s][A
  2%|▏         | 15/800 [00:07<06:09,  2.12it/s][A
  2%|▏         | 16/800 [00:07<06:09,  2.12it/s][A
  2%|▏         | 17/800 [00:08<06:22,  2.05it/s][A
  2%|▏         | 18/800 [00:08<06:26,  2.02it/s][A
  2%|▏         | 19/800 [00:0

Autoencoder training completed.
Extracting compressed features...
Compressed features shape: (60, 500)
Morgan fingerprints shape: (1005, 256)
Training data size: 57044, Validation data size: 975
Epoch 1 Loss: 0.775 Val Loss: 2.661
Accuracy: 0.392
Epoch 2 Loss: 1.990 Val Loss: 1.863
Accuracy: 0.392
Epoch 3 Loss: 1.392 Val Loss: 0.881
Accuracy: 0.389
Epoch 4 Loss: 0.739 Val Loss: 0.716
Accuracy: 0.614
Epoch 5 Loss: 0.835 Val Loss: 0.777
Accuracy: 0.609
Epoch 6 Loss: 0.967 Val Loss: 0.738
Accuracy: 0.610
Epoch 7 Loss: 0.889 Val Loss: 0.691
Accuracy: 0.606
Epoch 8 Loss: 0.748 Val Loss: 0.735
Accuracy: 0.402
Epoch 9 Loss: 0.687 Val Loss: 0.852
Accuracy: 0.383
Epoch 10 Loss: 0.717 Val Loss: 0.950
Accuracy: 0.394
Epoch 11 Loss: 0.765 Val Loss: 0.975
Accuracy: 0.393
Epoch 12 Loss: 0.778 Val Loss: 0.934
Accuracy: 0.394
Epoch 13 Loss: 0.755 Val Loss: 0.861
Accuracy: 0.393
Epoch 14 Loss: 0.717 Val Loss: 0.789
Accuracy: 0.388
Epoch 15 Loss: 0.689 Val Loss: 0.738
Accuracy: 0.403
Epoch 16 Loss: 0.68

  3%|▎         | 2/60 [18:40<9:07:10, 566.05s/it]

Epoch 100 Loss: 0.659 Val Loss: 0.744
Accuracy: 0.428
DF model training completed.


  if cell_sum[target_index] < 10:


Loading gene expression data...
Training autoencoder...



  0%|          | 0/800 [00:00<?, ?it/s][A
  0%|          | 1/800 [00:00<07:45,  1.72it/s][A
  0%|          | 2/800 [00:01<06:59,  1.90it/s][A
  0%|          | 3/800 [00:01<07:02,  1.88it/s][A
  0%|          | 4/800 [00:02<07:07,  1.86it/s][A
  1%|          | 5/800 [00:02<06:44,  1.96it/s][A
  1%|          | 6/800 [00:03<06:39,  1.99it/s][A
  1%|          | 7/800 [00:03<06:28,  2.04it/s][A
  1%|          | 8/800 [00:04<06:16,  2.10it/s][A
  1%|          | 9/800 [00:04<06:39,  1.98it/s][A
  1%|▏         | 10/800 [00:05<06:41,  1.97it/s][A
  1%|▏         | 11/800 [00:05<06:41,  1.96it/s][A
  2%|▏         | 12/800 [00:06<06:28,  2.03it/s][A
  2%|▏         | 13/800 [00:06<06:36,  1.99it/s][A
  2%|▏         | 14/800 [00:07<06:19,  2.07it/s][A
  2%|▏         | 15/800 [00:07<06:22,  2.05it/s][A
  2%|▏         | 16/800 [00:08<06:35,  1.98it/s][A
  2%|▏         | 17/800 [00:08<06:35,  1.98it/s][A
  2%|▏         | 18/800 [00:09<06:36,  1.97it/s][A
  2%|▏         | 19/800 [00:0

Autoencoder training completed.
Extracting compressed features...
Compressed features shape: (60, 500)
Morgan fingerprints shape: (1005, 256)
Training data size: 57026, Validation data size: 906
Epoch 1 Loss: 0.774 Val Loss: 2.145
Accuracy: 0.500
Epoch 2 Loss: 2.002 Val Loss: 1.479
Accuracy: 0.500
Epoch 3 Loss: 1.393 Val Loss: 0.746
Accuracy: 0.509
Epoch 4 Loss: 0.738 Val Loss: 0.792
Accuracy: 0.499
Epoch 5 Loss: 0.839 Val Loss: 0.910
Accuracy: 0.500
Epoch 6 Loss: 0.969 Val Loss: 0.840
Accuracy: 0.501
Epoch 7 Loss: 0.889 Val Loss: 0.721
Accuracy: 0.502
Epoch 8 Loss: 0.749 Val Loss: 0.685
Accuracy: 0.552
Epoch 9 Loss: 0.688 Val Loss: 0.736
Accuracy: 0.504
Epoch 10 Loss: 0.719 Val Loss: 0.795
Accuracy: 0.501
Epoch 11 Loss: 0.767 Val Loss: 0.809
Accuracy: 0.501
Epoch 12 Loss: 0.779 Val Loss: 0.780
Accuracy: 0.502
Epoch 13 Loss: 0.755 Val Loss: 0.734
Accuracy: 0.500
Epoch 14 Loss: 0.718 Val Loss: 0.696
Accuracy: 0.517
Epoch 15 Loss: 0.690 Val Loss: 0.678
Accuracy: 0.577
Epoch 16 Loss: 0.68

  5%|▌         | 3/60 [27:26<8:40:28, 547.87s/it]

Epoch 100 Loss: 0.661 Val Loss: 0.647
Accuracy: 0.606
DF model training completed.


  if cell_sum[target_index] < 10:


Loading gene expression data...
Training autoencoder...



  0%|          | 0/800 [00:00<?, ?it/s][A
  0%|          | 1/800 [00:00<04:08,  3.22it/s][A
  0%|          | 2/800 [00:00<03:53,  3.42it/s][A
  0%|          | 3/800 [00:00<03:57,  3.36it/s][A
  0%|          | 4/800 [00:01<04:01,  3.30it/s][A
  1%|          | 5/800 [00:01<04:03,  3.27it/s][A
  1%|          | 6/800 [00:01<04:06,  3.22it/s][A
  1%|          | 7/800 [00:02<04:08,  3.19it/s][A
  1%|          | 8/800 [00:02<04:08,  3.18it/s][A
  1%|          | 9/800 [00:02<04:08,  3.18it/s][A
  1%|▏         | 10/800 [00:03<04:10,  3.15it/s][A
  1%|▏         | 11/800 [00:03<04:12,  3.12it/s][A
  2%|▏         | 12/800 [00:03<04:11,  3.14it/s][A
  2%|▏         | 13/800 [00:04<04:10,  3.14it/s][A
  2%|▏         | 14/800 [00:04<04:11,  3.12it/s][A
  2%|▏         | 15/800 [00:04<04:10,  3.14it/s][A
  2%|▏         | 16/800 [00:05<04:09,  3.14it/s][A
  2%|▏         | 17/800 [00:05<04:11,  3.11it/s][A
  2%|▏         | 18/800 [00:05<04:23,  2.97it/s][A
  2%|▏         | 19/800 [00:0

Autoencoder training completed.
Extracting compressed features...
Compressed features shape: (60, 500)
Morgan fingerprints shape: (1005, 256)
Training data size: 57035, Validation data size: 984
Epoch 1 Loss: 0.774 Val Loss: 2.324
Accuracy: 0.459
Epoch 2 Loss: 1.997 Val Loss: 1.609
Accuracy: 0.459
Epoch 3 Loss: 1.394 Val Loss: 0.790
Accuracy: 0.465
Epoch 4 Loss: 0.739 Val Loss: 0.763
Accuracy: 0.539
Epoch 5 Loss: 0.837 Val Loss: 0.863
Accuracy: 0.540
Epoch 6 Loss: 0.968 Val Loss: 0.804
Accuracy: 0.540
Epoch 7 Loss: 0.889 Val Loss: 0.709
Accuracy: 0.542
Epoch 8 Loss: 0.749 Val Loss: 0.700
Accuracy: 0.516
Epoch 9 Loss: 0.687 Val Loss: 0.774
Accuracy: 0.460
Epoch 10 Loss: 0.718 Val Loss: 0.848
Accuracy: 0.459
Epoch 11 Loss: 0.766 Val Loss: 0.867
Accuracy: 0.460
Epoch 12 Loss: 0.779 Val Loss: 0.836
Accuracy: 0.459
Epoch 13 Loss: 0.755 Val Loss: 0.782
Accuracy: 0.451
Epoch 14 Loss: 0.718 Val Loss: 0.734
Accuracy: 0.437
Epoch 15 Loss: 0.690 Val Loss: 0.706
Accuracy: 0.514
Epoch 16 Loss: 0.68

  7%|▋         | 4/60 [36:47<8:36:01, 552.89s/it]

Epoch 100 Loss: 0.660 Val Loss: 0.707
Accuracy: 0.528
DF model training completed.


  if cell_sum[target_index] < 10:


Loading gene expression data...
Training autoencoder...



  0%|          | 0/800 [00:00<?, ?it/s][A
  0%|          | 1/800 [00:00<05:59,  2.22it/s][A
  0%|          | 2/800 [00:01<06:48,  1.95it/s][A
  0%|          | 3/800 [00:01<06:36,  2.01it/s][A
  0%|          | 4/800 [00:01<06:27,  2.06it/s][A
  1%|          | 5/800 [00:02<06:07,  2.17it/s][A
  1%|          | 6/800 [00:02<06:08,  2.15it/s][A
  1%|          | 7/800 [00:03<06:15,  2.11it/s][A
  1%|          | 8/800 [00:03<06:03,  2.18it/s][A
  1%|          | 9/800 [00:04<05:58,  2.21it/s][A
  1%|▏         | 10/800 [00:04<06:13,  2.11it/s][A
  1%|▏         | 11/800 [00:05<06:08,  2.14it/s][A
  2%|▏         | 12/800 [00:05<06:15,  2.10it/s][A
  2%|▏         | 13/800 [00:06<06:27,  2.03it/s][A
  2%|▏         | 14/800 [00:06<06:20,  2.06it/s][A
  2%|▏         | 15/800 [00:07<06:29,  2.02it/s][A
  2%|▏         | 16/800 [00:07<06:31,  2.00it/s][A
  2%|▏         | 17/800 [00:08<06:30,  2.00it/s][A
  2%|▏         | 18/800 [00:08<06:08,  2.12it/s][A
  2%|▏         | 19/800 [00:0

Autoencoder training completed.
Extracting compressed features...
Compressed features shape: (60, 500)
Morgan fingerprints shape: (1005, 256)
Training data size: 57079, Validation data size: 646
Epoch 1 Loss: 0.773 Val Loss: 2.155
Accuracy: 0.500
Epoch 2 Loss: 2.008 Val Loss: 1.496
Accuracy: 0.500
Epoch 3 Loss: 1.392 Val Loss: 0.768
Accuracy: 0.508
Epoch 4 Loss: 0.736 Val Loss: 0.824
Accuracy: 0.502
Epoch 5 Loss: 0.842 Val Loss: 0.941
Accuracy: 0.502
Epoch 6 Loss: 0.970 Val Loss: 0.867
Accuracy: 0.500
Epoch 7 Loss: 0.888 Val Loss: 0.745
Accuracy: 0.498
Epoch 8 Loss: 0.748 Val Loss: 0.705
Accuracy: 0.497
Epoch 9 Loss: 0.687 Val Loss: 0.755
Accuracy: 0.498
Epoch 10 Loss: 0.720 Val Loss: 0.813
Accuracy: 0.500
Epoch 11 Loss: 0.768 Val Loss: 0.827
Accuracy: 0.500
Epoch 12 Loss: 0.779 Val Loss: 0.799
Accuracy: 0.498
Epoch 13 Loss: 0.755 Val Loss: 0.754
Accuracy: 0.502
Epoch 14 Loss: 0.717 Val Loss: 0.718
Accuracy: 0.502
Epoch 15 Loss: 0.689 Val Loss: 0.702
Accuracy: 0.491
Epoch 16 Loss: 0.68

  7%|▋         | 4/60 [44:32<10:23:41, 668.23s/it]


KeyboardInterrupt: 

In [None]:
true_data_s.to_csv(f"new_cell_true_{args.data}.csv")
predict_data_s.to_csv(f"new_cell_pred_{args.data}.csv")