In [1]:
import argparse
import random

import numpy as np
import pandas as pd
import scipy.sparse as sp
import torch
from sklearn.model_selection import KFold
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

In [2]:
%load_ext autoreload
%autoreload 2

from load_data import load_data
from sampler import NewSampler

from DeepDSC.DeepDSC import (AE, DF, GeneExpressionDataset,
                             calculate_morgan_fingerprints, prepare_data,
                             prepare_drug_data, prepare_train_val_test_data,
                             train_autoencoder, train_df_model)

In [3]:
data = "gdsc1"
PATH = "../gdsc1_data/"

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
class Args:
    def __init__(self):
        self.device = device  # cuda:number or cpu
        self.data = "gdsc2"  # Dataset{gdsc or ccle}


args = Args()
res, drug_feature, exprs, mut, cna, null_mask, pos_num = load_data(args)
cells = {i: j for i, j in enumerate(res.index)}
drugs = {i: j for i, j in enumerate(res.columns)}

load gdsc2


In [6]:
cell_sum = np.sum(res, axis=1)
drug_sum = np.sum(res, axis=0)

target_dim = [
    # 0,  # Drug
    1  # Cell
]

In [7]:
def main(PATH, train, val):
    normalized_gene_exp_tensor, gene_exp = prepare_data(
        data1=PATH + "/gene_exp_part1.csv.gz", data2=PATH + "gene_exp_part2.csv.gz"
    )
    normalized_gene_exp_dataset = GeneExpressionDataset(normalized_gene_exp_tensor)
    normalized_gene_exp_dataloader = DataLoader(
        normalized_gene_exp_dataset, batch_size=10000, shuffle=True
    )

    # オートエンコーダーのトレーニング
    autoencoder = AE(normalized_gene_exp_tensor.shape[1]).to(device)
    train_autoencoder(autoencoder, normalized_gene_exp_dataloader)

    # 圧縮特徴の抽出
    compressed_features_tensor = autoencoder.encoder(normalized_gene_exp_tensor)
    compressed_features = pd.DataFrame(
        compressed_features_tensor.cpu().detach().numpy(), index=gene_exp.columns
    )

    # 薬物応答データの準備
    drug_response, nsc_sm = prepare_drug_data(is_nsc=False, is_gdsc=True, is_1=False)
    mfp = calculate_morgan_fingerprints(drug_response, nsc_sm)
    print(f"Morgan fingerprints shape: {mfp.shape}")

    train_labels = train[2]
    val_labels = val[2]
    train_data = train[[0, 1]]
    val_data = val[[0, 1]]
    val_data.columns = [0, 1]

    print(
        f"Training data size: {len(train_data)}, Validation data size: {len(val_data)}"
    )
    train_data, val_data = prepare_train_val_test_data(
        train_data, val_data, compressed_features, mfp
    )
    df_model = DF().to(device)
    val_labels, best_val_out = train_df_model(
        df_model,
        train_data,
        val_data,
        torch.tensor(train_labels).double().to(device),
        torch.tensor(val_labels).double().to(device),
    )
    print("DF model training completed.")
    return val_labels, best_val_out

In [8]:
def DeepDSC(res_mat, null_mask, target_dim, target_index, seed):
    sampler = NewSampler(res_mat, null_mask, target_dim, target_index, seed)
    
    train_data = pd.DataFrame(sampler.train_data, index=res.index, columns=res.columns)
    test_data = pd.DataFrame(sampler.test_data, index=res.index, columns=res.columns)

    train_mask = pd.DataFrame(sampler.train_mask, index=res.index, columns=res.columns)
    test_mask = pd.DataFrame(sampler.test_mask, index=res.index, columns=res.columns)

    train = pd.DataFrame(train_mask.values.nonzero()).T
    train[2] = train_data.values[train_mask.values.nonzero()].astype(int)

    test = pd.DataFrame(test_mask.values.nonzero()).T
    test[2] = test_data.values[test_mask.values.nonzero()].astype(int)

    val_labels = test[2] 

    if len(np.unique(val_labels)) < 2:
        print(f"Target {target_index} skipped: Validation set has only one class.")
        return None, None 
    
    train[0] = [cells[i] for i in train[0]]
    train[1] = [drugs[i] for i in train[1]]
    test[0] = [cells[i] for i in test[0]]
    test[1] = [drugs[i] for i in test[1]]

    val_labels, best_val_out = main(PATH, train, test)
    return val_labels, best_val_out

In [None]:
n_kfold = 1
true_datas = pd.DataFrame()
predict_datas = pd.DataFrame()
for dim in target_dim:
    for seed, target_index in enumerate(tqdm(np.arange(res.shape[dim]))):
        if dim:
            if drug_sum[target_index] < 10:
                continue
        else:
            if cell_sum[target_index] < 10:
                continue
        val_labels, best_val_out = DeepDSC(
            res.values, null_mask.T.values, dim, target_index, seed
        )

        if val_labels is not None:
            true_datas = pd.concat([true_datas, pd.DataFrame(val_labels.cpu().numpy())], axis=1)
            predict_datas = pd.concat(
                [predict_datas, pd.DataFrame(best_val_out.cpu().numpy())], axis=1
            )

  if drug_sum[target_index] < 10:

  0%|          | 0/800 [00:00<?, ?it/s][A
  0%|          | 1/800 [00:00<02:18,  5.76it/s][A
  1%|▏         | 10/800 [00:00<00:18, 41.86it/s][A
  2%|▏         | 16/800 [00:00<00:24, 31.88it/s][A
  2%|▎         | 20/800 [00:00<00:26, 29.51it/s][A
  3%|▎         | 24/800 [00:00<00:27, 28.07it/s][A
  4%|▎         | 28/800 [00:00<00:28, 27.16it/s][A
  4%|▍         | 31/800 [00:01<00:28, 26.67it/s][A
  4%|▍         | 34/800 [00:01<00:29, 26.25it/s][A
  5%|▍         | 37/800 [00:01<00:29, 25.93it/s][A
  5%|▌         | 40/800 [00:01<00:29, 25.67it/s][A
  5%|▌         | 43/800 [00:01<00:29, 25.48it/s][A
  6%|▌         | 46/800 [00:01<00:29, 25.37it/s][A
  6%|▌         | 49/800 [00:01<00:29, 25.33it/s][A
  6%|▋         | 52/800 [00:01<00:29, 25.31it/s][A
  7%|▋         | 55/800 [00:02<00:29, 25.30it/s][A
  7%|▋         | 58/800 [00:02<00:29, 25.29it/s][A
  8%|▊         | 61/800 [00:02<00:29, 25.25it/s][A
  8%|▊         | 64/800 [00:02<00:29, 2

Morgan fingerprints shape: (240, 256)
Training data size: 189889, Validation data size: 64
Epoch 1 Loss: 0.940 Val Loss: 2.281
Accuracy: 0.500
Epoch 2 Loss: 0.712 Val Loss: 2.327
Accuracy: 0.500
Epoch 3 Loss: 0.733 Val Loss: 1.746
Accuracy: 0.500
Epoch 4 Loss: 0.577 Val Loss: 1.065
Accuracy: 0.500
Epoch 5 Loss: 0.408 Val Loss: 0.598
Accuracy: 0.609
Epoch 6 Loss: 0.339 Val Loss: 0.423
Accuracy: 0.844
Epoch 7 Loss: 0.383 Val Loss: 0.382
Accuracy: 0.875
Epoch 8 Loss: 0.416 Val Loss: 0.372
Accuracy: 0.875
Epoch 9 Loss: 0.385 Val Loss: 0.392
Accuracy: 0.828
Epoch 10 Loss: 0.333 Val Loss: 0.450
Accuracy: 0.719
Epoch 11 Loss: 0.301 Val Loss: 0.531
Accuracy: 0.656
Epoch 12 Loss: 0.294 Val Loss: 0.605
Accuracy: 0.625
Epoch 13 Loss: 0.300 Val Loss: 0.652
Accuracy: 0.625
Epoch 14 Loss: 0.306 Val Loss: 0.660
Accuracy: 0.625
Epoch 15 Loss: 0.306 Val Loss: 0.632
Accuracy: 0.625
Epoch 16 Loss: 0.299 Val Loss: 0.578
Accuracy: 0.656
Epoch 17 Loss: 0.287 Val Loss: 0.511
Accuracy: 0.688
Epoch 18 Loss: 0.

  0%|          | 1/910 [01:03<16:08:18, 63.91s/it]

Epoch 99 Loss: 0.189 Val Loss: 0.260
Accuracy: 0.891
Epoch 100 Loss: 0.189 Val Loss: 0.260
Accuracy: 0.891
DF model training completed.


  if drug_sum[target_index] < 10:

  0%|          | 0/800 [00:00<?, ?it/s][A
  1%|          | 9/800 [00:00<00:09, 79.88it/s][A
  2%|▏         | 17/800 [00:00<00:21, 35.75it/s][A
  3%|▎         | 22/800 [00:00<00:24, 31.42it/s][A
  3%|▎         | 26/800 [00:00<00:26, 29.42it/s][A
  4%|▍         | 30/800 [00:00<00:27, 28.07it/s][A
  4%|▍         | 33/800 [00:01<00:28, 27.31it/s][A
  4%|▍         | 36/800 [00:01<00:28, 26.69it/s][A
  5%|▍         | 39/800 [00:01<00:29, 26.22it/s][A
  5%|▌         | 42/800 [00:01<00:29, 25.85it/s][A
  6%|▌         | 45/800 [00:01<00:29, 25.64it/s][A
  6%|▌         | 48/800 [00:01<00:29, 25.48it/s][A
  6%|▋         | 51/800 [00:01<00:29, 25.40it/s][A
  7%|▋         | 54/800 [00:01<00:29, 25.35it/s][A
  7%|▋         | 57/800 [00:02<00:29, 25.29it/s][A
  8%|▊         | 60/800 [00:02<00:29, 25.23it/s][A
  8%|▊         | 63/800 [00:02<00:29, 25.17it/s][A
  8%|▊         | 66/800 [00:02<00:29, 25.11it/s][A
  9%|▊         | 69/800 [00:02<00:29, 2

Morgan fingerprints shape: (240, 256)
Training data size: 189890, Validation data size: 70
Epoch 1 Loss: 0.940 Val Loss: 2.325
Accuracy: 0.500
Epoch 2 Loss: 0.712 Val Loss: 2.395
Accuracy: 0.500
Epoch 3 Loss: 0.733 Val Loss: 1.829
Accuracy: 0.500
Epoch 4 Loss: 0.577 Val Loss: 1.154
Accuracy: 0.500
Epoch 5 Loss: 0.408 Val Loss: 0.682
Accuracy: 0.600
Epoch 6 Loss: 0.339 Val Loss: 0.501
Accuracy: 0.829
Epoch 7 Loss: 0.383 Val Loss: 0.460
Accuracy: 0.857
Epoch 8 Loss: 0.416 Val Loss: 0.454
Accuracy: 0.857
Epoch 9 Loss: 0.385 Val Loss: 0.480
Accuracy: 0.814
Epoch 10 Loss: 0.333 Val Loss: 0.547
Accuracy: 0.700
Epoch 11 Loss: 0.301 Val Loss: 0.637
Accuracy: 0.643
Epoch 12 Loss: 0.294 Val Loss: 0.719
Accuracy: 0.614
Epoch 13 Loss: 0.300 Val Loss: 0.771
Accuracy: 0.614
Epoch 14 Loss: 0.306 Val Loss: 0.783
Accuracy: 0.614
Epoch 15 Loss: 0.306 Val Loss: 0.757
Accuracy: 0.614
Epoch 16 Loss: 0.299 Val Loss: 0.703
Accuracy: 0.643
Epoch 17 Loss: 0.287 Val Loss: 0.633
Accuracy: 0.671
Epoch 18 Loss: 0.

  0%|          | 2/910 [01:58<14:40:25, 58.18s/it]

Epoch 99 Loss: 0.189 Val Loss: 0.340
Accuracy: 0.857
Epoch 100 Loss: 0.189 Val Loss: 0.339
Accuracy: 0.857
DF model training completed.


  if drug_sum[target_index] < 10:

  0%|          | 0/800 [00:00<?, ?it/s][A
  1%|          | 9/800 [00:00<00:09, 79.81it/s][A
  2%|▏         | 17/800 [00:00<00:21, 35.83it/s][A
  3%|▎         | 22/800 [00:00<00:24, 31.28it/s][A
  3%|▎         | 26/800 [00:00<00:26, 29.30it/s][A
  4%|▍         | 30/800 [00:00<00:27, 28.02it/s][A
  4%|▍         | 33/800 [00:01<00:28, 27.30it/s][A
  4%|▍         | 36/800 [00:01<00:28, 26.73it/s][A
  5%|▍         | 39/800 [00:01<00:28, 26.27it/s][A
  5%|▌         | 42/800 [00:01<00:29, 25.91it/s][A
  6%|▌         | 45/800 [00:01<00:29, 25.61it/s][A
  6%|▌         | 48/800 [00:01<00:29, 25.43it/s][A
  6%|▋         | 51/800 [00:01<00:29, 25.33it/s][A
  7%|▋         | 54/800 [00:01<00:29, 25.22it/s][A
  7%|▋         | 57/800 [00:02<00:29, 25.13it/s][A
  8%|▊         | 60/800 [00:02<00:29, 25.06it/s][A
  8%|▊         | 63/800 [00:02<00:29, 25.04it/s][A
  8%|▊         | 66/800 [00:02<00:29, 25.06it/s][A
  9%|▊         | 69/800 [00:02<00:29, 2

Morgan fingerprints shape: (240, 256)
Training data size: 189890, Validation data size: 72
Epoch 1 Loss: 0.940 Val Loss: 2.356
Accuracy: 0.500
Epoch 2 Loss: 0.712 Val Loss: 2.424
Accuracy: 0.500
Epoch 3 Loss: 0.733 Val Loss: 1.853
Accuracy: 0.500
Epoch 4 Loss: 0.577 Val Loss: 1.172
Accuracy: 0.500
Epoch 5 Loss: 0.408 Val Loss: 0.692
Accuracy: 0.597
Epoch 6 Loss: 0.339 Val Loss: 0.500
Accuracy: 0.806
Epoch 7 Loss: 0.383 Val Loss: 0.454
Accuracy: 0.847
Epoch 8 Loss: 0.416 Val Loss: 0.451
Accuracy: 0.833
Epoch 9 Loss: 0.385 Val Loss: 0.484
Accuracy: 0.819
Epoch 10 Loss: 0.333 Val Loss: 0.558
Accuracy: 0.694
Epoch 11 Loss: 0.301 Val Loss: 0.652
Accuracy: 0.639
Epoch 12 Loss: 0.294 Val Loss: 0.737
Accuracy: 0.611
Epoch 13 Loss: 0.300 Val Loss: 0.790
Accuracy: 0.611
Epoch 14 Loss: 0.306 Val Loss: 0.802
Accuracy: 0.611
Epoch 15 Loss: 0.306 Val Loss: 0.774
Accuracy: 0.611
Epoch 16 Loss: 0.299 Val Loss: 0.717
Accuracy: 0.639
Epoch 17 Loss: 0.287 Val Loss: 0.645
Accuracy: 0.667
Epoch 18 Loss: 0.

  0%|          | 3/910 [02:52<14:11:39, 56.34s/it]

Epoch 99 Loss: 0.189 Val Loss: 0.259
Accuracy: 0.861
Epoch 100 Loss: 0.189 Val Loss: 0.258
Accuracy: 0.861
DF model training completed.


  if drug_sum[target_index] < 10:

  0%|          | 0/800 [00:00<?, ?it/s][A
  1%|          | 9/800 [00:00<00:09, 80.13it/s][A
  2%|▏         | 18/800 [00:00<00:23, 33.81it/s][A
  3%|▎         | 23/800 [00:00<00:25, 30.43it/s][A
  3%|▎         | 27/800 [00:00<00:26, 28.79it/s][A
  4%|▍         | 31/800 [00:01<00:27, 27.70it/s][A
  4%|▍         | 34/800 [00:01<00:28, 27.07it/s][A
  5%|▍         | 37/800 [00:01<00:28, 26.55it/s][A
  5%|▌         | 40/800 [00:01<00:29, 26.15it/s][A
  5%|▌         | 43/800 [00:01<00:29, 25.83it/s][A
  6%|▌         | 46/800 [00:01<00:29, 25.58it/s][A
  6%|▌         | 49/800 [00:01<00:29, 25.38it/s][A
  6%|▋         | 52/800 [00:01<00:29, 25.24it/s][A
  7%|▋         | 55/800 [00:01<00:29, 25.12it/s][A
  7%|▋         | 58/800 [00:02<00:29, 25.04it/s][A
  8%|▊         | 61/800 [00:02<00:29, 24.99it/s][A
  8%|▊         | 64/800 [00:02<00:29, 24.95it/s][A
  8%|▊         | 67/800 [00:02<00:29, 24.94it/s][A
  9%|▉         | 70/800 [00:02<00:29, 2

Morgan fingerprints shape: (240, 256)
Training data size: 189938, Validation data size: 42
Epoch 1 Loss: 0.939 Val Loss: 2.230
Accuracy: 0.500
Epoch 2 Loss: 0.712 Val Loss: 2.348
Accuracy: 0.500
Epoch 3 Loss: 0.733 Val Loss: 1.835
Accuracy: 0.500
Epoch 4 Loss: 0.577 Val Loss: 1.209
Accuracy: 0.500
Epoch 5 Loss: 0.408 Val Loss: 0.764
Accuracy: 0.595
Epoch 6 Loss: 0.339 Val Loss: 0.587
Accuracy: 0.762
Epoch 7 Loss: 0.383 Val Loss: 0.550
Accuracy: 0.738
Epoch 8 Loss: 0.416 Val Loss: 0.557
Accuracy: 0.762
Epoch 9 Loss: 0.385 Val Loss: 0.603
Accuracy: 0.762
Epoch 10 Loss: 0.333 Val Loss: 0.688
Accuracy: 0.714
Epoch 11 Loss: 0.301 Val Loss: 0.791
Accuracy: 0.643
Epoch 12 Loss: 0.294 Val Loss: 0.884
Accuracy: 0.619
Epoch 13 Loss: 0.300 Val Loss: 0.946
Accuracy: 0.619
Epoch 14 Loss: 0.306 Val Loss: 0.968
Accuracy: 0.619
Epoch 15 Loss: 0.306 Val Loss: 0.952
Accuracy: 0.619
Epoch 16 Loss: 0.299 Val Loss: 0.909
Accuracy: 0.643
Epoch 17 Loss: 0.287 Val Loss: 0.850
Accuracy: 0.690
Epoch 18 Loss: 0.

In [None]:
true_datas.to_csv(f"new_drug_true_{args.data}.csv")
predict_datas.to_csv(f"new_drug_pred_{args.data}.csv")