In [1]:
import argparse
import random

import numpy as np
import pandas as pd
import scipy.sparse as sp
import torch
from load_data import load_data
from sampler import NewSampler
from sklearn.model_selection import KFold
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

from DeepDSC.DeepDSC import (AE, DF, GeneExpressionDataset,
                             calculate_morgan_fingerprints, prepare_data,
                             prepare_drug_data, prepare_train_val_test_data,
                             train_autoencoder, train_df_model)

data = "nci"
PATH = "../nci_data/"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
class Args:
    def __init__(self):
        self.device = device  # cuda:number or cpu
        self.data = "nci"  # Dataset{gdsc or ccle}


args = Args()
res, drug_feature, exprs, mut, cna, null_mask, pos_num = load_data(args)
cells = {i: j for i, j in enumerate(res.index)}
drugs = {i: j for i, j in enumerate(res.columns)}

cell_sum = np.sum(res.values, axis=1)
drug_sum = np.sum(res.values, axis=0)

target_dim = [
    # 0,  # Drug
    1  # Cell
]

load nci


In [3]:
def main(PATH, train, test):
    print("Loading gene expression data...")
    normalized_gene_exp_tensor, gene_exp = prepare_data(
        data1=PATH + "/gene_exp_part1.csv.gz", data2=PATH + "gene_exp_part2.csv.gz"
    )
    normalized_gene_exp_dataset = GeneExpressionDataset(normalized_gene_exp_tensor)
    normalized_gene_exp_dataloader = DataLoader(
        normalized_gene_exp_dataset, batch_size=10000, shuffle=True
    )

    # オートエンコーダーのトレーニング
    print("Training autoencoder...")
    autoencoder = AE(normalized_gene_exp_tensor.shape[1]).to(device)
    train_autoencoder(autoencoder, normalized_gene_exp_dataloader)
    print("Autoencoder training completed.")

    # 圧縮特徴の抽出
    print("Extracting compressed features...")
    compressed_features_tensor = autoencoder.encoder(normalized_gene_exp_tensor)
    compressed_features = pd.DataFrame(
        compressed_features_tensor.cpu().detach().numpy(), index=gene_exp.columns
    )
    print(f"Compressed features shape: {compressed_features.shape}")
    drug_response, nsc_sm = prepare_drug_data(is_nsc=True)
    mfp = calculate_morgan_fingerprints(drug_response.T, nsc_sm)
    print(f"Morgan fingerprints shape: {mfp.shape}")

    train_labels = train[2]
    val_labels = test[2]
    train_data = train[[1, 0]]
    train_data.columns = [0, 1]
    val_data = test[[1, 0]]
    val_data.columns = [0, 1]
    print(
        f"Training data size: {len(train_data)}, Validation data size: {len(val_data)}"
    )
    train_data, val_data = prepare_train_val_test_data(
        train_data, val_data, compressed_features, mfp
    )
    df_model = DF().to(device)
    val_labels, best_val_out = train_df_model(
        df_model,
        train_data,
        val_data,
        torch.tensor(train_labels).double().to(device),
        torch.tensor(val_labels).double().to(device),
    )
    print("DF model training completed.")
    return val_labels, best_val_out

In [4]:
def DeepDSC(res_mat, null_mask, target_dim, target_index, seed):
    sampler = NewSampler(res_mat, null_mask, target_dim, target_index, seed)

    train_data = pd.DataFrame(sampler.train_data, index=res.index, columns=res.columns)
    test_data = pd.DataFrame(sampler.test_data, index=res.index, columns=res.columns)

    train_mask = pd.DataFrame(sampler.train_mask, index=res.index, columns=res.columns)
    test_mask = pd.DataFrame(sampler.test_mask, index=res.index, columns=res.columns)

    train = pd.DataFrame(train_mask.values.nonzero()).T
    train[2] = train_data.values[train_mask.values.nonzero()].astype(int)

    test = pd.DataFrame(test_mask.values.nonzero()).T
    test[2] = test_data.values[test_mask.values.nonzero()].astype(int)

    val_labels = test[2]

    if len(np.unique(val_labels)) < 2:
        print(f"Target {target_index} skipped: Validation set has only one class.")
        return None, None

    train[0] = [cells[i] for i in train[0]]
    train[1] = [drugs[i] for i in train[1]]
    test[0] = [cells[i] for i in test[0]]
    test[1] = [drugs[i] for i in test[1]]

    val_labels, best_val_out = main(PATH, train, test)
    return val_labels, best_val_out

In [7]:
n_kfold = 1
true_data_s = pd.DataFrame()
predict_data_s = pd.DataFrame()
for dim in target_dim:
    for seed, target_index in enumerate(tqdm(np.arange(res.shape[dim]))):
        if dim:
            if drug_sum[target_index] < 10:
                continue
        else:
            if cell_sum[target_index] < 10:
                continue
        epochs = []
        for fold in range(n_kfold):
            val_labels, best_val_out = DeepDSC(
                res.values, null_mask.values, dim, target_index, seed
            )

        true_data_s = pd.concat(
            [true_data_s, pd.DataFrame(val_labels.cpu().numpy())], axis=1
        )
        predict_data_s = pd.concat(
            [predict_data_s, pd.DataFrame(best_val_out.cpu().numpy())], axis=1
        )

  0%|          | 0/1005 [00:00<?, ?it/s]

Loading gene expression data...
Training autoencoder...



  0%|          | 0/800 [00:00<?, ?it/s][A
  0%|          | 1/800 [00:00<09:59,  1.33it/s][A
  2%|▏         | 15/800 [00:00<00:33, 23.43it/s][A
  3%|▎         | 24/800 [00:00<00:23, 33.05it/s][A
  4%|▍         | 32/800 [00:01<00:18, 40.54it/s][A
  5%|▍         | 39/800 [00:01<00:16, 46.32it/s][A
  6%|▌         | 46/800 [00:01<00:14, 51.19it/s][A
  7%|▋         | 53/800 [00:01<00:13, 55.12it/s][A
  8%|▊         | 60/800 [00:01<00:12, 58.23it/s][A
  8%|▊         | 67/800 [00:01<00:12, 60.57it/s][A
  9%|▉         | 74/800 [00:01<00:11, 62.31it/s][A
 10%|█         | 81/800 [00:01<00:11, 63.58it/s][A
 11%|█         | 88/800 [00:01<00:11, 64.53it/s][A
 12%|█▏        | 95/800 [00:02<00:10, 65.18it/s][A
 13%|█▎        | 102/800 [00:02<00:10, 65.60it/s][A
 14%|█▎        | 109/800 [00:02<00:10, 65.90it/s][A
 14%|█▍        | 116/800 [00:02<00:10, 66.14it/s][A
 15%|█▌        | 123/800 [00:02<00:10, 66.27it/s][A
 16%|█▋        | 130/800 [00:02<00:10, 66.38it/s][A
 17%|█▋        |

Autoencoder training completed.
Extracting compressed features...
Compressed features shape: (60, 500)
Morgan fingerprints shape: (1005, 256)
Training data size: 57959, Validation data size: 60
Epoch 1 Loss: 0.774 Val Loss: 3.281
Accuracy: 0.367
Epoch 2 Loss: 2.000 Val Loss: 2.348
Accuracy: 0.367
Epoch 3 Loss: 1.393 Val Loss: 1.076
Accuracy: 0.367
Epoch 4 Loss: 0.738 Val Loss: 0.657
Accuracy: 0.633
Epoch 5 Loss: 0.839 Val Loss: 0.687
Accuracy: 0.633
Epoch 6 Loss: 0.969 Val Loss: 0.666
Accuracy: 0.633
Epoch 7 Loss: 0.889 Val Loss: 0.669
Accuracy: 0.633
Epoch 8 Loss: 0.748 Val Loss: 0.777
Accuracy: 0.367
Epoch 9 Loss: 0.687 Val Loss: 0.938
Accuracy: 0.367
Epoch 10 Loss: 0.719 Val Loss: 1.048
Accuracy: 0.367
Epoch 11 Loss: 0.767 Val Loss: 1.061
Accuracy: 0.367
Epoch 12 Loss: 0.779 Val Loss: 0.995
Accuracy: 0.367
Epoch 13 Loss: 0.755 Val Loss: 0.894
Accuracy: 0.367
Epoch 14 Loss: 0.718 Val Loss: 0.798
Accuracy: 0.367
Epoch 15 Loss: 0.690 Val Loss: 0.728
Accuracy: 0.367
Epoch 16 Loss: 0.681

  0%|          | 1/1005 [00:24<6:54:16, 24.76s/it]

Epoch 96 Loss: 0.661 Val Loss: 0.658
Accuracy: 0.633
Epoch 97 Loss: 0.661 Val Loss: 0.658
Accuracy: 0.633
Epoch 98 Loss: 0.661 Val Loss: 0.658
Accuracy: 0.633
Epoch 99 Loss: 0.661 Val Loss: 0.658
Accuracy: 0.633
Epoch 100 Loss: 0.661 Val Loss: 0.658
Accuracy: 0.633
DF model training completed.
Loading gene expression data...
Training autoencoder...



  0%|          | 0/800 [00:00<?, ?it/s][A
  2%|▏         | 14/800 [00:00<00:05, 136.65it/s][A
  4%|▎         | 28/800 [00:00<00:09, 82.92it/s] [A
  5%|▍         | 38/800 [00:00<00:10, 75.98it/s][A
  6%|▌         | 47/800 [00:00<00:10, 72.67it/s][A
  7%|▋         | 55/800 [00:00<00:10, 70.81it/s][A
  8%|▊         | 63/800 [00:00<00:10, 69.54it/s][A
  9%|▉         | 71/800 [00:00<00:10, 68.69it/s][A
 10%|▉         | 78/800 [00:01<00:10, 68.14it/s][A
 11%|█         | 85/800 [00:01<00:10, 67.74it/s][A
 12%|█▏        | 92/800 [00:01<00:10, 67.44it/s][A
 12%|█▏        | 99/800 [00:01<00:10, 67.22it/s][A
 13%|█▎        | 106/800 [00:01<00:10, 67.09it/s][A
 14%|█▍        | 113/800 [00:01<00:10, 66.99it/s][A
 15%|█▌        | 120/800 [00:01<00:10, 66.92it/s][A
 16%|█▌        | 127/800 [00:01<00:10, 66.84it/s][A
 17%|█▋        | 134/800 [00:01<00:09, 66.79it/s][A
 18%|█▊        | 141/800 [00:02<00:09, 66.77it/s][A
 18%|█▊        | 148/800 [00:02<00:09, 66.79it/s][A
 19%|█▉    

Autoencoder training completed.
Extracting compressed features...
Compressed features shape: (60, 500)
Morgan fingerprints shape: (1005, 256)
Training data size: 57959, Validation data size: 60
Epoch 1 Loss: 0.774 Val Loss: 1.199
Accuracy: 0.417
Epoch 2 Loss: 2.002 Val Loss: 0.894
Accuracy: 0.417
Epoch 3 Loss: 1.393 Val Loss: 0.679
Accuracy: 0.583
Epoch 4 Loss: 0.738 Val Loss: 0.809
Accuracy: 0.583
Epoch 5 Loss: 0.839 Val Loss: 0.874
Accuracy: 0.583
Epoch 6 Loss: 0.969 Val Loss: 0.819
Accuracy: 0.583
Epoch 7 Loss: 0.889 Val Loss: 0.728
Accuracy: 0.583
Epoch 8 Loss: 0.748 Val Loss: 0.681
Accuracy: 0.583
Epoch 9 Loss: 0.687 Val Loss: 0.693
Accuracy: 0.583
Epoch 10 Loss: 0.719 Val Loss: 0.724
Accuracy: 0.417
Epoch 11 Loss: 0.767 Val Loss: 0.738
Accuracy: 0.417
Epoch 12 Loss: 0.779 Val Loss: 0.728
Accuracy: 0.417
Epoch 13 Loss: 0.755 Val Loss: 0.707
Accuracy: 0.417
Epoch 14 Loss: 0.718 Val Loss: 0.689
Accuracy: 0.583
Epoch 15 Loss: 0.690 Val Loss: 0.680
Accuracy: 0.583
Epoch 16 Loss: 0.681

  0%|          | 2/1005 [00:44<6:04:37, 21.81s/it]

Epoch 96 Loss: 0.661 Val Loss: 0.685
Accuracy: 0.583
Epoch 97 Loss: 0.661 Val Loss: 0.685
Accuracy: 0.583
Epoch 98 Loss: 0.661 Val Loss: 0.685
Accuracy: 0.583
Epoch 99 Loss: 0.661 Val Loss: 0.685
Accuracy: 0.583
Epoch 100 Loss: 0.661 Val Loss: 0.685
Accuracy: 0.583
DF model training completed.
Loading gene expression data...
Training autoencoder...



  0%|          | 0/800 [00:00<?, ?it/s][A
  2%|▏         | 15/800 [00:00<00:05, 131.71it/s][A
  4%|▎         | 29/800 [00:00<00:09, 84.04it/s] [A
  5%|▍         | 39/800 [00:00<00:09, 76.70it/s][A
  6%|▌         | 48/800 [00:00<00:10, 73.15it/s][A
  7%|▋         | 56/800 [00:00<00:10, 71.15it/s][A
  8%|▊         | 64/800 [00:00<00:10, 69.76it/s][A
  9%|▉         | 72/800 [00:00<00:10, 68.86it/s][A
 10%|▉         | 79/800 [00:01<00:10, 68.26it/s][A
 11%|█         | 86/800 [00:01<00:10, 67.80it/s][A
 12%|█▏        | 93/800 [00:01<00:10, 67.48it/s][A
 12%|█▎        | 100/800 [00:01<00:10, 67.27it/s][A
 13%|█▎        | 107/800 [00:01<00:10, 67.11it/s][A
 14%|█▍        | 114/800 [00:01<00:10, 66.98it/s][A
 15%|█▌        | 121/800 [00:01<00:10, 66.89it/s][A
 16%|█▌        | 128/800 [00:01<00:10, 66.87it/s][A
 17%|█▋        | 135/800 [00:01<00:09, 66.83it/s][A
 18%|█▊        | 142/800 [00:02<00:09, 66.76it/s][A
 19%|█▊        | 149/800 [00:02<00:09, 66.75it/s][A
 20%|█▉   

KeyboardInterrupt: 

In [None]:
true_data_s