In [1]:
import argparse
import random

import numpy as np
import pandas as pd
import scipy.sparse as sp
import torch
from sklearn.model_selection import KFold
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

In [2]:
%load_ext autoreload
%autoreload 2

from load_data import load_data
from sampler import NewSampler

from DeepDSC.DeepDSC import (AE, DF, GeneExpressionDataset,
                             calculate_morgan_fingerprints, prepare_data,
                             prepare_drug_data, prepare_train_val_test_data,
                             train_autoencoder, train_df_model)

In [3]:
data = "gdsc1"
PATH = "../gdsc1_data/"

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
class Args:
    def __init__(self):
        self.device = device  # cuda:number or cpu
        self.data = "gdsc1"  # Dataset{gdsc or ccle}


args = Args()
res, drug_feature, exprs, mut, cna, null_mask, pos_num = load_data(args)
cells = {i: j for i, j in enumerate(res.index)}
drugs = {i: j for i, j in enumerate(res.columns)}

load gdsc1


In [6]:
cell_sum = np.sum(res, axis=1)
drug_sum = np.sum(res, axis=0)

target_dim = [
    0,  # Cell
    # 1  # Drug
]

In [7]:
def main(PATH, train, val):
    print("Loading gene expression data...")
    normalized_gene_exp_tensor, gene_exp = prepare_data(
        data1=PATH + "/gene_exp_part1.csv.gz", data2=PATH + "gene_exp_part2.csv.gz"
    )
    normalized_gene_exp_dataset = GeneExpressionDataset(normalized_gene_exp_tensor)
    normalized_gene_exp_dataloader = DataLoader(
        normalized_gene_exp_dataset, batch_size=10000, shuffle=True
    )

    # オートエンコーダーのトレーニング
    print("Training autoencoder...")
    autoencoder = AE(normalized_gene_exp_tensor.shape[1]).to(device)
    train_autoencoder(autoencoder, normalized_gene_exp_dataloader)
    print("Autoencoder training completed.")

    # 圧縮特徴の抽出
    print("Extracting compressed features...")
    compressed_features_tensor = autoencoder.encoder(normalized_gene_exp_tensor)
    compressed_features = pd.DataFrame(
        compressed_features_tensor.cpu().detach().numpy(), index=gene_exp.columns
    )
    print(f"Compressed features shape: {compressed_features.shape}")
    drug_response, nsc_sm = prepare_drug_data(is_nsc=True)
    mfp = calculate_morgan_fingerprints(drug_response.T, nsc_sm)
    print(f"Morgan fingerprints shape: {mfp.shape}")

    train_labels = train[2]
    val_labels = val[2]
    train_data = train[[0, 1]]
    val_data = val[[0, 1]]
    val_data.columns = [0, 1]
    print(
        f"Training data size: {len(train_data)}, Validation data size: {len(val_data)}"
    )
    train_data, val_data = prepare_train_val_test_data(
        train_data, val_data, compressed_features, mfp
    )
    df_model = DF().to(device)
    val_labels, best_val_out = train_df_model(
        df_model,
        train_data,
        val_data,
        torch.tensor(train_labels).double().to(device),
        torch.tensor(val_labels).double().to(device),
    )
    print("DF model training completed.")
    return val_labels, best_val_out

In [8]:
def DeepDSC(res_mat, null_mask, target_dim, target_index, seed):
    sampler = NewSampler(res_mat, null_mask, target_dim, target_index, seed)

    train_data = pd.DataFrame(sampler.train_data, index=res.index, columns=res.columns)
    test_data = pd.DataFrame(sampler.test_data, index=res.index, columns=res.columns)

    train_mask = pd.DataFrame(sampler.train_mask, index=res.index, columns=res.columns)
    test_mask = pd.DataFrame(sampler.test_mask, index=res.index, columns=res.columns)

    train = pd.DataFrame(train_mask.values.nonzero()).T
    train[2] = train_data.values[train_mask.values.nonzero()].astype(int)

    test = pd.DataFrame(test_mask.values.nonzero()).T
    test[2] = test_data.values[test_mask.values.nonzero()].astype(int)

    train[0] = [cells[i] for i in train[0]]
    train[1] = [drugs[i] for i in train[1]]
    test[0] = [cells[i] for i in test[0]]
    test[1] = [drugs[i] for i in test[1]]

    val_labels, best_val_out = main(PATH, train, test)
    return val_labels, best_val_out

In [10]:
n_kfold = 1
true_data_s = pd.DataFrame()
predict_data_s = pd.DataFrame()
for dim in target_dim:
    for seed, target_index in enumerate(tqdm(np.arange(res.shape[dim]))):
        if dim:
            if drug_sum[target_index] < 10:
                continue
        else:
            if cell_sum[target_index] < 10:
                continue
        epochs = []
        for fold in range(n_kfold):
            val_labels, best_val_out = DeepDSC(
                res.values, null_mask.T.values, dim, target_index, seed
            )

    true_datas = pd.concat([true_datas, pd.DataFrame(val_labels.cpu().numpy())], axis=1)
    predict_datas = pd.concat(
        [predict_datas, pd.DataFrame(best_val_out.cpu().numpy())], axis=1
    )

  if cell_sum[target_index] < 10:


Loading gene expression data...
Training autoencoder...



  0%|          | 0/800 [00:00<?, ?it/s][A
  0%|          | 1/800 [00:01<21:08,  1.59s/it][A
  1%|▏         | 10/800 [00:01<01:39,  7.92it/s][A
  2%|▏         | 15/800 [00:01<01:11, 10.94it/s][A
  2%|▏         | 19/800 [00:02<00:58, 13.41it/s][A
  3%|▎         | 23/800 [00:02<00:49, 15.77it/s][A
  3%|▎         | 26/800 [00:02<00:44, 17.44it/s][A
  4%|▎         | 29/800 [00:02<00:40, 19.02it/s][A
  4%|▍         | 32/800 [00:02<00:37, 20.42it/s][A
  4%|▍         | 35/800 [00:02<00:35, 21.60it/s][A
  5%|▍         | 38/800 [00:02<00:33, 22.55it/s][A
  5%|▌         | 41/800 [00:02<00:32, 23.29it/s][A
  6%|▌         | 44/800 [00:03<00:31, 23.85it/s][A
  6%|▌         | 47/800 [00:03<00:31, 24.24it/s][A
  6%|▋         | 50/800 [00:03<00:30, 24.55it/s][A
  7%|▋         | 53/800 [00:03<00:30, 24.77it/s][A
  7%|▋         | 56/800 [00:03<00:29, 24.93it/s][A
  7%|▋         | 59/800 [00:03<00:29, 25.05it/s][A
  8%|▊         | 62/800 [00:03<00:29, 25.13it/s][A
  8%|▊         | 65/8

Autoencoder training completed.
Extracting compressed features...
Compressed features shape: (1084, 500)


  0%|          | 0/331 [00:53<?, ?it/s]

Morgan fingerprints shape: (1005, 256)
Training data size: 260246, Validation data size: 382





KeyError: "None of [Index(['5-Fluorouracil', '5-Fluorouracil', '5-Fluorouracil', '5-Fluorouracil',\n       '5-Fluorouracil', '5-Fluorouracil', '5-Fluorouracil', '5-Fluorouracil',\n       '5-Fluorouracil', '5-Fluorouracil',\n       ...\n       'torin2', 'torin2', 'torin2', 'torin2', 'torin2', 'torin2', 'torin2',\n       'torin2', 'torin2', 'torin2'],\n      dtype='object', length=260246)] are in the [index]"

In [None]:
true_data_s.to_csv(f"new_cell_true_{args.data}.csv")
predict_data_s.to_csv(f"new_cell_pred_{args.data}.csv")