In [1]:
import argparse
import random

import numpy as np
import pandas as pd
import scipy.sparse as sp
import torch
from sklearn.model_selection import KFold
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

In [2]:
%load_ext autoreload
%autoreload 2

from load_data import load_data
from sampler import Sampler

from DeepDSC.DeepDSC import (AE, DF, GeneExpressionDataset,
                             calculate_morgan_fingerprints, prepare_data,
                             prepare_drug_data, prepare_train_val_test_data,
                             train_autoencoder, train_df_model)

In [3]:
data = "gdsc2"
PATH = "../gdsc2_data/"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [11]:
class Args:
    def __init__(self):
        self.device = device  # cuda:number or cpu
        self.data = "gdsc2"  # Dataset{gdsc or ccle}

torch.manual_seed(0)
random.seed(0)
np.random.seed(0)


args = Args()
res, drug_feature, exprs, mut, cna, null_mask, pos_num = load_data(args)
cells = {i: j for i, j in enumerate(res.index)}
drugs = {i: j for i, j in enumerate(res.columns)}


cell_sum = np.sum(res, axis=1)
drug_sum = np.sum(res, axis=0)

load gdsc2


In [5]:
def main(PATH, train, val):
    print("Loading gene expression data...")
    normalized_gene_exp_tensor, gene_exp = prepare_data(
        data1=PATH + "/gene_exp_part1.csv.gz", data2=PATH + "gene_exp_part2.csv.gz"
    )
    normalized_gene_exp_dataset = GeneExpressionDataset(normalized_gene_exp_tensor)
    normalized_gene_exp_dataloader = DataLoader(
        normalized_gene_exp_dataset, batch_size=10000, shuffle=True
    )

    # オートエンコーダーのトレーニング
    print("Training autoencoder...")
    autoencoder = AE(normalized_gene_exp_tensor.shape[1]).to(device)
    train_autoencoder(autoencoder, normalized_gene_exp_dataloader)
    print("Autoencoder training completed.")

    # 圧縮特徴の抽出
    print("Extracting compressed features...")
    compressed_features_tensor = autoencoder.encoder(normalized_gene_exp_tensor)
    compressed_features = pd.DataFrame(
        compressed_features_tensor.cpu().detach().numpy(), index=gene_exp.columns
    )
    print(f"Compressed features shape: {compressed_features.shape}")
    drug_response, nsc_sm = prepare_drug_data(is_nsc=True)
    mfp = calculate_morgan_fingerprints(drug_response.T, nsc_sm)
    print(f"Morgan fingerprints shape: {mfp.shape}")

    train_labels = train[2]
    val_labels = test[2]
    train_data = train[[1, 0]]
    train_data.columns = [0, 1]
    val_data = test[[1, 0]]
    val_data.columns = [0, 1]
    print(
        f"Training data size: {len(train_data)}, Validation data size: {len(val_data)}"
    )
    train_data, val_data = prepare_train_val_test_data(
        train_data, val_data, compressed_features, mfp
    )
    df_model = DF().to(device)
    val_labels, best_val_out = train_df_model(
        df_model,
        train_data,
        val_data,
        torch.tensor(train_labels).double().to(device),
        torch.tensor(val_labels).double().to(device),
    )
    print("DF model training completed.")
    return val_labels, best_val_out

In [8]:
target_dim = [
    # 0,  # Drug
    1  # Cell
]

In [12]:
epochs = []
n_kfold = 1
true_data_s = pd.DataFrame()
predict_data_s = pd.DataFrame()

for dim in target_dim:
    for seed, target_index in tqdm(enumerate(np.arange(res.shape[dim]))):
        p = res.iloc[:, target_index].dropna() > 0
        tmp = sum(p)*100/len(p)
        if 0 < tmp < 100:
            if dim:
                if drug_sum[target_index] < 10:
                    continue
            else:
                if cell_sum[target_index] < 10:
                    continue
            epochs = []
            for fold in range(n_kfold):
                val_labels, best_val_out = DeepDSC(
                    res.values, null_mask.T.values, dim, target_index, seed
                )

            true_data_s = pd.concat([true_data_s, pd.DataFrame(val_labels.cpu().numpy())], axis=1)
            predict_data_s = pd.concat(
                [predict_data_s, pd.DataFrame(best_val_out.cpu().numpy())], axis=1
            )

        else:
            print(f"Target {target_index} skipped: All labels are {'0' if tmp == 0 else '1'}.")

  if drug_sum[target_index] < 10:
0it [00:00, ?it/s]


NameError: name 'DeepDSC' is not defined

In [8]:
true_datas.T.reset_index(drop=True).to_csv("true_nci.csv")
predict_datas.T.reset_index(drop=True).to_csv("pred_nci.csv")