In [2]:
import os
import sys
import argparse
import torch
import torch.multiprocessing
import torch.nn as nn
from torch.nn.modules.module import Module
from scipy.spatial.distance import cdist
from sklearn.metrics import pairwise_distances, adjusted_rand_score, normalized_mutual_info_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from tqdm.auto import tqdm

import numpy as np
import pandas as pd
import snf
from sklearn.metrics.pairwise import pairwise_kernels
from sklearn.cluster import spectral_clustering, KMeans
from sklearn.metrics import v_measure_score
import matplotlib.pyplot as plt

device = 'cuda' if torch.cuda.is_available() else 'cpu'
from MIND_model import *  

# CCLE dataset
We train and test the CCLE dataests using the proposed model.

In [2]:
# specify hyperparameters
emb_dim = 64
lr = 1e-4
epoch = 5000

In [3]:
np.random.seed(31415)
torch.manual_seed(31415)

RNA_data = pd.read_csv('./CCLE_preprocessed/RNA_data.csv', header=0, index_col=0)
methyl_data = pd.read_csv('./CCLE_preprocessed/meth_data.csv', header=0, index_col=0)
cnv_data = pd.read_csv('./CCLE_preprocessed/cna_data.csv', header=0, index_col=0)
clinical_data = pd.read_csv('./CCLE_preprocessed/clinic_data.csv', header=0, index_col=0)
mtb_data = pd.read_csv('./CCLE_preprocessed/metabolomics_data.csv', header=0, index_col=0)
miRNA_data = pd.read_csv('./CCLE_preprocessed/miRNA_data.csv', header=0, index_col=0)
rppa_data = pd.read_csv('./CCLE_preprocessed/rppa_data.csv', header=0, index_col=0)

N = clinical_data.shape[0]
print('total number of patients = {}'.format(N))
data_dict = {'RNA': RNA_data, 'meth': methyl_data, 'cna': cnv_data, 'metabolomics':mtb_data, 'miRNA':miRNA_data, 'rppa':rppa_data}

for i, j in data_dict.items():
    print('{} data missing {}/{}, missing proportion = {}'.format(i, j.iloc[:, 0].isna().sum(), j.shape[0], np.round(j.iloc[:, 0].isna().mean(), 3)))

test = MIND(data_dict=data_dict, device=device, emb_dim=emb_dim).to(device)
test.my_train(epoch, lr=lr)

with torch.no_grad():
    z_emb = test.get_embedding()[0].cpu().numpy()

if not os.path.isdir('./CCLE_results'):
    os.makedirs('./CCLE_results')

pd.DataFrame(z_emb, index=RNA_data.index).to_csv('./CCLE_results/embeddings.csv')

total number of patients = 1088
RNA data missing 69/1088, missing proportion = 0.063
meth data missing 245/1088, missing proportion = 0.225
cna data missing 93/1088, missing proportion = 0.085
metabolomics data missing 160/1088, missing proportion = 0.147
miRNA data missing 134/1088, missing proportion = 0.123
rppa data missing 189/1088, missing proportion = 0.174
Epoch=0
Epoch=1000
Epoch=2000
Epoch=3000
Epoch=4000


# Downstreaming task 1: Cancer type classification
We fit XGBoost classfiers to predict cancer types of patients using the output embeddings.

In [3]:
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

clinic_data = pd.read_csv('./CCLE_preprocessed/clinic_data.csv', header=0, index_col=0)
my_idxxx = np.logical_and(~clinic_data['tcga_code'].isna(), clinic_data['tcga_code'] != 'UNABLE TO CLASSIFY')
non_missing_types_names = clinic_data.index.to_numpy()[my_idxxx]
types_vec = clinic_data['tcga_code'].loc[non_missing_types_names]
encoder = LabelEncoder()
labels = encoder.fit_transform(types_vec)

emb = pd.read_csv('./CCLE_results/embeddings.csv', index_col=0).loc[non_missing_types_names]
skf = StratifiedKFold(n_splits=5)
acc_temp = 0.
for train_idx, test_idx in skf.split(emb, labels):
    X_train, X_test = emb.to_numpy()[train_idx], emb.to_numpy()[test_idx]
    y_train, y_test = labels[train_idx], labels[test_idx]

    model = xgb.XGBClassifier(
        objective='multi:softmax',  # or 'multi:softprob' if you want probabilities
        num_class=len(np.unique(labels)),
        eval_metric='mlogloss'
    )
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    acc_temp += np.mean(y_pred == y_test)
    
print('Test accuracy = {}'.format(acc_temp/5.))



Test accuracy = 0.6588917965638867


# Downstreaming task 2: Reconstruction
We comapre predicted vs observed in the same way as the TCGA example.

In [4]:
np.random.seed(31415)
torch.manual_seed(31415)

RNA_data = pd.read_csv('./CCLE_preprocessed/RNA_data_train.csv', header=0, index_col=0)
methyl_data = pd.read_csv('./CCLE_preprocessed/meth_data_train.csv', header=0, index_col=0)
cnv_data = pd.read_csv('./CCLE_preprocessed/cna_data_train.csv', header=0, index_col=0)
clinical_data = pd.read_csv('./CCLE_preprocessed/clinic_data.csv', header=0, index_col=0)
mtb_data = pd.read_csv('./CCLE_preprocessed/metabolomics_data_train.csv', header=0, index_col=0)
miRNA_data = pd.read_csv('./CCLE_preprocessed/miRNA_data_train.csv', header=0, index_col=0)
rppa_data = pd.read_csv('./CCLE_preprocessed/rppa_data_train.csv', header=0, index_col=0)

N = clinical_data.shape[0]
print('total number of patients = {}'.format(N))
data_dict = {'RNA': RNA_data, 'meth': methyl_data, 'cna': cnv_data, 'metabolomics':mtb_data, 'miRNA':miRNA_data, 'rppa':rppa_data}

for i, j in data_dict.items():
    print('{} data missing {}/{}, missing proportion = {}'.format(i, j.iloc[:, 0].isna().sum(), j.shape[0], np.round(j.iloc[:, 0].isna().mean(), 3)))

test = MIND(data_dict=data_dict, device=device, emb_dim=emb_dim).to(device)
test.my_train(epoch, lr=lr)

with torch.no_grad():
    z_emb = test.get_embedding()[0].cpu().numpy()

pred = test.predict()
for i, nammme in enumerate(data_dict.keys()):
    test_obs = pd.read_csv('./CCLE_preprocessed/{}_data_test.csv'.format(nammme), header=0, index_col=0)
    hms = pd.DataFrame(pred[i].numpy(), index=RNA_data.index).loc[test_obs.index]
    hms.to_csv('./CCLE_results/{}_data_test_pred.csv'.format(nammme))

total number of patients = 1088
RNA data missing 163/1088, missing proportion = 0.15
meth data missing 323/1088, missing proportion = 0.297
cna data missing 183/1088, missing proportion = 0.168
metabolomics data missing 249/1088, missing proportion = 0.229
miRNA data missing 221/1088, missing proportion = 0.203
rppa data missing 273/1088, missing proportion = 0.251
Epoch=0
Epoch=1000
Epoch=2000
Epoch=3000
Epoch=4000


In [4]:
mods = ['RNA', 'meth', 'cna', 'metabolomics', 'miRNA', 'rppa']
res_corr = pd.DataFrame(np.zeros((6, 1)) * np.nan, columns=['Pearson Correlaiton'], index=mods)
for mod in mods:
    pred = pd.read_csv('./CCLE_results/{}_data_test_pred.csv'.format(mod), header=0, index_col=0).to_numpy().ravel()
    obs = pd.read_csv('./CCLE_preprocessed/{}_data_test.csv'.format(mod), header=0, index_col=0).to_numpy().ravel()
    res_corr.loc[mod, 'Pearson Correlaiton'] = np.corrcoef(pred, obs)[0, 1]
print(res_corr)

              Pearson Correlaiton
RNA                      0.571903
meth                     0.446769
cna                      0.174209
metabolomics             0.337860
miRNA                    0.244199
rppa                     0.378902
