In [1]:
import os
import sys
import argparse
import torch
import torch.multiprocessing
import torch.nn as nn
from torch.nn.modules.module import Module
from scipy.spatial.distance import cdist
from sklearn.metrics import pairwise_distances, adjusted_rand_score, normalized_mutual_info_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from tqdm.auto import tqdm

import numpy as np
import pandas as pd
import snf
from sklearn.metrics.pairwise import pairwise_kernels
from sklearn.cluster import spectral_clustering, KMeans
from sklearn.metrics import v_measure_score
import matplotlib.pyplot as plt

device = 'cuda' if torch.cuda.is_available() else 'cpu'
from MIND_model import *  

  from .autonotebook import tqdm as notebook_tqdm


# CCMA dataset
We train and test the CCMA dataests using the proposed model.

In [2]:
# specify hyperparameters
emb_dim = 64
lr = 1e-4
epoch = 5000

In [4]:
np.random.seed(31415)
torch.manual_seed(31415)

RNA_data = pd.read_csv('./CCMA_preprocessed/mRNA.csv', header=0, index_col=0)
methyl_data = pd.read_csv('./CCMA_preprocessed/meth.csv', header=0, index_col=0)
cnv_data = pd.read_csv('./CCMA_preprocessed/CNV.csv', header=0, index_col=0)
clinical_data = pd.read_csv('./CCMA_preprocessed/clinical.csv', header=0, index_col=0)

N = clinical_data.shape[0]
print('total number of patients = {}'.format(N))
data_dict = {'mRNA': RNA_data, 'methyl': methyl_data, 'CNV': cnv_data}

for i, j in data_dict.items():
    print('{} data missing {}/{}, missing proportion = {}'.format(i, j.iloc[:, 0].isna().sum(), j.shape[0], np.round(j.iloc[:, 0].isna().mean(), 3)))

test = MIND(data_dict=data_dict, device=device, emb_dim=emb_dim).to(device)
test.my_train(epoch, lr=lr)

with torch.no_grad():
    z_emb = test.get_embedding()[0].cpu().numpy()

if not os.path.isdir('./CCMA_results'):
    os.makedirs('./CCMA_results')

pd.DataFrame(z_emb, index=RNA_data.index).to_csv('./CCMA_results/embeddings.csv')

total number of patients = 181
mRNA data missing 5/181, missing proportion = 0.028
methyl data missing 30/181, missing proportion = 0.166
CNV data missing 57/181, missing proportion = 0.315
Epoch=0
Epoch=1000
Epoch=2000
Epoch=3000
Epoch=4000


# Downstreaming task 1: Cancer type classification
We fit XGBoost classfiers to predict cancer types of patients using the output embeddings.

In [5]:
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier


clinic_data = pd.read_csv('./CCMA_preprocessed/clinical.csv', header=0, index_col=0)
# remove cancer types with sample size 1
my_idxxx = [_ in ['H3K27-DMG', 'ATRT', 'H3WT-HGG', 'Adult HGG', 'OS', 'MDB', 'H3G34-DHG', 'MRT'] for _ in clinic_data['cancer_type_abbrev']]
non_missing_types_names = clinic_data.index.to_numpy()[my_idxxx]
types_vec = clinic_data['cancer_type_abbrev'].loc[non_missing_types_names]
encoder = LabelEncoder()
labels = encoder.fit_transform(types_vec)

emb = pd.read_csv('./CCMA_results/embeddings.csv', index_col=0).loc[non_missing_types_names]
skf = StratifiedKFold(n_splits=5)
acc_temp = 0.
for train_idx, test_idx in skf.split(emb, labels):
    X_train, X_test = emb.to_numpy()[train_idx], emb.to_numpy()[test_idx]
    y_train, y_test = labels[train_idx], labels[test_idx]
        
    model_gbm = GradientBoostingClassifier(n_estimators=150)
        
    model = xgb.XGBClassifier(
        objective='multi:softmax',  # or 'multi:softprob' if you want probabilities
        num_class=len(np.unique(labels)),
        eval_metric='mlogloss'
    )
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    acc_temp += np.mean(y_pred == y_test)
print('Test accuracy = {}'.format(acc_temp/5.))




Test accuracy = 0.7925925925925926


# Downstreaming task 2: Reconstruction
We comapre predicted vs observed in the same way as the TCGA example.

In [7]:
np.random.seed(31415)
torch.manual_seed(31415)

RNA_data = pd.read_csv('./CCMA_preprocessed/mRNA_train.csv', header=0, index_col=0)
methyl_data = pd.read_csv('./CCMA_preprocessed/meth_train.csv', header=0, index_col=0)
cnv_data = pd.read_csv('./CCMA_preprocessed/CNV_train.csv', header=0, index_col=0)
clinical_data = pd.read_csv('./CCMA_preprocessed/clinical.csv', header=0, index_col=0)

N = clinical_data.shape[0]
print('total number of patients = {}'.format(N))
data_dict = {'mRNA': RNA_data, 'meth': methyl_data, 'CNV': cnv_data}

for i, j in data_dict.items():
    print('{} data missing {}/{}, missing proportion = {}'.format(i, j.iloc[:, 0].isna().sum(), j.shape[0], np.round(j.iloc[:, 0].isna().mean(), 3)))

test = MIND(data_dict=data_dict, device=device, emb_dim=emb_dim).to(device)
test.my_train(epoch, lr=lr)

pred = test.predict()
for i, nammme in enumerate(data_dict.keys()):
    test_obs = pd.read_csv('./CCMA_preprocessed/{}_test.csv'.format(nammme), header=0, index_col=0)
    hms = pd.DataFrame(pred[i].numpy(), index=RNA_data.index).loc[test_obs.index]
    hms.to_csv('./CCMA_results/{}_test_pred.csv'.format(nammme))

total number of patients = 181
mRNA data missing 19/181, missing proportion = 0.105
meth data missing 43/181, missing proportion = 0.238
CNV data missing 68/181, missing proportion = 0.376
Epoch=0
Epoch=1000
Epoch=2000
Epoch=3000
Epoch=4000


In [8]:
mods = ['mRNA', 'meth', 'CNV']
res_corr = pd.DataFrame(np.zeros((3, 1)) * np.nan, columns=['Pearson Correlaiton'], index=mods)
for mod in mods:
    pred = pd.read_csv('./CCMA_results/{}_test_pred.csv'.format(mod), header=0, index_col=0).to_numpy().ravel()
    obs = pd.read_csv('./CCMA_preprocessed/{}_test.csv'.format(mod), header=0, index_col=0).to_numpy().ravel()
    res_corr.loc[mod, 'Pearson Correlaiton'] = np.corrcoef(pred, obs)[0, 1]
print(res_corr)

      Pearson Correlaiton
mRNA             0.549651
meth             0.648310
CNV              0.836352
