In [1]:
import os
import sys
import argparse
import torch
import torch.multiprocessing
import torch.nn as nn
from torch.nn.modules.module import Module
from scipy.spatial.distance import cdist
from sklearn.metrics import pairwise_distances, adjusted_rand_score, normalized_mutual_info_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
import snf
from sklearn.metrics.pairwise import pairwise_kernels
from sklearn.cluster import spectral_clustering, KMeans
from sklearn.metrics import v_measure_score
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

device = 'cuda' if torch.cuda.is_available() else 'cpu'
from MIND_model import *
np.random.seed(31415)
torch.manual_seed(31415)

<torch._C.Generator at 0x1d505838750>

# Ablation studies using the CCLE dataset

In [2]:
# specify hyperparameters
lr = 1e-4
epoch = 5000

In [22]:
# Helper function for ablation
# testing using different models
def cls_bench(emb_size=64, perp=30, alpha=5e-2, beta=1.):
    warnings.filterwarnings("ignore")
    
    clinic_data = pd.read_csv('./CCLE_preprocessed/clinic_data.csv', header=0, index_col=0)
    my_idxxx = np.logical_and(~clinic_data['tcga_code'].isna(), clinic_data['tcga_code'] != 'UNABLE TO CLASSIFY')
    non_missing_types_names = clinic_data.index.to_numpy()[my_idxxx]
    types_vec = clinic_data['tcga_code'].loc[non_missing_types_names]
    encoder = LabelEncoder()
    y = encoder.fit_transform(types_vec)
    
    X = pd.read_csv('./CCLE_results_ablation/embeddings_perp={}_emb={}_alpha={}_beta={}.csv'.format(perp, emb_size, alpha, beta), index_col=0).loc[non_missing_types_names]

    models = {
        "Random Forest": RandomForestClassifier(
            n_estimators=100,
            random_state=42
        ),

        "SVM (RBF)": SVC(
            kernel='rbf',
            gamma='scale',
            random_state=42
        ),

        "KNN (k=5)": KNeighborsClassifier(
            n_neighbors=5
        ),

        "Neural Network (MLP)": MLPClassifier(
            hidden_layer_sizes=(100, 50),  # Two hidden layers
            max_iter=1000,  # Increased iterations for convergence
            random_state=42
        ),

        'XGBoost': xgb.XGBClassifier(
        objective='multi:softmax',
        num_class=len(np.unique(y)),
        eval_metric='mlogloss'
        )
    }

    # 3. Run the Comparison
    res = np.zeros((1, 5))
    # We use StratifiedKFold to ensure each fold has the same class distribution
    cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    for i, (name, model) in enumerate(models.items()):
        # Construct a pipeline:
        # Scaler -> Model
        pipeline = make_pipeline(StandardScaler(), model)

        # Run 5-Fold Cross-Validation
        scores = cross_val_score(
            pipeline,
            X,
            y,
            cv=cv_strategy,
            scoring='accuracy'
        )
        res[0, i] = scores.mean() * 1.0

    res_cls = pd.DataFrame(res, columns=['RF', 'SVM', 'KNN', 'NN', 'XGB'])

    mods = ['RNA', 'meth', 'cna', 'metabolomics', 'miRNA', 'rppa']
    res_corr = pd.DataFrame(np.zeros((6, 1)) * np.nan, columns=['Pearson Correlaiton'], index=mods)
    for mod in mods:
        pred = pd.read_csv('./CCLE_results_ablation/{}_data_test_pred_perp={}_emb={}_alpha={}_beta={}.csv'.format(mod, perp, emb_size, alpha, beta), header=0, index_col=0).to_numpy().ravel()
        obs = pd.read_csv('./CCLE_preprocessed/{}_data_test.csv'.format(mod), header=0, index_col=0).to_numpy().ravel()
        res_corr.loc[mod, 'Pearson Correlaiton'] = np.corrcoef(pred, obs)[0, 1]

    res_corr.to_csv('./CCLE_results_ablation/corr_perp={}_emb={}_alpha={}_beta={}.csv'.format(perp, emb_size, alpha, beta))
    res_cls.to_csv('./CCLE_results_ablation/cls_perp={}_emb={}_alpha={}_beta={}.csv'.format(perp, emb_size, alpha, beta))

    



        




def CCLE_ablation_helper(emb_size=64, perp=30, alpha=5e-2, beta=1., epoch=3000, lr=1e-4):
    RNA_data = pd.read_csv('./CCLE_preprocessed/RNA_data.csv', header=0, index_col=0)
    methyl_data = pd.read_csv('./CCLE_preprocessed/meth_data.csv', header=0, index_col=0)
    cnv_data = pd.read_csv('./CCLE_preprocessed/cna_data.csv', header=0, index_col=0)
    clinical_data = pd.read_csv('./CCLE_preprocessed/clinic_data.csv', header=0, index_col=0)
    mtb_data = pd.read_csv('./CCLE_preprocessed/metabolomics_data.csv', header=0, index_col=0)
    miRNA_data = pd.read_csv('./CCLE_preprocessed/miRNA_data.csv', header=0, index_col=0)
    rppa_data = pd.read_csv('./CCLE_preprocessed/rppa_data.csv', header=0, index_col=0)
    
    N = clinical_data.shape[0]
    data_dict = {'RNA': RNA_data, 'meth': methyl_data, 'cna': cnv_data, 'metabolomics':mtb_data, 'miRNA':miRNA_data, 'rppa':rppa_data}
    print('perp={}_emb={}_alpha={}_beta={}'.format(perp, emb_size, alpha, beta))

    test = MIND(data_dict=data_dict, device=device, emb_dim=emb_size, alpha=alpha, perp=perp, beta=beta).to(device)
    test.my_train(epoch, lr=lr)
    
    with torch.no_grad():
        z_emb = test.get_embedding()[0].cpu().numpy()
    
    if not os.path.isdir('./CCLE_results_ablation'):
        os.makedirs('./CCLE_results_ablation')
    
    pd.DataFrame(z_emb, index=RNA_data.index).to_csv('./CCLE_results_ablation/embeddings_perp={}_emb={}_alpha={}_beta={}.csv'.format(perp, emb_size, alpha, beta))

    RNA_data = pd.read_csv('./CCLE_preprocessed/RNA_data_train.csv', header=0, index_col=0)
    methyl_data = pd.read_csv('./CCLE_preprocessed/meth_data_train.csv', header=0, index_col=0)
    cnv_data = pd.read_csv('./CCLE_preprocessed/cna_data_train.csv', header=0, index_col=0)
    clinical_data = pd.read_csv('./CCLE_preprocessed/clinic_data.csv', header=0, index_col=0)
    mtb_data = pd.read_csv('./CCLE_preprocessed/metabolomics_data_train.csv', header=0, index_col=0)
    miRNA_data = pd.read_csv('./CCLE_preprocessed/miRNA_data_train.csv', header=0, index_col=0)
    rppa_data = pd.read_csv('./CCLE_preprocessed/rppa_data_train.csv', header=0, index_col=0)
    
    N = clinical_data.shape[0]
    print('total number of patients = {}'.format(N))
    data_dict = {'RNA': RNA_data, 'meth': methyl_data, 'cna': cnv_data, 'metabolomics':mtb_data, 'miRNA':miRNA_data, 'rppa':rppa_data}

    test = MIND(data_dict=data_dict, device=device, emb_dim=emb_size, alpha=alpha, perp=perp, beta=beta).to(device)
    test.my_train(epoch, lr=lr)
    
    with torch.no_grad():
        z_emb = test.get_embedding()[0].cpu().numpy()
    
    pred = test.predict()
    for i, nammme in enumerate(data_dict.keys()):
        test_obs = pd.read_csv('./CCLE_preprocessed/{}_data_test.csv'.format(nammme), header=0, index_col=0)
        hms = pd.DataFrame(pred[i].cpu().numpy(), index=RNA_data.index).loc[test_obs.index]
        hms.to_csv('./CCLE_results_ablation/{}_data_test_pred_perp={}_emb={}_alpha={}_beta={}.csv'.format(nammme, perp, emb_size, alpha, beta))

    




# Variants of MIND

In [4]:
# No prior tilting, no regularisation
CCLE_ablation_helper(alpha=0., beta=0., epoch=epoch, lr=lr)
# with prior tilting, no regularisation
CCLE_ablation_helper(alpha=0., epoch=epoch, lr=lr)
# no prior tilting, with regularisation
CCLE_ablation_helper(beta=0., epoch=epoch, lr=lr)

perp=30_emb=64_alpha=0.0_beta=0.0
Epoch=0
Epoch=1000
Epoch=2000
Epoch=3000
Epoch=4000
total number of patients = 1088
Epoch=0
Epoch=1000
Epoch=2000
Epoch=3000
Epoch=4000
perp=30_emb=64_alpha=0.0_beta=1.0
Epoch=0
Epoch=1000
Epoch=2000
Epoch=3000
Epoch=4000
total number of patients = 1088
Epoch=0
Epoch=1000
Epoch=2000
Epoch=3000
Epoch=4000
perp=30_emb=64_alpha=0.05_beta=0.0
Epoch=0
Epoch=1000
Epoch=2000
Epoch=3000
Epoch=4000
total number of patients = 1088
Epoch=0
Epoch=1000
Epoch=2000
Epoch=3000
Epoch=4000


In [6]:
# with prior tilting, with regularisation
CCLE_ablation_helper(epoch=epoch, lr=lr)

perp=30_emb=64_alpha=0.05_beta=1.0
Epoch=0
Epoch=1000
Epoch=2000
Epoch=3000
Epoch=4000
total number of patients = 1088
Epoch=0
Epoch=1000
Epoch=2000
Epoch=3000
Epoch=4000


# impact of on the size of embedding

In [8]:
CCLE_ablation_helper(emb_size=8, epoch=epoch, lr=lr)
CCLE_ablation_helper(emb_size=16, epoch=epoch, lr=lr)
CCLE_ablation_helper(emb_size=32, epoch=epoch, lr=lr)
CCLE_ablation_helper(emb_size=64, epoch=epoch, lr=lr)
CCLE_ablation_helper(emb_size=128, epoch=epoch, lr=lr)
CCLE_ablation_helper(emb_size=256, epoch=epoch, lr=lr)

perp=30_emb=16_alpha=0.05_beta=1.0
Epoch=0
Epoch=1000
Epoch=2000
Epoch=3000
Epoch=4000
total number of patients = 1088
Epoch=0
Epoch=1000
Epoch=2000
Epoch=3000
Epoch=4000
perp=30_emb=32_alpha=0.05_beta=1.0
Epoch=0
Epoch=1000
Epoch=2000
Epoch=3000
Epoch=4000
total number of patients = 1088
Epoch=0
Epoch=1000
Epoch=2000
Epoch=3000
Epoch=4000
perp=30_emb=64_alpha=0.05_beta=1.0
Epoch=0
Epoch=1000
Epoch=2000
Epoch=3000
Epoch=4000
total number of patients = 1088
Epoch=0
Epoch=1000
Epoch=2000
Epoch=3000
Epoch=4000
perp=30_emb=128_alpha=0.05_beta=1.0
Epoch=0
Epoch=1000
Epoch=2000
Epoch=3000
Epoch=4000
total number of patients = 1088
Epoch=0
Epoch=1000
Epoch=2000
Epoch=3000
Epoch=4000


# impact of the perplexity parameter in tSNE

In [9]:
CCLE_ablation_helper(perp=8, epoch=epoch, lr=lr)
CCLE_ablation_helper(perp=15, epoch=epoch, lr=lr)
CCLE_ablation_helper(perp=30, epoch=epoch, lr=lr)
CCLE_ablation_helper(perp=45, epoch=epoch, lr=lr)
CCLE_ablation_helper(perp=60, epoch=epoch, lr=lr)

perp=8_emb=64_alpha=0.05_beta=1.0
Epoch=0
Epoch=1000
Epoch=2000
Epoch=3000
Epoch=4000
total number of patients = 1088
Epoch=0
Epoch=1000
Epoch=2000
Epoch=3000
Epoch=4000
perp=15_emb=64_alpha=0.05_beta=1.0
Epoch=0
Epoch=1000
Epoch=2000
Epoch=3000
Epoch=4000
total number of patients = 1088
Epoch=0
Epoch=1000
Epoch=2000
Epoch=3000
Epoch=4000
perp=30_emb=64_alpha=0.05_beta=1.0
Epoch=0
Epoch=1000
Epoch=2000
Epoch=3000
Epoch=4000
total number of patients = 1088
Epoch=0
Epoch=1000
Epoch=2000
Epoch=3000
Epoch=4000
perp=45_emb=64_alpha=0.05_beta=1.0
Epoch=0
Epoch=1000
Epoch=2000
Epoch=3000
Epoch=4000
total number of patients = 1088
Epoch=0
Epoch=1000
Epoch=2000
Epoch=3000
Epoch=4000
perp=60_emb=64_alpha=0.05_beta=1.0
Epoch=0
Epoch=1000
Epoch=2000
Epoch=3000
Epoch=4000
total number of patients = 1088
Epoch=0
Epoch=1000
Epoch=2000
Epoch=3000
Epoch=4000


# storing results

In [23]:
# No prior tilting, no regularisation
cls_bench(alpha=0., beta=0.)
# with prior tilting, no regularisation
cls_bench(alpha=0.)
# no prior tilting, with regularisation
cls_bench(beta=0.)
# original
cls_bench()

In [24]:
# impact of on the size of embedding
cls_bench(emb_size=8)
cls_bench(emb_size=16)
cls_bench(emb_size=32)
cls_bench(emb_size=64)
cls_bench(emb_size=128)
cls_bench(emb_size=256)

In [25]:
# impact of the perplexity parameter in tSNE
cls_bench(perp=8)
cls_bench(perp=15)
cls_bench(perp=30)
cls_bench(perp=45)
cls_bench(perp=60)

# MIND based on single modalities

In [12]:
def CCLE_ablation_indv_helper(emb_size=64, perp=30, alpha=5e-2, beta=1., epoch=3000, lr=1e-4):
    RNA_data = pd.read_csv('./CCLE_preprocessed/RNA_data.csv', header=0, index_col=0)
    methyl_data = pd.read_csv('./CCLE_preprocessed/meth_data.csv', header=0, index_col=0)
    cnv_data = pd.read_csv('./CCLE_preprocessed/cna_data.csv', header=0, index_col=0)
    clinical_data_full = pd.read_csv('./CCLE_preprocessed/clinic_data.csv', header=0, index_col=0)
    mtb_data = pd.read_csv('./CCLE_preprocessed/metabolomics_data.csv', header=0, index_col=0)
    miRNA_data = pd.read_csv('./CCLE_preprocessed/miRNA_data.csv', header=0, index_col=0)
    rppa_data = pd.read_csv('./CCLE_preprocessed/rppa_data.csv', header=0, index_col=0)
    
    data_dict = {'RNA': RNA_data, 'meth': methyl_data, 'cna': cnv_data, 'metabolomics':mtb_data, 'miRNA':miRNA_data, 'rppa':rppa_data}

    full_res = pd.DataFrame(np.zeros((6, 5)), index=['RNA', 'meth', 'cna', 'metabolomics', 'miRNA', 'rppa'], columns=['RF', 'SVM', 'KNN', 'NN', 'XGB'])
    
    for mod in ['RNA', 'meth', 'cna', 'metabolomics', 'miRNA', 'rppa']:
        print(mod)
        data_ind = data_dict[mod]
        data_ind_dict = {mod: data_ind.loc[~data_ind.iloc[:, 0].isna()]}
        mod_idx = data_ind.index[~data_ind.iloc[:, 0].isna()]
        clinic_data = clinical_data_full.loc[mod_idx]
        test = MIND(data_dict=data_ind_dict, device=device, emb_dim=emb_size, alpha=alpha, perp=perp, beta=beta).to(device)
        test.my_train(epoch, lr=lr)
        
        with torch.no_grad():
            z_emb = test.get_embedding()[0].cpu().numpy()
        
        if not os.path.isdir('./CCLE_results_ablation'):
            os.makedirs('./CCLE_results_ablation')
        
        pd.DataFrame(z_emb, index=mod_idx).to_csv('./CCLE_results_ablation/embeddings_CCLE_{}_alone.csv'.format(mod))

        warnings.filterwarnings("ignore")
    
        my_idxxx = np.logical_and(~clinic_data['tcga_code'].isna(), clinic_data['tcga_code'] != 'UNABLE TO CLASSIFY')
        non_missing_types_names = clinic_data.index.to_numpy()[my_idxxx]
        types_vec = clinic_data['tcga_code'].loc[non_missing_types_names]
        encoder = LabelEncoder()
        y = encoder.fit_transform(types_vec)
        X = pd.read_csv('./CCLE_results_ablation/embeddings_CCLE_{}_alone.csv'.format(mod), index_col=0).loc[non_missing_types_names]
        
        models = {
            "Random Forest": RandomForestClassifier(
                n_estimators=100,
                random_state=42
            ),
    
            "SVM (RBF)": SVC(
                kernel='rbf',
                gamma='scale',
                random_state=42
            ),
    
            "KNN (k=5)": KNeighborsClassifier(
                n_neighbors=5
            ),
    
            "Neural Network (MLP)": MLPClassifier(
                hidden_layer_sizes=(100, 50),  # Two hidden layers
                max_iter=1000,  # Increased iterations for convergence
                random_state=42
            ),
            'XGBoost': xgb.XGBClassifier(
            objective='multi:softmax',
            num_class=len(np.unique(y)),
            eval_metric='mlogloss'
            )
        }
    
        # 3. Run the Comparison
        res = np.zeros(5)
        # We use StratifiedKFold to ensure each fold has the same class distribution
        cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
        for i, (name, model) in enumerate(models.items()):
            # Construct a pipeline:
            # Scaler -> Model
            pipeline = make_pipeline(StandardScaler(), model)
    
            # Run 5-Fold Cross-Validation
            scores = cross_val_score(
                pipeline,
                X,
                y,
                cv=cv_strategy,
                scoring='accuracy'
            )
            res[i] = scores.mean() * 1.0
    
        # res_cls = pd.DataFrame(res, columns=['RF', 'SVM', 'KNN', 'NN', 'XGB'])
        full_res.loc[mod] = res * 1.0
    return full_res


ans = CCLE_ablation_indv_helper()

RNA
Epoch=0
Epoch=1000
Epoch=2000
meth
Epoch=0
Epoch=1000
Epoch=2000
cna
Epoch=0
Epoch=1000
Epoch=2000
metabolomics
Epoch=0
Epoch=1000
Epoch=2000
miRNA
Epoch=0
Epoch=1000
Epoch=2000
rppa
Epoch=0
Epoch=1000
Epoch=2000
