In [2]:
import os
import sys
import argparse
import torch
import torch.multiprocessing
import torch.nn as nn
from torch.nn.modules.module import Module
from scipy.spatial.distance import cdist
from sklearn.metrics import pairwise_distances, adjusted_rand_score, normalized_mutual_info_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from tqdm.auto import tqdm

import numpy as np
import pandas as pd
import snf
from sklearn.metrics.pairwise import pairwise_kernels
from sklearn.cluster import spectral_clustering, KMeans
from sklearn.metrics import v_measure_score
import matplotlib.pyplot as plt

device = 'cuda' if torch.cuda.is_available() else 'cpu'
from MIND import MIND

  from .autonotebook import tqdm as notebook_tqdm


# Synthetic data with high noise
Here we demonstrate the training/testing process of the proposed method using the high noise data. Training/testing of the low noise version is identical. 
## Training 

In [11]:
# specify hyperparameters
emb_dim = 64
lr = 1e-4
epoch = 5000

In [13]:
np.random.seed(31415)
torch.manual_seed(31415)
for i, frac in enumerate((1+np.arange(9))/10):
    print(frac)
    sim_3 = pd.read_csv('./synthetic_data/sim_methyl_high.csv', index_col=0)
    sim_2 = pd.read_csv('./synthetic_data/sim_protein_high.csv', index_col=0)
    sim_1 = pd.read_csv('./synthetic_data/sim_expr_high.csv', index_col=0)
    sim_cls = pd.read_csv('./synthetic_data/sim_cls_high.csv', index_col=1)

    non_shared = list(np.random.choice(range(sim_1.shape[0]), size=int((1 - frac) * sim_1.shape[0]), replace=False))
    shared = list(set(range(sim_1.shape[0])) - set(non_shared))
    sim_1_presence = torch.tensor([_ in shared + non_shared[:len(non_shared) // 3] for _ in range(sim_1.shape[0])])
    sim_2_presence = torch.tensor(
        [_ in shared + non_shared[len(non_shared) // 3:(2 * len(non_shared) // 3)] for _ in range(sim_2.shape[0])])
    sim_3_presence = torch.tensor([_ in shared + non_shared[(2 * len(non_shared) // 3):] for _ in range(sim_3.shape[0])])

    sim_1 = torch.tensor(sim_1.to_numpy(), dtype=torch.float)
    sim_2 = torch.tensor(sim_2.to_numpy(), dtype=torch.float)
    sim_3 = torch.tensor(sim_3.to_numpy(), dtype=torch.float)
    sim_3 = torch.log(sim_3 / (1 - sim_3))

    sim_1_test = sim_1[~sim_1_presence] * 1.0
    sim_2_test = sim_2[~sim_2_presence] * 1.0
    sim_3_test = sim_3[~sim_3_presence] * 1.0
    test_list = [sim_1_test.to(device), sim_2_test.to(device), sim_3_test.to(device)]
    
    sim_1[~sim_1_presence] = float('nan')
    sim_2[~sim_2_presence] = float('nan')
    sim_3[~sim_3_presence] = float('nan')

    data_dict = {'RNA_expr': pd.DataFrame(sim_1.cpu().numpy()), 
                 'Protein': pd.DataFrame(sim_2.cpu().numpy()),
                 'DNA_methyl': pd.DataFrame(sim_3.cpu().numpy())}

    test = MIND(data_dict=data_dict, device=device, emb_dim=emb_dim).to(device)
    test.my_train(epoch, lr=lr)
    with torch.no_grad():
        z = test.get_embedding()[0].cpu().numpy()
    np.savetxt('./synth_results/embeddings_frac_{}.txt'.format(frac), z)

    reconstructed = test.predict()
    masks = [sim_1_presence, sim_2_presence, sim_3_presence]
    names = ['RNA_expr', 'Protein', 'DNA_methyl']
    for _ in range(len(reconstructed)):
        pred = reconstructed[_][~masks[_]].cpu().numpy()
        obs = test_list[_].cpu().numpy()
        np.savetxt('./synth_results/{}_missing_obs_high_noise_frac_{}.txt'.format(names[_], frac), obs.ravel())
        np.savetxt('./synth_results/{}_missing_pred_high_noise_frac_{}.txt'.format(names[_], frac), pred.ravel())

0.1
Epoch=0
Epoch=1000
Epoch=2000
Epoch=3000
Epoch=4000
0.2
Epoch=0
Epoch=1000
Epoch=2000
Epoch=3000
Epoch=4000
0.3
Epoch=0
Epoch=1000
Epoch=2000
Epoch=3000
Epoch=4000
0.4
Epoch=0
Epoch=1000
Epoch=2000
Epoch=3000
Epoch=4000
0.5
Epoch=0
Epoch=1000
Epoch=2000
Epoch=3000
Epoch=4000
0.6
Epoch=0
Epoch=1000
Epoch=2000
Epoch=3000
Epoch=4000
0.7
Epoch=0
Epoch=1000
Epoch=2000
Epoch=3000
Epoch=4000
0.8
Epoch=0
Epoch=1000
Epoch=2000
Epoch=3000
Epoch=4000
0.9
Epoch=0
Epoch=1000
Epoch=2000
Epoch=3000
Epoch=4000


## Downstreaming task 1: clustering
Apply spectral clustering to the output embedddings. Compare them with the true membership using normalised mutual information as a metric.

In [20]:
cluster_number = 15  # the true number of clusters
my_res_NMI = pd.DataFrame(0., index=(1+np.arange(9))/10, columns=['NMI'])
for i, frac in enumerate((1+np.arange(9))/10):
    sim_cls = pd.read_csv('./synthetic_data/sim_cls_high.csv', index_col=1)
    z = np.loadtxt('./synth_results/embeddings_frac_{}.txt'.format(frac))
    temp = 0.
    for _ in range(10):
        labels = spectral_clustering(pairwise_kernels(z, metric='rbf'), n_clusters=cluster_number)
        temp += v_measure_score(sim_cls['cluster.id'].to_numpy(), labels)
    my_res_NMI.loc[frac, 'NMI'] = temp/10.
print(my_res_NMI)

          NMI
0.1  0.589354
0.2  0.730374
0.3  0.787495
0.4  0.884238
0.5  0.902163
0.6  0.932714
0.7  0.963517
0.8  0.984615
0.9  0.996303


## Downstreaming task 2: Classification
Fit XGBoost classifiers to predict membership using the output embeddings. 

In [22]:
from sklearn.model_selection import KFold
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
my_res_ACC = pd.DataFrame(0., index=(1+np.arange(9))/10, columns=['Accuracy'])
for i, frac in enumerate((1+np.arange(9))/10):
    sim_cls = pd.read_csv('./synthetic_data/sim_cls_high.csv', index_col=1)
    z = np.loadtxt('./synth_results/embeddings_frac_{}.txt'.format(frac))
    labels = sim_cls['cluster.id'].to_numpy() - 1
    kf = KFold(n_splits=10, shuffle=True)
    acc_temp = 0.
    for train_idx, test_idx in kf.split(z):
        X_train, X_test = z[train_idx], z[test_idx]
        y_train, y_test = labels[train_idx], labels[test_idx]
    
        model = xgb.XGBClassifier(
            objective='multi:softmax',  # or 'multi:softprob' if you want probabilities
            num_class=len(np.unique(labels)),
            eval_metric='mlogloss'
        )
        model.fit(X_train, y_train)
        # Predict classes
        y_pred = model.predict(X_test)
        acc_temp += np.mean(y_pred == y_test)
    my_res_ACC.loc[frac, 'Accuracy'] = acc_temp / 10.
print(my_res_ACC)

     Accuracy
0.1     0.786
0.2     0.860
0.3     0.890
0.4     0.948
0.5     0.946
0.6     0.952
0.7     0.964
0.8     0.964
0.9     0.976


## Downstreaming task 3: Reconstruction
Predict the masked portion of the synthetic data from the output embeddings

In [26]:
ans = pd.DataFrame(0., columns=['Averaged Correlation'], index=np.array(range(1, 10))/10.)
for i, frac in enumerate(np.array(range(1, 10))/10.):
    anss = []

    RNA_pred = np.loadtxt('./synth_results/RNA_expr_missing_pred_high_noise_frac_{}.txt'.format(frac))
    RNA_obs = np.loadtxt('./synth_results/RNA_expr_missing_obs_high_noise_frac_{}.txt'.format(frac))
    anss += [np.corrcoef(RNA_pred, RNA_obs)[1,0]]

    protein_pred = np.loadtxt('./synth_results/Protein_missing_pred_high_noise_frac_{}.txt'.format(frac))
    protein_obs = np.loadtxt('./synth_results/Protein_missing_obs_high_noise_frac_{}.txt'.format(frac))
    anss += [np.corrcoef(protein_pred, protein_obs)[1,0]]

    DNA_pred = np.loadtxt('./synth_results/DNA_methyl_missing_obs_high_noise_frac_{}.txt'.format(frac))
    DNA_obs = np.loadtxt('./synth_results/DNA_methyl_missing_pred_high_noise_frac_{}.txt'.format(frac))
    anss += [np.corrcoef(DNA_pred, DNA_obs)[1,0]]

    ans.loc[frac] = sum(anss)/3.

print(ans)

     Averaged Correlation
0.1              0.695623
0.2              0.722080
0.3              0.736973
0.4              0.750532
0.5              0.763948
0.6              0.766739
0.7              0.782772
0.8              0.775191
0.9              0.802561
