In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, normalized_mutual_info_score
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.cluster import SpectralClustering

from sklearn.decomposition import PCA, SparsePCA, KernelPCA
from sklearn.manifold import TSNE

from rpy2.robjects import r, pandas2ri
from rpy2.robjects.vectors import StrVector

pandas2ri.activate()

import magic
import scprep

%matplotlib inline

# from sklearnex import patch_sklearn
# patch_sklearn()

import warnings

from sklearn.cluster import KMeans
from tqdm import tqdm

# r('''library(Seurat)''')

In [2]:
def get_cluster_metrics(pred, labels):
    ari_res = []
    ami_res = []
    nmi_res = []

#     try:
#         r.assign("data", pred.T)
#         seurat = r('''
#             countsData = data.frame(data)
#             pbmc <- CreateSeuratObject(counts = countsData, project = "thal_single_cell", min.cells = 1, min.features = 1)
#             pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", verbose=FALSE)
#             all.genes <- rownames(pbmc)
#             pbmc <- ScaleData(pbmc, features = all.genes, verbose=FALSE)
#             pbmc <- RunPCA(pbmc, verbose=FALSE)
#             pbmc <- FindNeighbors(pbmc, verbose=FALSE)
#             pbmc <- FindClusters(pbmc, verbose=FALSE)
#             Idents(pbmc)
#         ''')
#         ari_res.append(adjusted_rand_score(labels, seurat))
#         ami_res.append(adjusted_mutual_info_score(labels, seurat))
#         nmi_res.append(normalized_mutual_info_score(labels, seurat))
#     except Exception as e:
#         pass

    pred_ = KMeans(n_clusters=len(np.unique(labels)), random_state=42).fit_predict(pred)

    ari_res.append(adjusted_rand_score(labels, pred_))
    ami_res.append(adjusted_mutual_info_score(labels, pred_))
    nmi_res.append(normalized_mutual_info_score(labels, pred_))

    warnings.filterwarnings("error")

    affinities = ['cosine', 'linear', 'poly']

    for i in affinities:
        try:
            pred_ = SpectralClustering(
                n_clusters=len(np.unique(labels)), 
                random_state=42, 
                affinity=i
            ).fit_predict(pred)
            ari_res.append(adjusted_rand_score(labels, pred_))
            ami_res.append(adjusted_mutual_info_score(labels, pred_))
            nmi_res.append(normalized_mutual_info_score(labels, pred_))
        except:
            ari_res.append(0)
            ami_res.append(0)
            nmi_res.append(0)

    warnings.resetwarnings()
    
    return max(ari_res), max(ami_res), max(nmi_res)

In [3]:
def get_data(i):
    df = pd.read_csv('../data/{}/data.csv.gz'.format(i), index_col=0)
    tmp = np.sign(df)
    cols = (np.sum(tmp) > int((df.shape[0])*0.05))
    rows = (np.sum(tmp, axis=1) > int((df.shape[1])*0.05))
    df = np.log(df.loc[rows, cols] + 1)
    df_norm = df.copy()
    df_norm = scprep.normalize.library_size_normalize(df_norm)    
    df_norm = scprep.transform.sqrt(df_norm)
    X_norm = pd.DataFrame(df_norm, columns=df.columns)
    labels = df.index
    return X_norm, labels

In [4]:
dir_list = !ls ../data/ | grep -v zip
dir_list

['baron',
 'bmcite',
 'brosens',
 'carey',
 'cbmc',
 'chang',
 'Fujii',
 'hcabm40k',
 'hrvatin',
 'jakel',
 'jiang',
 'loureiro',
 'manno',
 'mingyao',
 'pbmc3k',
 'Selewa',
 'Xu']

In [5]:
len(dir_list)

17

In [6]:
res = []
for i in tqdm(dir_list):
    X_norm, labels = get_data(i)
    pred = magic.MAGIC().fit_transform(X_norm)
    res.append(get_cluster_metrics(pred, labels))

  0%|          | 0/17 [00:00<?, ?it/s]

Calculating MAGIC...
  Running MAGIC on 1791 cells and 8669 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...
    Calculated PCA in 1.07 seconds.
    Calculating KNN search...
    Calculated KNN search in 0.25 seconds.
    Calculating affinities...
    Calculated affinities in 0.24 seconds.
  Calculated graph and diffusion operator in 1.59 seconds.
  Running MAGIC with `solver='exact'` on 8669-dimensional data may take a long time. Consider denoising specific genes with `genes=<list-like>` or using `solver='approximate'`.
  Calculating imputation...
  Calculated imputation in 0.26 seconds.
Calculated MAGIC in 1.89 seconds.


  6%|▌         | 1/17 [00:15<04:01, 15.10s/it]

Calculating MAGIC...
  Running MAGIC on 5445 cells and 5421 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...


  with _logger.task("MAGIC"):
  _logger.info(
  with _logger.task("graph and diffusion operator"):
  _logger.debug(msg)
  _logger.debug(
  with _logger.task("PCA"):


    Calculated PCA in 2.41 seconds.
    Calculating KNN search...


  _logger.debug("Initializing kernel...")
  with _logger.task("KNN search"):


    Calculated KNN search in 1.88 seconds.
    Calculating affinities...


  with _logger.task("affinities"):
  _logger.debug(
  _logger.debug(


    Calculated affinities in 1.84 seconds.
  Calculated graph and diffusion operator in 6.15 seconds.
  Calculating imputation...


  _logger.debug("Using addition symmetrization.")
  with _logger.task("imputation"):


  Calculated imputation in 1.39 seconds.
Calculated MAGIC in 7.60 seconds.


 12%|█▏        | 2/17 [01:43<14:36, 58.44s/it]

Calculating MAGIC...
  Running MAGIC on 3567 cells and 8102 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...


  with _logger.task("MAGIC"):
  _logger.info(
  with _logger.task("graph and diffusion operator"):
  _logger.debug(msg)
  _logger.debug(
  with _logger.task("PCA"):


    Calculated PCA in 2.18 seconds.
    Calculating KNN search...


  _logger.debug("Initializing kernel...")
  with _logger.task("KNN search"):


    Calculated KNN search in 0.87 seconds.
    Calculating affinities...


  with _logger.task("affinities"):
  _logger.debug(
  _logger.debug(


    Calculated affinities in 0.85 seconds.
  Calculated graph and diffusion operator in 3.93 seconds.
  Running MAGIC with `solver='exact'` on 8102-dimensional data may take a long time. Consider denoising specific genes with `genes=<list-like>` or using `solver='approximate'`.
  Calculating imputation...


  _logger.debug("Using addition symmetrization.")
  with _logger.task("imputation"):


  Calculated imputation in 1.23 seconds.
Calculated MAGIC in 5.21 seconds.


 18%|█▊        | 3/17 [02:35<12:51, 55.11s/it]

Calculating MAGIC...
  Running MAGIC on 5362 cells and 9582 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...


  with _logger.task("MAGIC"):
  _logger.info(
  with _logger.task("graph and diffusion operator"):
  _logger.debug(msg)
  _logger.debug(
  with _logger.task("PCA"):


    Calculated PCA in 9.07 seconds.
    Calculating KNN search...


  _logger.debug("Initializing kernel...")
  with _logger.task("KNN search"):


    Calculated KNN search in 3.40 seconds.
    Calculating affinities...


  with _logger.task("affinities"):
  _logger.debug(
  _logger.debug(


    Calculated affinities in 3.31 seconds.
  Calculated graph and diffusion operator in 15.85 seconds.
  Running MAGIC with `solver='exact'` on 9582-dimensional data may take a long time. Consider denoising specific genes with `genes=<list-like>` or using `solver='approximate'`.
  Calculating imputation...


  _logger.debug("Using addition symmetrization.")
  with _logger.task("imputation"):


  Calculated imputation in 5.27 seconds.
Calculated MAGIC in 21.22 seconds.


 24%|██▎       | 4/17 [04:52<18:57, 87.48s/it]

Calculating MAGIC...
  Running MAGIC on 2492 cells and 4531 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...


  with _logger.task("MAGIC"):
  _logger.info(
  with _logger.task("graph and diffusion operator"):
  _logger.debug(msg)
  _logger.debug(
  with _logger.task("PCA"):


    Calculated PCA in 1.91 seconds.
    Calculating KNN search...


  _logger.debug("Initializing kernel...")
  with _logger.task("KNN search"):


    Calculated KNN search in 0.48 seconds.
    Calculating affinities...


  with _logger.task("affinities"):
  _logger.debug(
  _logger.debug(


    Calculated affinities in 0.45 seconds.
  Calculated graph and diffusion operator in 2.85 seconds.
  Calculating imputation...


  _logger.debug("Using addition symmetrization.")
  with _logger.task("imputation"):


  Calculated imputation in 0.42 seconds.
Calculated MAGIC in 3.30 seconds.


 29%|██▉       | 5/17 [05:45<15:02, 75.18s/it]

Calculating MAGIC...
  Running MAGIC on 1471 cells and 1002 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...


  with _logger.task("MAGIC"):
  _logger.info(
  with _logger.task("graph and diffusion operator"):
  _logger.debug(msg)
  _logger.debug(
  with _logger.task("PCA"):


    Calculated PCA in 0.22 seconds.
    Calculating KNN search...
    Calculated KNN search in 0.19 seconds.
    Calculating affinities...


  _logger.debug("Initializing kernel...")
  with _logger.task("KNN search"):
  with _logger.task("affinities"):
  _logger.debug(
  _logger.debug(


    Calculated affinities in 0.16 seconds.
  Calculated graph and diffusion operator in 0.57 seconds.
  Calculating imputation...
  Calculated imputation in 0.03 seconds.
Calculated MAGIC in 0.61 seconds.


  _logger.debug("Using addition symmetrization.")
  with _logger.task("imputation"):
 35%|███▌      | 6/17 [06:35<12:12, 66.58s/it]

Calculating MAGIC...
  Running MAGIC on 2482 cells and 10689 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...


  with _logger.task("MAGIC"):
  _logger.info(
  with _logger.task("graph and diffusion operator"):
  _logger.debug(msg)
  _logger.debug(
  with _logger.task("PCA"):


    Calculated PCA in 4.23 seconds.
    Calculating KNN search...


  _logger.debug("Initializing kernel...")
  with _logger.task("KNN search"):


    Calculated KNN search in 0.47 seconds.
    Calculating affinities...


  with _logger.task("affinities"):
  _logger.debug(
  _logger.debug(


    Calculated affinities in 0.45 seconds.
  Calculated graph and diffusion operator in 5.18 seconds.
  Running MAGIC with `solver='exact'` on 10689-dimensional data may take a long time. Consider denoising specific genes with `genes=<list-like>` or using `solver='approximate'`.
  Calculating imputation...


  _logger.debug("Using addition symmetrization.")
  with _logger.task("imputation"):


  Calculated imputation in 0.68 seconds.
Calculated MAGIC in 5.92 seconds.


 41%|████      | 7/17 [07:09<09:20, 56.05s/it]

Calculating MAGIC...
  Running MAGIC on 4064 cells and 5125 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...


  with _logger.task("MAGIC"):
  _logger.info(
  with _logger.task("graph and diffusion operator"):
  _logger.debug(msg)
  _logger.debug(
  with _logger.task("PCA"):


    Calculated PCA in 10.81 seconds.
    Calculating KNN search...


  _logger.debug("Initializing kernel...")
  with _logger.task("KNN search"):


    Calculated KNN search in 1.77 seconds.
    Calculating affinities...


  with _logger.task("affinities"):
  _logger.debug(
  _logger.debug(


    Calculated affinities in 1.72 seconds.
  Calculated graph and diffusion operator in 14.35 seconds.
  Calculating imputation...


  _logger.debug("Using addition symmetrization.")
  with _logger.task("imputation"):


  Calculated imputation in 1.95 seconds.
Calculated MAGIC in 16.37 seconds.


 47%|████▋     | 8/17 [09:19<11:54, 79.42s/it]

Calculating MAGIC...
  Running MAGIC on 4533 cells and 7992 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...


  with _logger.task("MAGIC"):
  _logger.info(
  with _logger.task("graph and diffusion operator"):
  _logger.debug(msg)
  _logger.debug(
  with _logger.task("PCA"):


    Calculated PCA in 10.73 seconds.
    Calculating KNN search...


  _logger.debug("Initializing kernel...")
  with _logger.task("KNN search"):


    Calculated KNN search in 0.98 seconds.
    Calculating affinities...


  with _logger.task("affinities"):
  _logger.debug(
  _logger.debug(


    Calculated affinities in 0.91 seconds.
  Calculated graph and diffusion operator in 12.67 seconds.
  Running MAGIC with `solver='exact'` on 7992-dimensional data may take a long time. Consider denoising specific genes with `genes=<list-like>` or using `solver='approximate'`.
  Calculating imputation...


  _logger.debug("Using addition symmetrization.")
  with _logger.task("imputation"):


  Calculated imputation in 2.83 seconds.
Calculated MAGIC in 15.59 seconds.


 53%|█████▎    | 9/17 [11:13<12:01, 90.19s/it]

Calculating MAGIC...
  Running MAGIC on 2913 cells and 6286 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...


  with _logger.task("MAGIC"):
  _logger.info(
  with _logger.task("graph and diffusion operator"):
  _logger.debug(msg)
  _logger.debug(
  with _logger.task("PCA"):


    Calculated PCA in 4.71 seconds.
    Calculating KNN search...


  _logger.debug("Initializing kernel...")
  with _logger.task("KNN search"):


    Calculated KNN search in 0.47 seconds.
    Calculating affinities...


  with _logger.task("affinities"):
  _logger.debug(
  _logger.debug(


    Calculated affinities in 0.46 seconds.
  Calculated graph and diffusion operator in 5.65 seconds.
  Running MAGIC with `solver='exact'` on 6286-dimensional data may take a long time. Consider denoising specific genes with `genes=<list-like>` or using `solver='approximate'`.
  Calculating imputation...


  _logger.debug("Using addition symmetrization.")
  with _logger.task("imputation"):


  Calculated imputation in 0.95 seconds.
Calculated MAGIC in 6.65 seconds.


 59%|█████▉    | 10/17 [12:37<10:19, 88.53s/it]

Calculating MAGIC...
  Running MAGIC on 1014 cells and 14568 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...


  with _logger.task("MAGIC"):
  _logger.info(
  with _logger.task("graph and diffusion operator"):
  _logger.debug(msg)
  _logger.debug(
  with _logger.task("PCA"):


    Calculated PCA in 4.86 seconds.
    Calculating KNN search...
    Calculated KNN search in 0.11 seconds.
    Calculating affinities...


  _logger.debug("Initializing kernel...")
  with _logger.task("KNN search"):
  with _logger.task("affinities"):
  _logger.debug(
  _logger.debug(


    Calculated affinities in 0.13 seconds.
  Calculated graph and diffusion operator in 5.15 seconds.
  Running MAGIC with `solver='exact'` on 14568-dimensional data may take a long time. Consider denoising specific genes with `genes=<list-like>` or using `solver='approximate'`.
  Calculating imputation...
  Calculated imputation in 0.19 seconds.
Calculated MAGIC in 5.40 seconds.


  _logger.debug("Using addition symmetrization.")
  with _logger.task("imputation"):
 65%|██████▍   | 11/17 [13:11<07:09, 71.65s/it]

Calculating MAGIC...
  Running MAGIC on 6656 cells and 10124 genes.
  Calculating graph and diffusion operator...


  with _logger.task("MAGIC"):
  _logger.info(
  with _logger.task("graph and diffusion operator"):
  _logger.debug(msg)
  _logger.debug(


    Calculating PCA...


  with _logger.task("PCA"):


    Calculated PCA in 13.85 seconds.
    Calculating KNN search...


  _logger.debug("Initializing kernel...")
  with _logger.task("KNN search"):


    Calculated KNN search in 4.08 seconds.
    Calculating affinities...


  with _logger.task("affinities"):
  _logger.debug(
  _logger.debug(


    Calculated affinities in 4.03 seconds.
  Calculated graph and diffusion operator in 22.06 seconds.
  Running MAGIC with `solver='exact'` on 10124-dimensional data may take a long time. Consider denoising specific genes with `genes=<list-like>` or using `solver='approximate'`.
  Calculating imputation...


  _logger.debug("Using addition symmetrization.")
  with _logger.task("imputation"):


  Calculated imputation in 8.93 seconds.
Calculated MAGIC in 31.16 seconds.


 71%|███████   | 12/17 [14:59<06:54, 82.83s/it]

Calculating MAGIC...
  Running MAGIC on 1943 cells and 10502 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...


  with _logger.task("MAGIC"):
  _logger.info(
  with _logger.task("graph and diffusion operator"):
  _logger.debug(msg)
  _logger.debug(
  with _logger.task("PCA"):


    Calculated PCA in 10.94 seconds.
    Calculating KNN search...


  _logger.debug("Initializing kernel...")
  with _logger.task("KNN search"):


    Calculated KNN search in 0.44 seconds.
    Calculating affinities...


  with _logger.task("affinities"):
  _logger.debug(
  _logger.debug(


    Calculated affinities in 0.39 seconds.
  Calculated graph and diffusion operator in 11.80 seconds.
  Running MAGIC with `solver='exact'` on 10502-dimensional data may take a long time. Consider denoising specific genes with `genes=<list-like>` or using `solver='approximate'`.
  Calculating imputation...


  _logger.debug("Using addition symmetrization.")
  with _logger.task("imputation"):


  Calculated imputation in 0.69 seconds.
Calculated MAGIC in 12.56 seconds.


 76%|███████▋  | 13/17 [16:15<05:22, 80.74s/it]

Calculating MAGIC...
  Running MAGIC on 6600 cells and 5564 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...


  with _logger.task("MAGIC"):
  _logger.info(
  with _logger.task("graph and diffusion operator"):
  _logger.debug(msg)
  _logger.debug(
  with _logger.task("PCA"):


    Calculated PCA in 4.61 seconds.
    Calculating KNN search...


  _logger.debug("Initializing kernel...")
  with _logger.task("KNN search"):


    Calculated KNN search in 3.36 seconds.
    Calculating affinities...


  with _logger.task("affinities"):
  _logger.debug(
  _logger.debug(


    Calculated affinities in 3.82 seconds.
  Calculated graph and diffusion operator in 11.85 seconds.
  Calculating imputation...


  _logger.debug("Using addition symmetrization.")
  with _logger.task("imputation"):


  Calculated imputation in 5.53 seconds.
Calculated MAGIC in 17.45 seconds.


 82%|████████▏ | 14/17 [18:06<04:29, 89.81s/it]

Calculating MAGIC...
  Running MAGIC on 2024 cells and 3857 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...


  with _logger.task("MAGIC"):
  _logger.info(
  with _logger.task("graph and diffusion operator"):
  _logger.debug(msg)
  _logger.debug(
  with _logger.task("PCA"):


    Calculated PCA in 8.07 seconds.
    Calculating KNN search...


  _logger.debug("Initializing kernel...")
  with _logger.task("KNN search"):


    Calculated KNN search in 0.33 seconds.
    Calculating affinities...


  with _logger.task("affinities"):
  _logger.debug(
  _logger.debug(


    Calculated affinities in 0.43 seconds.
  Calculated graph and diffusion operator in 8.85 seconds.
  Calculating imputation...


  _logger.debug("Using addition symmetrization.")
  with _logger.task("imputation"):


  Calculated imputation in 0.47 seconds.
Calculated MAGIC in 9.35 seconds.


 88%|████████▊ | 15/17 [19:06<02:41, 80.81s/it]

Calculating MAGIC...
  Running MAGIC on 479 cells and 4021 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...


  with _logger.task("MAGIC"):
  _logger.info(
  with _logger.task("graph and diffusion operator"):
  _logger.debug(msg)
  _logger.debug(
  with _logger.task("PCA"):


    Calculated PCA in 1.93 seconds.
    Calculating KNN search...
    Calculated KNN search in 0.05 seconds.
    Calculating affinities...
    Calculated affinities in 0.02 seconds.
  Calculated graph and diffusion operator in 2.01 seconds.
  Calculating imputation...
  Calculated imputation in 0.01 seconds.
Calculated MAGIC in 2.03 seconds.


  _logger.debug("Initializing kernel...")
  with _logger.task("KNN search"):
  with _logger.task("affinities"):
  _logger.debug(
  _logger.debug(
  _logger.debug("Using addition symmetrization.")
  with _logger.task("imputation"):
 94%|█████████▍| 16/17 [19:58<01:12, 72.24s/it]

Calculating MAGIC...
  Running MAGIC on 4252 cells and 7389 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...


  with _logger.task("MAGIC"):
  _logger.info(
  with _logger.task("graph and diffusion operator"):
  _logger.debug(msg)
  _logger.debug(
  with _logger.task("PCA"):


    Calculated PCA in 5.36 seconds.
    Calculating KNN search...


  _logger.debug("Initializing kernel...")
  with _logger.task("KNN search"):


    Calculated KNN search in 1.12 seconds.
    Calculating affinities...


  with _logger.task("affinities"):
  _logger.debug(
  _logger.debug(


    Calculated affinities in 1.10 seconds.
  Calculated graph and diffusion operator in 7.60 seconds.
  Running MAGIC with `solver='exact'` on 7389-dimensional data may take a long time. Consider denoising specific genes with `genes=<list-like>` or using `solver='approximate'`.
  Calculating imputation...


  _logger.debug("Using addition symmetrization.")
  with _logger.task("imputation"):


  Calculated imputation in 2.19 seconds.
Calculated MAGIC in 9.85 seconds.


100%|██████████| 17/17 [21:40<00:00, 76.52s/it]


In [11]:
pd.DataFrame(res, columns=['ARI', 'AMI', 'NMI'], index=dir_list)

Unnamed: 0,ARI,AMI,NMI
baron,0.765658,0.835604,0.839344
bmcite,0.588317,0.682699,0.683016
brosens,0.468512,0.649397,0.651711
carey,0.743721,0.778168,0.778936
cbmc,0.662201,0.746191,0.75107
chang,0.161977,0.302264,0.324759
Fujii,0.443908,0.565823,0.568675
hcabm40k,0.034687,0.05168,0.054587
hrvatin,0.777089,0.867684,0.872608
jakel,0.411675,0.587467,0.589291
