In [1]:
import numpy as np
import pandas as pd
import scanpy as sc

from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, normalized_mutual_info_score
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.cluster import SpectralClustering

from sklearn.decomposition import PCA, SparsePCA, KernelPCA
from sklearn.manifold import TSNE

from rpy2.robjects import r, pandas2ri
from rpy2.robjects.vectors import StrVector

pandas2ri.activate()

# import magic
import scprep

%matplotlib inline

# from sklearnex import patch_sklearn
# patch_sklearn()

import warnings

from sklearn.cluster import KMeans
from tqdm import tqdm

# r('''library(Seurat)''')

from dca.api import dca

  _config = yaml.load(open(_config_path))


In [2]:
def get_cluster_metrics(pred, labels):
    ari_res = []
    ami_res = []
    nmi_res = []

#     try:
#         r.assign("data", pred.T)
#         seurat = r('''
#             countsData = data.frame(data)
#             pbmc <- CreateSeuratObject(counts = countsData, project = "thal_single_cell", min.cells = 1, min.features = 1)
#             pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", verbose=FALSE)
#             all.genes <- rownames(pbmc)
#             pbmc <- ScaleData(pbmc, features = all.genes, verbose=FALSE)
#             pbmc <- RunPCA(pbmc, verbose=FALSE)
#             pbmc <- FindNeighbors(pbmc, verbose=FALSE)
#             pbmc <- FindClusters(pbmc, verbose=FALSE)
#             Idents(pbmc)
#         ''')
#         ari_res.append(adjusted_rand_score(labels, seurat))
#         ami_res.append(adjusted_mutual_info_score(labels, seurat))
#         nmi_res.append(normalized_mutual_info_score(labels, seurat))
#     except Exception as e:
#         pass

    pred_ = KMeans(n_clusters=len(np.unique(labels)), random_state=42).fit_predict(pred)

    ari_res.append(adjusted_rand_score(labels, pred_))
    ami_res.append(adjusted_mutual_info_score(labels, pred_))
    nmi_res.append(normalized_mutual_info_score(labels, pred_))

    warnings.filterwarnings("error")

    affinities = ['cosine', 'linear', 'poly']

    for i in affinities:
        try:
            pred_ = SpectralClustering(
                n_clusters=len(np.unique(labels)), 
                random_state=42, 
                affinity=i
            ).fit_predict(pred)
            ari_res.append(adjusted_rand_score(labels, pred_))
            ami_res.append(adjusted_mutual_info_score(labels, pred_))
            nmi_res.append(normalized_mutual_info_score(labels, pred_))
        except:
            ari_res.append(0)
            ami_res.append(0)
            nmi_res.append(0)

    warnings.resetwarnings()
    
    return max(ari_res), max(ami_res), max(nmi_res)

In [3]:
def get_data(i):
    df = pd.read_csv('../data/{}/data.csv.gz'.format(i), index_col=0)
    tmp = np.sign(df)
    cols = (np.sum(tmp) > int((df.shape[0])*0.05))
    rows = (np.sum(tmp, axis=1) > int((df.shape[1])*0.05))
    df = df.loc[rows, cols]
    X = df.reset_index(drop=True)
    labels = df.index
    return X, labels

In [4]:
dir_list = !ls ../data/ | grep -v zip
dir_list

['baron',
 'bmcite',
 'brosens',
 'carey',
 'cbmc',
 'chang',
 'Fujii',
 'hcabm40k',
 'hrvatin',
 'jakel',
 'jiang',
 'loureiro',
 'manno',
 'mingyao',
 'pbmc3k',
 'Selewa',
 'Xu']

In [5]:
len(dir_list)

17

In [6]:
res = []
for i in tqdm(dir_list):
    X_norm, labels = get_data(i)
    adata = sc.AnnData(X_norm)
    dca(adata, verbose=False)
    res.append(get_cluster_metrics(adata.X, labels))



dca: Successfully preprocessed 8669 genes and 1791 cells.






dca: Calculating reconstructions...




dca: Successfully preprocessed 5421 genes and 5445 cells.




dca: Calculating reconstructions...




dca: Successfully preprocessed 8102 genes and 3567 cells.




dca: Calculating reconstructions...




dca: Successfully preprocessed 9582 genes and 5362 cells.




dca: Calculating reconstructions...




dca: Successfully preprocessed 4531 genes and 2492 cells.




dca: Calculating reconstructions...




dca: Successfully preprocessed 1002 genes and 1471 cells.




dca: Calculating reconstructions...




dca: Successfully preprocessed 10689 genes and 2482 cells.




dca: Calculating reconstructions...




dca: Successfully preprocessed 5125 genes and 4064 cells.




dca: Calculating reconstructions...




dca: Successfully preprocessed 7992 genes and 4533 cells.




dca: Calculating reconstructions...




dca: Successfully preprocessed 6286 genes and 2913 cells.




dca: Calculating reconstructions...




dca: Successfully preprocessed 14568 genes and 1014 cells.




dca: Calculating reconstructions...




dca: Successfully preprocessed 10124 genes and 6656 cells.




dca: Calculating reconstructions...




dca: Successfully preprocessed 10502 genes and 1943 cells.




dca: Calculating reconstructions...




dca: Successfully preprocessed 5564 genes and 6600 cells.




dca: Calculating reconstructions...




dca: Successfully preprocessed 3857 genes and 2024 cells.




dca: Calculating reconstructions...




dca: Successfully preprocessed 4021 genes and 479 cells.




dca: Calculating reconstructions...




dca: Successfully preprocessed 7389 genes and 4252 cells.




dca: Calculating reconstructions...


100%|██████████| 17/17 [38:19<00:00, 135.26s/it]


In [7]:
pd.DataFrame(res)

Unnamed: 0,0,1,2
0,0.639186,0.756684,0.762076
1,0.442704,0.49924,0.499812
2,0.377462,0.550599,0.553496
3,0.479896,0.616351,0.61771
4,0.443218,0.615552,0.62274
5,0.150238,0.285529,0.308555
6,0.197755,0.321579,0.326047
7,0.03521,0.05322,0.056538
8,0.668088,0.815548,0.822436
9,0.463438,0.571807,0.573987
