## Imports

In [17]:
import numpy as np
import scanpy as sc 
import sklearn.metrics as sm

## Data preprocessing

We can obtain our data with or without PCA. Anyway, firstly, let's reduce number of features to 2500.

In [3]:
multiome = sc.read_h5ad("data/GSE194122_openproblems_neurips2021_cite_BMMC_processed.h5ad")
gex = multiome[:, multiome.var["feature_types"] == "GEX"]

  utils.warn_names_duplicates("var")


In [5]:
sc.pp.highly_variable_genes(gex, n_top_genes=2500, flavor="seurat_v3") # Feature selection

  self.data[key] = value


In [11]:
gex_data = gex[:, gex.var["highly_variable"]].X
gex_data.shape # should be (90k, 2500)

(90261, 2500)

In [13]:
sc.pp.pca(gex, n_comps=30) # Perform PCA reduction of the data

In [14]:
gex_data_pcaed = np.array(gex.obsm["X_pca"], dtype=np.float32)
gex_data_pcaed.shape # should be (90k, 30)

(90261, 30)

In [21]:
gex_labels = gex.obs["cell_type"]
gex_labels.shape

(90261,)

## Evaluation pipeline

Let's evaluate PCAed data with Leiden clustering. For this, we must use help of AnnData again.

In [18]:
gex_ad_eval = sc.AnnData(gex_data_pcaed)
sc.pp.neighbors(gex_ad_eval, use_rep='X')
sc.tl.leiden(gex_ad_eval, resolution=1, key_added="leiden")

  from .autonotebook import tqdm as notebook_tqdm


In [19]:
# Get cluster labels as numpy array
gex_ad_eval.obs["leiden"]

0        19
1         2
2        12
3         1
4        47
         ..
90256     3
90257     1
90258    38
90259    25
90260    42
Name: leiden, Length: 90261, dtype: category
Categories (54, object): ['0', '1', '2', '3', ..., '50', '51', '52', '53']

In [20]:
ari = sm.adjusted_rand_score(gex_labels.values, gex_ad_eval.obs["leiden"])
ami = sm.adjusted_mutual_info_score(gex_labels.values, gex_ad_eval.obs["leiden"])
homogeneity = sm.homogeneity_score(gex_labels.values, gex_ad_eval.obs["leiden"])
completeness = sm.completeness_score(gex_labels.values, gex_ad_eval.obs["leiden"])
print("""
ARI: {}\n
AMI: {}\n
Homogenuity: {}\n
Completeness: {}
""".format(ari, ami, homogeneity, completeness))


ARI: 0.06248161009715558

AMI: 0.26549472386422207

Homogenuity: 0.30198739490647486

Completeness: 0.24136953972126018

