In [None]:
import scanpy as sc
import pandas as pd
from sklearn.metrics import adjusted_mutual_info_score, adjusted_rand_score
import matplotlib.pyplot as plt

In [None]:
adata = sc.read_h5ad('data/gene_sorted_filtered_matrix.h5ad').T
barcodes = pd.read_csv('data/barcodes_filtered.tsv', header=None, sep='\t')
genes = pd.read_csv('data/genes.tsv', header=None, sep='\t')
ground_truth_labels = pd.read_csv('data/ground_truth_labels.tsv', sep='\t')


In [None]:
adata.obs_names = barcodes[0].values
adata.var_names = genes[0].values

ground_truth_labels = ground_truth_labels.set_index("NAME")  
adata.obs['ground_truth_labels'] = adata.obs_names.map(ground_truth_labels["New_cellType"])

print(adata.shape)
print(barcodes.shape)
print(genes.shape)

In [None]:
sc.pp.filter_cells(adata, min_genes=100)
sc.pp.filter_genes(adata, min_cells=3)
print(adata.shape)

In [None]:
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)

In [None]:
sc.tl.pca(adata)
sc.pp.neighbors(adata)
sc.tl.umap(adata)
sc.tl.leiden(adata, flavor="igraph", n_iterations=2)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12, 6))

sc.pl.umap(adata, color="leiden", ax=ax[0], title="Baseline Cluster Predictions", show=False)

sc.pl.umap(adata, color="ground_truth_labels", ax=ax[1], title="Ground Truth Clusters", show=False)

plt.tight_layout()  
plt.show()

In [None]:
ground_truth_labels = adata.obs['ground_truth_labels'].values
leiden_labels = adata.obs['leiden'].values

ami = adjusted_mutual_info_score(ground_truth_labels, leiden_labels)
ari = adjusted_rand_score(ground_truth_labels, leiden_labels)

print("Adjusted Mutual Information: ", ami)
print("Adjusted Rand Index: ", ari)
