In [6]:
import pandas as pd
import os
from sklearn.metrics import normalized_mutual_info_score as NMI

n_cores = 20
seed = 888
data_path = '/data3/hratch/tc2c_analyses_1/natcomm_revisions/processed/'

In [27]:
pca_dfs = {f.split('-')[1].split('.csv')[0]: pd.read_csv(data_path + f, index_col = 0) \
           for f in os.listdir(data_path) if f.startswith('PCA')}

In [40]:
# get the graph and clusters from PCA space - meant to emulate typical single-cell clustering pipeline
# this should probably be replaced with sc.pp.neighbors and sc.tl.leiden on an adata object
from sklearn.neighbors import kneighbors_graph
import leidenalg as la
import igraph as ig

def construct_graph(your_matrix): # if using this, make sure this is correct
    vcount = max(your_matrix.shape)
    sources, targets = your_matrix.nonzero()
    edgelist = zip(sources.tolist(), targets.tolist())
    return ig.Graph(vcount, edgelist)

neighbor_graphs = {k: kneighbors_graph(v, n_neighbors = 15, n_jobs = n_cores) for \
                  k,v in pca_dfs.items()}
clusters = {k + ' Clusters': la.find_partition(construct_graph(v), la.ModularityVertexPartition, seed = seed).membership \
           for k,v in neighbor_graphs.items()}

In [68]:
meta_labels = pd.Series(list(pca_dfs.values())[0].index)
meta = pd.DataFrame(data = clusters)
meta['Cell Type'] = meta_labels.apply(lambda x: x.split('_')[0])
meta['Sample'] = meta_labels.apply(lambda x: x.split('_')[1])
meta['Data Point'] = meta_labels
# this is the important output for batch correction analysis
meta.head()

Unnamed: 0,Log1P Clusters,NonZeroFraction Clusters,RawCounts Clusters,Cell Type,Sample,Data Point
0,4,0,0,B,C100,B_C100
1,0,3,1,Epithelial,C100,Epithelial_C100
2,2,2,1,Macrophages,C100,Macrophages_C100
3,4,0,0,NK,C100,NK_C100
4,4,0,0,T,C100,T_C100


In [66]:
# if batch correction is working, the NMI between cluster label and sample 
# will be higher in batch correction processing method and lower without batch correction
for cluster_label in clusters:
    score = NMI(meta[cluster_label], meta['Sample'])
    print('NMI for ' + cluster_label.split('_')[0] + ' method is: {}'.format(score))

NMI for Log1P Clusters method is: 0.21848833412166768
NMI for NonZeroFraction Clusters method is: 0.2143362608825604
NMI for RawCounts Clusters method is: 0.18613831604726722


In [67]:
# if batch correction is working, we may also see an improvement 
# in the NMI between cluster label and cell type label in the batch correction method 
for cluster_label in clusters:
    score = NMI(meta[cluster_label], meta['Cell Type'])
    print('NMI for ' + cluster_label.split('_')[0] + ' method is: {}'.format(score))

NMI for Log1P Clusters method is: 0.4676131727957071
NMI for NonZeroFraction Clusters method is: 0.3893818069164669
NMI for RawCounts Clusters method is: 0.32517066438163206
