In [1]:
import pandas as pd
import numpy as np
import anndata
import json
from collections import defaultdict

In [2]:
gene_clusters = anndata.read_h5ad('GeneClustering.h5ad')
gene_clusters = gene_clusters.obs['leiden']

In [3]:
total_corr = pd.read_msgpack('RelatedCorr.msg')

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
dmr_records = []
with pd.HDFStore('FinalDMGDMR.h5') as f:
    for k in f.keys():
        if k.endswith('Corr'):
            corr_index = f[k]
        else:
            continue
        this_corr = total_corr.loc[corr_index].copy()
        this_corr['gene_cluster'] = this_corr['Gene'].map(gene_clusters)
        dmr_records.append(this_corr[['DMR', 'gene_cluster']])
dmr_records = pd.concat(dmr_records).reset_index(drop=True).drop_duplicates()

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  


In [5]:
cluster_dmr_index = dmr_records.groupby('gene_cluster').apply(lambda i: i['DMR'].tolist()).to_dict()

In [6]:
with open('GeneCluster.relatedDMR.index.json', 'w') as f:
    json.dump(cluster_dmr_index, f)

In [7]:
for cluster, dmrs in cluster_dmr_index.items():
    print(f'Cluster {cluster}', len(dmrs), sep='\t')

Cluster 0	9133
Cluster 1	5870
Cluster 2	18999
Cluster 3	11921
Cluster 4	5227
Cluster 5	5790
Cluster 6	11745
Cluster 7	7535
Cluster 8	7953
