In [1]:
import pandas as pd
import numpy as np
import anndata
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import warnings
import pathlib

warnings.filterwarnings('ignore')

In [2]:
from matplotlib import rc
labelsize = 6
linewidth = 0.6
rc('lines', linewidth=linewidth)
rc('axes', labelsize=labelsize, linewidth=linewidth)
rc('xtick', labelsize=labelsize)
rc('ytick', labelsize=labelsize)
rc('xtick.major', width=linewidth)
rc('ytick.major', width=linewidth)
rc('xtick.minor', width=linewidth - 0.2)
rc('ytick.minor', width=linewidth - 0.2)

In [3]:
chrom = 'chr2'
genes = ['Lhx6']
slop = 250000
n_pc = 10
resolution = 1

In [4]:
# Parameters
chrom = "chr6"
genes = ["ENSMUSG00000030067.17", "ENSMUSG00000045095.16", "ENSMUSG00000030068.4", "ENSMUSG00000030310.10", "ENSMUSG00000059187.12", "ENSMUSG00000030283.7", "ENSMUSG00000030222.13", "ENSMUSG00000056755.13", "ENSMUSG00000015766.14", "ENSMUSG00000029757.16", "ENSMUSG00000063568.11", "ENSMUSG00000029778.12", "ENSMUSG00000107476.2", "ENSMUSG00000030315.15", "ENSMUSG00000030223.14", "ENSMUSG00000029838.11", "ENSMUSG00000046178.3", "ENSMUSG00000034312.14", "ENSMUSG00000029821.15", "ENSMUSG00000004633.17", "ENSMUSG00000030084.11", "ENSMUSG00000037984.9", "ENSMUSG00000049939.6", "ENSMUSG00000029765.12", "ENSMUSG00000036402.13", "ENSMUSG00000030102.11", "ENSMUSG00000029822.15", "ENSMUSG00000030225.11", "ENSMUSG00000063063.12", "ENSMUSG00000055403.13", "ENSMUSG00000033174.17", "ENSMUSG00000037973.6", "ENSMUSG00000035357.16", "ENSMUSG00000030268.17", "ENSMUSG00000061436.15", "ENSMUSG00000030123.15", "ENSMUSG00000030020.13", "ENSMUSG00000038665.15", "ENSMUSG00000017978.18", "ENSMUSG00000029769.16", "ENSMUSG00000040797.16", "ENSMUSG00000030226.12", "ENSMUSG00000038456.9", "ENSMUSG00000045441.5", "ENSMUSG00000045613.9", "ENSMUSG00000042810.14", "ENSMUSG00000030350.8", "ENSMUSG00000030306.14", "ENSMUSG00000030199.16", "ENSMUSG00000090063.6", "ENSMUSG00000016487.15", "ENSMUSG00000025889.13", "ENSMUSG00000030287.15", "ENSMUSG00000000440.12", "ENSMUSG00000029999.14", "ENSMUSG00000068551.12", "ENSMUSG00000107427.1", "ENSMUSG00000039629.14", "ENSMUSG00000058446.14", "ENSMUSG00000038115.16", "ENSMUSG00000035199.6", "ENSMUSG00000030307.8", "ENSMUSG00000057716.6", "ENSMUSG00000041477.14", "ENSMUSG00000030352.15", "ENSMUSG00000030323.13", "ENSMUSG00000086040.8", "ENSMUSG00000052581.13", "ENSMUSG00000033420.12", "ENSMUSG00000092090.2", "ENSMUSG00000038077.7", "ENSMUSG00000030257.16", "ENSMUSG00000029925.13", "ENSMUSG00000041741.10", "ENSMUSG00000107623.1", "ENSMUSG00000035245.12", "ENSMUSG00000029869.7", "ENSMUSG00000096630.2", "ENSMUSG00000097816.1", "ENSMUSG00000009394.13", "ENSMUSG00000030231.11", "ENSMUSG00000030092.14", "ENSMUSG00000033720.12", "ENSMUSG00000024211.15", "ENSMUSG00000107902.1", "ENSMUSG00000064330.9", "ENSMUSG00000030316.13", "ENSMUSG00000030091.17", "ENSMUSG00000038388.14", "ENSMUSG00000030029.14", "ENSMUSG00000030235.17", "ENSMUSG00000063455.16", "ENSMUSG00000029913.14", "ENSMUSG00000025702.15", "ENSMUSG00000107563.2", "ENSMUSG00000053004.9", "ENSMUSG00000035378.17", "ENSMUSG00000030249.15", "ENSMUSG00000030101.15", "ENSMUSG00000029797.13", "ENSMUSG00000051331.15", "ENSMUSG00000001930.17", "ENSMUSG00000108276.1", "ENSMUSG00000030093.7", "ENSMUSG00000086804.8", "ENSMUSG00000072679.10", "ENSMUSG00000034037.14", "ENSMUSG00000030086.16", "ENSMUSG00000064293.14", "ENSMUSG00000034648.9", "ENSMUSG00000064080.12", "ENSMUSG00000072683.3", "ENSMUSG00000030022.14", "ENSMUSG00000037788.14", "ENSMUSG00000108228.2", "ENSMUSG00000040112.9", "ENSMUSG00000090667.2", "ENSMUSG00000035158.15", "ENSMUSG00000030228.18", "ENSMUSG00000029804.17", "ENSMUSG00000097462.7", "ENSMUSG00000029638.17", "ENSMUSG00000087380.7", "ENSMUSG00000093661.1", "ENSMUSG00000029790.16", "ENSMUSG00000001520.12", "ENSMUSG00000107614.1", "ENSMUSG00000029761.16", "ENSMUSG00000029687.16", "ENSMUSG00000061414.8", "ENSMUSG00000038648.5", "ENSMUSG00000097596.1", "ENSMUSG00000033216.9", "ENSMUSG00000051343.11", "ENSMUSG00000108063.1", "ENSMUSG00000056215.13", "ENSMUSG00000029735.17", "ENSMUSG00000086946.7", "ENSMUSG00000054966.13", "ENSMUSG00000072872.3", "ENSMUSG00000092004.1", "ENSMUSG00000087379.1", "ENSMUSG00000030351.5", "ENSMUSG00000038871.5", "ENSMUSG00000042770.8", "ENSMUSG00000048636.7", "ENSMUSG00000029776.11", "ENSMUSG00000087341.4", "ENSMUSG00000106737.1", "ENSMUSG00000097166.1", "ENSMUSG00000042599.8", "ENSMUSG00000093385.3", "ENSMUSG00000038022.11", "ENSMUSG00000086720.1", "ENSMUSG00000044471.12", "ENSMUSG00000051586.16", "ENSMUSG00000030043.11", "ENSMUSG00000056091.12", "ENSMUSG00000107638.2", "ENSMUSG00000097834.2", "ENSMUSG00000029553.10", "ENSMUSG00000097616.7", "ENSMUSG00000029672.16", "ENSMUSG00000038301.15", "ENSMUSG00000052955.5", "ENSMUSG00000040187.15", "ENSMUSG00000030270.11", "ENSMUSG00000038065.13", "ENSMUSG00000085416.1", "ENSMUSG00000059659.7", "ENSMUSG00000072780.3", "ENSMUSG00000107705.1", "ENSMUSG00000010797.6", "ENSMUSG00000029837.2", "ENSMUSG00000030047.14", "ENSMUSG00000107666.1", "ENSMUSG00000001630.13", "ENSMUSG00000037997.12", "ENSMUSG00000073067.10", "ENSMUSG00000030203.17", "ENSMUSG00000041460.14", "ENSMUSG00000029534.17", "ENSMUSG00000029780.14", "ENSMUSG00000086887.1", "ENSMUSG00000055003.14", "ENSMUSG00000038538.17", "ENSMUSG00000039904.9", "ENSMUSG00000063975.13", "ENSMUSG00000068794.7", "ENSMUSG00000071226.11", "ENSMUSG00000029772.17", "ENSMUSG00000097484.1", "ENSMUSG00000087231.7", "ENSMUSG00000030254.16", "ENSMUSG00000029916.11", "ENSMUSG00000097563.1", "ENSMUSG00000030220.13", "ENSMUSG00000107667.2", "ENSMUSG00000045100.11", "ENSMUSG00000087083.2", "ENSMUSG00000000416.16", "ENSMUSG00000107570.1", "ENSMUSG00000089862.8", "ENSMUSG00000000627.15", "ENSMUSG00000073045.9", "ENSMUSG00000032899.14", "ENSMUSG00000030209.14", "ENSMUSG00000009376.15", "ENSMUSG00000038784.13", "ENSMUSG00000044378.12", "ENSMUSG00000015112.15", "ENSMUSG00000039159.16", "ENSMUSG00000025609.15", "ENSMUSG00000102802.3", "ENSMUSG00000056832.14", "ENSMUSG00000033726.8", "ENSMUSG00000029868.13", "ENSMUSG00000030236.10", "ENSMUSG00000000184.12", "ENSMUSG00000049093.9", "ENSMUSG00000034387.13", "ENSMUSG00000107718.2", "ENSMUSG00000042460.5", "ENSMUSG00000060780.2", "ENSMUSG00000068748.7", "ENSMUSG00000094672.2", "ENSMUSG00000033152.13", "ENSMUSG00000030189.15", "ENSMUSG00000039578.17", "ENSMUSG00000037172.14", "ENSMUSG00000030265.14", "ENSMUSG00000030103.11", "ENSMUSG00000079462.1", "ENSMUSG00000099869.6", "ENSMUSG00000046192.4", "ENSMUSG00000030087.11", "ENSMUSG00000055799.13", "ENSMUSG00000007827.10", "ENSMUSG00000086762.1", "ENSMUSG00000107875.1", "ENSMUSG00000030345.16", "ENSMUSG00000033788.15", "ENSMUSG00000001376.17", "ENSMUSG00000057604.9", "ENSMUSG00000060477.14", "ENSMUSG00000107458.1", "ENSMUSG00000093675.7", "ENSMUSG00000086282.1", "ENSMUSG00000046764.8", "ENSMUSG00000029685.15", "ENSMUSG00000030246.11", "ENSMUSG00000102623.1", "ENSMUSG00000023964.15", "ENSMUSG00000030302.16", "ENSMUSG00000107719.1", "ENSMUSG00000093482.8", "ENSMUSG00000059201.12", "ENSMUSG00000108274.1", "ENSMUSG00000030168.13", "ENSMUSG00000048794.14", "ENSMUSG00000034083.16", "ENSMUSG00000058440.14", "ENSMUSG00000034023.16", "ENSMUSG00000044162.12", "ENSMUSG00000030237.14", "ENSMUSG00000108067.1", "ENSMUSG00000107796.1", "ENSMUSG00000034430.16", "ENSMUSG00000034063.8", "ENSMUSG00000030088.15", "ENSMUSG00000030172.15", "ENSMUSG00000108181.1", "ENSMUSG00000093559.1", "ENSMUSG00000103331.1", "ENSMUSG00000107745.1", "ENSMUSG00000044156.14", "ENSMUSG00000108171.1", "ENSMUSG00000102287.1", "ENSMUSG00000041390.18", "ENSMUSG00000052852.8", "ENSMUSG00000107706.1", "ENSMUSG00000107966.1", "ENSMUSG00000034245.10", "ENSMUSG00000091620.3", "ENSMUSG00000005225.15", "ENSMUSG00000038058.14", "ENSMUSG00000085785.7", "ENSMUSG00000030208.15", "ENSMUSG00000030313.15", "ENSMUSG00000059182.7", "ENSMUSG00000030359.14", "ENSMUSG00000107622.1", "ENSMUSG00000085058.2", "ENSMUSG00000030269.14", "ENSMUSG00000002897.5", "ENSMUSG00000025608.9", "ENSMUSG00000039841.14", "ENSMUSG00000001518.12", "ENSMUSG00000005893.14", "ENSMUSG00000107960.1", "ENSMUSG00000030051.10", "ENSMUSG00000007216.6", "ENSMUSG00000068263.11", "ENSMUSG00000107448.1", "ENSMUSG00000086763.1", "ENSMUSG00000091260.1", "ENSMUSG00000030137.8", "ENSMUSG00000107230.1", "ENSMUSG00000086542.1", "ENSMUSG00000072778.3", "ENSMUSG00000041698.11", "ENSMUSG00000030292.10", "ENSMUSG00000029811.14", "ENSMUSG00000030322.12", "ENSMUSG00000030041.9", "ENSMUSG00000076498.2", "ENSMUSG00000030074.9", "ENSMUSG00000042638.14", "ENSMUSG00000043340.3", "ENSMUSG00000044086.8", "ENSMUSG00000068323.12", "ENSMUSG00000063415.12", "ENSMUSG00000079263.8", "ENSMUSG00000108081.1", "ENSMUSG00000086212.7", "ENSMUSG00000030108.14", "ENSMUSG00000107653.1", "ENSMUSG00000115343.1", "ENSMUSG00000029924.12", "ENSMUSG00000030201.15", "ENSMUSG00000107494.1", "ENSMUSG00000054556.6", "ENSMUSG00000089069.1", "ENSMUSG00000057230.14", "ENSMUSG00000108060.2", "ENSMUSG00000038759.15", "ENSMUSG00000107419.1", "ENSMUSG00000041372.10", "ENSMUSG00000054488.14", "ENSMUSG00000030353.15", "ENSMUSG00000030272.15", "ENSMUSG00000029788.13", "ENSMUSG00000030317.8", "ENSMUSG00000033182.12", "ENSMUSG00000087317.1", "ENSMUSG00000108022.1", "ENSMUSG00000001763.14", "ENSMUSG00000030170.14", "ENSMUSG00000071553.10", "ENSMUSG00000076327.1", "ENSMUSG00000107634.1", "ENSMUSG00000030207.15", "ENSMUSG00000029766.7", "ENSMUSG00000019124.10", "ENSMUSG00000107714.1", "ENSMUSG00000061762.12", "ENSMUSG00000105646.1", "ENSMUSG00000029862.15", "ENSMUSG00000029782.19", "ENSMUSG00000030111.9", "ENSMUSG00000029683.7", "ENSMUSG00000085238.1", "ENSMUSG00000092134.1", "ENSMUSG00000086675.1", "ENSMUSG00000106905.1", "ENSMUSG00000030319.8", "ENSMUSG00000049037.8", "ENSMUSG00000079598.4", "ENSMUSG00000001642.18", "ENSMUSG00000042607.15", "ENSMUSG00000003452.15", "ENSMUSG00000025701.12", "ENSMUSG00000055027.17", "ENSMUSG00000049112.9", "ENSMUSG00000103586.5", "ENSMUSG00000070995.10", "ENSMUSG00000029661.16", "ENSMUSG00000005364.11", "ENSMUSG00000087204.1", "ENSMUSG00000004446.12", "ENSMUSG00000000248.16", "ENSMUSG00000079264.1", "ENSMUSG00000108077.1", "ENSMUSG00000029552.19", "ENSMUSG00000107512.1", "ENSMUSG00000003500.13", "ENSMUSG00000029759.9", "ENSMUSG00000085264.1", "ENSMUSG00000042447.13", "ENSMUSG00000029686.15", "ENSMUSG00000032652.13", "ENSMUSG00000108141.1", "ENSMUSG00000100166.1", "ENSMUSG00000108005.1", "ENSMUSG00000107538.1", "ENSMUSG00000029629.17", "ENSMUSG00000030000.10", "ENSMUSG00000032641.18", "ENSMUSG00000107799.1", "ENSMUSG00000029814.10", "ENSMUSG00000029676.15", "ENSMUSG00000031668.14", "ENSMUSG00000059908.9", "ENSMUSG00000086855.1", "ENSMUSG00000048108.13", "ENSMUSG00000000811.13", "ENSMUSG00000038600.12", "ENSMUSG00000042042.14", "ENSMUSG00000048206.6", "ENSMUSG00000030309.16", "ENSMUSG00000029883.12", "ENSMUSG00000093317.1", "ENSMUSG00000089307.1", "ENSMUSG00000107789.1", "ENSMUSG00000047228.9", "ENSMUSG00000108071.1", "ENSMUSG00000107993.2", "ENSMUSG00000036899.10", "ENSMUSG00000052131.7", "ENSMUSG00000035125.10", "ENSMUSG00000030109.15", "ENSMUSG00000106833.1", "ENSMUSG00000097730.3", "ENSMUSG00000029847.13", "ENSMUSG00000040163.14", "ENSMUSG00000092247.2", "ENSMUSG00000039070.5", "ENSMUSG00000107835.1", "ENSMUSG00000030069.15", "ENSMUSG00000030337.16", "ENSMUSG00000038028.9", "ENSMUSG00000079679.3", "ENSMUSG00000029684.14", "ENSMUSG00000079262.3", "ENSMUSG00000107624.1", "ENSMUSG00000063810.7", "ENSMUSG00000012535.14", "ENSMUSG00000108006.1", "ENSMUSG00000061353.11", "ENSMUSG00000030347.6", "ENSMUSG00000087670.1", "ENSMUSG00000029860.16", "ENSMUSG00000089997.2", "ENSMUSG00000040234.16", "ENSMUSG00000087302.1", "ENSMUSG00000030321.15", "ENSMUSG00000107408.1", "ENSMUSG00000029781.7", "ENSMUSG00000107595.1", "ENSMUSG00000107396.2", "ENSMUSG00000091438.1", "ENSMUSG00000047203.3", "ENSMUSG00000108035.1", "ENSMUSG00000038836.15", "ENSMUSG00000098846.1", "ENSMUSG00000108011.1", "ENSMUSG00000030344.11", "ENSMUSG00000030079.15", "ENSMUSG00000001157.13", "ENSMUSG00000088867.1", "ENSMUSG00000097364.1", "ENSMUSG00000107103.1", "ENSMUSG00000014542.3", "ENSMUSG00000054435.16", "ENSMUSG00000107795.1", "ENSMUSG00000108184.1", "ENSMUSG00000097354.7", "ENSMUSG00000067825.11", "ENSMUSG00000071494.5", "ENSMUSG00000108111.1", "ENSMUSG00000098285.1", "ENSMUSG00000034456.15", "ENSMUSG00000072878.4", "ENSMUSG00000098674.1"]
slop = 250000
n_pc = 10
resolution = 1


In [5]:
output_dir = pathlib.Path(chrom)
output_dir.mkdir(exist_ok=True)

## Cell Meta

In [6]:
cell_tidy_data = pd.read_msgpack(
    '/home/hanliu/project/mouse_rostral_brain/study/ClusteringSummary/Summary/TotalClusteringResults.msg'
)
use_clusters = [
    i.replace(' ', '_') for i in cell_tidy_data[cell_tidy_data['CellClass'].isin(['Exc', 'Inh'])]
    ['SubType'].unique() if 'Outlier' not in i
]
len(use_clusters)

145

## ATAC peaks

In [7]:
atac_peak = pd.read_msgpack('/home/hanliu/project/mouse_rostral_brain/study/DMRCluster/SubType.ATAC_peak_merged.msg')
atac_peak = atac_peak.loc[atac_peak.index.map(lambda i: i.startswith(f'Sub{chrom}_'))].copy()

## Gene Info

In [8]:
gene_meta = pd.read_csv(
    '/home/hanliu/ref/mouse/gencode/vm22/gencode.vM22.annotation.gene.flat.tsv.gz',
    index_col='gene_id', sep='\t'
)
gene_meta = gene_meta[gene_meta['chrom'] == chrom].copy()

In [9]:
exon_bed = pd.read_csv('/home/hanliu/ref/mouse/gencode/vm22/genome_anno/exon.all.bed',
                       header=None, sep='\t')
exon_bed.columns = ['chrom', 'start', 'end', 'gene_id', 'gene_name']

## DMR Info

In [10]:
with pd.HDFStore('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/DMRInfo.h5', 'r') as hdf:
    dmr_rate = hdf['Rate']
dmr_rate = dmr_rate.loc[dmr_rate.index.map(lambda i: i.startswith(f'Sub{chrom}_'))].copy()

dmr_corr = pd.read_msgpack(
    '/home/hanliu/project/mouse_rostral_brain/study/DMRGeneCorr/TotalGeneDMRCorrLoop.0.3.msg'
)
dmr_corr = dmr_corr.set_index(['DMR', 'Gene'])
dmr_corr = dmr_corr.loc[dmr_corr.index.get_level_values('DMR').isin(dmr_rate.index)].copy()

dmr_bed = pd.read_csv('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/TotalDMR.nofilter.bed',
                      sep='\t', header=None, index_col=3)
dmr_bed.columns = ['chrom', 'start', 'end']
dmr_bed = dmr_bed[dmr_bed['chrom'] == chrom].copy()

dmr_hits = anndata.read_h5ad('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/TotalHits.HypoDMR.h5ad')
dmr_hits = dmr_hits[dmr_rate.index, :].copy()
dmr_hits = dmr_hits[:, use_clusters].copy()

dmr_annot = anndata.read_h5ad('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/DMRAnnotation.h5ad')
dmr_annot = dmr_annot[dmr_rate.index, :].copy()

In [11]:
dmr_hits.shape[0]

269856

In [12]:
def get_gene(gene_id):
    _gene = gene_meta.loc[gene_id]
    return _gene.name, _gene['chrom'], _gene['start'], _gene['end'], _gene['strand']

## Gene's DMR clustering

In [13]:
def calculate_gene(gene_id):
    # get gene information
    gene_id, _, gene_start, gene_end, _ = get_gene(gene_id)

    # select related DMRs
    related_dmr = dmr_bed[(dmr_bed['start'] > gene_start - slop) &
                          (dmr_bed['end'] < gene_end + slop)].copy()
    
    related_dmr_rate = dmr_rate.loc[related_dmr.index, use_clusters].copy()
    related_dmr_rate.fillna(related_dmr_rate.mean(), axis=0, inplace=True)
    
    # construct Adata
    adata = anndata.AnnData(X=related_dmr_rate.values.copy(),
                            obs=pd.DataFrame([], related_dmr_rate.index),
                            var=pd.DataFrame([], related_dmr_rate.columns))
    sc.pp.scale(adata)
    sc.pp.pca(adata)
    
    pcs = adata.obsm['X_pca'][:, :n_pc]
    sc.pp.neighbors(adata, n_neighbors=int(round(np.log2(adata.shape[0]))), n_pcs=n_pc)
    sc.tl.leiden(adata, resolution=resolution)
    return adata.obs


def get_annotation(gene_id):
    gene_id, _, gene_start, gene_end, strand = get_gene(gene_id)
    tss = gene_start if strand == '+' else gene_end
    gene_cluster = calculate_gene(gene_id)
    this_corr = dmr_corr[dmr_corr.index.get_level_values('Gene') == gene_id]['Corr']
    this_corr.index = this_corr.index.droplevel('Gene')
    gene_cluster['Corr'] = gene_cluster.index.map(this_corr).fillna(0)
    
    this_dmr_bed = dmr_bed.loc[gene_cluster.index]
    dmr_center = (this_dmr_bed['end'] + this_dmr_bed['start']) / 2
    gene_length = gene_end - gene_start
    if strand == '+':
        gene_cluster['reldist_tss'] = (dmr_center - gene_start) / gene_length
    else:
        gene_cluster['reldist_tss'] = (gene_end - dmr_center) / gene_length
    gene_cluster['in_gene_body'] = (gene_cluster['reldist_tss'] > 0) & (gene_cluster['reldist_tss'] < 1)
    
    this_annot = dmr_annot[gene_cluster.index]
    annot_df = pd.DataFrame(this_annot.X.todense(), 
                 index=this_annot.obs_names, columns=this_annot.var_names)
    
    # annotate TE cols
    dna_te = annot_df.columns[20:33]
    gene_cluster['is_dna_te'] = annot_df[dna_te].sum(axis=1) != 0
    
    line_te = annot_df.columns[33:39]
    gene_cluster['is_line_te'] = annot_df[line_te].sum(axis=1) != 0
    
    ltr_te = annot_df.columns[39:45]
    gene_cluster['is_ltr_te'] = annot_df[ltr_te].sum(axis=1) != 0
    
    sine_te = annot_df.columns[45:52]
    gene_cluster['is_sine_te'] = annot_df[sine_te].sum(axis=1) != 0
    
    # this dmr within GOI's gene feature
    gene_cluster['in_intron'] = annot_df['intron'].astype(bool) & gene_cluster['in_gene_body']
    gene_cluster['in_exon'] = annot_df['exon'].astype(bool) & gene_cluster['in_gene_body']
    gene_cluster['in_utr3'] = annot_df['UTR3'].astype(bool) & gene_cluster['in_gene_body']
    gene_cluster['in_utr5'] = annot_df['UTR5'].astype(bool) & gene_cluster['in_gene_body']
    
    # previous mC study
    gene_cluster['feDMR'] = annot_df['feDMR'].astype(bool)
    gene_cluster['adultDMR'] = annot_df['adultDMR'].astype(bool)
    
    
    other_profiles = []
    # DMR hypo call in each cluster
    this_hypo_hits = dmr_hits[gene_cluster.index]
    hits_df = pd.DataFrame(this_hypo_hits.X.todense(), 
                 index=this_hypo_hits.obs_names, columns=this_hypo_hits.var_names)
    hits_df.columns = hits_df.columns.map(lambda i: f'HypoDMR.{i}')
    other_profiles.append(hits_df)
    
    # DMR rate
    related_dmr_rate = dmr_rate.loc[gene_cluster['leiden'].sort_values().index, use_clusters].copy()
    related_dmr_rate.fillna(related_dmr_rate.mean(), axis=0, inplace=True)
    related_dmr_rate.columns = related_dmr_rate.columns.map(lambda i: f'DMRRate.{i}')
    other_profiles.append(related_dmr_rate)
    
    # atac peak
    atac_peak_df = atac_peak.loc[related_dmr_rate.index, use_clusters].copy()
    other_profiles.append(atac_peak_df)
    atac_peak_df.columns = atac_peak_df.columns.map(lambda i: f'ATACPeak.{i}')
    
    dmr_annotation = pd.concat([gene_cluster] + other_profiles, axis=1, sort=True)
    return dmr_annotation

In [15]:
for gene in genes:
    print(gene)
    check_path = output_dir / f'{gene}.DMR_cluster.msg'
    if check_path.exists():
        continue
    
    dmr_annotation = get_annotation(gene)
    dmr_annotation.to_msgpack(output_dir / f'{gene}.DMR_detail.msg', compress='zlib')
    
    cluster_annotation = dmr_annotation.groupby('leiden').mean()
    cluster_annotation.to_msgpack(output_dir / f'{gene}.DMR_cluster.msg', compress='zlib')

ENSMUSG00000030067.17
ENSMUSG00000045095.16
ENSMUSG00000030068.4
ENSMUSG00000030310.10
ENSMUSG00000059187.12
ENSMUSG00000030283.7
ENSMUSG00000030222.13
ENSMUSG00000056755.13
ENSMUSG00000015766.14
ENSMUSG00000029757.16
ENSMUSG00000063568.11
ENSMUSG00000029778.12
ENSMUSG00000107476.2
ENSMUSG00000030315.15
ENSMUSG00000030223.14
ENSMUSG00000029838.11
ENSMUSG00000046178.3
ENSMUSG00000034312.14
ENSMUSG00000029821.15
ENSMUSG00000004633.17
ENSMUSG00000030084.11
ENSMUSG00000037984.9
ENSMUSG00000049939.6
ENSMUSG00000029765.12
ENSMUSG00000036402.13
ENSMUSG00000030102.11
ENSMUSG00000029822.15
ENSMUSG00000030225.11
ENSMUSG00000063063.12
ENSMUSG00000055403.13
ENSMUSG00000033174.17
ENSMUSG00000037973.6
ENSMUSG00000035357.16
ENSMUSG00000030268.17
ENSMUSG00000061436.15
ENSMUSG00000030123.15
ENSMUSG00000030020.13
ENSMUSG00000038665.15
ENSMUSG00000017978.18
ENSMUSG00000029769.16
ENSMUSG00000040797.16
ENSMUSG00000030226.12
ENSMUSG00000038456.9
ENSMUSG00000045441.5
ENSMUSG00000045613.9
ENSMUSG00000042810.1

ENSMUSG00000000248.16
ENSMUSG00000079264.1
ENSMUSG00000108077.1
ENSMUSG00000029552.19
ENSMUSG00000107512.1
ENSMUSG00000003500.13
ENSMUSG00000029759.9
ENSMUSG00000085264.1
ENSMUSG00000042447.13
ENSMUSG00000029686.15
ENSMUSG00000032652.13
ENSMUSG00000108141.1
ENSMUSG00000100166.1
ENSMUSG00000108005.1
ENSMUSG00000107538.1
ENSMUSG00000029629.17
ENSMUSG00000030000.10
ENSMUSG00000032641.18
ENSMUSG00000107799.1
ENSMUSG00000029814.10
ENSMUSG00000029676.15
ENSMUSG00000031668.14
ENSMUSG00000059908.9
ENSMUSG00000086855.1
ENSMUSG00000048108.13
ENSMUSG00000000811.13
ENSMUSG00000038600.12
ENSMUSG00000042042.14
ENSMUSG00000048206.6
ENSMUSG00000030309.16
ENSMUSG00000029883.12
ENSMUSG00000093317.1
ENSMUSG00000089307.1
ENSMUSG00000107789.1
ENSMUSG00000047228.9
ENSMUSG00000108071.1
ENSMUSG00000107993.2
ENSMUSG00000036899.10
ENSMUSG00000052131.7
ENSMUSG00000035125.10
ENSMUSG00000030109.15
ENSMUSG00000106833.1
ENSMUSG00000097730.3
ENSMUSG00000029847.13
ENSMUSG00000040163.14
ENSMUSG00000092247.2
ENSMUSG0000