In [1]:
import pandas as pd
import numpy as np
import anndata
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import warnings
import pathlib

warnings.filterwarnings('ignore')

In [2]:
from matplotlib import rc
labelsize = 6
linewidth = 0.6
rc('lines', linewidth=linewidth)
rc('axes', labelsize=labelsize, linewidth=linewidth)
rc('xtick', labelsize=labelsize)
rc('ytick', labelsize=labelsize)
rc('xtick.major', width=linewidth)
rc('ytick.major', width=linewidth)
rc('xtick.minor', width=linewidth - 0.2)
rc('ytick.minor', width=linewidth - 0.2)

In [3]:
chrom = 'chr2'
genes = ['Lhx6']
slop = 250000
n_pc = 10
resolution = 1

In [4]:
# Parameters
chrom = "chr9"
genes = ["ENSMUSG00000066687.5", "ENSMUSG00000049556.5", "ENSMUSG00000074415.14", "ENSMUSG00000042757.16", "ENSMUSG00000032020.15", "ENSMUSG00000090626.9", "ENSMUSG00000110996.1", "ENSMUSG00000032418.15", "ENSMUSG00000013076.17", "ENSMUSG00000032352.16", "ENSMUSG00000034275.18", "ENSMUSG00000032009.8", "ENSMUSG00000041794.13", "ENSMUSG00000111593.1", "ENSMUSG00000111462.2", "ENSMUSG00000035934.16", "ENSMUSG00000032500.10", "ENSMUSG00000032024.10", "ENSMUSG00000074505.5", "ENSMUSG00000032087.10", "ENSMUSG00000032172.8", "ENSMUSG00000110408.1", "ENSMUSG00000110741.1", "ENSMUSG00000037410.13", "ENSMUSG00000111022.1", "ENSMUSG00000110876.1", "ENSMUSG00000032503.18", "ENSMUSG00000025892.16", "ENSMUSG00000045994.17", "ENSMUSG00000045414.7", "ENSMUSG00000010066.15", "ENSMUSG00000032289.15", "ENSMUSG00000032452.12", "ENSMUSG00000032017.14", "ENSMUSG00000032238.17", "ENSMUSG00000052698.15", "ENSMUSG00000043987.18", "ENSMUSG00000053199.13", "ENSMUSG00000049313.8", "ENSMUSG00000034135.15", "ENSMUSG00000110784.1", "ENSMUSG00000032224.15", "ENSMUSG00000032012.8", "ENSMUSG00000111235.1", "ENSMUSG00000032313.11", "ENSMUSG00000032202.11", "ENSMUSG00000034898.16", "ENSMUSG00000111278.1", "ENSMUSG00000097870.2", "ENSMUSG00000031990.15", "ENSMUSG00000050730.17", "ENSMUSG00000001942.8", "ENSMUSG00000032036.15", "ENSMUSG00000032040.15", "ENSMUSG00000038119.15", "ENSMUSG00000085168.2", "ENSMUSG00000039542.16", "ENSMUSG00000032280.16", "ENSMUSG00000042487.6", "ENSMUSG00000032437.10", "ENSMUSG00000031928.15", "ENSMUSG00000070287.3", "ENSMUSG00000035032.12", "ENSMUSG00000037674.15", "ENSMUSG00000032373.15", "ENSMUSG00000032412.8", "ENSMUSG00000031963.8", "ENSMUSG00000111066.1", "ENSMUSG00000039313.14", "ENSMUSG00000032252.14", "ENSMUSG00000032221.14", "ENSMUSG00000103247.1", "ENSMUSG00000032530.14", "ENSMUSG00000111532.1", "ENSMUSG00000009828.15", "ENSMUSG00000043067.15", "ENSMUSG00000058444.8", "ENSMUSG00000058173.12", "ENSMUSG00000032228.16", "ENSMUSG00000110862.1", "ENSMUSG00000032443.16", "ENSMUSG00000032327.14", "ENSMUSG00000099616.1", "ENSMUSG00000062296.8", "ENSMUSG00000032259.8", "ENSMUSG00000034910.5", "ENSMUSG00000074345.4", "ENSMUSG00000032006.15", "ENSMUSG00000031934.14", "ENSMUSG00000032528.5", "ENSMUSG00000099722.3", "ENSMUSG00000032207.10", "ENSMUSG00000032340.8", "ENSMUSG00000042688.16", "ENSMUSG00000057710.7", "ENSMUSG00000032479.15", "ENSMUSG00000035164.8", "ENSMUSG00000059495.14", "ENSMUSG00000032816.15", "ENSMUSG00000053310.11", "ENSMUSG00000032462.14", "ENSMUSG00000109844.2", "ENSMUSG00000047220.5", "ENSMUSG00000079564.3", "ENSMUSG00000118396.1", "ENSMUSG00000037206.15", "ENSMUSG00000058587.9", "ENSMUSG00000032314.14", "ENSMUSG00000032338.9", "ENSMUSG00000032297.11", "ENSMUSG00000037419.9", "ENSMUSG00000043659.11", "ENSMUSG00000111520.1", "ENSMUSG00000032411.15", "ENSMUSG00000037112.16", "ENSMUSG00000032267.8", "ENSMUSG00000091345.9", "ENSMUSG00000032911.6", "ENSMUSG00000023345.17", "ENSMUSG00000053641.9", "ENSMUSG00000032394.6", "ENSMUSG00000032532.7", "ENSMUSG00000046997.5", "ENSMUSG00000032232.14", "ENSMUSG00000001943.8", "ENSMUSG00000032220.10", "ENSMUSG00000032350.9", "ENSMUSG00000042496.18", "ENSMUSG00000032268.13", "ENSMUSG00000038264.8", "ENSMUSG00000023192.12", "ENSMUSG00000111846.1", "ENSMUSG00000032527.13", "ENSMUSG00000038708.10", "ENSMUSG00000040219.4", "ENSMUSG00000056880.12", "ENSMUSG00000091816.1", "ENSMUSG00000041361.13", "ENSMUSG00000041729.15", "ENSMUSG00000036943.9", "ENSMUSG00000032547.12", "ENSMUSG00000111143.1", "ENSMUSG00000037716.15", "ENSMUSG00000037801.13", "ENSMUSG00000086360.1", "ENSMUSG00000032356.12", "ENSMUSG00000041986.16", "ENSMUSG00000032402.12", "ENSMUSG00000050627.13", "ENSMUSG00000098875.1", "ENSMUSG00000086539.8", "ENSMUSG00000032531.15", "ENSMUSG00000032058.14", "ENSMUSG00000032482.9", "ENSMUSG00000109032.1", "ENSMUSG00000032497.18", "ENSMUSG00000097655.3", "ENSMUSG00000111511.1", "ENSMUSG00000036912.17", "ENSMUSG00000078307.3", "ENSMUSG00000032360.16", "ENSMUSG00000032312.7", "ENSMUSG00000100341.1", "ENSMUSG00000044229.9", "ENSMUSG00000101734.1", "ENSMUSG00000074061.4", "ENSMUSG00000056919.9", "ENSMUSG00000111357.1", "ENSMUSG00000102257.1", "ENSMUSG00000084995.8", "ENSMUSG00000032577.16", "ENSMUSG00000040111.16", "ENSMUSG00000032534.18", "ENSMUSG00000032343.15", "ENSMUSG00000041444.14", "ENSMUSG00000032059.13", "ENSMUSG00000018620.3", "ENSMUSG00000032413.12", "ENSMUSG00000032193.9", "ENSMUSG00000094992.1", "ENSMUSG00000036027.12", "ENSMUSG00000111189.1", "ENSMUSG00000100294.6", "ENSMUSG00000032511.17", "ENSMUSG00000032243.8", "ENSMUSG00000025786.17", "ENSMUSG00000040524.9", "ENSMUSG00000032548.14", "ENSMUSG00000032257.6", "ENSMUSG00000059547.3", "ENSMUSG00000034485.10", "ENSMUSG00000110914.1", "ENSMUSG00000111598.1", "ENSMUSG00000032309.15", "ENSMUSG00000097623.7", "ENSMUSG00000032274.9", "ENSMUSG00000047409.13", "ENSMUSG00000035941.15", "ENSMUSG00000038957.13", "ENSMUSG00000032376.12", "ENSMUSG00000032556.11", "ENSMUSG00000032216.15", "ENSMUSG00000025887.10", "ENSMUSG00000104572.5", "ENSMUSG00000111482.1", "ENSMUSG00000054693.14", "ENSMUSG00000031936.9", "ENSMUSG00000074269.10", "ENSMUSG00000086648.1", "ENSMUSG00000085449.1", "ENSMUSG00000111669.1", "ENSMUSG00000102441.1", "ENSMUSG00000070315.12", "ENSMUSG00000090255.7", "ENSMUSG00000111364.1", "ENSMUSG00000032611.9", "ENSMUSG00000110981.1", "ENSMUSG00000099879.1", "ENSMUSG00000007908.14", "ENSMUSG00000032498.9", "ENSMUSG00000099051.1", "ENSMUSG00000054156.8", "ENSMUSG00000074354.11", "ENSMUSG00000033419.15", "ENSMUSG00000032481.17", "ENSMUSG00000047766.15", "ENSMUSG00000031870.16", "ENSMUSG00000074508.4", "ENSMUSG00000032332.17", "ENSMUSG00000034584.3", "ENSMUSG00000046186.8", "ENSMUSG00000031933.17", "ENSMUSG00000035284.10", "ENSMUSG00000032311.17", "ENSMUSG00000039977.16", "ENSMUSG00000034684.12", "ENSMUSG00000045620.7", "ENSMUSG00000032076.19", "ENSMUSG00000036395.15", "ENSMUSG00000111673.1", "ENSMUSG00000036768.6", "ENSMUSG00000035594.10", "ENSMUSG00000010660.3", "ENSMUSG00000032199.13", "ENSMUSG00000032440.13", "ENSMUSG00000048537.16", "ENSMUSG00000032609.12", "ENSMUSG00000032181.7", "ENSMUSG00000032077.5", "ENSMUSG00000097967.1", "ENSMUSG00000040729.8", "ENSMUSG00000053110.13", "ENSMUSG00000104614.5", "ENSMUSG00000111389.1", "ENSMUSG00000032392.11", "ENSMUSG00000047193.16", "ENSMUSG00000111767.1", "ENSMUSG00000110901.1", "ENSMUSG00000102963.1", "ENSMUSG00000087466.3", "ENSMUSG00000062275.7", "ENSMUSG00000033590.8", "ENSMUSG00000074059.3", "ENSMUSG00000110956.1", "ENSMUSG00000066705.7", "ENSMUSG00000111555.1", "ENSMUSG00000032064.17", "ENSMUSG00000099823.1", "ENSMUSG00000089718.1", "ENSMUSG00000032050.17", "ENSMUSG00000033491.13", "ENSMUSG00000046846.4", "ENSMUSG00000061393.14", "ENSMUSG00000025239.3", "ENSMUSG00000104041.1", "ENSMUSG00000034593.16", "ENSMUSG00000098127.1", "ENSMUSG00000111867.1", "ENSMUSG00000036972.14", "ENSMUSG00000032549.7", "ENSMUSG00000070324.6", "ENSMUSG00000032021.14", "ENSMUSG00000111033.1", "ENSMUSG00000032177.17", "ENSMUSG00000032269.8", "ENSMUSG00000110766.1", "ENSMUSG00000032470.17", "ENSMUSG00000034858.16", "ENSMUSG00000015354.8", "ENSMUSG00000039714.9", "ENSMUSG00000098573.1", "ENSMUSG00000032264.9", "ENSMUSG00000111602.1", "ENSMUSG00000033453.8", "ENSMUSG00000103891.1", "ENSMUSG00000074062.5", "ENSMUSG00000074060.10", "ENSMUSG00000037287.15", "ENSMUSG00000050912.15", "ENSMUSG00000066510.5", "ENSMUSG00000089497.1", "ENSMUSG00000086381.1", "ENSMUSG00000105962.1", "ENSMUSG00000111836.1", "ENSMUSG00000045594.10", "ENSMUSG00000111068.1", "ENSMUSG00000032405.10", "ENSMUSG00000053889.5", "ENSMUSG00000032109.15", "ENSMUSG00000091159.1", "ENSMUSG00000053646.13", "ENSMUSG00000032051.9", "ENSMUSG00000110789.1", "ENSMUSG00000111381.1", "ENSMUSG00000032586.9", "ENSMUSG00000034218.16", "ENSMUSG00000061701.11", "ENSMUSG00000101411.1", "ENSMUSG00000102259.1", "ENSMUSG00000111510.2", "ENSMUSG00000051243.14", "ENSMUSG00000111806.1", "ENSMUSG00000110922.1", "ENSMUSG00000111107.1", "ENSMUSG00000031996.17", "ENSMUSG00000110747.1", "ENSMUSG00000086236.9", "ENSMUSG00000099724.2", "ENSMUSG00000013584.5", "ENSMUSG00000111758.1", "ENSMUSG00000091425.1", "ENSMUSG00000031994.13", "ENSMUSG00000110727.1", "ENSMUSG00000100807.2", "ENSMUSG00000004098.7", "ENSMUSG00000098191.1", "ENSMUSG00000111222.1", "ENSMUSG00000111002.1", "ENSMUSG00000101648.1", "ENSMUSG00000091908.7", "ENSMUSG00000110925.1", "ENSMUSG00000034533.10", "ENSMUSG00000104120.1", "ENSMUSG00000089709.1", "ENSMUSG00000038112.15", "ENSMUSG00000040188.10", "ENSMUSG00000004661.15", "ENSMUSG00000032300.7", "ENSMUSG00000096202.1", "ENSMUSG00000097651.2", "ENSMUSG00000032357.12", "ENSMUSG00000035024.16", "ENSMUSG00000032013.6", "ENSMUSG00000032015.16", "ENSMUSG00000111199.1", "ENSMUSG00000111117.1", "ENSMUSG00000032374.14", "ENSMUSG00000111780.1", "ENSMUSG00000032035.15", "ENSMUSG00000032449.13", "ENSMUSG00000094985.2", "ENSMUSG00000032329.4", "ENSMUSG00000032477.14", "ENSMUSG00000111871.1", "ENSMUSG00000074465.2", "ENSMUSG00000110923.1", "ENSMUSG00000079355.5", "ENSMUSG00000037257.8", "ENSMUSG00000111874.1", "ENSMUSG00000032584.12", "ENSMUSG00000048534.7", "ENSMUSG00000090150.10", "ENSMUSG00000046480.6", "ENSMUSG00000096986.2", "ENSMUSG00000016087.13", "ENSMUSG00000032561.15", "ENSMUSG00000098685.1", "ENSMUSG00000087456.1", "ENSMUSG00000001833.17", "ENSMUSG00000049526.8", "ENSMUSG00000032323.13", "ENSMUSG00000032086.12", "ENSMUSG00000086624.1", "ENSMUSG00000032382.14", "ENSMUSG00000044976.17", "ENSMUSG00000111334.1", "ENSMUSG00000046167.5", "ENSMUSG00000098627.7", "ENSMUSG00000097720.1", "ENSMUSG00000031932.14", "ENSMUSG00000050641.7", "ENSMUSG00000111570.1", "ENSMUSG00000001946.14", "ENSMUSG00000032271.13", "ENSMUSG00000032258.15", "ENSMUSG00000043943.14", "ENSMUSG00000035382.9", "ENSMUSG00000097524.1", "ENSMUSG00000111503.1", "ENSMUSG00000057818.8", "ENSMUSG00000111748.1", "ENSMUSG00000044244.18", "ENSMUSG00000032179.7", "ENSMUSG00000049723.14", "ENSMUSG00000099543.1", "ENSMUSG00000085253.3", "ENSMUSG00000032570.17", "ENSMUSG00000037705.13", "ENSMUSG00000056271.13", "ENSMUSG00000032192.9", "ENSMUSG00000110824.1", "ENSMUSG00000094970.3", "ENSMUSG00000104674.1", "ENSMUSG00000032612.14", "ENSMUSG00000032278.11", "ENSMUSG00000008590.4", "ENSMUSG00000032507.15", "ENSMUSG00000102702.1", "ENSMUSG00000025232.8", "ENSMUSG00000032435.9", "ENSMUSG00000032198.9", "ENSMUSG00000111840.1", "ENSMUSG00000093721.1", "ENSMUSG00000032380.9", "ENSMUSG00000040433.16", "ENSMUSG00000032262.13", "ENSMUSG00000109896.1", "ENSMUSG00000076355.2", "ENSMUSG00000111061.1", "ENSMUSG00000032463.10", "ENSMUSG00000041268.17", "ENSMUSG00000100176.1", "ENSMUSG00000032010.15", "ENSMUSG00000046460.15", "ENSMUSG00000032128.15", "ENSMUSG00000111126.1", "ENSMUSG00000111700.1", "ENSMUSG00000111691.1", "ENSMUSG00000036777.8", "ENSMUSG00000111858.1", "ENSMUSG00000087135.7", "ENSMUSG00000046603.16", "ENSMUSG00000049314.11", "ENSMUSG00000025787.6", "ENSMUSG00000032397.7", "ENSMUSG00000070304.12", "ENSMUSG00000034115.10", "ENSMUSG00000044860.9", "ENSMUSG00000061559.15", "ENSMUSG00000047237.13", "ENSMUSG00000111533.1", "ENSMUSG00000085430.2", "ENSMUSG00000034908.16", "ENSMUSG00000111658.1", "ENSMUSG00000099930.1", "ENSMUSG00000032328.12", "ENSMUSG00000032285.15", "ENSMUSG00000111794.1", "ENSMUSG00000078308.2", "ENSMUSG00000032375.15", "ENSMUSG00000085969.1", "ENSMUSG00000036986.16", "ENSMUSG00000032434.9", "ENSMUSG00000019659.8", "ENSMUSG00000111353.1", "ENSMUSG00000037784.14", "ENSMUSG00000098590.4", "ENSMUSG00000063380.3", "ENSMUSG00000111354.1", "ENSMUSG00000032420.8", "ENSMUSG00000032387.15", "ENSMUSG00000106232.1", "ENSMUSG00000042138.8", "ENSMUSG00000074259.10", "ENSMUSG00000041608.8", "ENSMUSG00000100836.1", "ENSMUSG00000036737.12", "ENSMUSG00000086596.2", "ENSMUSG00000110783.1", "ENSMUSG00000097241.2", "ENSMUSG00000111652.1", "ENSMUSG00000039285.12", "ENSMUSG00000111561.1", "ENSMUSG00000049103.14", "ENSMUSG00000038379.15", "ENSMUSG00000032315.6", "ENSMUSG00000042444.10", "ENSMUSG00000032363.15", "ENSMUSG00000109564.1", "ENSMUSG00000032131.16", "ENSMUSG00000031937.7", "ENSMUSG00000095956.2", "ENSMUSG00000111303.1", "ENSMUSG00000032431.6", "ENSMUSG00000037493.6", "ENSMUSG00000102573.1", "ENSMUSG00000111379.1", "ENSMUSG00000031993.7", "ENSMUSG00000087167.1", "ENSMUSG00000041696.14", "ENSMUSG00000087469.1", "ENSMUSG00000074139.8", "ENSMUSG00000111786.1", "ENSMUSG00000032451.6", "ENSMUSG00000110864.1", "ENSMUSG00000032589.14", "ENSMUSG00000099016.1", "ENSMUSG00000032033.11", "ENSMUSG00000042360.12", "ENSMUSG00000035769.9", "ENSMUSG00000004099.16", "ENSMUSG00000111225.1", "ENSMUSG00000074146.9", "ENSMUSG00000003131.7", "ENSMUSG00000110851.1", "ENSMUSG00000032218.7", "ENSMUSG00000011958.17", "ENSMUSG00000042787.15", "ENSMUSG00000060594.6", "ENSMUSG00000023186.14", "ENSMUSG00000042254.14", "ENSMUSG00000101581.6", "ENSMUSG00000032239.10", "ENSMUSG00000118397.1", "ENSMUSG00000033350.7", "ENSMUSG00000050471.17", "ENSMUSG00000096109.2"]
slop = 250000
n_pc = 10
resolution = 1


In [5]:
output_dir = pathlib.Path(chrom)
output_dir.mkdir(exist_ok=True)

## Cell Meta

In [6]:
cell_tidy_data = pd.read_msgpack(
    '/home/hanliu/project/mouse_rostral_brain/study/ClusteringSummary/Summary/TotalClusteringResults.msg'
)
use_clusters = [
    i.replace(' ', '_') for i in cell_tidy_data[cell_tidy_data['CellClass'].isin(['Exc', 'Inh'])]
    ['SubType'].unique() if 'Outlier' not in i
]
len(use_clusters)

145

## ATAC peaks

In [7]:
atac_peak = pd.read_msgpack('/home/hanliu/project/mouse_rostral_brain/study/DMRCluster/SubType.ATAC_peak_merged.msg')
atac_peak = atac_peak.loc[atac_peak.index.map(lambda i: i.startswith(f'Sub{chrom}_'))].copy()

## Gene Info

In [8]:
gene_meta = pd.read_csv(
    '/home/hanliu/ref/mouse/gencode/vm22/gencode.vM22.annotation.gene.flat.tsv.gz',
    index_col='gene_id', sep='\t'
)
gene_meta = gene_meta[gene_meta['chrom'] == chrom].copy()

In [9]:
exon_bed = pd.read_csv('/home/hanliu/ref/mouse/gencode/vm22/genome_anno/exon.all.bed',
                       header=None, sep='\t')
exon_bed.columns = ['chrom', 'start', 'end', 'gene_id', 'gene_name']

## DMR Info

In [10]:
with pd.HDFStore('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/DMRInfo.h5', 'r') as hdf:
    dmr_rate = hdf['Rate']
dmr_rate = dmr_rate.loc[dmr_rate.index.map(lambda i: i.startswith(f'Sub{chrom}_'))].copy()

dmr_corr = pd.read_msgpack(
    '/home/hanliu/project/mouse_rostral_brain/study/DMRGeneCorr/TotalGeneDMRCorrLoop.0.3.msg'
)
dmr_corr = dmr_corr.set_index(['DMR', 'Gene'])
dmr_corr = dmr_corr.loc[dmr_corr.index.get_level_values('DMR').isin(dmr_rate.index)].copy()

dmr_bed = pd.read_csv('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/TotalDMR.nofilter.bed',
                      sep='\t', header=None, index_col=3)
dmr_bed.columns = ['chrom', 'start', 'end']
dmr_bed = dmr_bed[dmr_bed['chrom'] == chrom].copy()

dmr_hits = anndata.read_h5ad('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/TotalHits.HypoDMR.h5ad')
dmr_hits = dmr_hits[dmr_rate.index, :].copy()
dmr_hits = dmr_hits[:, use_clusters].copy()

dmr_annot = anndata.read_h5ad('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/DMRAnnotation.h5ad')
dmr_annot = dmr_annot[dmr_rate.index, :].copy()

In [11]:
dmr_hits.shape[0]

241767

In [12]:
def get_gene(gene_id):
    _gene = gene_meta.loc[gene_id]
    return _gene.name, _gene['chrom'], _gene['start'], _gene['end'], _gene['strand']

## Gene's DMR clustering

In [13]:
def calculate_gene(gene_id):
    # get gene information
    gene_id, _, gene_start, gene_end, _ = get_gene(gene_id)

    # select related DMRs
    related_dmr = dmr_bed[(dmr_bed['start'] > gene_start - slop) &
                          (dmr_bed['end'] < gene_end + slop)].copy()
    
    related_dmr_rate = dmr_rate.loc[related_dmr.index, use_clusters].copy()
    related_dmr_rate.fillna(related_dmr_rate.mean(), axis=0, inplace=True)
    
    # construct Adata
    adata = anndata.AnnData(X=related_dmr_rate.values.copy(),
                            obs=pd.DataFrame([], related_dmr_rate.index),
                            var=pd.DataFrame([], related_dmr_rate.columns))
    sc.pp.scale(adata)
    sc.pp.pca(adata)
    
    pcs = adata.obsm['X_pca'][:, :n_pc]
    sc.pp.neighbors(adata, n_neighbors=int(round(np.log2(adata.shape[0]))), n_pcs=n_pc)
    sc.tl.leiden(adata, resolution=resolution)
    return adata.obs


def get_annotation(gene_id):
    gene_id, _, gene_start, gene_end, strand = get_gene(gene_id)
    tss = gene_start if strand == '+' else gene_end
    gene_cluster = calculate_gene(gene_id)
    this_corr = dmr_corr[dmr_corr.index.get_level_values('Gene') == gene_id]['Corr']
    this_corr.index = this_corr.index.droplevel('Gene')
    gene_cluster['Corr'] = gene_cluster.index.map(this_corr).fillna(0)
    
    this_dmr_bed = dmr_bed.loc[gene_cluster.index]
    dmr_center = (this_dmr_bed['end'] + this_dmr_bed['start']) / 2
    gene_length = gene_end - gene_start
    if strand == '+':
        gene_cluster['reldist_tss'] = (dmr_center - gene_start) / gene_length
    else:
        gene_cluster['reldist_tss'] = (gene_end - dmr_center) / gene_length
    gene_cluster['in_gene_body'] = (gene_cluster['reldist_tss'] > 0) & (gene_cluster['reldist_tss'] < 1)
    
    this_annot = dmr_annot[gene_cluster.index]
    annot_df = pd.DataFrame(this_annot.X.todense(), 
                 index=this_annot.obs_names, columns=this_annot.var_names)
    
    # annotate TE cols
    dna_te = annot_df.columns[20:33]
    gene_cluster['is_dna_te'] = annot_df[dna_te].sum(axis=1) != 0
    
    line_te = annot_df.columns[33:39]
    gene_cluster['is_line_te'] = annot_df[line_te].sum(axis=1) != 0
    
    ltr_te = annot_df.columns[39:45]
    gene_cluster['is_ltr_te'] = annot_df[ltr_te].sum(axis=1) != 0
    
    sine_te = annot_df.columns[45:52]
    gene_cluster['is_sine_te'] = annot_df[sine_te].sum(axis=1) != 0
    
    # this dmr within GOI's gene feature
    gene_cluster['in_intron'] = annot_df['intron'].astype(bool) & gene_cluster['in_gene_body']
    gene_cluster['in_exon'] = annot_df['exon'].astype(bool) & gene_cluster['in_gene_body']
    gene_cluster['in_utr3'] = annot_df['UTR3'].astype(bool) & gene_cluster['in_gene_body']
    gene_cluster['in_utr5'] = annot_df['UTR5'].astype(bool) & gene_cluster['in_gene_body']
    
    # previous mC study
    gene_cluster['feDMR'] = annot_df['feDMR'].astype(bool)
    gene_cluster['adultDMR'] = annot_df['adultDMR'].astype(bool)
    
    
    other_profiles = []
    # DMR hypo call in each cluster
    this_hypo_hits = dmr_hits[gene_cluster.index]
    hits_df = pd.DataFrame(this_hypo_hits.X.todense(), 
                 index=this_hypo_hits.obs_names, columns=this_hypo_hits.var_names)
    hits_df.columns = hits_df.columns.map(lambda i: f'HypoDMR.{i}')
    other_profiles.append(hits_df)
    
    # DMR rate
    related_dmr_rate = dmr_rate.loc[gene_cluster['leiden'].sort_values().index, use_clusters].copy()
    related_dmr_rate.fillna(related_dmr_rate.mean(), axis=0, inplace=True)
    related_dmr_rate.columns = related_dmr_rate.columns.map(lambda i: f'DMRRate.{i}')
    other_profiles.append(related_dmr_rate)
    
    # atac peak
    atac_peak_df = atac_peak.loc[related_dmr_rate.index, use_clusters].copy()
    other_profiles.append(atac_peak_df)
    atac_peak_df.columns = atac_peak_df.columns.map(lambda i: f'ATACPeak.{i}')
    
    dmr_annotation = pd.concat([gene_cluster] + other_profiles, axis=1, sort=True)
    return dmr_annotation

In [15]:
for gene in genes:
    print(gene)
    check_path = output_dir / f'{gene}.DMR_cluster.msg'
    if check_path.exists():
        continue
    
    dmr_annotation = get_annotation(gene)
    dmr_annotation.to_msgpack(output_dir / f'{gene}.DMR_detail.msg', compress='zlib')
    
    cluster_annotation = dmr_annotation.groupby('leiden').mean()
    cluster_annotation.to_msgpack(output_dir / f'{gene}.DMR_cluster.msg', compress='zlib')

ENSMUSG00000066687.5
ENSMUSG00000049556.5
ENSMUSG00000074415.14
ENSMUSG00000042757.16
ENSMUSG00000032020.15
ENSMUSG00000090626.9
ENSMUSG00000110996.1
ENSMUSG00000032418.15
ENSMUSG00000013076.17
ENSMUSG00000032352.16
ENSMUSG00000034275.18
ENSMUSG00000032009.8
ENSMUSG00000041794.13
ENSMUSG00000111593.1
ENSMUSG00000111462.2
ENSMUSG00000035934.16
ENSMUSG00000032500.10
ENSMUSG00000032024.10
ENSMUSG00000074505.5
ENSMUSG00000032087.10
ENSMUSG00000032172.8
ENSMUSG00000110408.1
ENSMUSG00000110741.1
ENSMUSG00000037410.13
ENSMUSG00000111022.1
ENSMUSG00000110876.1
ENSMUSG00000032503.18
ENSMUSG00000025892.16
ENSMUSG00000045994.17
ENSMUSG00000045414.7
ENSMUSG00000010066.15
ENSMUSG00000032289.15
ENSMUSG00000032452.12
ENSMUSG00000032017.14
ENSMUSG00000032238.17
ENSMUSG00000052698.15
ENSMUSG00000043987.18
ENSMUSG00000053199.13
ENSMUSG00000049313.8
ENSMUSG00000034135.15
ENSMUSG00000110784.1
ENSMUSG00000032224.15
ENSMUSG00000032012.8
ENSMUSG00000111235.1
ENSMUSG00000032313.11
ENSMUSG00000032202.11
ENSMUS

ENSMUSG00000086624.1
ENSMUSG00000032382.14
ENSMUSG00000044976.17
ENSMUSG00000111334.1
ENSMUSG00000046167.5
ENSMUSG00000098627.7
ENSMUSG00000097720.1
ENSMUSG00000031932.14
ENSMUSG00000050641.7
ENSMUSG00000111570.1
ENSMUSG00000001946.14
ENSMUSG00000032271.13
ENSMUSG00000032258.15
ENSMUSG00000043943.14
ENSMUSG00000035382.9
ENSMUSG00000097524.1
ENSMUSG00000111503.1
ENSMUSG00000057818.8
ENSMUSG00000111748.1
ENSMUSG00000044244.18
ENSMUSG00000032179.7
ENSMUSG00000049723.14
ENSMUSG00000099543.1
ENSMUSG00000085253.3
ENSMUSG00000032570.17
ENSMUSG00000037705.13
ENSMUSG00000056271.13
ENSMUSG00000032192.9
ENSMUSG00000110824.1
ENSMUSG00000094970.3
ENSMUSG00000104674.1
ENSMUSG00000032612.14
ENSMUSG00000032278.11
ENSMUSG00000008590.4
ENSMUSG00000032507.15
ENSMUSG00000102702.1
ENSMUSG00000025232.8
ENSMUSG00000032435.9
ENSMUSG00000032198.9
ENSMUSG00000111840.1
ENSMUSG00000093721.1
ENSMUSG00000032380.9
ENSMUSG00000040433.16
ENSMUSG00000032262.13
ENSMUSG00000109896.1
ENSMUSG00000076355.2
ENSMUSG0000011106