In [1]:
import pandas as pd
import numpy as np
import anndata
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import warnings
import pathlib

warnings.filterwarnings('ignore')

In [2]:
from matplotlib import rc
labelsize = 6
linewidth = 0.6
rc('lines', linewidth=linewidth)
rc('axes', labelsize=labelsize, linewidth=linewidth)
rc('xtick', labelsize=labelsize)
rc('ytick', labelsize=labelsize)
rc('xtick.major', width=linewidth)
rc('ytick.major', width=linewidth)
rc('xtick.minor', width=linewidth - 0.2)
rc('ytick.minor', width=linewidth - 0.2)

In [3]:
chrom = 'chr2'
genes = ['Lhx6']
slop = 250000
n_pc = 10
resolution = 1

In [4]:
# Parameters
chrom = "chr4"
genes = ["ENSMUSG00000001985.9", "ENSMUSG00000028559.17", "ENSMUSG00000008575.17", "ENSMUSG00000008489.18", "ENSMUSG00000028399.18", "ENSMUSG00000028664.14", "ENSMUSG00000044813.15", "ENSMUSG00000028519.16", "ENSMUSG00000040972.8", "ENSMUSG00000028546.17", "ENSMUSG00000040387.16", "ENSMUSG00000014592.20", "ENSMUSG00000028804.20", "ENSMUSG00000061887.14", "ENSMUSG00000028906.16", "ENSMUSG00000090053.9", "ENSMUSG00000060206.11", "ENSMUSG00000040270.16", "ENSMUSG00000039809.10", "ENSMUSG00000039410.16", "ENSMUSG00000087413.1", "ENSMUSG00000059713.12", "ENSMUSG00000028402.18", "ENSMUSG00000028545.13", "ENSMUSG00000032890.17", "ENSMUSG00000008305.18", "ENSMUSG00000045083.14", "ENSMUSG00000028803.18", "ENSMUSG00000041544.11", "ENSMUSG00000003411.10", "ENSMUSG00000110373.1", "ENSMUSG00000040606.13", "ENSMUSG00000045071.14", "ENSMUSG00000028373.16", "ENSMUSG00000040183.13", "ENSMUSG00000037306.13", "ENSMUSG00000040964.16", "ENSMUSG00000033295.14", "ENSMUSG00000028525.16", "ENSMUSG00000085816.1", "ENSMUSG00000028532.14", "ENSMUSG00000028565.18", "ENSMUSG00000048232.12", "ENSMUSG00000061859.17", "ENSMUSG00000006586.15", "ENSMUSG00000035275.14", "ENSMUSG00000028975.16", "ENSMUSG00000041000.8", "ENSMUSG00000036062.13", "ENSMUSG00000076123.1", "ENSMUSG00000040659.3", "ENSMUSG00000028341.9", "ENSMUSG00000089945.13", "ENSMUSG00000025791.18", "ENSMUSG00000028518.8", "ENSMUSG00000039546.9", "ENSMUSG00000028496.17", "ENSMUSG00000028391.16", "ENSMUSG00000037996.17", "ENSMUSG00000033948.3", "ENSMUSG00000041272.11", "ENSMUSG00000028538.12", "ENSMUSG00000028909.17", "ENSMUSG00000028488.15", "ENSMUSG00000028456.18", "ENSMUSG00000070867.4", "ENSMUSG00000025743.14", "ENSMUSG00000087459.8", "ENSMUSG00000028347.14", "ENSMUSG00000038729.23", "ENSMUSG00000029055.17", "ENSMUSG00000063446.4", "ENSMUSG00000038368.16", "ENSMUSG00000042228.14", "ENSMUSG00000028634.17", "ENSMUSG00000028226.15", "ENSMUSG00000055373.8", "ENSMUSG00000034762.9", "ENSMUSG00000059049.14", "ENSMUSG00000028530.14", "ENSMUSG00000041235.12", "ENSMUSG00000045672.15", "ENSMUSG00000035615.12", "ENSMUSG00000035305.5", "ENSMUSG00000098760.1", "ENSMUSG00000028444.17", "ENSMUSG00000028683.14", "ENSMUSG00000028830.14", "ENSMUSG00000040536.15", "ENSMUSG00000038816.14", "ENSMUSG00000028613.15", "ENSMUSG00000028957.12", "ENSMUSG00000041058.15", "ENSMUSG00000048747.14", "ENSMUSG00000028238.6", "ENSMUSG00000087614.1", "ENSMUSG00000040372.2", "ENSMUSG00000102450.1", "ENSMUSG00000028977.16", "ENSMUSG00000066191.12", "ENSMUSG00000028661.8", "ENSMUSG00000028358.15", "ENSMUSG00000060862.10", "ENSMUSG00000028782.14", "ENSMUSG00000057637.13", "ENSMUSG00000044288.6", "ENSMUSG00000084851.1", "ENSMUSG00000035069.3", "ENSMUSG00000028337.14", "ENSMUSG00000086740.1", "ENSMUSG00000037366.14", "ENSMUSG00000034401.16", "ENSMUSG00000028289.12", "ENSMUSG00000028527.18", "ENSMUSG00000035649.17", "ENSMUSG00000028528.16", "ENSMUSG00000086080.7", "ENSMUSG00000059810.18", "ENSMUSG00000066113.16", "ENSMUSG00000041143.16", "ENSMUSG00000085885.1", "ENSMUSG00000048706.3", "ENSMUSG00000115115.1", "ENSMUSG00000086136.1", "ENSMUSG00000028434.12", "ENSMUSG00000087027.2", "ENSMUSG00000023151.9", "ENSMUSG00000028351.5", "ENSMUSG00000007613.15", "ENSMUSG00000028573.18", "ENSMUSG00000028747.10", "ENSMUSG00000038764.14", "ENSMUSG00000037242.8", "ENSMUSG00000038024.17", "ENSMUSG00000039911.13", "ENSMUSG00000085581.1", "ENSMUSG00000028937.14", "ENSMUSG00000085931.1", "ENSMUSG00000042616.8", "ENSMUSG00000028414.17", "ENSMUSG00000052135.8", "ENSMUSG00000048899.8", "ENSMUSG00000039270.9", "ENSMUSG00000038172.14", "ENSMUSG00000001089.14", "ENSMUSG00000033985.17", "ENSMUSG00000078532.9", "ENSMUSG00000073779.7", "ENSMUSG00000028278.14", "ENSMUSG00000085047.1", "ENSMUSG00000049122.17", "ENSMUSG00000028920.9", "ENSMUSG00000028868.13", "ENSMUSG00000028222.2", "ENSMUSG00000042500.11", "ENSMUSG00000023232.17", "ENSMUSG00000085323.1", "ENSMUSG00000073821.11", "ENSMUSG00000050511.1", "ENSMUSG00000028497.12", "ENSMUSG00000085562.7", "ENSMUSG00000028708.16", "ENSMUSG00000087049.1", "ENSMUSG00000034926.3", "ENSMUSG00000085549.7", "ENSMUSG00000028602.12", "ENSMUSG00000015243.4", "ENSMUSG00000083377.1", "ENSMUSG00000039579.15", "ENSMUSG00000037553.14", "ENSMUSG00000057530.14", "ENSMUSG00000006386.15", "ENSMUSG00000110067.1", "ENSMUSG00000028413.13", "ENSMUSG00000028544.14", "ENSMUSG00000078639.1", "ENSMUSG00000028328.13", "ENSMUSG00000028599.10", "ENSMUSG00000039005.13", "ENSMUSG00000029063.16", "ENSMUSG00000049119.14", "ENSMUSG00000086150.1", "ENSMUSG00000028364.15", "ENSMUSG00000028796.17", "ENSMUSG00000084595.1", "ENSMUSG00000028521.17", "ENSMUSG00000087641.1", "ENSMUSG00000028572.13", "ENSMUSG00000085968.1", "ENSMUSG00000084833.1", "ENSMUSG00000028760.16", "ENSMUSG00000055296.14", "ENSMUSG00000028730.14", "ENSMUSG00000028631.7", "ENSMUSG00000086070.1", "ENSMUSG00000028207.18", "ENSMUSG00000028339.17", "ENSMUSG00000003644.17", "ENSMUSG00000043621.13", "ENSMUSG00000042388.14", "ENSMUSG00000028614.14", "ENSMUSG00000038668.14", "ENSMUSG00000028980.14", "ENSMUSG00000028385.14", "ENSMUSG00000038070.15", "ENSMUSG00000039577.17", "ENSMUSG00000028412.17", "ENSMUSG00000029049.14", "ENSMUSG00000085996.1", "ENSMUSG00000061322.15", "ENSMUSG00000078612.9", "ENSMUSG00000086052.1", "ENSMUSG00000028689.14", "ENSMUSG00000028487.18", "ENSMUSG00000005045.16", "ENSMUSG00000051435.11", "ENSMUSG00000086927.1", "ENSMUSG00000028259.13", "ENSMUSG00000087366.7", "ENSMUSG00000041216.15", "ENSMUSG00000028370.7", "ENSMUSG00000054659.13", "ENSMUSG00000087399.1", "ENSMUSG00000028785.13", "ENSMUSG00000036856.4", "ENSMUSG00000073991.4", "ENSMUSG00000045205.16", "ENSMUSG00000028988.13", "ENSMUSG00000050989.9", "ENSMUSG00000028807.2", "ENSMUSG00000041351.16", "ENSMUSG00000028369.15", "ENSMUSG00000086554.1", "ENSMUSG00000028476.13", "ENSMUSG00000028382.15", "ENSMUSG00000087128.7", "ENSMUSG00000073759.6", "ENSMUSG00000028552.13", "ENSMUSG00000028681.11", "ENSMUSG00000038047.18", "ENSMUSG00000028603.15", "ENSMUSG00000028931.12", "ENSMUSG00000039852.17", "ENSMUSG00000104211.1", "ENSMUSG00000054885.11", "ENSMUSG00000085909.1", "ENSMUSG00000057280.15", "ENSMUSG00000028274.17", "ENSMUSG00000085614.1", "ENSMUSG00000086355.1", "ENSMUSG00000063077.14", "ENSMUSG00000028514.15", "ENSMUSG00000028403.15", "ENSMUSG00000028556.15", "ENSMUSG00000028383.17", "ENSMUSG00000038578.15", "ENSMUSG00000008932.9", "ENSMUSG00000039137.18", "ENSMUSG00000028753.12", "ENSMUSG00000084898.1", "ENSMUSG00000057751.14", "ENSMUSG00000028750.12", "ENSMUSG00000028563.16", "ENSMUSG00000028555.15", "ENSMUSG00000035696.15", "ENSMUSG00000029054.8", "ENSMUSG00000082062.1", "ENSMUSG00000085922.1", "ENSMUSG00000086072.1", "ENSMUSG00000041193.15", "ENSMUSG00000041261.9", "ENSMUSG00000028886.15", "ENSMUSG00000020220.16", "ENSMUSG00000014030.15", "ENSMUSG00000084757.7", "ENSMUSG00000090125.3", "ENSMUSG00000085643.1", "ENSMUSG00000028619.15", "ENSMUSG00000028649.19", "ENSMUSG00000028318.14", "ENSMUSG00000039953.13", "ENSMUSG00000050390.12", "ENSMUSG00000040410.14", "ENSMUSG00000028492.13", "ENSMUSG00000042489.15", "ENSMUSG00000086284.1", "ENSMUSG00000073860.12", "ENSMUSG00000087016.1", "ENSMUSG00000028292.14", "ENSMUSG00000085804.1", "ENSMUSG00000033423.16", "ENSMUSG00000028637.16", "ENSMUSG00000028838.11", "ENSMUSG00000043003.14", "ENSMUSG00000049648.4", "ENSMUSG00000046447.3", "ENSMUSG00000028854.9", "ENSMUSG00000028832.11", "ENSMUSG00000028617.10", "ENSMUSG00000041161.8", "ENSMUSG00000066036.14", "ENSMUSG00000086249.1", "ENSMUSG00000028644.16", "ENSMUSG00000057722.17", "ENSMUSG00000048485.12", "ENSMUSG00000039158.11", "ENSMUSG00000090839.1", "ENSMUSG00000086364.1", "ENSMUSG00000028641.16", "ENSMUSG00000039713.16", "ENSMUSG00000118416.1", "ENSMUSG00000078531.3", "ENSMUSG00000085022.2", "ENSMUSG00000028612.10", "ENSMUSG00000001334.9", "ENSMUSG00000040860.16", "ENSMUSG00000029026.16", "ENSMUSG00000043333.12", "ENSMUSG00000028457.18", "ENSMUSG00000028621.17", "ENSMUSG00000057572.15", "ENSMUSG00000046593.3", "ENSMUSG00000028439.14", "ENSMUSG00000000085.16", "ENSMUSG00000028344.12", "ENSMUSG00000028618.11", "ENSMUSG00000086326.1", "ENSMUSG00000028420.13", "ENSMUSG00000029656.13", "ENSMUSG00000084787.1", "ENSMUSG00000028991.15", "ENSMUSG00000070806.5", "ENSMUSG00000094403.1", "ENSMUSG00000078722.2", "ENSMUSG00000085517.7", "ENSMUSG00000078689.8", "ENSMUSG00000028314.6", "ENSMUSG00000085522.1", "ENSMUSG00000086164.1", "ENSMUSG00000040690.15", "ENSMUSG00000092492.1", "ENSMUSG00000028677.19", "ENSMUSG00000058006.12", "ENSMUSG00000028645.11", "ENSMUSG00000041025.15", "ENSMUSG00000037692.14", "ENSMUSG00000095139.2", "ENSMUSG00000087241.1", "ENSMUSG00000028910.12", "ENSMUSG00000043924.16", "ENSMUSG00000085632.1", "ENSMUSG00000035407.8", "ENSMUSG00000035517.17", "ENSMUSG00000037348.15", "ENSMUSG00000086251.1", "ENSMUSG00000028348.7", "ENSMUSG00000028718.16", "ENSMUSG00000102796.1", "ENSMUSG00000028300.14", "ENSMUSG00000029064.15", "ENSMUSG00000028517.8", "ENSMUSG00000087171.1", "ENSMUSG00000073987.4", "ENSMUSG00000097224.1", "ENSMUSG00000028876.7", "ENSMUSG00000043698.9", "ENSMUSG00000087343.1", "ENSMUSG00000028575.11", "ENSMUSG00000028393.10", "ENSMUSG00000028646.16", "ENSMUSG00000028873.16", "ENSMUSG00000042608.15", "ENSMUSG00000028584.3", "ENSMUSG00000006699.17", "ENSMUSG00000034853.16", "ENSMUSG00000028443.6", "ENSMUSG00000028549.17", "ENSMUSG00000078772.2", "ENSMUSG00000088349.1", "ENSMUSG00000035031.15", "ENSMUSG00000087525.1", "ENSMUSG00000051351.14", "ENSMUSG00000073792.11", "ENSMUSG00000103595.1", "ENSMUSG00000034171.13", "ENSMUSG00000045917.17", "ENSMUSG00000087098.1", "ENSMUSG00000077780.1", "ENSMUSG00000040928.15", "ENSMUSG00000085032.1", "ENSMUSG00000086017.1", "ENSMUSG00000028653.16", "ENSMUSG00000041135.15", "ENSMUSG00000028243.5", "ENSMUSG00000033191.14", "ENSMUSG00000007880.16", "ENSMUSG00000049969.3", "ENSMUSG00000028874.14", "ENSMUSG00000085515.1", "ENSMUSG00000066037.14", "ENSMUSG00000036052.14", "ENSMUSG00000028607.16", "ENSMUSG00000028745.18", "ENSMUSG00000057375.13", "ENSMUSG00000040569.13", "ENSMUSG00000028898.15", "ENSMUSG00000029050.15", "ENSMUSG00000040359.14", "ENSMUSG00000062937.7", "ENSMUSG00000080780.1", "ENSMUSG00000078716.10", "ENSMUSG00000028698.13", "ENSMUSG00000028542.17", "ENSMUSG00000086483.1", "ENSMUSG00000028578.8", "ENSMUSG00000084828.1", "ENSMUSG00000028571.15", "ENSMUSG00000039492.7", "ENSMUSG00000061455.13", "ENSMUSG00000086565.1", "ENSMUSG00000091237.1", "ENSMUSG00000028642.7", "ENSMUSG00000103127.1", "ENSMUSG00000087470.1", "ENSMUSG00000028550.15", "ENSMUSG00000039967.14", "ENSMUSG00000010517.7", "ENSMUSG00000028553.12", "ENSMUSG00000066026.14", "ENSMUSG00000084918.1", "ENSMUSG00000078584.9", "ENSMUSG00000062627.9", "ENSMUSG00000039813.14", "ENSMUSG00000085895.1", "ENSMUSG00000086027.1", "ENSMUSG00000097871.1", "ENSMUSG00000056494.7", "ENSMUSG00000028224.14", "ENSMUSG00000035683.13", "ENSMUSG00000034871.2", "ENSMUSG00000028826.14", "ENSMUSG00000055963.12", "ENSMUSG00000063851.12", "ENSMUSG00000028217.11", "ENSMUSG00000028788.14", "ENSMUSG00000087010.1", "ENSMUSG00000028600.15", "ENSMUSG00000034359.16", "ENSMUSG00000040728.15", "ENSMUSG00000084987.1", "ENSMUSG00000050323.14", "ENSMUSG00000087451.1", "ENSMUSG00000049657.9", "ENSMUSG00000038544.14", "ENSMUSG00000028228.5", "ENSMUSG00000039611.2", "ENSMUSG00000049225.14", "ENSMUSG00000041052.8", "ENSMUSG00000078598.11", "ENSMUSG00000087194.9", "ENSMUSG00000045573.9", "ENSMUSG00000041120.6", "ENSMUSG00000086332.1", "ENSMUSG00000066060.5", "ENSMUSG00000028551.14", "ENSMUSG00000032897.17", "ENSMUSG00000028907.7", "ENSMUSG00000086606.1", "ENSMUSG00000069733.11", "ENSMUSG00000033326.15", "ENSMUSG00000086713.1", "ENSMUSG00000028524.21", "ENSMUSG00000028223.8", "ENSMUSG00000055960.12", "ENSMUSG00000035126.19", "ENSMUSG00000047636.3", "ENSMUSG00000046219.9", "ENSMUSG00000088014.1", "ENSMUSG00000086712.2", "ENSMUSG00000052767.5", "ENSMUSG00000029038.9", "ENSMUSG00000073889.10", "ENSMUSG00000047502.14", "ENSMUSG00000037295.7", "ENSMUSG00000044254.6", "ENSMUSG00000028332.13", "ENSMUSG00000111410.1", "ENSMUSG00000042202.15", "ENSMUSG00000094439.1", "ENSMUSG00000028426.10", "ENSMUSG00000055198.3", "ENSMUSG00000028643.11", "ENSMUSG00000057977.10", "ENSMUSG00000029009.17", "ENSMUSG00000048766.6", "ENSMUSG00000028362.2", "ENSMUSG00000106597.1", "ENSMUSG00000035969.15", "ENSMUSG00000028763.18", "ENSMUSG00000081225.3", "ENSMUSG00000028893.8", "ENSMUSG00000081660.1", "ENSMUSG00000049972.4", "ENSMUSG00000097459.1", "ENSMUSG00000070868.11", "ENSMUSG00000085027.1"]
slop = 250000
n_pc = 10
resolution = 1


In [5]:
output_dir = pathlib.Path(chrom)
output_dir.mkdir(exist_ok=True)

## Cell Meta

In [6]:
cell_tidy_data = pd.read_msgpack(
    '/home/hanliu/project/mouse_rostral_brain/study/ClusteringSummary/Summary/TotalClusteringResults.msg'
)
use_clusters = [
    i.replace(' ', '_') for i in cell_tidy_data[cell_tidy_data['CellClass'].isin(['Exc', 'Inh'])]
    ['SubType'].unique() if 'Outlier' not in i
]
len(use_clusters)

145

## ATAC peaks

In [7]:
atac_peak = pd.read_msgpack('/home/hanliu/project/mouse_rostral_brain/study/DMRCluster/SubType.ATAC_peak_merged.msg')
atac_peak = atac_peak.loc[atac_peak.index.map(lambda i: i.startswith(f'Sub{chrom}_'))].copy()

## Gene Info

In [8]:
gene_meta = pd.read_csv(
    '/home/hanliu/ref/mouse/gencode/vm22/gencode.vM22.annotation.gene.flat.tsv.gz',
    index_col='gene_id', sep='\t'
)
gene_meta = gene_meta[gene_meta['chrom'] == chrom].copy()

In [9]:
exon_bed = pd.read_csv('/home/hanliu/ref/mouse/gencode/vm22/genome_anno/exon.all.bed',
                       header=None, sep='\t')
exon_bed.columns = ['chrom', 'start', 'end', 'gene_id', 'gene_name']

## DMR Info

In [10]:
with pd.HDFStore('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/DMRInfo.h5', 'r') as hdf:
    dmr_rate = hdf['Rate']
dmr_rate = dmr_rate.loc[dmr_rate.index.map(lambda i: i.startswith(f'Sub{chrom}_'))].copy()

dmr_corr = pd.read_msgpack(
    '/home/hanliu/project/mouse_rostral_brain/study/DMRGeneCorr/TotalGeneDMRCorrLoop.0.3.msg'
)
dmr_corr = dmr_corr.set_index(['DMR', 'Gene'])
dmr_corr = dmr_corr.loc[dmr_corr.index.get_level_values('DMR').isin(dmr_rate.index)].copy()

dmr_bed = pd.read_csv('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/TotalDMR.nofilter.bed',
                      sep='\t', header=None, index_col=3)
dmr_bed.columns = ['chrom', 'start', 'end']
dmr_bed = dmr_bed[dmr_bed['chrom'] == chrom].copy()

dmr_hits = anndata.read_h5ad('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/TotalHits.HypoDMR.h5ad')
dmr_hits = dmr_hits[dmr_rate.index, :].copy()
dmr_hits = dmr_hits[:, use_clusters].copy()

dmr_annot = anndata.read_h5ad('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/DMRAnnotation.h5ad')
dmr_annot = dmr_annot[dmr_rate.index, :].copy()

In [11]:
dmr_hits.shape[0]

283201

In [12]:
def get_gene(gene_id):
    _gene = gene_meta.loc[gene_id]
    return _gene.name, _gene['chrom'], _gene['start'], _gene['end'], _gene['strand']

## Gene's DMR clustering

In [13]:
def calculate_gene(gene_id):
    # get gene information
    gene_id, _, gene_start, gene_end, _ = get_gene(gene_id)

    # select related DMRs
    related_dmr = dmr_bed[(dmr_bed['start'] > gene_start - slop) &
                          (dmr_bed['end'] < gene_end + slop)].copy()
    
    related_dmr_rate = dmr_rate.loc[related_dmr.index, use_clusters].copy()
    related_dmr_rate.fillna(related_dmr_rate.mean(), axis=0, inplace=True)
    
    # construct Adata
    adata = anndata.AnnData(X=related_dmr_rate.values.copy(),
                            obs=pd.DataFrame([], related_dmr_rate.index),
                            var=pd.DataFrame([], related_dmr_rate.columns))
    sc.pp.scale(adata)
    sc.pp.pca(adata)
    
    pcs = adata.obsm['X_pca'][:, :n_pc]
    sc.pp.neighbors(adata, n_neighbors=int(round(np.log2(adata.shape[0]))), n_pcs=n_pc)
    sc.tl.leiden(adata, resolution=resolution)
    return adata.obs


def get_annotation(gene_id):
    gene_id, _, gene_start, gene_end, strand = get_gene(gene_id)
    tss = gene_start if strand == '+' else gene_end
    gene_cluster = calculate_gene(gene_id)
    this_corr = dmr_corr[dmr_corr.index.get_level_values('Gene') == gene_id]['Corr']
    this_corr.index = this_corr.index.droplevel('Gene')
    gene_cluster['Corr'] = gene_cluster.index.map(this_corr).fillna(0)
    
    this_dmr_bed = dmr_bed.loc[gene_cluster.index]
    dmr_center = (this_dmr_bed['end'] + this_dmr_bed['start']) / 2
    gene_length = gene_end - gene_start
    if strand == '+':
        gene_cluster['reldist_tss'] = (dmr_center - gene_start) / gene_length
    else:
        gene_cluster['reldist_tss'] = (gene_end - dmr_center) / gene_length
    gene_cluster['in_gene_body'] = (gene_cluster['reldist_tss'] > 0) & (gene_cluster['reldist_tss'] < 1)
    
    this_annot = dmr_annot[gene_cluster.index]
    annot_df = pd.DataFrame(this_annot.X.todense(), 
                 index=this_annot.obs_names, columns=this_annot.var_names)
    
    # annotate TE cols
    dna_te = annot_df.columns[20:33]
    gene_cluster['is_dna_te'] = annot_df[dna_te].sum(axis=1) != 0
    
    line_te = annot_df.columns[33:39]
    gene_cluster['is_line_te'] = annot_df[line_te].sum(axis=1) != 0
    
    ltr_te = annot_df.columns[39:45]
    gene_cluster['is_ltr_te'] = annot_df[ltr_te].sum(axis=1) != 0
    
    sine_te = annot_df.columns[45:52]
    gene_cluster['is_sine_te'] = annot_df[sine_te].sum(axis=1) != 0
    
    # this dmr within GOI's gene feature
    gene_cluster['in_intron'] = annot_df['intron'].astype(bool) & gene_cluster['in_gene_body']
    gene_cluster['in_exon'] = annot_df['exon'].astype(bool) & gene_cluster['in_gene_body']
    gene_cluster['in_utr3'] = annot_df['UTR3'].astype(bool) & gene_cluster['in_gene_body']
    gene_cluster['in_utr5'] = annot_df['UTR5'].astype(bool) & gene_cluster['in_gene_body']
    
    # previous mC study
    gene_cluster['feDMR'] = annot_df['feDMR'].astype(bool)
    gene_cluster['adultDMR'] = annot_df['adultDMR'].astype(bool)
    
    
    other_profiles = []
    # DMR hypo call in each cluster
    this_hypo_hits = dmr_hits[gene_cluster.index]
    hits_df = pd.DataFrame(this_hypo_hits.X.todense(), 
                 index=this_hypo_hits.obs_names, columns=this_hypo_hits.var_names)
    hits_df.columns = hits_df.columns.map(lambda i: f'HypoDMR.{i}')
    other_profiles.append(hits_df)
    
    # DMR rate
    related_dmr_rate = dmr_rate.loc[gene_cluster['leiden'].sort_values().index, use_clusters].copy()
    related_dmr_rate.fillna(related_dmr_rate.mean(), axis=0, inplace=True)
    related_dmr_rate.columns = related_dmr_rate.columns.map(lambda i: f'DMRRate.{i}')
    other_profiles.append(related_dmr_rate)
    
    # atac peak
    atac_peak_df = atac_peak.loc[related_dmr_rate.index, use_clusters].copy()
    other_profiles.append(atac_peak_df)
    atac_peak_df.columns = atac_peak_df.columns.map(lambda i: f'ATACPeak.{i}')
    
    dmr_annotation = pd.concat([gene_cluster] + other_profiles, axis=1, sort=True)
    return dmr_annotation

In [None]:
for gene in genes:
    print(gene)
    check_path = output_dir / f'{gene}.DMR_cluster.msg'
    if check_path.exists():
        continue
    
    dmr_annotation = get_annotation(gene)
    dmr_annotation.to_msgpack(output_dir / f'{gene}.DMR_detail.msg', compress='zlib')
    
    cluster_annotation = dmr_annotation.groupby('leiden').mean()
    cluster_annotation.to_msgpack(output_dir / f'{gene}.DMR_cluster.msg', compress='zlib')

ENSMUSG00000001985.9
ENSMUSG00000028559.17
ENSMUSG00000008575.17
ENSMUSG00000008489.18
ENSMUSG00000028399.18
ENSMUSG00000028664.14
ENSMUSG00000044813.15
ENSMUSG00000028519.16
ENSMUSG00000040972.8
ENSMUSG00000028546.17
ENSMUSG00000040387.16
ENSMUSG00000014592.20
ENSMUSG00000028804.20
ENSMUSG00000061887.14
ENSMUSG00000028906.16
ENSMUSG00000090053.9
ENSMUSG00000060206.11
ENSMUSG00000040270.16
ENSMUSG00000039809.10
ENSMUSG00000039410.16
ENSMUSG00000087413.1
ENSMUSG00000059713.12
ENSMUSG00000028402.18
ENSMUSG00000028545.13
ENSMUSG00000032890.17
ENSMUSG00000008305.18
ENSMUSG00000045083.14
ENSMUSG00000028803.18
ENSMUSG00000041544.11
ENSMUSG00000003411.10
ENSMUSG00000110373.1
ENSMUSG00000040606.13
ENSMUSG00000045071.14
ENSMUSG00000028373.16
ENSMUSG00000040183.13
ENSMUSG00000037306.13
ENSMUSG00000040964.16
ENSMUSG00000033295.14
ENSMUSG00000028525.16
ENSMUSG00000085816.1
ENSMUSG00000028532.14
ENSMUSG00000028565.18
ENSMUSG00000048232.12
ENSMUSG00000061859.17
ENSMUSG00000006586.15
ENSMUSG000000352