In [1]:
import pandas as pd
import numpy as np
import anndata
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import warnings
import pathlib

warnings.filterwarnings('ignore')

In [2]:
from matplotlib import rc
labelsize = 6
linewidth = 0.6
rc('lines', linewidth=linewidth)
rc('axes', labelsize=labelsize, linewidth=linewidth)
rc('xtick', labelsize=labelsize)
rc('ytick', labelsize=labelsize)
rc('xtick.major', width=linewidth)
rc('ytick.major', width=linewidth)
rc('xtick.minor', width=linewidth - 0.2)
rc('ytick.minor', width=linewidth - 0.2)

In [3]:
chrom = 'chr2'
genes = ['Lhx6']
slop = 250000
n_pc = 10
resolution = 1

In [4]:
# Parameters
chrom = "chr7"
genes = ["ENSMUSG00000053025.13", "ENSMUSG00000030657.11", "ENSMUSG00000030583.16", "ENSMUSG00000025813.14", "ENSMUSG00000021217.7", "ENSMUSG00000001741.12", "ENSMUSG00000030970.16", "ENSMUSG00000110230.1", "ENSMUSG00000058145.16", "ENSMUSG00000030862.13", "ENSMUSG00000055723.6", "ENSMUSG00000010476.14", "ENSMUSG00000015709.9", "ENSMUSG00000030691.15", "ENSMUSG00000038156.16", "ENSMUSG00000046321.8", "ENSMUSG00000030839.12", "ENSMUSG00000073856.11", "ENSMUSG00000060402.7", "ENSMUSG00000078591.1", "ENSMUSG00000051910.13", "ENSMUSG00000030518.17", "ENSMUSG00000052889.11", "ENSMUSG00000030638.13", "ENSMUSG00000085899.1", "ENSMUSG00000032743.15", "ENSMUSG00000058420.8", "ENSMUSG00000048078.16", "ENSMUSG00000038244.14", "ENSMUSG00000043456.16", "ENSMUSG00000030500.6", "ENSMUSG00000025790.14", "ENSMUSG00000046027.17", "ENSMUSG00000031024.13", "ENSMUSG00000066189.9", "ENSMUSG00000040125.4", "ENSMUSG00000055489.8", "ENSMUSG00000052353.13", "ENSMUSG00000055407.14", "ENSMUSG00000038296.14", "ENSMUSG00000025475.17", "ENSMUSG00000051527.11", "ENSMUSG00000055320.17", "ENSMUSG00000052707.8", "ENSMUSG00000025789.9", "ENSMUSG00000030930.14", "ENSMUSG00000041836.10", "ENSMUSG00000030527.15", "ENSMUSG00000030772.6", "ENSMUSG00000036111.8", "ENSMUSG00000005533.10", "ENSMUSG00000045659.18", "ENSMUSG00000047248.20", "ENSMUSG00000055078.7", "ENSMUSG00000030982.19", "ENSMUSG00000109814.1", "ENSMUSG00000066406.15", "ENSMUSG00000018909.15", "ENSMUSG00000008734.9", "ENSMUSG00000030905.5", "ENSMUSG00000031028.14", "ENSMUSG00000030653.17", "ENSMUSG00000038623.9", "ENSMUSG00000053528.6", "ENSMUSG00000062542.11", "ENSMUSG00000110195.1", "ENSMUSG00000025133.9", "ENSMUSG00000030956.15", "ENSMUSG00000086254.1", "ENSMUSG00000108390.1", "ENSMUSG00000087088.2", "ENSMUSG00000038540.14", "ENSMUSG00000035314.10", "ENSMUSG00000030528.12", "ENSMUSG00000035547.14", "ENSMUSG00000055409.14", "ENSMUSG00000091002.2", "ENSMUSG00000108811.1", "ENSMUSG00000002771.12", "ENSMUSG00000025324.8", "ENSMUSG00000030525.8", "ENSMUSG00000107985.1", "ENSMUSG00000031027.15", "ENSMUSG00000030513.14", "ENSMUSG00000048897.15", "ENSMUSG00000030539.13", "ENSMUSG00000030688.15", "ENSMUSG00000005686.17", "ENSMUSG00000058975.7", "ENSMUSG00000060376.7", "ENSMUSG00000030516.14", "ENSMUSG00000046182.8", "ENSMUSG00000031075.18", "ENSMUSG00000085792.2", "ENSMUSG00000042978.10", "ENSMUSG00000038260.10", "ENSMUSG00000070469.12", "ENSMUSG00000030766.15", "ENSMUSG00000109341.1", "ENSMUSG00000030987.5", "ENSMUSG00000030861.15", "ENSMUSG00000048787.13", "ENSMUSG00000010755.17", "ENSMUSG00000036528.15", "ENSMUSG00000078816.9", "ENSMUSG00000042105.18", "ENSMUSG00000036862.15", "ENSMUSG00000061286.7", "ENSMUSG00000060260.13", "ENSMUSG00000032875.8", "ENSMUSG00000015981.12", "ENSMUSG00000070509.15", "ENSMUSG00000088683.1", "ENSMUSG00000030876.7", "ENSMUSG00000030729.17", "ENSMUSG00000054612.4", "ENSMUSG00000006205.13", "ENSMUSG00000030556.13", "ENSMUSG00000039202.12", "ENSMUSG00000052512.17", "ENSMUSG00000030541.16", "ENSMUSG00000101755.1", "ENSMUSG00000038738.15", "ENSMUSG00000037541.21", "ENSMUSG00000053049.3", "ENSMUSG00000102516.1", "ENSMUSG00000030909.6", "ENSMUSG00000030643.14", "ENSMUSG00000055116.8", "ENSMUSG00000043671.14", "ENSMUSG00000070570.5", "ENSMUSG00000025586.17", "ENSMUSG00000062444.15", "ENSMUSG00000030880.14", "ENSMUSG00000030871.11", "ENSMUSG00000034951.10", "ENSMUSG00000097247.2", "ENSMUSG00000030602.13", "ENSMUSG00000054746.18", "ENSMUSG00000030376.8", "ENSMUSG00000039257.11", "ENSMUSG00000074227.12", "ENSMUSG00000030523.18", "ENSMUSG00000056856.13", "ENSMUSG00000073805.5", "ENSMUSG00000045795.10", "ENSMUSG00000030760.14", "ENSMUSG00000058230.12", "ENSMUSG00000039428.11", "ENSMUSG00000108489.1", "ENSMUSG00000062017.15", "ENSMUSG00000108635.1", "ENSMUSG00000038459.10", "ENSMUSG00000078670.3", "ENSMUSG00000108572.1", "ENSMUSG00000042055.13", "ENSMUSG00000085443.1", "ENSMUSG00000035004.3", "ENSMUSG00000047767.17", "ENSMUSG00000030780.15", "ENSMUSG00000055652.14", "ENSMUSG00000108303.1", "ENSMUSG00000108432.1", "ENSMUSG00000030555.16", "ENSMUSG00000030522.14", "ENSMUSG00000087492.2", "ENSMUSG00000030844.11", "ENSMUSG00000030770.15", "ENSMUSG00000085265.1", "ENSMUSG00000030510.11", "ENSMUSG00000033533.14", "ENSMUSG00000066129.15", "ENSMUSG00000030946.13", "ENSMUSG00000030747.5", "ENSMUSG00000109194.1", "ENSMUSG00000033847.15", "ENSMUSG00000032812.18", "ENSMUSG00000037519.17", "ENSMUSG00000032776.9", "ENSMUSG00000038371.15", "ENSMUSG00000109336.1", "ENSMUSG00000097789.2", "ENSMUSG00000053046.16", "ENSMUSG00000097756.2", "ENSMUSG00000030911.7", "ENSMUSG00000104420.1", "ENSMUSG00000108790.1", "ENSMUSG00000030922.12", "ENSMUSG00000109001.1", "ENSMUSG00000030761.16", "ENSMUSG00000100600.6", "ENSMUSG00000044952.5", "ENSMUSG00000104454.1", "ENSMUSG00000002068.16", "ENSMUSG00000031026.15", "ENSMUSG00000037606.18", "ENSMUSG00000047517.12", "ENSMUSG00000097471.8", "ENSMUSG00000030898.15", "ENSMUSG00000056216.9", "ENSMUSG00000048779.5", "ENSMUSG00000108293.1", "ENSMUSG00000100005.7", "ENSMUSG00000069806.5", "ENSMUSG00000033676.13", "ENSMUSG00000008496.19", "ENSMUSG00000030847.8", "ENSMUSG00000042246.5", "ENSMUSG00000093540.2", "ENSMUSG00000030889.14", "ENSMUSG00000097178.7", "ENSMUSG00000030650.18", "ENSMUSG00000030921.17", "ENSMUSG00000030788.16", "ENSMUSG00000034825.14", "ENSMUSG00000062028.8", "ENSMUSG00000030757.13", "ENSMUSG00000025478.15", "ENSMUSG00000045777.14", "ENSMUSG00000108379.1", "ENSMUSG00000025104.13", "ENSMUSG00000035064.17", "ENSMUSG00000074221.12", "ENSMUSG00000066568.12", "ENSMUSG00000025103.8", "ENSMUSG00000109097.1", "ENSMUSG00000030491.16", "ENSMUSG00000084821.1", "ENSMUSG00000033510.14", "ENSMUSG00000058447.8", "ENSMUSG00000068151.7", "ENSMUSG00000004609.11", "ENSMUSG00000109407.1", "ENSMUSG00000099853.3", "ENSMUSG00000053111.13", "ENSMUSG00000030878.11", "ENSMUSG00000006204.7", "ENSMUSG00000109125.1", "ENSMUSG00000030854.17", "ENSMUSG00000035354.9", "ENSMUSG00000054793.8", "ENSMUSG00000030471.17", "ENSMUSG00000052749.9", "ENSMUSG00000118412.1", "ENSMUSG00000066197.5", "ENSMUSG00000109517.1", "ENSMUSG00000097023.8", "ENSMUSG00000006763.14", "ENSMUSG00000091239.2", "ENSMUSG00000085745.1", "ENSMUSG00000035704.17", "ENSMUSG00000035211.9", "ENSMUSG00000054676.15", "ENSMUSG00000108332.1", "ENSMUSG00000109093.1", "ENSMUSG00000002205.16", "ENSMUSG00000031004.8", "ENSMUSG00000045928.2", "ENSMUSG00000030814.17", "ENSMUSG00000108752.1", "ENSMUSG00000108449.1", "ENSMUSG00000038763.12", "ENSMUSG00000030530.15", "ENSMUSG00000040167.16", "ENSMUSG00000070366.5", "ENSMUSG00000030986.13", "ENSMUSG00000040298.6", "ENSMUSG00000109532.1", "ENSMUSG00000097016.2", "ENSMUSG00000098022.7", "ENSMUSG00000030850.17", "ENSMUSG00000103882.1", "ENSMUSG00000038663.7", "ENSMUSG00000049350.6", "ENSMUSG00000030472.9", "ENSMUSG00000030421.9", "ENSMUSG00000059277.12", "ENSMUSG00000108486.1", "ENSMUSG00000048236.8", "ENSMUSG00000049123.10", "ENSMUSG00000030397.10", "ENSMUSG00000062300.14", "ENSMUSG00000051048.17", "ENSMUSG00000007279.14", "ENSMUSG00000009687.14", "ENSMUSG00000047730.17", "ENSMUSG00000013367.5", "ENSMUSG00000099137.1", "ENSMUSG00000039608.6", "ENSMUSG00000108317.1", "ENSMUSG00000109248.1", "ENSMUSG00000030450.11", "ENSMUSG00000030873.9", "ENSMUSG00000109107.1", "ENSMUSG00000030960.16", "ENSMUSG00000066979.6", "ENSMUSG00000034660.8", "ENSMUSG00000008318.10", "ENSMUSG00000102384.1", "ENSMUSG00000051041.7", "ENSMUSG00000085161.1", "ENSMUSG00000030945.17", "ENSMUSG00000015133.17", "ENSMUSG00000078616.2", "ENSMUSG00000001829.17", "ENSMUSG00000053367.15", "ENSMUSG00000001773.14", "ENSMUSG00000109014.1", "ENSMUSG00000089273.1", "ENSMUSG00000109570.1", "ENSMUSG00000109314.1", "ENSMUSG00000030706.17", "ENSMUSG00000085124.1", "ENSMUSG00000030963.6", "ENSMUSG00000029461.17", "ENSMUSG00000004328.15", "ENSMUSG00000030554.16", "ENSMUSG00000078796.4", "ENSMUSG00000030954.10", "ENSMUSG00000054808.15", "ENSMUSG00000037020.16", "ENSMUSG00000108722.1", "ENSMUSG00000109414.1", "ENSMUSG00000030607.7", "ENSMUSG00000030752.8", "ENSMUSG00000074006.3", "ENSMUSG00000040139.14", "ENSMUSG00000108596.1", "ENSMUSG00000039405.7", "ENSMUSG00000040136.10", "ENSMUSG00000048330.14", "ENSMUSG00000030671.9", "ENSMUSG00000102124.1", "ENSMUSG00000109382.1", "ENSMUSG00000047026.9", "ENSMUSG00000097220.1", "ENSMUSG00000085560.2", "ENSMUSG00000108991.1", "ENSMUSG00000016626.10", "ENSMUSG00000108825.1", "ENSMUSG00000070822.5", "ENSMUSG00000030600.15", "ENSMUSG00000094822.2", "ENSMUSG00000078815.8", "ENSMUSG00000109450.1", "ENSMUSG00000110282.1", "ENSMUSG00000060314.13", "ENSMUSG00000100650.1", "ENSMUSG00000109317.1", "ENSMUSG00000108954.1", "ENSMUSG00000109808.1", "ENSMUSG00000058761.5", "ENSMUSG00000055102.15", "ENSMUSG00000002266.17", "ENSMUSG00000073962.9", "ENSMUSG00000053985.10", "ENSMUSG00000035713.15", "ENSMUSG00000088141.1", "ENSMUSG00000055048.7", "ENSMUSG00000097749.2", "ENSMUSG00000109722.1", "ENSMUSG00000109555.1", "ENSMUSG00000054054.3", "ENSMUSG00000078810.4", "ENSMUSG00000109464.1", "ENSMUSG00000030877.11", "ENSMUSG00000110272.1", "ENSMUSG00000056394.17", "ENSMUSG00000058028.14", "ENSMUSG00000030917.13", "ENSMUSG00000109290.1", "ENSMUSG00000030726.16", "ENSMUSG00000025147.7", "ENSMUSG00000031015.8", "ENSMUSG00000031090.14", "ENSMUSG00000086232.2", "ENSMUSG00000030592.18", "ENSMUSG00000030718.9", "ENSMUSG00000025512.15", "ENSMUSG00000110125.1", "ENSMUSG00000108627.1", "ENSMUSG00000051900.12", "ENSMUSG00000061702.10", "ENSMUSG00000077176.1", "ENSMUSG00000108351.1", "ENSMUSG00000110248.1", "ENSMUSG00000030834.7", "ENSMUSG00000030447.15", "ENSMUSG00000053714.8", "ENSMUSG00000043366.16", "ENSMUSG00000110135.1", "ENSMUSG00000108608.1", "ENSMUSG00000049685.8", "ENSMUSG00000032777.9", "ENSMUSG00000037974.16", "ENSMUSG00000108910.1", "ENSMUSG00000092364.2", "ENSMUSG00000044997.3", "ENSMUSG00000048782.15", "ENSMUSG00000108753.1", "ENSMUSG00000030782.16", "ENSMUSG00000109460.1", "ENSMUSG00000108719.1", "ENSMUSG00000109149.1", "ENSMUSG00000019194.15", "ENSMUSG00000040046.14", "ENSMUSG00000030499.9", "ENSMUSG00000108389.1", "ENSMUSG00000108803.1", "ENSMUSG00000030786.18", "ENSMUSG00000108652.1", "ENSMUSG00000109831.1", "ENSMUSG00000030745.9", "ENSMUSG00000003863.18", "ENSMUSG00000053338.9", "ENSMUSG00000030704.14", "ENSMUSG00000108905.1", "ENSMUSG00000030515.9", "ENSMUSG00000030663.12", "ENSMUSG00000063089.4", "ENSMUSG00000085751.1", "ENSMUSG00000092652.1", "ENSMUSG00000004508.6", "ENSMUSG00000082684.1", "ENSMUSG00000108678.1", "ENSMUSG00000036578.7", "ENSMUSG00000025139.14", "ENSMUSG00000108730.1", "ENSMUSG00000048677.9", "ENSMUSG00000108828.1", "ENSMUSG00000035582.13", "ENSMUSG00000089259.1", "ENSMUSG00000108330.1", "ENSMUSG00000049571.18", "ENSMUSG00000090855.1", "ENSMUSG00000108623.1", "ENSMUSG00000066306.13", "ENSMUSG00000050382.14", "ENSMUSG00000000215.11", "ENSMUSG00000030769.15", "ENSMUSG00000030474.9", "ENSMUSG00000006095.12", "ENSMUSG00000030703.8", "ENSMUSG00000109110.1", "ENSMUSG00000086814.1", "ENSMUSG00000074003.4", "ENSMUSG00000032640.10", "ENSMUSG00000097634.2", "ENSMUSG00000097644.1", "ENSMUSG00000030968.3", "ENSMUSG00000085236.3", "ENSMUSG00000035390.16", "ENSMUSG00000108749.1", "ENSMUSG00000072244.11", "ENSMUSG00000040205.8", "ENSMUSG00000015980.14", "ENSMUSG00000042659.15", "ENSMUSG00000030789.9", "ENSMUSG00000030737.17", "ENSMUSG00000108850.1", "ENSMUSG00000108334.1", "ENSMUSG00000009487.10", "ENSMUSG00000110078.1", "ENSMUSG00000109898.1", "ENSMUSG00000110027.1", "ENSMUSG00000066513.4", "ENSMUSG00000110101.1", "ENSMUSG00000005611.15", "ENSMUSG00000109917.1", "ENSMUSG00000030748.9", "ENSMUSG00000023072.14", "ENSMUSG00000109398.2", "ENSMUSG00000063179.13", "ENSMUSG00000109379.1", "ENSMUSG00000039361.11", "ENSMUSG00000108460.1", "ENSMUSG00000110167.1", "ENSMUSG00000040940.18", "ENSMUSG00000109079.1", "ENSMUSG00000049580.12", "ENSMUSG00000108600.1", "ENSMUSG00000108875.1", "ENSMUSG00000030846.15", "ENSMUSG00000053395.15", "ENSMUSG00000034990.15", "ENSMUSG00000108869.1", "ENSMUSG00000009545.14", "ENSMUSG00000110062.1", "ENSMUSG00000108676.1", "ENSMUSG00000097585.2", "ENSMUSG00000108714.1", "ENSMUSG00000034867.16", "ENSMUSG00000005621.11", "ENSMUSG00000035401.9", "ENSMUSG00000044903.15", "ENSMUSG00000108580.1", "ENSMUSG00000092225.1", "ENSMUSG00000030763.7", "ENSMUSG00000066269.7", "ENSMUSG00000074377.12", "ENSMUSG00000004056.15", "ENSMUSG00000109321.1", "ENSMUSG00000030577.14", "ENSMUSG00000110028.1", "ENSMUSG00000078671.11", "ENSMUSG00000108754.1", "ENSMUSG00000072259.3", "ENSMUSG00000108545.1", "ENSMUSG00000091205.3", "ENSMUSG00000108793.1", "ENSMUSG00000092592.1", "ENSMUSG00000109608.1", "ENSMUSG00000074164.12", "ENSMUSG00000054236.6", "ENSMUSG00000109816.1", "ENSMUSG00000108990.1", "ENSMUSG00000090619.2", "ENSMUSG00000078799.5", "ENSMUSG00000045693.8", "ENSMUSG00000036560.14", "ENSMUSG00000035642.16", "ENSMUSG00000011154.17", "ENSMUSG00000109239.1", "ENSMUSG00000030830.18", "ENSMUSG00000063133.2", "ENSMUSG00000108522.1", "ENSMUSG00000108455.1", "ENSMUSG00000030882.19", "ENSMUSG00000060508.16", "ENSMUSG00000008789.10", "ENSMUSG00000066108.7", "ENSMUSG00000092032.2", "ENSMUSG00000030659.14", "ENSMUSG00000109344.1", "ENSMUSG00000108323.1", "ENSMUSG00000040614.14", "ENSMUSG00000031085.16", "ENSMUSG00000109254.1", "ENSMUSG00000086662.7", "ENSMUSG00000015721.18", "ENSMUSG00000110316.1", "ENSMUSG00000066516.5"]
slop = 250000
n_pc = 10
resolution = 1


In [5]:
output_dir = pathlib.Path(chrom)
output_dir.mkdir(exist_ok=True)

## Cell Meta

In [6]:
cell_tidy_data = pd.read_msgpack(
    '/home/hanliu/project/mouse_rostral_brain/study/ClusteringSummary/Summary/TotalClusteringResults.msg'
)
use_clusters = [
    i.replace(' ', '_') for i in cell_tidy_data[cell_tidy_data['CellClass'].isin(['Exc', 'Inh'])]
    ['SubType'].unique() if 'Outlier' not in i
]
len(use_clusters)

145

## ATAC peaks

In [7]:
atac_peak = pd.read_msgpack('/home/hanliu/project/mouse_rostral_brain/study/DMRCluster/SubType.ATAC_peak_merged.msg')
atac_peak = atac_peak.loc[atac_peak.index.map(lambda i: i.startswith(f'Sub{chrom}_'))].copy()

## Gene Info

In [8]:
gene_meta = pd.read_csv(
    '/home/hanliu/ref/mouse/gencode/vm22/gencode.vM22.annotation.gene.flat.tsv.gz',
    index_col='gene_id', sep='\t'
)
gene_meta = gene_meta[gene_meta['chrom'] == chrom].copy()

In [9]:
exon_bed = pd.read_csv('/home/hanliu/ref/mouse/gencode/vm22/genome_anno/exon.all.bed',
                       header=None, sep='\t')
exon_bed.columns = ['chrom', 'start', 'end', 'gene_id', 'gene_name']

## DMR Info

In [10]:
with pd.HDFStore('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/DMRInfo.h5', 'r') as hdf:
    dmr_rate = hdf['Rate']
dmr_rate = dmr_rate.loc[dmr_rate.index.map(lambda i: i.startswith(f'Sub{chrom}_'))].copy()

dmr_corr = pd.read_msgpack(
    '/home/hanliu/project/mouse_rostral_brain/study/DMRGeneCorr/TotalGeneDMRCorrLoop.0.3.msg'
)
dmr_corr = dmr_corr.set_index(['DMR', 'Gene'])
dmr_corr = dmr_corr.loc[dmr_corr.index.get_level_values('DMR').isin(dmr_rate.index)].copy()

dmr_bed = pd.read_csv('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/TotalDMR.nofilter.bed',
                      sep='\t', header=None, index_col=3)
dmr_bed.columns = ['chrom', 'start', 'end']
dmr_bed = dmr_bed[dmr_bed['chrom'] == chrom].copy()

dmr_hits = anndata.read_h5ad('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/TotalHits.HypoDMR.h5ad')
dmr_hits = dmr_hits[dmr_rate.index, :].copy()
dmr_hits = dmr_hits[:, use_clusters].copy()

dmr_annot = anndata.read_h5ad('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/DMRAnnotation.h5ad')
dmr_annot = dmr_annot[dmr_rate.index, :].copy()

In [11]:
dmr_hits.shape[0]

253173

In [12]:
def get_gene(gene_id):
    _gene = gene_meta.loc[gene_id]
    return _gene.name, _gene['chrom'], _gene['start'], _gene['end'], _gene['strand']

## Gene's DMR clustering

In [13]:
def calculate_gene(gene_id):
    # get gene information
    gene_id, _, gene_start, gene_end, _ = get_gene(gene_id)

    # select related DMRs
    related_dmr = dmr_bed[(dmr_bed['start'] > gene_start - slop) &
                          (dmr_bed['end'] < gene_end + slop)].copy()
    
    related_dmr_rate = dmr_rate.loc[related_dmr.index, use_clusters].copy()
    related_dmr_rate.fillna(related_dmr_rate.mean(), axis=0, inplace=True)
    
    # construct Adata
    adata = anndata.AnnData(X=related_dmr_rate.values.copy(),
                            obs=pd.DataFrame([], related_dmr_rate.index),
                            var=pd.DataFrame([], related_dmr_rate.columns))
    sc.pp.scale(adata)
    sc.pp.pca(adata)
    
    pcs = adata.obsm['X_pca'][:, :n_pc]
    sc.pp.neighbors(adata, n_neighbors=int(round(np.log2(adata.shape[0]))), n_pcs=n_pc)
    sc.tl.leiden(adata, resolution=resolution)
    return adata.obs


def get_annotation(gene_id):
    gene_id, _, gene_start, gene_end, strand = get_gene(gene_id)
    tss = gene_start if strand == '+' else gene_end
    gene_cluster = calculate_gene(gene_id)
    this_corr = dmr_corr[dmr_corr.index.get_level_values('Gene') == gene_id]['Corr']
    this_corr.index = this_corr.index.droplevel('Gene')
    gene_cluster['Corr'] = gene_cluster.index.map(this_corr).fillna(0)
    
    this_dmr_bed = dmr_bed.loc[gene_cluster.index]
    dmr_center = (this_dmr_bed['end'] + this_dmr_bed['start']) / 2
    gene_length = gene_end - gene_start
    if strand == '+':
        gene_cluster['reldist_tss'] = (dmr_center - gene_start) / gene_length
    else:
        gene_cluster['reldist_tss'] = (gene_end - dmr_center) / gene_length
    gene_cluster['in_gene_body'] = (gene_cluster['reldist_tss'] > 0) & (gene_cluster['reldist_tss'] < 1)
    
    this_annot = dmr_annot[gene_cluster.index]
    annot_df = pd.DataFrame(this_annot.X.todense(), 
                 index=this_annot.obs_names, columns=this_annot.var_names)
    
    # annotate TE cols
    dna_te = annot_df.columns[20:33]
    gene_cluster['is_dna_te'] = annot_df[dna_te].sum(axis=1) != 0
    
    line_te = annot_df.columns[33:39]
    gene_cluster['is_line_te'] = annot_df[line_te].sum(axis=1) != 0
    
    ltr_te = annot_df.columns[39:45]
    gene_cluster['is_ltr_te'] = annot_df[ltr_te].sum(axis=1) != 0
    
    sine_te = annot_df.columns[45:52]
    gene_cluster['is_sine_te'] = annot_df[sine_te].sum(axis=1) != 0
    
    # this dmr within GOI's gene feature
    gene_cluster['in_intron'] = annot_df['intron'].astype(bool) & gene_cluster['in_gene_body']
    gene_cluster['in_exon'] = annot_df['exon'].astype(bool) & gene_cluster['in_gene_body']
    gene_cluster['in_utr3'] = annot_df['UTR3'].astype(bool) & gene_cluster['in_gene_body']
    gene_cluster['in_utr5'] = annot_df['UTR5'].astype(bool) & gene_cluster['in_gene_body']
    
    # previous mC study
    gene_cluster['feDMR'] = annot_df['feDMR'].astype(bool)
    gene_cluster['adultDMR'] = annot_df['adultDMR'].astype(bool)
    
    
    other_profiles = []
    # DMR hypo call in each cluster
    this_hypo_hits = dmr_hits[gene_cluster.index]
    hits_df = pd.DataFrame(this_hypo_hits.X.todense(), 
                 index=this_hypo_hits.obs_names, columns=this_hypo_hits.var_names)
    hits_df.columns = hits_df.columns.map(lambda i: f'HypoDMR.{i}')
    other_profiles.append(hits_df)
    
    # DMR rate
    related_dmr_rate = dmr_rate.loc[gene_cluster['leiden'].sort_values().index, use_clusters].copy()
    related_dmr_rate.fillna(related_dmr_rate.mean(), axis=0, inplace=True)
    related_dmr_rate.columns = related_dmr_rate.columns.map(lambda i: f'DMRRate.{i}')
    other_profiles.append(related_dmr_rate)
    
    # atac peak
    atac_peak_df = atac_peak.loc[related_dmr_rate.index, use_clusters].copy()
    other_profiles.append(atac_peak_df)
    atac_peak_df.columns = atac_peak_df.columns.map(lambda i: f'ATACPeak.{i}')
    
    dmr_annotation = pd.concat([gene_cluster] + other_profiles, axis=1, sort=True)
    return dmr_annotation

In [15]:
for gene in genes:
    print(gene)
    check_path = output_dir / f'{gene}.DMR_cluster.msg'
    if check_path.exists():
        continue
    
    dmr_annotation = get_annotation(gene)
    dmr_annotation.to_msgpack(output_dir / f'{gene}.DMR_detail.msg', compress='zlib')
    
    cluster_annotation = dmr_annotation.groupby('leiden').mean()
    cluster_annotation.to_msgpack(output_dir / f'{gene}.DMR_cluster.msg', compress='zlib')

ENSMUSG00000053025.13
ENSMUSG00000030657.11
ENSMUSG00000030583.16
ENSMUSG00000025813.14
ENSMUSG00000021217.7
ENSMUSG00000001741.12
ENSMUSG00000030970.16
ENSMUSG00000110230.1
ENSMUSG00000058145.16
ENSMUSG00000030862.13
ENSMUSG00000055723.6
ENSMUSG00000010476.14
ENSMUSG00000015709.9
ENSMUSG00000030691.15
ENSMUSG00000038156.16
ENSMUSG00000046321.8
ENSMUSG00000030839.12
ENSMUSG00000073856.11
ENSMUSG00000060402.7
ENSMUSG00000078591.1
ENSMUSG00000051910.13
ENSMUSG00000030518.17
ENSMUSG00000052889.11
ENSMUSG00000030638.13
ENSMUSG00000085899.1
ENSMUSG00000032743.15
ENSMUSG00000058420.8
ENSMUSG00000048078.16
ENSMUSG00000038244.14
ENSMUSG00000043456.16
ENSMUSG00000030500.6
ENSMUSG00000025790.14
ENSMUSG00000046027.17
ENSMUSG00000031024.13
ENSMUSG00000066189.9
ENSMUSG00000040125.4
ENSMUSG00000055489.8
ENSMUSG00000052353.13
ENSMUSG00000055407.14
ENSMUSG00000038296.14
ENSMUSG00000025475.17
ENSMUSG00000051527.11
ENSMUSG00000055320.17
ENSMUSG00000052707.8
ENSMUSG00000025789.9
ENSMUSG00000030930.14
ENS

ENSMUSG00000051900.12
ENSMUSG00000061702.10
ENSMUSG00000077176.1
ENSMUSG00000108351.1
ENSMUSG00000110248.1
ENSMUSG00000030834.7
ENSMUSG00000030447.15
ENSMUSG00000053714.8
ENSMUSG00000043366.16
ENSMUSG00000110135.1
ENSMUSG00000108608.1
ENSMUSG00000049685.8
ENSMUSG00000032777.9
ENSMUSG00000037974.16
ENSMUSG00000108910.1
ENSMUSG00000092364.2
ENSMUSG00000044997.3
ENSMUSG00000048782.15
ENSMUSG00000108753.1
ENSMUSG00000030782.16
ENSMUSG00000109460.1
ENSMUSG00000108719.1
ENSMUSG00000109149.1
ENSMUSG00000019194.15
ENSMUSG00000040046.14
ENSMUSG00000030499.9
ENSMUSG00000108389.1
ENSMUSG00000108803.1
ENSMUSG00000030786.18
ENSMUSG00000108652.1
ENSMUSG00000109831.1
ENSMUSG00000030745.9
ENSMUSG00000003863.18
ENSMUSG00000053338.9
ENSMUSG00000030704.14
ENSMUSG00000108905.1
ENSMUSG00000030515.9
ENSMUSG00000030663.12
ENSMUSG00000063089.4
ENSMUSG00000085751.1
ENSMUSG00000092652.1
ENSMUSG00000004508.6
ENSMUSG00000082684.1
ENSMUSG00000108678.1
ENSMUSG00000036578.7
ENSMUSG00000025139.14
ENSMUSG00000108730.1