In [1]:
import pandas as pd
import numpy as np
import anndata
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import warnings
import pathlib

warnings.filterwarnings('ignore')

In [2]:
from matplotlib import rc
labelsize = 6
linewidth = 0.6
rc('lines', linewidth=linewidth)
rc('axes', labelsize=labelsize, linewidth=linewidth)
rc('xtick', labelsize=labelsize)
rc('ytick', labelsize=labelsize)
rc('xtick.major', width=linewidth)
rc('ytick.major', width=linewidth)
rc('xtick.minor', width=linewidth - 0.2)
rc('ytick.minor', width=linewidth - 0.2)

In [3]:
chrom = 'chr2'
genes = ['Lhx6']
slop = 250000
n_pc = 10
resolution = 1

In [4]:
# Parameters
chrom = "chr2"
genes = ["ENSMUSG00000005089.15", "ENSMUSG00000027194.16", "ENSMUSG00000038860.15", "ENSMUSG00000027238.17", "ENSMUSG00000026787.3", "ENSMUSG00000026799.15", "ENSMUSG00000035226.5", "ENSMUSG00000026883.17", "ENSMUSG00000070880.10", "ENSMUSG00000092201.7", "ENSMUSG00000026827.12", "ENSMUSG00000042631.7", "ENSMUSG00000055485.6", "ENSMUSG00000039046.15", "ENSMUSG00000027210.20", "ENSMUSG00000027422.15", "ENSMUSG00000035513.19", "ENSMUSG00000050896.12", "ENSMUSG00000027347.18", "ENSMUSG00000068373.15", "ENSMUSG00000026932.14", "ENSMUSG00000009621.18", "ENSMUSG00000026872.18", "ENSMUSG00000074607.11", "ENSMUSG00000042359.18", "ENSMUSG00000027195.10", "ENSMUSG00000074736.10", "ENSMUSG00000027593.15", "ENSMUSG00000026970.16", "ENSMUSG00000087694.8", "ENSMUSG00000006800.14", "ENSMUSG00000040133.2", "ENSMUSG00000087125.1", "ENSMUSG00000026888.14", "ENSMUSG00000026824.11", "ENSMUSG00000026885.13", "ENSMUSG00000026950.18", "ENSMUSG00000016386.15", "ENSMUSG00000050211.14", "ENSMUSG00000027207.15", "ENSMUSG00000086353.1", "ENSMUSG00000027560.4", "ENSMUSG00000027217.13", "ENSMUSG00000026768.10", "ENSMUSG00000074575.4", "ENSMUSG00000040282.13", "ENSMUSG00000005102.13", "ENSMUSG00000027220.2", "ENSMUSG00000048482.14", "ENSMUSG00000050199.13", "ENSMUSG00000038400.15", "ENSMUSG00000058594.15", "ENSMUSG00000049744.15", "ENSMUSG00000038718.15", "ENSMUSG00000027016.17", "ENSMUSG00000049044.16", "ENSMUSG00000027641.13", "ENSMUSG00000027111.15", "ENSMUSG00000035168.16", "ENSMUSG00000037843.6", "ENSMUSG00000048038.8", "ENSMUSG00000027200.17", "ENSMUSG00000027544.16", "ENSMUSG00000085631.1", "ENSMUSG00000050600.5", "ENSMUSG00000027030.15", "ENSMUSG00000070866.4", "ENSMUSG00000027298.17", "ENSMUSG00000037754.13", "ENSMUSG00000037110.20", "ENSMUSG00000086994.1", "ENSMUSG00000027108.15", "ENSMUSG00000026737.12", "ENSMUSG00000048058.17", "ENSMUSG00000027204.13", "ENSMUSG00000026842.16", "ENSMUSG00000075028.11", "ENSMUSG00000035392.17", "ENSMUSG00000017929.13", "ENSMUSG00000084946.2", "ENSMUSG00000015846.14", "ENSMUSG00000039621.13", "ENSMUSG00000068040.10", "ENSMUSG00000075270.11", "ENSMUSG00000000247.11", "ENSMUSG00000027011.14", "ENSMUSG00000045967.11", "ENSMUSG00000026991.20", "ENSMUSG00000032724.5", "ENSMUSG00000085640.1", "ENSMUSG00000027087.11", "ENSMUSG00000026837.15", "ENSMUSG00000017412.15", "ENSMUSG00000096972.7", "ENSMUSG00000026796.16", "ENSMUSG00000027188.8", "ENSMUSG00000075020.6", "ENSMUSG00000026778.13", "ENSMUSG00000027335.9", "ENSMUSG00000003418.11", "ENSMUSG00000026727.10", "ENSMUSG00000044033.16", "ENSMUSG00000053166.14", "ENSMUSG00000009216.7", "ENSMUSG00000026773.19", "ENSMUSG00000087528.1", "ENSMUSG00000040479.11", "ENSMUSG00000050447.15", "ENSMUSG00000027397.14", "ENSMUSG00000040084.9", "ENSMUSG00000027577.14", "ENSMUSG00000033396.13", "ENSMUSG00000056486.18", "ENSMUSG00000026767.12", "ENSMUSG00000062646.12", "ENSMUSG00000047907.11", "ENSMUSG00000027333.18", "ENSMUSG00000027165.16", "ENSMUSG00000038467.15", "ENSMUSG00000027472.14", "ENSMUSG00000061411.12", "ENSMUSG00000041921.16", "ENSMUSG00000074923.10", "ENSMUSG00000079056.12", "ENSMUSG00000040506.16", "ENSMUSG00000055632.18", "ENSMUSG00000000305.12", "ENSMUSG00000027395.15", "ENSMUSG00000054510.5", "ENSMUSG00000057147.13", "ENSMUSG00000027344.14", "ENSMUSG00000038696.14", "ENSMUSG00000087539.1", "ENSMUSG00000026655.15", "ENSMUSG00000036890.13", "ENSMUSG00000064289.15", "ENSMUSG00000000392.17", "ENSMUSG00000087029.1", "ENSMUSG00000027327.16", "ENSMUSG00000092679.1", "ENSMUSG00000004085.14", "ENSMUSG00000038740.9", "ENSMUSG00000027187.10", "ENSMUSG00000027639.16", "ENSMUSG00000027489.15", "ENSMUSG00000091192.1", "ENSMUSG00000026765.12", "ENSMUSG00000026672.11", "ENSMUSG00000086496.1", "ENSMUSG00000058740.14", "ENSMUSG00000027010.16", "ENSMUSG00000026836.15", "ENSMUSG00000057182.15", "ENSMUSG00000010505.16", "ENSMUSG00000097770.1", "ENSMUSG00000026676.7", "ENSMUSG00000087185.1", "ENSMUSG00000027070.14", "ENSMUSG00000034906.15", "ENSMUSG00000079499.9", "ENSMUSG00000006494.11", "ENSMUSG00000009614.16", "ENSMUSG00000027583.13", "ENSMUSG00000075324.13", "ENSMUSG00000027293.13", "ENSMUSG00000035399.12", "ENSMUSG00000035033.15", "ENSMUSG00000027650.12", "ENSMUSG00000090625.1", "ENSMUSG00000017418.13", "ENSMUSG00000038085.13", "ENSMUSG00000027351.14", "ENSMUSG00000041911.3", "ENSMUSG00000041997.16", "ENSMUSG00000052155.5", "ENSMUSG00000027339.15", "ENSMUSG00000050556.9", "ENSMUSG00000103412.1", "ENSMUSG00000005882.18", "ENSMUSG00000026748.13", "ENSMUSG00000017144.8", "ENSMUSG00000033902.12", "ENSMUSG00000104459.1", "ENSMUSG00000026843.15", "ENSMUSG00000017861.11", "ENSMUSG00000027452.11", "ENSMUSG00000000889.8", "ENSMUSG00000085709.1", "ENSMUSG00000079110.11", "ENSMUSG00000026890.19", "ENSMUSG00000085431.7", "ENSMUSG00000026764.15", "ENSMUSG00000075538.2", "ENSMUSG00000087158.1", "ENSMUSG00000102286.1", "ENSMUSG00000068735.14", "ENSMUSG00000098312.1", "ENSMUSG00000027048.15", "ENSMUSG00000036053.17", "ENSMUSG00000027209.17", "ENSMUSG00000026726.10", "ENSMUSG00000027180.12", "ENSMUSG00000102674.1", "ENSMUSG00000059842.12", "ENSMUSG00000064329.13", "ENSMUSG00000027270.14", "ENSMUSG00000027624.19", "ENSMUSG00000074673.15", "ENSMUSG00000085439.1", "ENSMUSG00000026994.9", "ENSMUSG00000085261.1", "ENSMUSG00000027296.7", "ENSMUSG00000085222.1", "ENSMUSG00000045624.15", "ENSMUSG00000079502.8", "ENSMUSG00000075415.13", "ENSMUSG00000017943.15", "ENSMUSG00000004113.18", "ENSMUSG00000036040.14", "ENSMUSG00000054074.9", "ENSMUSG00000086544.1", "ENSMUSG00000101448.1", "ENSMUSG00000026784.14", "ENSMUSG00000027221.5", "ENSMUSG00000049630.6", "ENSMUSG00000027363.15", "ENSMUSG00000074758.10", "ENSMUSG00000086509.2", "ENSMUSG00000087473.7", "ENSMUSG00000085417.1", "ENSMUSG00000027007.16", "ENSMUSG00000048186.14", "ENSMUSG00000027364.14", "ENSMUSG00000026896.14", "ENSMUSG00000085335.1", "ENSMUSG00000027071.14", "ENSMUSG00000080880.1", "ENSMUSG00000026832.12", "ENSMUSG00000086831.1", "ENSMUSG00000061689.15", "ENSMUSG00000026750.6", "ENSMUSG00000026806.15", "ENSMUSG00000027655.14", "ENSMUSG00000086756.1", "ENSMUSG00000039844.19", "ENSMUSG00000017858.12", "ENSMUSG00000026730.12", "ENSMUSG00000023206.16", "ENSMUSG00000036202.16", "ENSMUSG00000027568.9", "ENSMUSG00000027075.16", "ENSMUSG00000032841.15", "ENSMUSG00000037307.11", "ENSMUSG00000053552.14", "ENSMUSG00000015932.8", "ENSMUSG00000026826.13", "ENSMUSG00000074766.10", "ENSMUSG00000100197.1", "ENSMUSG00000025782.12", "ENSMUSG00000026834.13", "ENSMUSG00000085014.1", "ENSMUSG00000054455.5", "ENSMUSG00000097774.1", "ENSMUSG00000027009.18", "ENSMUSG00000027340.15", "ENSMUSG00000036327.18", "ENSMUSG00000104106.1", "ENSMUSG00000027168.21", "ENSMUSG00000035268.14", "ENSMUSG00000025314.16", "ENSMUSG00000027074.14", "ENSMUSG00000034738.8", "ENSMUSG00000027634.14", "ENSMUSG00000079604.9", "ENSMUSG00000084908.2", "ENSMUSG00000027022.13", "ENSMUSG00000027523.20", "ENSMUSG00000091457.1", "ENSMUSG00000016356.18", "ENSMUSG00000044647.16", "ENSMUSG00000098004.1", "ENSMUSG00000057234.9", "ENSMUSG00000026788.13", "ENSMUSG00000038572.7", "ENSMUSG00000026712.3", "ENSMUSG00000075316.11", "ENSMUSG00000104277.1", "ENSMUSG00000085485.1", "ENSMUSG00000037279.13", "ENSMUSG00000103413.1", "ENSMUSG00000027401.9", "ENSMUSG00000027014.14", "ENSMUSG00000085135.7", "ENSMUSG00000034683.12", "ENSMUSG00000040591.18", "ENSMUSG00000085889.1", "ENSMUSG00000027378.16", "ENSMUSG00000027276.7", "ENSMUSG00000033955.13", "ENSMUSG00000085427.7", "ENSMUSG00000082010.3", "ENSMUSG00000084829.7", "ENSMUSG00000044991.10", "ENSMUSG00000060332.9", "ENSMUSG00000086874.1", "ENSMUSG00000085938.1", "ENSMUSG00000050530.14", "ENSMUSG00000039128.13", "ENSMUSG00000087301.7", "ENSMUSG00000026988.15", "ENSMUSG00000026786.14", "ENSMUSG00000027332.11", "ENSMUSG00000027381.16", "ENSMUSG00000007659.18", "ENSMUSG00000075224.10", "ENSMUSG00000027375.14", "ENSMUSG00000026976.15", "ENSMUSG00000032698.15", "ENSMUSG00000026807.8", "ENSMUSG00000027525.17", "ENSMUSG00000026866.16", "ENSMUSG00000039356.15", "ENSMUSG00000085862.2", "ENSMUSG00000023094.14", "ENSMUSG00000084798.1", "ENSMUSG00000026821.16", "ENSMUSG00000075376.10", "ENSMUSG00000085001.1", "ENSMUSG00000048486.6", "ENSMUSG00000056738.11", "ENSMUSG00000035829.13", "ENSMUSG00000087086.7", "ENSMUSG00000086449.1", "ENSMUSG00000085357.1", "ENSMUSG00000017817.11", "ENSMUSG00000035576.13", "ENSMUSG00000074627.11", "ENSMUSG00000044083.12", "ENSMUSG00000027469.16", "ENSMUSG00000034848.17", "ENSMUSG00000039050.8", "ENSMUSG00000027208.14", "ENSMUSG00000027495.4", "ENSMUSG00000036833.16", "ENSMUSG00000085680.1", "ENSMUSG00000033256.14", "ENSMUSG00000034903.18", "ENSMUSG00000003283.14", "ENSMUSG00000027297.14", "ENSMUSG00000026740.12", "ENSMUSG00000069495.12", "ENSMUSG00000026770.5", "ENSMUSG00000062661.6", "ENSMUSG00000042670.5", "ENSMUSG00000086308.1", "ENSMUSG00000027496.15", "ENSMUSG00000087617.1", "ENSMUSG00000087226.1", "ENSMUSG00000000876.11", "ENSMUSG00000036949.16", "ENSMUSG00000085141.1", "ENSMUSG00000086358.2", "ENSMUSG00000086166.1", "ENSMUSG00000023236.7", "ENSMUSG00000026874.10", "ENSMUSG00000051379.12", "ENSMUSG00000015787.15", "ENSMUSG00000026718.17", "ENSMUSG00000027485.15", "ENSMUSG00000085247.1", "ENSMUSG00000085057.7", "ENSMUSG00000042410.16", "ENSMUSG00000069132.3", "ENSMUSG00000026942.13", "ENSMUSG00000068966.10", "ENSMUSG00000074981.13", "ENSMUSG00000027394.13", "ENSMUSG00000043241.14", "ENSMUSG00000026828.11", "ENSMUSG00000027227.7", "ENSMUSG00000074749.10", "ENSMUSG00000027642.15", "ENSMUSG00000085483.1", "ENSMUSG00000100303.3", "ENSMUSG00000039117.16", "ENSMUSG00000074753.5", "ENSMUSG00000027546.15", "ENSMUSG00000109071.1", "ENSMUSG00000027522.15", "ENSMUSG00000027678.17", "ENSMUSG00000086669.1", "ENSMUSG00000085582.7", "ENSMUSG00000055612.15", "ENSMUSG00000040093.15", "ENSMUSG00000075217.1", "ENSMUSG00000027202.12", "ENSMUSG00000041762.16", "ENSMUSG00000085950.1", "ENSMUSG00000026869.12", "ENSMUSG00000085845.7", "ENSMUSG00000057133.14", "ENSMUSG00000001999.15", "ENSMUSG00000018459.15", "ENSMUSG00000035236.17", "ENSMUSG00000002731.6", "ENSMUSG00000087549.1", "ENSMUSG00000086418.1", "ENSMUSG00000026840.14", "ENSMUSG00000085008.1", "ENSMUSG00000086450.1", "ENSMUSG00000040152.8", "ENSMUSG00000053675.8", "ENSMUSG00000037259.15", "ENSMUSG00000026841.7", "ENSMUSG00000087309.1", "ENSMUSG00000039155.15", "ENSMUSG00000027166.14", "ENSMUSG00000035778.17", "ENSMUSG00000074783.10", "ENSMUSG00000074628.9", "ENSMUSG00000018209.15", "ENSMUSG00000027198.16", "ENSMUSG00000040174.14", "ENSMUSG00000085252.1", "ENSMUSG00000100046.1", "ENSMUSG00000075027.10", "ENSMUSG00000037820.15", "ENSMUSG00000114963.1", "ENSMUSG00000027651.16", "ENSMUSG00000106994.1", "ENSMUSG00000040061.17", "ENSMUSG00000078998.3", "ENSMUSG00000027359.16", "ENSMUSG00000055897.13", "ENSMUSG00000027012.15", "ENSMUSG00000083674.3", "ENSMUSG00000061136.14", "ENSMUSG00000084840.1", "ENSMUSG00000027177.14", "ENSMUSG00000027605.18", "ENSMUSG00000084791.1", "ENSMUSG00000038685.18", "ENSMUSG00000100135.1", "ENSMUSG00000026894.4", "ENSMUSG00000087467.1", "ENSMUSG00000025815.13", "ENSMUSG00000026904.17", "ENSMUSG00000027173.7", "ENSMUSG00000026645.11", "ENSMUSG00000085089.1", "ENSMUSG00000027551.15", "ENSMUSG00000009214.9", "ENSMUSG00000086954.1", "ENSMUSG00000074994.5", "ENSMUSG00000086379.7", "ENSMUSG00000026792.16", "ENSMUSG00000015839.6", "ENSMUSG00000076441.9", "ENSMUSG00000087061.1", "ENSMUSG00000005973.6", "ENSMUSG00000040035.14", "ENSMUSG00000026915.16", "ENSMUSG00000027481.2", "ENSMUSG00000038765.13", "ENSMUSG00000026774.11", "ENSMUSG00000085448.1", "ENSMUSG00000075062.4", "ENSMUSG00000027273.13", "ENSMUSG00000086980.1", "ENSMUSG00000086819.1", "ENSMUSG00000027425.18", "ENSMUSG00000086447.2", "ENSMUSG00000036249.16", "ENSMUSG00000068079.5", "ENSMUSG00000038831.16", "ENSMUSG00000085591.1", "ENSMUSG00000050558.13", "ENSMUSG00000027524.9", "ENSMUSG00000027244.14", "ENSMUSG00000070883.3", "ENSMUSG00000044349.15", "ENSMUSG00000027488.12", "ENSMUSG00000086141.7", "ENSMUSG00000036924.3", "ENSMUSG00000082315.2", "ENSMUSG00000027475.9", "ENSMUSG00000084785.1", "ENSMUSG00000059013.12", "ENSMUSG00000084919.1", "ENSMUSG00000084839.1", "ENSMUSG00000027291.15", "ENSMUSG00000068615.4", "ENSMUSG00000107955.1", "ENSMUSG00000086735.1", "ENSMUSG00000038523.10", "ENSMUSG00000027346.15", "ENSMUSG00000035000.8", "ENSMUSG00000026983.10", "ENSMUSG00000063972.13", "ENSMUSG00000059588.13", "ENSMUSG00000085852.1", "ENSMUSG00000080846.1", "ENSMUSG00000103662.1", "ENSMUSG00000107586.1", "ENSMUSG00000026980.15", "ENSMUSG00000082879.1", "ENSMUSG00000033808.16", "ENSMUSG00000039033.11", "ENSMUSG00000050592.8", "ENSMUSG00000025839.3", "ENSMUSG00000086872.1", "ENSMUSG00000027243.8", "ENSMUSG00000103747.1", "ENSMUSG00000097843.1", "ENSMUSG00000027345.6", "ENSMUSG00000026971.15", "ENSMUSG00000039063.5", "ENSMUSG00000039263.16", "ENSMUSG00000048911.15", "ENSMUSG00000084901.6", "ENSMUSG00000027429.16", "ENSMUSG00000027570.15", "ENSMUSG00000074665.10", "ENSMUSG00000000194.13", "ENSMUSG00000053475.5", "ENSMUSG00000054580.14", "ENSMUSG00000013465.19", "ENSMUSG00000084826.7", "ENSMUSG00000085428.1", "ENSMUSG00000095332.2", "ENSMUSG00000027163.13", "ENSMUSG00000082457.1", "ENSMUSG00000015647.9", "ENSMUSG00000026999.14", "ENSMUSG00000068009.11", "ENSMUSG00000086007.1", "ENSMUSG00000067998.11", "ENSMUSG00000107351.1", "ENSMUSG00000027509.11", "ENSMUSG00000070476.10", "ENSMUSG00000086177.1", "ENSMUSG00000040687.16", "ENSMUSG00000085504.1", "ENSMUSG00000034701.9", "ENSMUSG00000000826.16", "ENSMUSG00000074899.6", "ENSMUSG00000040549.16", "ENSMUSG00000085958.1", "ENSMUSG00000017740.17", "ENSMUSG00000040495.4", "ENSMUSG00000027540.13", "ENSMUSG00000062319.2", "ENSMUSG00000027420.13", "ENSMUSG00000035877.17", "ENSMUSG00000026879.14", "ENSMUSG00000075284.10", "ENSMUSG00000027001.10", "ENSMUSG00000032046.15", "ENSMUSG00000087355.7", "ENSMUSG00000026960.6", "ENSMUSG00000045838.8", "ENSMUSG00000087196.1", "ENSMUSG00000017670.16", "ENSMUSG00000045339.7", "ENSMUSG00000046085.7", "ENSMUSG00000061531.8", "ENSMUSG00000102258.1", "ENSMUSG00000085257.2", "ENSMUSG00000074570.14", "ENSMUSG00000104153.1", "ENSMUSG00000086908.1", "ENSMUSG00000099161.1", "ENSMUSG00000061809.8", "ENSMUSG00000057738.13", "ENSMUSG00000027253.15", "ENSMUSG00000027015.4", "ENSMUSG00000087515.1", "ENSMUSG00000032852.3", "ENSMUSG00000075314.4", "ENSMUSG00000075249.12", "ENSMUSG00000027360.5", "ENSMUSG00000098575.1", "ENSMUSG00000085496.1", "ENSMUSG00000027353.14", "ENSMUSG00000086840.1", "ENSMUSG00000079324.11", "ENSMUSG00000079103.3", "ENSMUSG00000027502.11", "ENSMUSG00000078137.1", "ENSMUSG00000094747.3"]
slop = 250000
n_pc = 10
resolution = 1


In [5]:
output_dir = pathlib.Path(chrom)
output_dir.mkdir(exist_ok=True)

## Cell Meta

In [6]:
cell_tidy_data = pd.read_msgpack(
    '/home/hanliu/project/mouse_rostral_brain/study/ClusteringSummary/Summary/TotalClusteringResults.msg'
)
use_clusters = [
    i.replace(' ', '_') for i in cell_tidy_data[cell_tidy_data['CellClass'].isin(['Exc', 'Inh'])]
    ['SubType'].unique() if 'Outlier' not in i
]
len(use_clusters)

145

## ATAC peaks

In [7]:
atac_peak = pd.read_msgpack('/home/hanliu/project/mouse_rostral_brain/study/DMRCluster/SubType.ATAC_peak_merged.msg')
atac_peak = atac_peak.loc[atac_peak.index.map(lambda i: i.startswith(f'Sub{chrom}_'))].copy()

## Gene Info

In [8]:
gene_meta = pd.read_csv(
    '/home/hanliu/ref/mouse/gencode/vm22/gencode.vM22.annotation.gene.flat.tsv.gz',
    index_col='gene_id', sep='\t'
)
gene_meta = gene_meta[gene_meta['chrom'] == chrom].copy()

In [9]:
exon_bed = pd.read_csv('/home/hanliu/ref/mouse/gencode/vm22/genome_anno/exon.all.bed',
                       header=None, sep='\t')
exon_bed.columns = ['chrom', 'start', 'end', 'gene_id', 'gene_name']

## DMR Info

In [10]:
with pd.HDFStore('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/DMRInfo.h5', 'r') as hdf:
    dmr_rate = hdf['Rate']
dmr_rate = dmr_rate.loc[dmr_rate.index.map(lambda i: i.startswith(f'Sub{chrom}_'))].copy()

dmr_corr = pd.read_msgpack(
    '/home/hanliu/project/mouse_rostral_brain/study/DMRGeneCorr/TotalGeneDMRCorrLoop.0.3.msg'
)
dmr_corr = dmr_corr.set_index(['DMR', 'Gene'])
dmr_corr = dmr_corr.loc[dmr_corr.index.get_level_values('DMR').isin(dmr_rate.index)].copy()

dmr_bed = pd.read_csv('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/TotalDMR.nofilter.bed',
                      sep='\t', header=None, index_col=3)
dmr_bed.columns = ['chrom', 'start', 'end']
dmr_bed = dmr_bed[dmr_bed['chrom'] == chrom].copy()

dmr_hits = anndata.read_h5ad('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/TotalHits.HypoDMR.h5ad')
dmr_hits = dmr_hits[dmr_rate.index, :].copy()
dmr_hits = dmr_hits[:, use_clusters].copy()

dmr_annot = anndata.read_h5ad('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/DMRAnnotation.h5ad')
dmr_annot = dmr_annot[dmr_rate.index, :].copy()

In [11]:
dmr_hits.shape[0]

356755

In [12]:
def get_gene(gene_id):
    _gene = gene_meta.loc[gene_id]
    return _gene.name, _gene['chrom'], _gene['start'], _gene['end'], _gene['strand']

## Gene's DMR clustering

In [13]:
def calculate_gene(gene_id):
    # get gene information
    gene_id, _, gene_start, gene_end, _ = get_gene(gene_id)

    # select related DMRs
    related_dmr = dmr_bed[(dmr_bed['start'] > gene_start - slop) &
                          (dmr_bed['end'] < gene_end + slop)].copy()
    
    related_dmr_rate = dmr_rate.loc[related_dmr.index, use_clusters].copy()
    related_dmr_rate.fillna(related_dmr_rate.mean(), axis=0, inplace=True)
    
    # construct Adata
    adata = anndata.AnnData(X=related_dmr_rate.values.copy(),
                            obs=pd.DataFrame([], related_dmr_rate.index),
                            var=pd.DataFrame([], related_dmr_rate.columns))
    sc.pp.scale(adata)
    sc.pp.pca(adata)
    
    pcs = adata.obsm['X_pca'][:, :n_pc]
    sc.pp.neighbors(adata, n_neighbors=int(round(np.log2(adata.shape[0]))), n_pcs=n_pc)
    sc.tl.leiden(adata, resolution=resolution)
    return adata.obs


def get_annotation(gene_id):
    gene_id, _, gene_start, gene_end, strand = get_gene(gene_id)
    tss = gene_start if strand == '+' else gene_end
    gene_cluster = calculate_gene(gene_id)
    this_corr = dmr_corr[dmr_corr.index.get_level_values('Gene') == gene_id]['Corr']
    this_corr.index = this_corr.index.droplevel('Gene')
    gene_cluster['Corr'] = gene_cluster.index.map(this_corr).fillna(0)
    
    this_dmr_bed = dmr_bed.loc[gene_cluster.index]
    dmr_center = (this_dmr_bed['end'] + this_dmr_bed['start']) / 2
    gene_length = gene_end - gene_start
    if strand == '+':
        gene_cluster['reldist_tss'] = (dmr_center - gene_start) / gene_length
    else:
        gene_cluster['reldist_tss'] = (gene_end - dmr_center) / gene_length
    gene_cluster['in_gene_body'] = (gene_cluster['reldist_tss'] > 0) & (gene_cluster['reldist_tss'] < 1)
    
    this_annot = dmr_annot[gene_cluster.index]
    annot_df = pd.DataFrame(this_annot.X.todense(), 
                 index=this_annot.obs_names, columns=this_annot.var_names)
    
    # annotate TE cols
    dna_te = annot_df.columns[20:33]
    gene_cluster['is_dna_te'] = annot_df[dna_te].sum(axis=1) != 0
    
    line_te = annot_df.columns[33:39]
    gene_cluster['is_line_te'] = annot_df[line_te].sum(axis=1) != 0
    
    ltr_te = annot_df.columns[39:45]
    gene_cluster['is_ltr_te'] = annot_df[ltr_te].sum(axis=1) != 0
    
    sine_te = annot_df.columns[45:52]
    gene_cluster['is_sine_te'] = annot_df[sine_te].sum(axis=1) != 0
    
    # this dmr within GOI's gene feature
    gene_cluster['in_intron'] = annot_df['intron'].astype(bool) & gene_cluster['in_gene_body']
    gene_cluster['in_exon'] = annot_df['exon'].astype(bool) & gene_cluster['in_gene_body']
    gene_cluster['in_utr3'] = annot_df['UTR3'].astype(bool) & gene_cluster['in_gene_body']
    gene_cluster['in_utr5'] = annot_df['UTR5'].astype(bool) & gene_cluster['in_gene_body']
    
    # previous mC study
    gene_cluster['feDMR'] = annot_df['feDMR'].astype(bool)
    gene_cluster['adultDMR'] = annot_df['adultDMR'].astype(bool)
    
    
    other_profiles = []
    # DMR hypo call in each cluster
    this_hypo_hits = dmr_hits[gene_cluster.index]
    hits_df = pd.DataFrame(this_hypo_hits.X.todense(), 
                 index=this_hypo_hits.obs_names, columns=this_hypo_hits.var_names)
    hits_df.columns = hits_df.columns.map(lambda i: f'HypoDMR.{i}')
    other_profiles.append(hits_df)
    
    # DMR rate
    related_dmr_rate = dmr_rate.loc[gene_cluster['leiden'].sort_values().index, use_clusters].copy()
    related_dmr_rate.fillna(related_dmr_rate.mean(), axis=0, inplace=True)
    related_dmr_rate.columns = related_dmr_rate.columns.map(lambda i: f'DMRRate.{i}')
    other_profiles.append(related_dmr_rate)
    
    # atac peak
    atac_peak_df = atac_peak.loc[related_dmr_rate.index, use_clusters].copy()
    other_profiles.append(atac_peak_df)
    atac_peak_df.columns = atac_peak_df.columns.map(lambda i: f'ATACPeak.{i}')
    
    dmr_annotation = pd.concat([gene_cluster] + other_profiles, axis=1, sort=True)
    return dmr_annotation

In [None]:
for gene in genes:
    print(gene)
    check_path = output_dir / f'{gene}.DMR_cluster.msg'
    if check_path.exists():
        continue
    
    dmr_annotation = get_annotation(gene)
    dmr_annotation.to_msgpack(output_dir / f'{gene}.DMR_detail.msg', compress='zlib')
    
    cluster_annotation = dmr_annotation.groupby('leiden').mean()
    cluster_annotation.to_msgpack(output_dir / f'{gene}.DMR_cluster.msg', compress='zlib')

ENSMUSG00000005089.15
ENSMUSG00000027194.16
ENSMUSG00000038860.15
ENSMUSG00000027238.17
ENSMUSG00000026787.3
ENSMUSG00000026799.15
ENSMUSG00000035226.5
ENSMUSG00000026883.17
ENSMUSG00000070880.10
ENSMUSG00000092201.7
ENSMUSG00000026827.12
ENSMUSG00000042631.7
ENSMUSG00000055485.6
ENSMUSG00000039046.15
ENSMUSG00000027210.20
ENSMUSG00000027422.15
ENSMUSG00000035513.19
ENSMUSG00000050896.12
ENSMUSG00000027347.18
ENSMUSG00000068373.15
ENSMUSG00000026932.14
ENSMUSG00000009621.18
ENSMUSG00000026872.18
ENSMUSG00000074607.11
ENSMUSG00000042359.18
ENSMUSG00000027195.10
ENSMUSG00000074736.10
ENSMUSG00000027593.15
ENSMUSG00000026970.16
ENSMUSG00000087694.8
ENSMUSG00000006800.14
ENSMUSG00000040133.2
ENSMUSG00000087125.1
ENSMUSG00000026888.14
ENSMUSG00000026824.11
ENSMUSG00000026885.13
ENSMUSG00000026950.18
ENSMUSG00000016386.15
ENSMUSG00000050211.14
ENSMUSG00000027207.15
ENSMUSG00000086353.1
ENSMUSG00000027560.4
ENSMUSG00000027217.13
ENSMUSG00000026768.10
ENSMUSG00000074575.4
ENSMUSG00000040282.13