In [1]:
import pandas as pd
import numpy as np
import anndata
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import warnings
import pathlib

warnings.filterwarnings('ignore')

In [2]:
from matplotlib import rc
labelsize = 6
linewidth = 0.6
rc('lines', linewidth=linewidth)
rc('axes', labelsize=labelsize, linewidth=linewidth)
rc('xtick', labelsize=labelsize)
rc('ytick', labelsize=labelsize)
rc('xtick.major', width=linewidth)
rc('ytick.major', width=linewidth)
rc('xtick.minor', width=linewidth - 0.2)
rc('ytick.minor', width=linewidth - 0.2)

In [3]:
chrom = 'chr2'
genes = ['Lhx6']
slop = 250000
n_pc = 10
resolution = 1

In [4]:
# Parameters
chrom = "chr5"
genes = ["ENSMUSG00000029705.17", "ENSMUSG00000086220.1", "ENSMUSG00000029094.12", "ENSMUSG00000023079.14", "ENSMUSG00000037736.19", "ENSMUSG00000060961.15", "ENSMUSG00000090061.9", "ENSMUSG00000089809.9", "ENSMUSG00000033316.14", "ENSMUSG00000039706.11", "ENSMUSG00000034040.16", "ENSMUSG00000040407.18", "ENSMUSG00000056004.16", "ENSMUSG00000029168.14", "ENSMUSG00000003974.6", "ENSMUSG00000051339.10", "ENSMUSG00000029287.14", "ENSMUSG00000036377.18", "ENSMUSG00000028944.14", "ENSMUSG00000028926.15", "ENSMUSG00000043635.12", "ENSMUSG00000029174.18", "ENSMUSG00000040118.15", "ENSMUSG00000037999.13", "ENSMUSG00000029554.15", "ENSMUSG00000038319.14", "ENSMUSG00000029189.10", "ENSMUSG00000029560.12", "ENSMUSG00000029516.19", "ENSMUSG00000104369.1", "ENSMUSG00000033209.17", "ENSMUSG00000106379.1", "ENSMUSG00000042249.11", "ENSMUSG00000029093.14", "ENSMUSG00000029101.16", "ENSMUSG00000039156.19", "ENSMUSG00000029636.11", "ENSMUSG00000029673.17", "ENSMUSG00000029245.16", "ENSMUSG00000056367.14", "ENSMUSG00000037795.14", "ENSMUSG00000097207.7", "ENSMUSG00000000560.9", "ENSMUSG00000029167.13", "ENSMUSG00000050640.16", "ENSMUSG00000092060.4", "ENSMUSG00000034687.8", "ENSMUSG00000042589.18", "ENSMUSG00000029361.18", "ENSMUSG00000045078.12", "ENSMUSG00000058153.15", "ENSMUSG00000072720.9", "ENSMUSG00000037224.15", "ENSMUSG00000029207.16", "ENSMUSG00000029420.13", "ENSMUSG00000037605.16", "ENSMUSG00000060371.12", "ENSMUSG00000029123.8", "ENSMUSG00000061576.15", "ENSMUSG00000104894.4", "ENSMUSG00000018001.18", "ENSMUSG00000029178.14", "ENSMUSG00000004530.12", "ENSMUSG00000005672.12", "ENSMUSG00000077437.1", "ENSMUSG00000063919.11", "ENSMUSG00000037685.15", "ENSMUSG00000005107.13", "ENSMUSG00000029674.13", "ENSMUSG00000029092.9", "ENSMUSG00000063430.9", "ENSMUSG00000070576.4", "ENSMUSG00000034663.13", "ENSMUSG00000097726.7", "ENSMUSG00000029088.16", "ENSMUSG00000018076.12", "ENSMUSG00000032867.8", "ENSMUSG00000070498.3", "ENSMUSG00000042453.14", "ENSMUSG00000029608.10", "ENSMUSG00000029406.15", "ENSMUSG00000052139.18", "ENSMUSG00000029338.13", "ENSMUSG00000050248.11", "ENSMUSG00000048988.8", "ENSMUSG00000016520.7", "ENSMUSG00000039683.16", "ENSMUSG00000102415.1", "ENSMUSG00000034310.8", "ENSMUSG00000029330.8", "ENSMUSG00000029095.17", "ENSMUSG00000085673.1", "ENSMUSG00000011884.13", "ENSMUSG00000029228.15", "ENSMUSG00000015653.13", "ENSMUSG00000034118.15", "ENSMUSG00000039959.13", "ENSMUSG00000106892.1", "ENSMUSG00000028780.13", "ENSMUSG00000107132.1", "ENSMUSG00000041930.7", "ENSMUSG00000089743.7", "ENSMUSG00000023353.14", "ENSMUSG00000029108.14", "ENSMUSG00000104781.1", "ENSMUSG00000029602.11", "ENSMUSG00000028883.17", "ENSMUSG00000046808.17", "ENSMUSG00000029154.13", "ENSMUSG00000062110.14", "ENSMUSG00000040003.18", "ENSMUSG00000025855.13", "ENSMUSG00000014932.15", "ENSMUSG00000058013.11", "ENSMUSG00000105579.2", "ENSMUSG00000015882.18", "ENSMUSG00000016128.14", "ENSMUSG00000029136.9", "ENSMUSG00000029475.17", "ENSMUSG00000039252.11", "ENSMUSG00000029191.16", "ENSMUSG00000041638.18", "ENSMUSG00000097145.5", "ENSMUSG00000029651.16", "ENSMUSG00000029727.7", "ENSMUSG00000097767.8", "ENSMUSG00000029206.13", "ENSMUSG00000047881.14", "ENSMUSG00000037653.7", "ENSMUSG00000029594.12", "ENSMUSG00000029090.12", "ENSMUSG00000054920.12", "ENSMUSG00000048271.14", "ENSMUSG00000025854.15", "ENSMUSG00000063531.7", "ENSMUSG00000049265.7", "ENSMUSG00000040537.17", "ENSMUSG00000106659.1", "ENSMUSG00000029217.16", "ENSMUSG00000000149.10", "ENSMUSG00000085915.1", "ENSMUSG00000040473.15", "ENSMUSG00000034981.9", "ENSMUSG00000029426.8", "ENSMUSG00000106479.1", "ENSMUSG00000029086.15", "ENSMUSG00000063646.18", "ENSMUSG00000105617.4", "ENSMUSG00000029125.14", "ENSMUSG00000066551.12", "ENSMUSG00000054892.14", "ENSMUSG00000040274.11", "ENSMUSG00000041890.17", "ENSMUSG00000106918.3", "ENSMUSG00000097516.7", "ENSMUSG00000097626.5", "ENSMUSG00000041609.16", "ENSMUSG00000039191.12", "ENSMUSG00000051022.7", "ENSMUSG00000050017.11", "ENSMUSG00000057315.14", "ENSMUSG00000029270.10", "ENSMUSG00000042121.16", "ENSMUSG00000028995.14", "ENSMUSG00000039860.19", "ENSMUSG00000051950.10", "ENSMUSG00000029313.18", "ENSMUSG00000045790.10", "ENSMUSG00000029428.13", "ENSMUSG00000097986.2", "ENSMUSG00000051246.3", "ENSMUSG00000029212.11", "ENSMUSG00000032850.16", "ENSMUSG00000046079.16", "ENSMUSG00000005220.10", "ENSMUSG00000042817.15", "ENSMUSG00000029376.8", "ENSMUSG00000041298.15", "ENSMUSG00000085376.1", "ENSMUSG00000054814.14", "ENSMUSG00000029134.14", "ENSMUSG00000029211.11", "ENSMUSG00000043323.17", "ENSMUSG00000036403.15", "ENSMUSG00000029576.17", "ENSMUSG00000053553.11", "ENSMUSG00000033805.12", "ENSMUSG00000056310.14", "ENSMUSG00000107314.1", "ENSMUSG00000105020.1", "ENSMUSG00000039782.14", "ENSMUSG00000029482.4", "ENSMUSG00000029185.14", "ENSMUSG00000097640.7", "ENSMUSG00000056493.9", "ENSMUSG00000057614.6", "ENSMUSG00000049387.10", "ENSMUSG00000085237.2", "ENSMUSG00000029334.14", "ENSMUSG00000031558.15", "ENSMUSG00000039533.8", "ENSMUSG00000029309.7", "ENSMUSG00000061535.11", "ENSMUSG00000044716.12", "ENSMUSG00000029528.19", "ENSMUSG00000034324.16", "ENSMUSG00000106909.1", "ENSMUSG00000085720.8", "ENSMUSG00000041870.16", "ENSMUSG00000039765.15", "ENSMUSG00000087606.1", "ENSMUSG00000108207.2", "ENSMUSG00000010721.15", "ENSMUSG00000029471.13", "ENSMUSG00000086259.7", "ENSMUSG00000001098.15", "ENSMUSG00000029658.17", "ENSMUSG00000005103.12", "ENSMUSG00000048520.16", "ENSMUSG00000061578.8", "ENSMUSG00000015879.8", "ENSMUSG00000056602.11", "ENSMUSG00000117286.1", "ENSMUSG00000105773.1", "ENSMUSG00000038780.14", "ENSMUSG00000090558.1", "ENSMUSG00000039315.15", "ENSMUSG00000029135.10", "ENSMUSG00000029413.14", "ENSMUSG00000106560.1", "ENSMUSG00000029436.9", "ENSMUSG00000082692.2", "ENSMUSG00000039747.11", "ENSMUSG00000042184.11", "ENSMUSG00000038770.17", "ENSMUSG00000086729.1", "ENSMUSG00000085271.7", "ENSMUSG00000079173.11", "ENSMUSG00000039477.16", "ENSMUSG00000057816.7", "ENSMUSG00000044017.16", "ENSMUSG00000105423.4", "ENSMUSG00000048578.11", "ENSMUSG00000029642.11", "ENSMUSG00000106749.1", "ENSMUSG00000104866.1", "ENSMUSG00000045102.12", "ENSMUSG00000073152.4", "ENSMUSG00000086964.1", "ENSMUSG00000046709.17", "ENSMUSG00000107011.1", "ENSMUSG00000029410.11", "ENSMUSG00000002486.15", "ENSMUSG00000036526.8", "ENSMUSG00000105224.1", "ENSMUSG00000029335.5", "ENSMUSG00000029408.13", "ENSMUSG00000085839.1", "ENSMUSG00000013629.16", "ENSMUSG00000036639.12", "ENSMUSG00000032741.9", "ENSMUSG00000038564.11", "ENSMUSG00000029171.12", "ENSMUSG00000085234.1", "ENSMUSG00000034573.14", "ENSMUSG00000072889.9", "ENSMUSG00000052848.7", "ENSMUSG00000029153.11", "ENSMUSG00000039474.13", "ENSMUSG00000105068.1", "ENSMUSG00000107078.1", "ENSMUSG00000105985.1", "ENSMUSG00000029337.2", "ENSMUSG00000029513.14", "ENSMUSG00000007207.10", "ENSMUSG00000106656.1", "ENSMUSG00000046798.14", "ENSMUSG00000029012.11", "ENSMUSG00000105157.1", "ENSMUSG00000106643.1", "ENSMUSG00000036928.14", "ENSMUSG00000044968.16", "ENSMUSG00000054720.12", "ENSMUSG00000097216.3", "ENSMUSG00000029370.10", "ENSMUSG00000076320.1", "ENSMUSG00000105105.1", "ENSMUSG00000072722.2", "ENSMUSG00000005057.13", "ENSMUSG00000036817.14", "ENSMUSG00000029442.18", "ENSMUSG00000067206.3", "ENSMUSG00000106292.1", "ENSMUSG00000029359.13", "ENSMUSG00000001687.15", "ENSMUSG00000073102.7", "ENSMUSG00000049907.8", "ENSMUSG00000029381.16", "ENSMUSG00000025856.15", "ENSMUSG00000007080.14", "ENSMUSG00000043913.14", "ENSMUSG00000072573.2", "ENSMUSG00000068082.12", "ENSMUSG00000048215.14", "ENSMUSG00000056413.16", "ENSMUSG00000051674.15", "ENSMUSG00000035946.7", "ENSMUSG00000061755.10", "ENSMUSG00000028949.13", "ENSMUSG00000029439.14", "ENSMUSG00000104886.1", "ENSMUSG00000028970.9", "ENSMUSG00000041147.10", "ENSMUSG00000029122.11", "ENSMUSG00000059631.6", "ENSMUSG00000106543.1", "ENSMUSG00000029314.14", "ENSMUSG00000040464.15", "ENSMUSG00000049606.16", "ENSMUSG00000106741.3", "ENSMUSG00000045294.11", "ENSMUSG00000099047.1", "ENSMUSG00000062604.11", "ENSMUSG00000029544.16", "ENSMUSG00000104753.1", "ENSMUSG00000043410.16", "ENSMUSG00000015942.9", "ENSMUSG00000062372.13", "ENSMUSG00000029130.12", "ENSMUSG00000041264.16", "ENSMUSG00000043430.5", "ENSMUSG00000090220.1", "ENSMUSG00000104717.1", "ENSMUSG00000105071.1", "ENSMUSG00000029468.17", "ENSMUSG00000106863.3", "ENSMUSG00000099605.1", "ENSMUSG00000070639.5", "ENSMUSG00000004415.15", "ENSMUSG00000105648.1", "ENSMUSG00000093400.1", "ENSMUSG00000041132.11", "ENSMUSG00000038970.7", "ENSMUSG00000046562.5", "ENSMUSG00000029659.16", "ENSMUSG00000107040.1", "ENSMUSG00000061601.15", "ENSMUSG00000039682.12", "ENSMUSG00000033773.14", "ENSMUSG00000105890.1", "ENSMUSG00000028864.7", "ENSMUSG00000105320.1", "ENSMUSG00000085971.4", "ENSMUSG00000070690.7", "ENSMUSG00000029205.12", "ENSMUSG00000041313.14", "ENSMUSG00000106570.1", "ENSMUSG00000079362.5", "ENSMUSG00000060512.7", "ENSMUSG00000104800.1", "ENSMUSG00000025745.12", "ENSMUSG00000087290.2", "ENSMUSG00000037108.13", "ENSMUSG00000098230.7", "ENSMUSG00000043733.14", "ENSMUSG00000105543.1", "ENSMUSG00000025857.10", "ENSMUSG00000105311.1", "ENSMUSG00000029470.15", "ENSMUSG00000104990.1", "ENSMUSG00000028954.11", "ENSMUSG00000098599.1", "ENSMUSG00000105274.1", "ENSMUSG00000079215.8", "ENSMUSG00000105439.1", "ENSMUSG00000106710.1", "ENSMUSG00000039987.15", "ENSMUSG00000061979.8", "ENSMUSG00000029344.14", "ENSMUSG00000037936.15", "ENSMUSG00000036718.17", "ENSMUSG00000087248.7", "ENSMUSG00000097191.2", "ENSMUSG00000000915.15", "ENSMUSG00000067724.5", "ENSMUSG00000034842.16", "ENSMUSG00000053214.7", "ENSMUSG00000105861.1", "ENSMUSG00000038582.15", "ENSMUSG00000028978.12", "ENSMUSG00000070733.13", "ENSMUSG00000029455.14", "ENSMUSG00000046245.13", "ENSMUSG00000109713.1", "ENSMUSG00000067220.12", "ENSMUSG00000034219.15", "ENSMUSG00000093479.1", "ENSMUSG00000029722.15", "ENSMUSG00000105282.1", "ENSMUSG00000029414.11", "ENSMUSG00000085919.1", "ENSMUSG00000092094.5", "ENSMUSG00000039883.5", "ENSMUSG00000104987.1", "ENSMUSG00000040013.11", "ENSMUSG00000049971.17", "ENSMUSG00000105335.1", "ENSMUSG00000029660.10", "ENSMUSG00000048703.9", "ENSMUSG00000029407.10", "ENSMUSG00000037426.18", "ENSMUSG00000042190.12", "ENSMUSG00000105218.4", "ENSMUSG00000104868.1", "ENSMUSG00000097321.5", "ENSMUSG00000107271.3", "ENSMUSG00000066278.6", "ENSMUSG00000029084.5", "ENSMUSG00000029484.12", "ENSMUSG00000062761.7", "ENSMUSG00000105673.1", "ENSMUSG00000029103.16", "ENSMUSG00000104559.1", "ENSMUSG00000055385.12", "ENSMUSG00000105686.1", "ENSMUSG00000105347.1", "ENSMUSG00000054256.11", "ENSMUSG00000106384.1", "ENSMUSG00000105490.1", "ENSMUSG00000038593.18", "ENSMUSG00000029298.15", "ENSMUSG00000039178.9", "ENSMUSG00000093097.1", "ENSMUSG00000092275.1", "ENSMUSG00000011831.16", "ENSMUSG00000053647.4", "ENSMUSG00000107033.1", "ENSMUSG00000060261.15", "ENSMUSG00000111375.4", "ENSMUSG00000047843.16", "ENSMUSG00000086219.7", "ENSMUSG00000029179.14", "ENSMUSG00000029044.9", "ENSMUSG00000090146.1", "ENSMUSG00000029151.14", "ENSMUSG00000105264.1", "ENSMUSG00000029120.10", "ENSMUSG00000054622.4", "ENSMUSG00000086769.7", "ENSMUSG00000090326.2", "ENSMUSG00000053368.13", "ENSMUSG00000029015.9", "ENSMUSG00000037905.13", "ENSMUSG00000086505.1", "ENSMUSG00000050822.11", "ENSMUSG00000061288.13", "ENSMUSG00000105942.1", "ENSMUSG00000044134.9", "ENSMUSG00000029452.18", "ENSMUSG00000062960.10", "ENSMUSG00000048764.16", "ENSMUSG00000029299.14", "ENSMUSG00000029161.7", "ENSMUSG00000105526.1", "ENSMUSG00000105912.1", "ENSMUSG00000106537.1", "ENSMUSG00000044827.10", "ENSMUSG00000099235.1", "ENSMUSG00000028777.8", "ENSMUSG00000106818.1", "ENSMUSG00000029465.14", "ENSMUSG00000085363.1", "ENSMUSG00000059991.7", "ENSMUSG00000029283.17", "ENSMUSG00000005514.14", "ENSMUSG00000040584.8", "ENSMUSG00000072845.3", "ENSMUSG00000070702.9", "ENSMUSG00000100704.1", "ENSMUSG00000053194.2", "ENSMUSG00000029121.16", "ENSMUSG00000034438.16", "ENSMUSG00000046985.11", "ENSMUSG00000029102.9", "ENSMUSG00000066682.11", "ENSMUSG00000040570.14", "ENSMUSG00000038859.7", "ENSMUSG00000046658.16", "ENSMUSG00000029474.7", "ENSMUSG00000029260.15", "ENSMUSG00000038342.15", "ENSMUSG00000075703.15", "ENSMUSG00000097976.1", "ENSMUSG00000058488.7", "ENSMUSG00000029415.4", "ENSMUSG00000003623.4", "ENSMUSG00000105704.1", "ENSMUSG00000051498.6", "ENSMUSG00000105152.1", "ENSMUSG00000016510.11", "ENSMUSG00000107105.1", "ENSMUSG00000104807.1", "ENSMUSG00000029512.11", "ENSMUSG00000056966.7", "ENSMUSG00000105726.1", "ENSMUSG00000049537.10", "ENSMUSG00000036968.15", "ENSMUSG00000107155.1", "ENSMUSG00000040351.11", "ENSMUSG00000106677.1", "ENSMUSG00000105340.5", "ENSMUSG00000095074.4", "ENSMUSG00000067285.4", "ENSMUSG00000085352.1", "ENSMUSG00000106320.1", "ENSMUSG00000032898.9", "ENSMUSG00000105795.1", "ENSMUSG00000037890.13", "ENSMUSG00000066684.10", "ENSMUSG00000090006.1", "ENSMUSG00000001260.10", "ENSMUSG00000072647.6", "ENSMUSG00000029233.15", "ENSMUSG00000104576.1", "ENSMUSG00000106859.1", "ENSMUSG00000042216.13", "ENSMUSG00000011034.6", "ENSMUSG00000104966.1", "ENSMUSG00000029281.13", "ENSMUSG00000037210.16", "ENSMUSG00000029128.12", "ENSMUSG00000014956.15", "ENSMUSG00000106764.1", "ENSMUSG00000106927.1", "ENSMUSG00000029223.12", "ENSMUSG00000106837.1", "ENSMUSG00000029469.14", "ENSMUSG00000029403.14", "ENSMUSG00000109903.1", "ENSMUSG00000036087.18", "ENSMUSG00000107207.1", "ENSMUSG00000105730.1", "ENSMUSG00000063146.11", "ENSMUSG00000105271.1", "ENSMUSG00000029577.13", "ENSMUSG00000105624.1", "ENSMUSG00000029229.8", "ENSMUSG00000046572.10", "ENSMUSG00000075591.2", "ENSMUSG00000100911.2", "ENSMUSG00000105637.1", "ENSMUSG00000025747.12", "ENSMUSG00000105946.1", "ENSMUSG00000029480.13", "ENSMUSG00000106694.1", "ENSMUSG00000086115.1", "ENSMUSG00000107096.3", "ENSMUSG00000090302.1", "ENSMUSG00000038056.15", "ENSMUSG00000067365.10", "ENSMUSG00000034021.15", "ENSMUSG00000107134.1", "ENSMUSG00000029454.15", "ENSMUSG00000105352.4", "ENSMUSG00000057068.15", "ENSMUSG00000029598.12", "ENSMUSG00000016503.17", "ENSMUSG00000079451.9", "ENSMUSG00000079363.7", "ENSMUSG00000029467.15", "ENSMUSG00000083771.1", "ENSMUSG00000105762.1", "ENSMUSG00000038569.13", "ENSMUSG00000065373.1", "ENSMUSG00000105622.1", "ENSMUSG00000104547.1", "ENSMUSG00000039968.9", "ENSMUSG00000038011.15", "ENSMUSG00000083012.9", "ENSMUSG00000059325.14", "ENSMUSG00000106794.1", "ENSMUSG00000105495.1", "ENSMUSG00000045348.16", "ENSMUSG00000105136.1", "ENSMUSG00000029478.16", "ENSMUSG00000098072.1", "ENSMUSG00000029472.13", "ENSMUSG00000056735.5", "ENSMUSG00000037355.14", "ENSMUSG00000097275.1", "ENSMUSG00000118332.1", "ENSMUSG00000042744.16", "ENSMUSG00000006642.3", "ENSMUSG00000047221.5", "ENSMUSG00000041740.16", "ENSMUSG00000037053.6", "ENSMUSG00000042010.16", "ENSMUSG00000061184.8", "ENSMUSG00000087104.7", "ENSMUSG00000049327.17", "ENSMUSG00000035528.4", "ENSMUSG00000104829.1", "ENSMUSG00000048450.10", "ENSMUSG00000029279.15", "ENSMUSG00000073226.9", "ENSMUSG00000049686.14", "ENSMUSG00000097962.7", "ENSMUSG00000079278.1"]
slop = 250000
n_pc = 10
resolution = 1


In [5]:
output_dir = pathlib.Path(chrom)
output_dir.mkdir(exist_ok=True)

## Cell Meta

In [6]:
cell_tidy_data = pd.read_msgpack(
    '/home/hanliu/project/mouse_rostral_brain/study/ClusteringSummary/Summary/TotalClusteringResults.msg'
)
use_clusters = [
    i.replace(' ', '_') for i in cell_tidy_data[cell_tidy_data['CellClass'].isin(['Exc', 'Inh'])]
    ['SubType'].unique() if 'Outlier' not in i
]
len(use_clusters)

145

## ATAC peaks

In [7]:
atac_peak = pd.read_msgpack('/home/hanliu/project/mouse_rostral_brain/study/DMRCluster/SubType.ATAC_peak_merged.msg')
atac_peak = atac_peak.loc[atac_peak.index.map(lambda i: i.startswith(f'Sub{chrom}_'))].copy()

## Gene Info

In [8]:
gene_meta = pd.read_csv(
    '/home/hanliu/ref/mouse/gencode/vm22/gencode.vM22.annotation.gene.flat.tsv.gz',
    index_col='gene_id', sep='\t'
)
gene_meta = gene_meta[gene_meta['chrom'] == chrom].copy()

In [9]:
exon_bed = pd.read_csv('/home/hanliu/ref/mouse/gencode/vm22/genome_anno/exon.all.bed',
                       header=None, sep='\t')
exon_bed.columns = ['chrom', 'start', 'end', 'gene_id', 'gene_name']

## DMR Info

In [10]:
with pd.HDFStore('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/DMRInfo.h5', 'r') as hdf:
    dmr_rate = hdf['Rate']
dmr_rate = dmr_rate.loc[dmr_rate.index.map(lambda i: i.startswith(f'Sub{chrom}_'))].copy()

dmr_corr = pd.read_msgpack(
    '/home/hanliu/project/mouse_rostral_brain/study/DMRGeneCorr/TotalGeneDMRCorrLoop.0.3.msg'
)
dmr_corr = dmr_corr.set_index(['DMR', 'Gene'])
dmr_corr = dmr_corr.loc[dmr_corr.index.get_level_values('DMR').isin(dmr_rate.index)].copy()

dmr_bed = pd.read_csv('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/TotalDMR.nofilter.bed',
                      sep='\t', header=None, index_col=3)
dmr_bed.columns = ['chrom', 'start', 'end']
dmr_bed = dmr_bed[dmr_bed['chrom'] == chrom].copy()

dmr_hits = anndata.read_h5ad('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/TotalHits.HypoDMR.h5ad')
dmr_hits = dmr_hits[dmr_rate.index, :].copy()
dmr_hits = dmr_hits[:, use_clusters].copy()

dmr_annot = anndata.read_h5ad('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/DMRAnnotation.h5ad')
dmr_annot = dmr_annot[dmr_rate.index, :].copy()

In [11]:
dmr_hits.shape[0]

283678

In [12]:
def get_gene(gene_id):
    _gene = gene_meta.loc[gene_id]
    return _gene.name, _gene['chrom'], _gene['start'], _gene['end'], _gene['strand']

## Gene's DMR clustering

In [13]:
def calculate_gene(gene_id):
    # get gene information
    gene_id, _, gene_start, gene_end, _ = get_gene(gene_id)

    # select related DMRs
    related_dmr = dmr_bed[(dmr_bed['start'] > gene_start - slop) &
                          (dmr_bed['end'] < gene_end + slop)].copy()
    
    related_dmr_rate = dmr_rate.loc[related_dmr.index, use_clusters].copy()
    related_dmr_rate.fillna(related_dmr_rate.mean(), axis=0, inplace=True)
    
    # construct Adata
    adata = anndata.AnnData(X=related_dmr_rate.values.copy(),
                            obs=pd.DataFrame([], related_dmr_rate.index),
                            var=pd.DataFrame([], related_dmr_rate.columns))
    sc.pp.scale(adata)
    sc.pp.pca(adata)
    
    pcs = adata.obsm['X_pca'][:, :n_pc]
    sc.pp.neighbors(adata, n_neighbors=int(round(np.log2(adata.shape[0]))), n_pcs=n_pc)
    sc.tl.leiden(adata, resolution=resolution)
    return adata.obs


def get_annotation(gene_id):
    gene_id, _, gene_start, gene_end, strand = get_gene(gene_id)
    tss = gene_start if strand == '+' else gene_end
    gene_cluster = calculate_gene(gene_id)
    this_corr = dmr_corr[dmr_corr.index.get_level_values('Gene') == gene_id]['Corr']
    this_corr.index = this_corr.index.droplevel('Gene')
    gene_cluster['Corr'] = gene_cluster.index.map(this_corr).fillna(0)
    
    this_dmr_bed = dmr_bed.loc[gene_cluster.index]
    dmr_center = (this_dmr_bed['end'] + this_dmr_bed['start']) / 2
    gene_length = gene_end - gene_start
    if strand == '+':
        gene_cluster['reldist_tss'] = (dmr_center - gene_start) / gene_length
    else:
        gene_cluster['reldist_tss'] = (gene_end - dmr_center) / gene_length
    gene_cluster['in_gene_body'] = (gene_cluster['reldist_tss'] > 0) & (gene_cluster['reldist_tss'] < 1)
    
    this_annot = dmr_annot[gene_cluster.index]
    annot_df = pd.DataFrame(this_annot.X.todense(), 
                 index=this_annot.obs_names, columns=this_annot.var_names)
    
    # annotate TE cols
    dna_te = annot_df.columns[20:33]
    gene_cluster['is_dna_te'] = annot_df[dna_te].sum(axis=1) != 0
    
    line_te = annot_df.columns[33:39]
    gene_cluster['is_line_te'] = annot_df[line_te].sum(axis=1) != 0
    
    ltr_te = annot_df.columns[39:45]
    gene_cluster['is_ltr_te'] = annot_df[ltr_te].sum(axis=1) != 0
    
    sine_te = annot_df.columns[45:52]
    gene_cluster['is_sine_te'] = annot_df[sine_te].sum(axis=1) != 0
    
    # this dmr within GOI's gene feature
    gene_cluster['in_intron'] = annot_df['intron'].astype(bool) & gene_cluster['in_gene_body']
    gene_cluster['in_exon'] = annot_df['exon'].astype(bool) & gene_cluster['in_gene_body']
    gene_cluster['in_utr3'] = annot_df['UTR3'].astype(bool) & gene_cluster['in_gene_body']
    gene_cluster['in_utr5'] = annot_df['UTR5'].astype(bool) & gene_cluster['in_gene_body']
    
    # previous mC study
    gene_cluster['feDMR'] = annot_df['feDMR'].astype(bool)
    gene_cluster['adultDMR'] = annot_df['adultDMR'].astype(bool)
    
    
    other_profiles = []
    # DMR hypo call in each cluster
    this_hypo_hits = dmr_hits[gene_cluster.index]
    hits_df = pd.DataFrame(this_hypo_hits.X.todense(), 
                 index=this_hypo_hits.obs_names, columns=this_hypo_hits.var_names)
    hits_df.columns = hits_df.columns.map(lambda i: f'HypoDMR.{i}')
    other_profiles.append(hits_df)
    
    # DMR rate
    related_dmr_rate = dmr_rate.loc[gene_cluster['leiden'].sort_values().index, use_clusters].copy()
    related_dmr_rate.fillna(related_dmr_rate.mean(), axis=0, inplace=True)
    related_dmr_rate.columns = related_dmr_rate.columns.map(lambda i: f'DMRRate.{i}')
    other_profiles.append(related_dmr_rate)
    
    # atac peak
    atac_peak_df = atac_peak.loc[related_dmr_rate.index, use_clusters].copy()
    other_profiles.append(atac_peak_df)
    atac_peak_df.columns = atac_peak_df.columns.map(lambda i: f'ATACPeak.{i}')
    
    dmr_annotation = pd.concat([gene_cluster] + other_profiles, axis=1, sort=True)
    return dmr_annotation

In [16]:
for gene in genes:
    print(gene)
    check_path = output_dir / f'{gene}.DMR_cluster.msg'
    if check_path.exists():
        continue
    
    dmr_annotation = get_annotation(gene)
    dmr_annotation.to_msgpack(output_dir / f'{gene}.DMR_detail.msg', compress='zlib')
    
    cluster_annotation = dmr_annotation.groupby('leiden').mean()
    cluster_annotation.to_msgpack(output_dir / f'{gene}.DMR_cluster.msg', compress='zlib')

ENSMUSG00000029705.17
ENSMUSG00000086220.1
ENSMUSG00000029094.12
ENSMUSG00000023079.14
ENSMUSG00000037736.19
ENSMUSG00000060961.15
ENSMUSG00000090061.9
ENSMUSG00000089809.9
ENSMUSG00000033316.14
ENSMUSG00000039706.11
ENSMUSG00000034040.16
ENSMUSG00000040407.18
ENSMUSG00000056004.16
ENSMUSG00000029168.14
ENSMUSG00000003974.6
ENSMUSG00000051339.10
ENSMUSG00000029287.14
ENSMUSG00000036377.18
ENSMUSG00000028944.14
ENSMUSG00000028926.15
ENSMUSG00000043635.12
ENSMUSG00000029174.18
ENSMUSG00000040118.15
ENSMUSG00000037999.13
ENSMUSG00000029554.15
ENSMUSG00000038319.14
ENSMUSG00000029189.10
ENSMUSG00000029560.12
ENSMUSG00000029516.19
ENSMUSG00000104369.1
ENSMUSG00000033209.17
ENSMUSG00000106379.1
ENSMUSG00000042249.11
ENSMUSG00000029093.14
ENSMUSG00000029101.16
ENSMUSG00000039156.19
ENSMUSG00000029636.11
ENSMUSG00000029673.17
ENSMUSG00000029245.16
ENSMUSG00000056367.14
ENSMUSG00000037795.14
ENSMUSG00000097207.7
ENSMUSG00000000560.9
ENSMUSG00000029167.13
ENSMUSG00000050640.16
ENSMUSG00000092060