In [1]:
import pandas as pd
import numpy as np
import anndata
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import warnings
import pathlib

warnings.filterwarnings('ignore')

In [2]:
from matplotlib import rc
labelsize = 6
linewidth = 0.6
rc('lines', linewidth=linewidth)
rc('axes', labelsize=labelsize, linewidth=linewidth)
rc('xtick', labelsize=labelsize)
rc('ytick', labelsize=labelsize)
rc('xtick.major', width=linewidth)
rc('ytick.major', width=linewidth)
rc('xtick.minor', width=linewidth - 0.2)
rc('ytick.minor', width=linewidth - 0.2)

In [3]:
chrom = 'chr2'
genes = ['Lhx6']
slop = 250000
n_pc = 10
resolution = 1

In [4]:
# Parameters
chrom = "chr17"
genes = ["ENSMUSG00000024219.16", "ENSMUSG00000062202.14", "ENSMUSG00000039512.12", "ENSMUSG00000058392.13", "ENSMUSG00000000708.14", "ENSMUSG00000024172.10", "ENSMUSG00000024109.18", "ENSMUSG00000023927.15", "ENSMUSG00000023886.10", "ENSMUSG00000054640.15", "ENSMUSG00000014763.8", "ENSMUSG00000034265.8", "ENSMUSG00000024064.14", "ENSMUSG00000024143.15", "ENSMUSG00000040490.4", "ENSMUSG00000116718.1", "ENSMUSG00000039601.16", "ENSMUSG00000052105.17", "ENSMUSG00000014426.9", "ENSMUSG00000023868.17", "ENSMUSG00000024245.4", "ENSMUSG00000062078.15", "ENSMUSG00000024112.16", "ENSMUSG00000041119.12", "ENSMUSG00000003279.17", "ENSMUSG00000042256.5", "ENSMUSG00000063239.17", "ENSMUSG00000002076.12", "ENSMUSG00000050138.8", "ENSMUSG00000044477.12", "ENSMUSG00000024074.8", "ENSMUSG00000043557.16", "ENSMUSG00000055471.7", "ENSMUSG00000024145.6", "ENSMUSG00000037104.9", "ENSMUSG00000024044.18", "ENSMUSG00000036533.9", "ENSMUSG00000024008.16", "ENSMUSG00000024242.15", "ENSMUSG00000024222.17", "ENSMUSG00000117118.1", "ENSMUSG00000056515.9", "ENSMUSG00000117023.1", "ENSMUSG00000039153.17", "ENSMUSG00000023830.14", "ENSMUSG00000034413.14", "ENSMUSG00000023915.5", "ENSMUSG00000037196.6", "ENSMUSG00000002250.16", "ENSMUSG00000056121.16", "ENSMUSG00000023999.14", "ENSMUSG00000117254.1", "ENSMUSG00000055817.18", "ENSMUSG00000063952.16", "ENSMUSG00000042203.8", "ENSMUSG00000118341.1", "ENSMUSG00000001870.16", "ENSMUSG00000069729.14", "ENSMUSG00000000127.15", "ENSMUSG00000045038.14", "ENSMUSG00000024107.7", "ENSMUSG00000023965.13", "ENSMUSG00000024059.10", "ENSMUSG00000043705.7", "ENSMUSG00000071984.10", "ENSMUSG00000046991.12", "ENSMUSG00000023805.17", "ENSMUSG00000036918.16", "ENSMUSG00000024042.7", "ENSMUSG00000116988.1", "ENSMUSG00000118081.1", "ENSMUSG00000024063.13", "ENSMUSG00000023972.10", "ENSMUSG00000041565.17", "ENSMUSG00000117628.1", "ENSMUSG00000103779.1", "ENSMUSG00000023963.10", "ENSMUSG00000097305.2", "ENSMUSG00000002279.19", "ENSMUSG00000035580.10", "ENSMUSG00000057899.6", "ENSMUSG00000064043.13", "ENSMUSG00000038954.14", "ENSMUSG00000032937.5", "ENSMUSG00000093460.9", "ENSMUSG00000039316.14", "ENSMUSG00000052712.17", "ENSMUSG00000023940.14", "ENSMUSG00000040852.6", "ENSMUSG00000023827.8", "ENSMUSG00000024085.13", "ENSMUSG00000023460.13", "ENSMUSG00000024146.9", "ENSMUSG00000041130.10", "ENSMUSG00000024299.17", "ENSMUSG00000024151.13", "ENSMUSG00000099329.2", "ENSMUSG00000061950.17", "ENSMUSG00000023923.10", "ENSMUSG00000032624.16", "ENSMUSG00000117161.1", "ENSMUSG00000117231.1", "ENSMUSG00000045761.16", "ENSMUSG00000117243.1", "ENSMUSG00000038048.8", "ENSMUSG00000047786.12", "ENSMUSG00000062036.9", "ENSMUSG00000116801.1", "ENSMUSG00000024120.12", "ENSMUSG00000102973.1", "ENSMUSG00000089900.2", "ENSMUSG00000100150.2", "ENSMUSG00000054469.14", "ENSMUSG00000023873.13", "ENSMUSG00000090069.2", "ENSMUSG00000097103.1", "ENSMUSG00000023943.7", "ENSMUSG00000024078.7", "ENSMUSG00000102212.1", "ENSMUSG00000023931.5", "ENSMUSG00000024043.13", "ENSMUSG00000085846.1", "ENSMUSG00000102744.1", "ENSMUSG00000024300.17", "ENSMUSG00000038545.13", "ENSMUSG00000117264.1", "ENSMUSG00000117328.1", "ENSMUSG00000117351.1", "ENSMUSG00000114492.1", "ENSMUSG00000097049.1", "ENSMUSG00000078247.4", "ENSMUSG00000103216.1", "ENSMUSG00000024002.18", "ENSMUSG00000117835.1", "ENSMUSG00000024135.10", "ENSMUSG00000033327.17", "ENSMUSG00000089941.1", "ENSMUSG00000024036.16", "ENSMUSG00000024030.7", "ENSMUSG00000041057.10", "ENSMUSG00000101606.1", "ENSMUSG00000117019.1", "ENSMUSG00000023828.3", "ENSMUSG00000024070.15", "ENSMUSG00000089633.2", "ENSMUSG00000038805.10", "ENSMUSG00000117210.1", "ENSMUSG00000024169.16", "ENSMUSG00000024049.15", "ENSMUSG00000024154.11", "ENSMUSG00000024019.18", "ENSMUSG00000002504.15", "ENSMUSG00000117093.1", "ENSMUSG00000036983.7", "ENSMUSG00000034998.18", "ENSMUSG00000024246.9", "ENSMUSG00000117191.1", "ENSMUSG00000048027.9", "ENSMUSG00000056492.6", "ENSMUSG00000024066.9", "ENSMUSG00000023994.13", "ENSMUSG00000024457.16", "ENSMUSG00000024168.8", "ENSMUSG00000071042.12", "ENSMUSG00000097282.1", "ENSMUSG00000057246.16", "ENSMUSG00000043592.15", "ENSMUSG00000117647.1", "ENSMUSG00000024140.10", "ENSMUSG00000114291.1", "ENSMUSG00000117102.1", "ENSMUSG00000117026.1", "ENSMUSG00000045053.2", "ENSMUSG00000033826.10", "ENSMUSG00000085492.7", "ENSMUSG00000023802.4", "ENSMUSG00000035435.17", "ENSMUSG00000117100.1", "ENSMUSG00000052525.14", "ENSMUSG00000024084.8", "ENSMUSG00000117635.1", "ENSMUSG00000117511.1", "ENSMUSG00000061126.5", "ENSMUSG00000015127.14", "ENSMUSG00000117175.1", "ENSMUSG00000090168.1", "ENSMUSG00000034165.16", "ENSMUSG00000024018.18", "ENSMUSG00000024076.10", "ENSMUSG00000016946.6", "ENSMUSG00000038146.8", "ENSMUSG00000095407.2", "ENSMUSG00000068037.11", "ENSMUSG00000043972.8", "ENSMUSG00000098775.1", "ENSMUSG00000114563.1", "ENSMUSG00000117786.1", "ENSMUSG00000023935.10", "ENSMUSG00000040260.8", "ENSMUSG00000098374.1", "ENSMUSG00000089755.1", "ENSMUSG00000061544.13", "ENSMUSG00000117222.1", "ENSMUSG00000091504.2", "ENSMUSG00000117084.1", "ENSMUSG00000101133.1", "ENSMUSG00000117501.1", "ENSMUSG00000095687.2", "ENSMUSG00000024174.9", "ENSMUSG00000014773.13", "ENSMUSG00000024247.14", "ENSMUSG00000116644.1", "ENSMUSG00000038002.9", "ENSMUSG00000002791.17", "ENSMUSG00000023926.8", "ENSMUSG00000003534.17", "ENSMUSG00000004945.15", "ENSMUSG00000023885.9", "ENSMUSG00000023949.7", "ENSMUSG00000045036.15", "ENSMUSG00000091636.2", "ENSMUSG00000038347.15", "ENSMUSG00000117110.1", "ENSMUSG00000117065.1", "ENSMUSG00000024227.14", "ENSMUSG00000023918.12", "ENSMUSG00000116813.1", "ENSMUSG00000061665.7", "ENSMUSG00000024254.15", "ENSMUSG00000116858.1", "ENSMUSG00000097560.1", "ENSMUSG00000003546.10", "ENSMUSG00000117443.1", "ENSMUSG00000096975.1", "ENSMUSG00000089701.1", "ENSMUSG00000117333.1", "ENSMUSG00000117334.1", "ENSMUSG00000099332.1", "ENSMUSG00000096847.2", "ENSMUSG00000025433.7", "ENSMUSG00000023959.9", "ENSMUSG00000117444.1", "ENSMUSG00000116655.1", "ENSMUSG00000024253.9", "ENSMUSG00000024105.3", "ENSMUSG00000048826.7", "ENSMUSG00000023122.6", "ENSMUSG00000023930.14", "ENSMUSG00000024056.10", "ENSMUSG00000090083.11", "ENSMUSG00000090700.2", "ENSMUSG00000034709.8", "ENSMUSG00000033855.15", "ENSMUSG00000013236.17", "ENSMUSG00000058704.9", "ENSMUSG00000117124.1", "ENSMUSG00000117130.1", "ENSMUSG00000024054.14", "ENSMUSG00000118232.1", "ENSMUSG00000093686.1", "ENSMUSG00000089849.1", "ENSMUSG00000117033.1", "ENSMUSG00000117228.1", "ENSMUSG00000024175.2", "ENSMUSG00000036557.8", "ENSMUSG00000044375.8", "ENSMUSG00000054901.7", "ENSMUSG00000059811.13", "ENSMUSG00000003200.10", "ENSMUSG00000035473.10", "ENSMUSG00000117224.1", "ENSMUSG00000117233.1", "ENSMUSG00000024248.14", "ENSMUSG00000117393.1", "ENSMUSG00000024027.8", "ENSMUSG00000024091.9", "ENSMUSG00000098090.8", "ENSMUSG00000032915.6", "ENSMUSG00000024155.8", "ENSMUSG00000023945.7", "ENSMUSG00000043740.15", "ENSMUSG00000117417.1", "ENSMUSG00000117263.1", "ENSMUSG00000024213.14", "ENSMUSG00000023991.16", "ENSMUSG00000117227.1", "ENSMUSG00000025431.4", "ENSMUSG00000037246.6", "ENSMUSG00000092368.1", "ENSMUSG00000003199.16", "ENSMUSG00000024052.17", "ENSMUSG00000117338.1", "ENSMUSG00000023953.9", "ENSMUSG00000001576.15", "ENSMUSG00000015575.15", "ENSMUSG00000095377.2", "ENSMUSG00000117539.1", "ENSMUSG00000024077.15", "ENSMUSG00000117314.1", "ENSMUSG00000112121.1", "ENSMUSG00000054134.14", "ENSMUSG00000117569.1", "ENSMUSG00000024206.15", "ENSMUSG00000041293.5", "ENSMUSG00000073424.9", "ENSMUSG00000095961.2", "ENSMUSG00000092618.2", "ENSMUSG00000090556.5", "ENSMUSG00000024065.8", "ENSMUSG00000091475.3", "ENSMUSG00000024150.12", "ENSMUSG00000043286.12", "ENSMUSG00000093508.1", "ENSMUSG00000024462.17", "ENSMUSG00000116984.1", "ENSMUSG00000100402.1", "ENSMUSG00000037130.4", "ENSMUSG00000100847.1", "ENSMUSG00000085705.1", "ENSMUSG00000058435.6", "ENSMUSG00000090655.2", "ENSMUSG00000117433.1", "ENSMUSG00000024165.9", "ENSMUSG00000073420.10", "ENSMUSG00000024163.17", "ENSMUSG00000089222.1", "ENSMUSG00000116670.1", "ENSMUSG00000116851.1", "ENSMUSG00000096477.2", "ENSMUSG00000024176.10", "ENSMUSG00000092549.1", "ENSMUSG00000117301.1"]
slop = 250000
n_pc = 10
resolution = 1


In [5]:
output_dir = pathlib.Path(chrom)
output_dir.mkdir(exist_ok=True)

## Cell Meta

In [6]:
cell_tidy_data = pd.read_msgpack(
    '/home/hanliu/project/mouse_rostral_brain/study/ClusteringSummary/Summary/TotalClusteringResults.msg'
)
use_clusters = [
    i.replace(' ', '_') for i in cell_tidy_data[cell_tidy_data['CellClass'].isin(['Exc', 'Inh'])]
    ['SubType'].unique() if 'Outlier' not in i
]
len(use_clusters)

145

## ATAC peaks

In [7]:
atac_peak = pd.read_msgpack('/home/hanliu/project/mouse_rostral_brain/study/DMRCluster/SubType.ATAC_peak_merged.msg')
atac_peak = atac_peak.loc[atac_peak.index.map(lambda i: i.startswith(f'Sub{chrom}_'))].copy()

## Gene Info

In [8]:
gene_meta = pd.read_csv(
    '/home/hanliu/ref/mouse/gencode/vm22/gencode.vM22.annotation.gene.flat.tsv.gz',
    index_col='gene_id', sep='\t'
)
gene_meta = gene_meta[gene_meta['chrom'] == chrom].copy()

In [9]:
exon_bed = pd.read_csv('/home/hanliu/ref/mouse/gencode/vm22/genome_anno/exon.all.bed',
                       header=None, sep='\t')
exon_bed.columns = ['chrom', 'start', 'end', 'gene_id', 'gene_name']

## DMR Info

In [10]:
with pd.HDFStore('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/DMRInfo.h5', 'r') as hdf:
    dmr_rate = hdf['Rate']
dmr_rate = dmr_rate.loc[dmr_rate.index.map(lambda i: i.startswith(f'Sub{chrom}_'))].copy()

dmr_corr = pd.read_msgpack(
    '/home/hanliu/project/mouse_rostral_brain/study/DMRGeneCorr/TotalGeneDMRCorrLoop.0.3.msg'
)
dmr_corr = dmr_corr.set_index(['DMR', 'Gene'])
dmr_corr = dmr_corr.loc[dmr_corr.index.get_level_values('DMR').isin(dmr_rate.index)].copy()

dmr_bed = pd.read_csv('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/TotalDMR.nofilter.bed',
                      sep='\t', header=None, index_col=3)
dmr_bed.columns = ['chrom', 'start', 'end']
dmr_bed = dmr_bed[dmr_bed['chrom'] == chrom].copy()

dmr_hits = anndata.read_h5ad('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/TotalHits.HypoDMR.h5ad')
dmr_hits = dmr_hits[dmr_rate.index, :].copy()
dmr_hits = dmr_hits[:, use_clusters].copy()

dmr_annot = anndata.read_h5ad('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/DMRAnnotation.h5ad')
dmr_annot = dmr_annot[dmr_rate.index, :].copy()

In [11]:
dmr_hits.shape[0]

179626

In [12]:
def get_gene(gene_id):
    _gene = gene_meta.loc[gene_id]
    return _gene.name, _gene['chrom'], _gene['start'], _gene['end'], _gene['strand']

## Gene's DMR clustering

In [13]:
def calculate_gene(gene_id):
    # get gene information
    gene_id, _, gene_start, gene_end, _ = get_gene(gene_id)

    # select related DMRs
    related_dmr = dmr_bed[(dmr_bed['start'] > gene_start - slop) &
                          (dmr_bed['end'] < gene_end + slop)].copy()
    
    related_dmr_rate = dmr_rate.loc[related_dmr.index, use_clusters].copy()
    related_dmr_rate.fillna(related_dmr_rate.mean(), axis=0, inplace=True)
    
    # construct Adata
    adata = anndata.AnnData(X=related_dmr_rate.values.copy(),
                            obs=pd.DataFrame([], related_dmr_rate.index),
                            var=pd.DataFrame([], related_dmr_rate.columns))
    sc.pp.scale(adata)
    sc.pp.pca(adata)
    
    pcs = adata.obsm['X_pca'][:, :n_pc]
    sc.pp.neighbors(adata, n_neighbors=int(round(np.log2(adata.shape[0]))), n_pcs=n_pc)
    sc.tl.leiden(adata, resolution=resolution)
    return adata.obs


def get_annotation(gene_id):
    gene_id, _, gene_start, gene_end, strand = get_gene(gene_id)
    tss = gene_start if strand == '+' else gene_end
    gene_cluster = calculate_gene(gene_id)
    this_corr = dmr_corr[dmr_corr.index.get_level_values('Gene') == gene_id]['Corr']
    this_corr.index = this_corr.index.droplevel('Gene')
    gene_cluster['Corr'] = gene_cluster.index.map(this_corr).fillna(0)
    
    this_dmr_bed = dmr_bed.loc[gene_cluster.index]
    dmr_center = (this_dmr_bed['end'] + this_dmr_bed['start']) / 2
    gene_length = gene_end - gene_start
    if strand == '+':
        gene_cluster['reldist_tss'] = (dmr_center - gene_start) / gene_length
    else:
        gene_cluster['reldist_tss'] = (gene_end - dmr_center) / gene_length
    gene_cluster['in_gene_body'] = (gene_cluster['reldist_tss'] > 0) & (gene_cluster['reldist_tss'] < 1)
    
    this_annot = dmr_annot[gene_cluster.index]
    annot_df = pd.DataFrame(this_annot.X.todense(), 
                 index=this_annot.obs_names, columns=this_annot.var_names)
    
    # annotate TE cols
    dna_te = annot_df.columns[20:33]
    gene_cluster['is_dna_te'] = annot_df[dna_te].sum(axis=1) != 0
    
    line_te = annot_df.columns[33:39]
    gene_cluster['is_line_te'] = annot_df[line_te].sum(axis=1) != 0
    
    ltr_te = annot_df.columns[39:45]
    gene_cluster['is_ltr_te'] = annot_df[ltr_te].sum(axis=1) != 0
    
    sine_te = annot_df.columns[45:52]
    gene_cluster['is_sine_te'] = annot_df[sine_te].sum(axis=1) != 0
    
    # this dmr within GOI's gene feature
    gene_cluster['in_intron'] = annot_df['intron'].astype(bool) & gene_cluster['in_gene_body']
    gene_cluster['in_exon'] = annot_df['exon'].astype(bool) & gene_cluster['in_gene_body']
    gene_cluster['in_utr3'] = annot_df['UTR3'].astype(bool) & gene_cluster['in_gene_body']
    gene_cluster['in_utr5'] = annot_df['UTR5'].astype(bool) & gene_cluster['in_gene_body']
    
    # previous mC study
    gene_cluster['feDMR'] = annot_df['feDMR'].astype(bool)
    gene_cluster['adultDMR'] = annot_df['adultDMR'].astype(bool)
    
    
    other_profiles = []
    # DMR hypo call in each cluster
    this_hypo_hits = dmr_hits[gene_cluster.index]
    hits_df = pd.DataFrame(this_hypo_hits.X.todense(), 
                 index=this_hypo_hits.obs_names, columns=this_hypo_hits.var_names)
    hits_df.columns = hits_df.columns.map(lambda i: f'HypoDMR.{i}')
    other_profiles.append(hits_df)
    
    # DMR rate
    related_dmr_rate = dmr_rate.loc[gene_cluster['leiden'].sort_values().index, use_clusters].copy()
    related_dmr_rate.fillna(related_dmr_rate.mean(), axis=0, inplace=True)
    related_dmr_rate.columns = related_dmr_rate.columns.map(lambda i: f'DMRRate.{i}')
    other_profiles.append(related_dmr_rate)
    
    # atac peak
    atac_peak_df = atac_peak.loc[related_dmr_rate.index, use_clusters].copy()
    other_profiles.append(atac_peak_df)
    atac_peak_df.columns = atac_peak_df.columns.map(lambda i: f'ATACPeak.{i}')
    
    dmr_annotation = pd.concat([gene_cluster] + other_profiles, axis=1, sort=True)
    return dmr_annotation

In [None]:
for gene in genes:
    print(gene)
    check_path = output_dir / f'{gene}.DMR_cluster.msg'
    if check_path.exists():
        continue
    
    dmr_annotation = get_annotation(gene)
    dmr_annotation.to_msgpack(output_dir / f'{gene}.DMR_detail.msg', compress='zlib')
    
    cluster_annotation = dmr_annotation.groupby('leiden').mean()
    cluster_annotation.to_msgpack(output_dir / f'{gene}.DMR_cluster.msg', compress='zlib')

ENSMUSG00000024219.16
ENSMUSG00000062202.14
ENSMUSG00000039512.12
ENSMUSG00000058392.13
ENSMUSG00000000708.14
ENSMUSG00000024172.10
ENSMUSG00000024109.18
ENSMUSG00000023927.15
ENSMUSG00000023886.10
ENSMUSG00000054640.15
ENSMUSG00000014763.8
ENSMUSG00000034265.8
ENSMUSG00000024064.14
ENSMUSG00000024143.15
ENSMUSG00000040490.4
ENSMUSG00000116718.1
ENSMUSG00000039601.16
ENSMUSG00000052105.17
ENSMUSG00000014426.9
ENSMUSG00000023868.17
ENSMUSG00000024245.4
ENSMUSG00000062078.15
ENSMUSG00000024112.16
ENSMUSG00000041119.12
ENSMUSG00000003279.17
ENSMUSG00000042256.5
ENSMUSG00000063239.17
ENSMUSG00000002076.12
ENSMUSG00000050138.8
ENSMUSG00000044477.12
ENSMUSG00000024074.8
ENSMUSG00000043557.16
ENSMUSG00000055471.7
ENSMUSG00000024145.6
ENSMUSG00000037104.9
ENSMUSG00000024044.18
ENSMUSG00000036533.9
ENSMUSG00000024008.16
ENSMUSG00000024242.15
ENSMUSG00000024222.17
ENSMUSG00000117118.1
ENSMUSG00000056515.9
ENSMUSG00000117023.1
ENSMUSG00000039153.17
ENSMUSG00000023830.14
ENSMUSG00000034413.14
ENSM