In [1]:
import pandas as pd
import numpy as np
import anndata
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import warnings
import pathlib

warnings.filterwarnings('ignore')

In [2]:
from matplotlib import rc
labelsize = 6
linewidth = 0.6
rc('lines', linewidth=linewidth)
rc('axes', labelsize=labelsize, linewidth=linewidth)
rc('xtick', labelsize=labelsize)
rc('ytick', labelsize=labelsize)
rc('xtick.major', width=linewidth)
rc('ytick.major', width=linewidth)
rc('xtick.minor', width=linewidth - 0.2)
rc('ytick.minor', width=linewidth - 0.2)

In [3]:
chrom = 'chr2'
genes = ['Lhx6']
slop = 250000
n_pc = 10
resolution = 1

In [4]:
# Parameters
chrom = "chr15"
genes = ["ENSMUSG00000022353.10", "ENSMUSG00000061731.9", "ENSMUSG00000036800.8", "ENSMUSG00000022377.16", "ENSMUSG00000041708.12", "ENSMUSG00000022270.16", "ENSMUSG00000016664.16", "ENSMUSG00000071757.10", "ENSMUSG00000019146.2", "ENSMUSG00000036158.12", "ENSMUSG00000051359.15", "ENSMUSG00000022441.17", "ENSMUSG00000022321.15", "ENSMUSG00000022376.8", "ENSMUSG00000115157.1", "ENSMUSG00000022261.6", "ENSMUSG00000022306.9", "ENSMUSG00000022231.10", "ENSMUSG00000038679.16", "ENSMUSG00000022237.17", "ENSMUSG00000015002.18", "ENSMUSG00000094296.1", "ENSMUSG00000062373.8", "ENSMUSG00000022416.15", "ENSMUSG00000022372.14", "ENSMUSG00000062760.10", "ENSMUSG00000115729.1", "ENSMUSG00000044250.8", "ENSMUSG00000060429.12", "ENSMUSG00000053469.14", "ENSMUSG00000023026.16", "ENSMUSG00000022378.14", "ENSMUSG00000054863.9", "ENSMUSG00000016552.13", "ENSMUSG00000050439.7", "ENSMUSG00000016763.15", "ENSMUSG00000016624.15", "ENSMUSG00000079022.10", "ENSMUSG00000022312.11", "ENSMUSG00000022362.13", "ENSMUSG00000000552.10", "ENSMUSG00000022419.16", "ENSMUSG00000022265.7", "ENSMUSG00000005268.20", "ENSMUSG00000013846.10", "ENSMUSG00000115850.1", "ENSMUSG00000039385.5", "ENSMUSG00000037106.8", "ENSMUSG00000022206.7", "ENSMUSG00000055022.14", "ENSMUSG00000022489.6", "ENSMUSG00000022272.17", "ENSMUSG00000097452.2", "ENSMUSG00000022361.14", "ENSMUSG00000054277.8", "ENSMUSG00000023022.14", "ENSMUSG00000023032.12", "ENSMUSG00000064210.7", "ENSMUSG00000023017.10", "ENSMUSG00000036661.14", "ENSMUSG00000022288.1", "ENSMUSG00000037579.7", "ENSMUSG00000036760.7", "ENSMUSG00000115654.1", "ENSMUSG00000022305.13", "ENSMUSG00000022438.6", "ENSMUSG00000086125.1", "ENSMUSG00000041852.15", "ENSMUSG00000086992.7", "ENSMUSG00000045763.8", "ENSMUSG00000115062.1", "ENSMUSG00000024479.3", "ENSMUSG00000051920.7", "ENSMUSG00000022324.15", "ENSMUSG00000047497.10", "ENSMUSG00000025370.7", "ENSMUSG00000044216.7", "ENSMUSG00000097232.1", "ENSMUSG00000054263.12", "ENSMUSG00000055782.9", "ENSMUSG00000092375.1", "ENSMUSG00000038591.3", "ENSMUSG00000049148.8", "ENSMUSG00000100603.6", "ENSMUSG00000022262.8", "ENSMUSG00000069996.3", "ENSMUSG00000035900.18", "ENSMUSG00000022421.19", "ENSMUSG00000022309.9", "ENSMUSG00000022257.4", "ENSMUSG00000042564.13", "ENSMUSG00000022358.7", "ENSMUSG00000022369.13", "ENSMUSG00000022994.9", "ENSMUSG00000043556.10", "ENSMUSG00000022442.16", "ENSMUSG00000051237.7", "ENSMUSG00000059895.13", "ENSMUSG00000023033.14", "ENSMUSG00000044933.3", "ENSMUSG00000072487.11", "ENSMUSG00000022488.9", "ENSMUSG00000023015.14", "ENSMUSG00000042351.9", "ENSMUSG00000033075.17", "ENSMUSG00000022558.16", "ENSMUSG00000037627.16", "ENSMUSG00000022443.17", "ENSMUSG00000050697.9", "ENSMUSG00000075600.3", "ENSMUSG00000043460.7", "ENSMUSG00000116130.1", "ENSMUSG00000022434.8", "ENSMUSG00000116068.1", "ENSMUSG00000038879.8", "ENSMUSG00000022296.9", "ENSMUSG00000022269.13", "ENSMUSG00000115337.1", "ENSMUSG00000036046.14", "ENSMUSG00000097003.1", "ENSMUSG00000034730.17", "ENSMUSG00000022425.16", "ENSMUSG00000115779.1", "ENSMUSG00000115560.1", "ENSMUSG00000033088.19", "ENSMUSG00000023008.18", "ENSMUSG00000016028.10", "ENSMUSG00000023009.14", "ENSMUSG00000116031.1", "ENSMUSG00000036606.17", "ENSMUSG00000015365.15", "ENSMUSG00000042961.13", "ENSMUSG00000022602.14", "ENSMUSG00000054619.7", "ENSMUSG00000023011.8", "ENSMUSG00000023034.7", "ENSMUSG00000058099.16", "ENSMUSG00000022375.6", "ENSMUSG00000022429.11", "ENSMUSG00000035891.16", "ENSMUSG00000000531.5", "ENSMUSG00000115918.1", "ENSMUSG00000115339.1", "ENSMUSG00000006369.14", "ENSMUSG00000116114.1", "ENSMUSG00000115820.1", "ENSMUSG00000116483.1", "ENSMUSG00000115204.1", "ENSMUSG00000023021.15", "ENSMUSG00000115169.1", "ENSMUSG00000022993.7", "ENSMUSG00000072568.4", "ENSMUSG00000078907.2", "ENSMUSG00000115355.1", "ENSMUSG00000005125.13", "ENSMUSG00000022342.6", "ENSMUSG00000033697.15", "ENSMUSG00000022181.16", "ENSMUSG00000036698.11", "ENSMUSG00000037362.8", "ENSMUSG00000022414.8", "ENSMUSG00000046034.8", "ENSMUSG00000039458.15", "ENSMUSG00000116493.1", "ENSMUSG00000037465.10", "ENSMUSG00000100220.1", "ENSMUSG00000016541.10", "ENSMUSG00000022475.19", "ENSMUSG00000036106.15", "ENSMUSG00000022286.16", "ENSMUSG00000047888.10", "ENSMUSG00000022263.10", "ENSMUSG00000022360.8", "ENSMUSG00000022634.9", "ENSMUSG00000115773.1", "ENSMUSG00000115478.1", "ENSMUSG00000114971.1", "ENSMUSG00000005360.14", "ENSMUSG00000022629.17", "ENSMUSG00000022404.8", "ENSMUSG00000000489.7", "ENSMUSG00000022469.17", "ENSMUSG00000033039.10", "ENSMUSG00000102069.1", "ENSMUSG00000022526.8", "ENSMUSG00000005124.10", "ENSMUSG00000022304.13", "ENSMUSG00000094814.1", "ENSMUSG00000022253.15", "ENSMUSG00000009035.15", "ENSMUSG00000116294.1", "ENSMUSG00000033237.19", "ENSMUSG00000016637.7", "ENSMUSG00000089837.9", "ENSMUSG00000099013.1", "ENSMUSG00000022463.8", "ENSMUSG00000022408.7", "ENSMUSG00000039168.15", "ENSMUSG00000086801.1", "ENSMUSG00000086361.1", "ENSMUSG00000086541.8", "ENSMUSG00000056332.4", "ENSMUSG00000115680.1", "ENSMUSG00000036944.6", "ENSMUSG00000023169.15", "ENSMUSG00000041653.5", "ENSMUSG00000037617.12", "ENSMUSG00000115912.1", "ENSMUSG00000101671.1", "ENSMUSG00000116021.1", "ENSMUSG00000115137.1", "ENSMUSG00000047807.10", "ENSMUSG00000015001.17", "ENSMUSG00000115485.1", "ENSMUSG00000115620.1", "ENSMUSG00000042428.6", "ENSMUSG00000022155.9", "ENSMUSG00000116360.1", "ENSMUSG00000018865.9", "ENSMUSG00000096607.1", "ENSMUSG00000056069.10", "ENSMUSG00000022249.14", "ENSMUSG00000022300.10", "ENSMUSG00000068086.6", "ENSMUSG00000022146.12", "ENSMUSG00000090500.1", "ENSMUSG00000022464.14", "ENSMUSG00000023010.15", "ENSMUSG00000115970.1", "ENSMUSG00000115591.1", "ENSMUSG00000097536.2", "ENSMUSG00000022371.16", "ENSMUSG00000101892.1", "ENSMUSG00000072663.12", "ENSMUSG00000097259.2", "ENSMUSG00000000934.9", "ENSMUSG00000068206.13", "ENSMUSG00000093695.2", "ENSMUSG00000079105.4", "ENSMUSG00000022295.8", "ENSMUSG00000116505.1", "ENSMUSG00000090002.1", "ENSMUSG00000022297.15", "ENSMUSG00000071714.7", "ENSMUSG00000115164.2", "ENSMUSG00000022439.9", "ENSMUSG00000116311.1", "ENSMUSG00000000555.8", "ENSMUSG00000116256.1", "ENSMUSG00000115345.1", "ENSMUSG00000091119.2", "ENSMUSG00000116121.1", "ENSMUSG00000115752.1", "ENSMUSG00000022462.7", "ENSMUSG00000110018.2", "ENSMUSG00000089680.1", "ENSMUSG00000022415.12", "ENSMUSG00000022299.9", "ENSMUSG00000022141.7", "ENSMUSG00000115986.1", "ENSMUSG00000037487.7", "ENSMUSG00000116004.1", "ENSMUSG00000115009.1", "ENSMUSG00000116345.1", "ENSMUSG00000071713.6", "ENSMUSG00000115908.1", "ENSMUSG00000114994.1", "ENSMUSG00000022350.7", "ENSMUSG00000058441.7", "ENSMUSG00000078932.8", "ENSMUSG00000022200.8", "ENSMUSG00000050310.9", "ENSMUSG00000022432.7", "ENSMUSG00000037185.9", "ENSMUSG00000079024.2", "ENSMUSG00000054115.11", "ENSMUSG00000015377.10", "ENSMUSG00000090237.1", "ENSMUSG00000022987.12", "ENSMUSG00000094112.2", "ENSMUSG00000115667.1", "ENSMUSG00000036197.16", "ENSMUSG00000116305.1", "ENSMUSG00000108748.1", "ENSMUSG00000093677.1", "ENSMUSG00000050963.7", "ENSMUSG00000018008.8", "ENSMUSG00000094447.2", "ENSMUSG00000115433.1", "ENSMUSG00000116000.1", "ENSMUSG00000005716.16", "ENSMUSG00000116299.1", "ENSMUSG00000065936.1", "ENSMUSG00000065049.1", "ENSMUSG00000063727.3", "ENSMUSG00000116081.1", "ENSMUSG00000115522.1", "ENSMUSG00000022565.15", "ENSMUSG00000115880.1", "ENSMUSG00000023007.15", "ENSMUSG00000050891.10", "ENSMUSG00000055114.9", "ENSMUSG00000115463.1", "ENSMUSG00000115177.1", "ENSMUSG00000079020.9", "ENSMUSG00000086313.1", "ENSMUSG00000022466.6", "ENSMUSG00000115063.1"]
slop = 250000
n_pc = 10
resolution = 1


In [5]:
output_dir = pathlib.Path(chrom)
output_dir.mkdir(exist_ok=True)

## Cell Meta

In [6]:
cell_tidy_data = pd.read_msgpack(
    '/home/hanliu/project/mouse_rostral_brain/study/ClusteringSummary/Summary/TotalClusteringResults.msg'
)
use_clusters = [
    i.replace(' ', '_') for i in cell_tidy_data[cell_tidy_data['CellClass'].isin(['Exc', 'Inh'])]
    ['SubType'].unique() if 'Outlier' not in i
]
len(use_clusters)

145

## ATAC peaks

In [7]:
atac_peak = pd.read_msgpack('/home/hanliu/project/mouse_rostral_brain/study/DMRCluster/SubType.ATAC_peak_merged.msg')
atac_peak = atac_peak.loc[atac_peak.index.map(lambda i: i.startswith(f'Sub{chrom}_'))].copy()

## Gene Info

In [8]:
gene_meta = pd.read_csv(
    '/home/hanliu/ref/mouse/gencode/vm22/gencode.vM22.annotation.gene.flat.tsv.gz',
    index_col='gene_id', sep='\t'
)
gene_meta = gene_meta[gene_meta['chrom'] == chrom].copy()

In [9]:
exon_bed = pd.read_csv('/home/hanliu/ref/mouse/gencode/vm22/genome_anno/exon.all.bed',
                       header=None, sep='\t')
exon_bed.columns = ['chrom', 'start', 'end', 'gene_id', 'gene_name']

## DMR Info

In [10]:
with pd.HDFStore('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/DMRInfo.h5', 'r') as hdf:
    dmr_rate = hdf['Rate']
dmr_rate = dmr_rate.loc[dmr_rate.index.map(lambda i: i.startswith(f'Sub{chrom}_'))].copy()

dmr_corr = pd.read_msgpack(
    '/home/hanliu/project/mouse_rostral_brain/study/DMRGeneCorr/TotalGeneDMRCorrLoop.0.3.msg'
)
dmr_corr = dmr_corr.set_index(['DMR', 'Gene'])
dmr_corr = dmr_corr.loc[dmr_corr.index.get_level_values('DMR').isin(dmr_rate.index)].copy()

dmr_bed = pd.read_csv('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/TotalDMR.nofilter.bed',
                      sep='\t', header=None, index_col=3)
dmr_bed.columns = ['chrom', 'start', 'end']
dmr_bed = dmr_bed[dmr_bed['chrom'] == chrom].copy()

dmr_hits = anndata.read_h5ad('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/TotalHits.HypoDMR.h5ad')
dmr_hits = dmr_hits[dmr_rate.index, :].copy()
dmr_hits = dmr_hits[:, use_clusters].copy()

dmr_annot = anndata.read_h5ad('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/DMRAnnotation.h5ad')
dmr_annot = dmr_annot[dmr_rate.index, :].copy()

In [11]:
dmr_hits.shape[0]

193576

In [12]:
def get_gene(gene_id):
    _gene = gene_meta.loc[gene_id]
    return _gene.name, _gene['chrom'], _gene['start'], _gene['end'], _gene['strand']

## Gene's DMR clustering

In [13]:
def calculate_gene(gene_id):
    # get gene information
    gene_id, _, gene_start, gene_end, _ = get_gene(gene_id)

    # select related DMRs
    related_dmr = dmr_bed[(dmr_bed['start'] > gene_start - slop) &
                          (dmr_bed['end'] < gene_end + slop)].copy()
    
    related_dmr_rate = dmr_rate.loc[related_dmr.index, use_clusters].copy()
    related_dmr_rate.fillna(related_dmr_rate.mean(), axis=0, inplace=True)
    
    # construct Adata
    adata = anndata.AnnData(X=related_dmr_rate.values.copy(),
                            obs=pd.DataFrame([], related_dmr_rate.index),
                            var=pd.DataFrame([], related_dmr_rate.columns))
    sc.pp.scale(adata)
    sc.pp.pca(adata)
    
    pcs = adata.obsm['X_pca'][:, :n_pc]
    sc.pp.neighbors(adata, n_neighbors=int(round(np.log2(adata.shape[0]))), n_pcs=n_pc)
    sc.tl.leiden(adata, resolution=resolution)
    return adata.obs


def get_annotation(gene_id):
    gene_id, _, gene_start, gene_end, strand = get_gene(gene_id)
    tss = gene_start if strand == '+' else gene_end
    gene_cluster = calculate_gene(gene_id)
    this_corr = dmr_corr[dmr_corr.index.get_level_values('Gene') == gene_id]['Corr']
    this_corr.index = this_corr.index.droplevel('Gene')
    gene_cluster['Corr'] = gene_cluster.index.map(this_corr).fillna(0)
    
    this_dmr_bed = dmr_bed.loc[gene_cluster.index]
    dmr_center = (this_dmr_bed['end'] + this_dmr_bed['start']) / 2
    gene_length = gene_end - gene_start
    if strand == '+':
        gene_cluster['reldist_tss'] = (dmr_center - gene_start) / gene_length
    else:
        gene_cluster['reldist_tss'] = (gene_end - dmr_center) / gene_length
    gene_cluster['in_gene_body'] = (gene_cluster['reldist_tss'] > 0) & (gene_cluster['reldist_tss'] < 1)
    
    this_annot = dmr_annot[gene_cluster.index]
    annot_df = pd.DataFrame(this_annot.X.todense(), 
                 index=this_annot.obs_names, columns=this_annot.var_names)
    
    # annotate TE cols
    dna_te = annot_df.columns[20:33]
    gene_cluster['is_dna_te'] = annot_df[dna_te].sum(axis=1) != 0
    
    line_te = annot_df.columns[33:39]
    gene_cluster['is_line_te'] = annot_df[line_te].sum(axis=1) != 0
    
    ltr_te = annot_df.columns[39:45]
    gene_cluster['is_ltr_te'] = annot_df[ltr_te].sum(axis=1) != 0
    
    sine_te = annot_df.columns[45:52]
    gene_cluster['is_sine_te'] = annot_df[sine_te].sum(axis=1) != 0
    
    # this dmr within GOI's gene feature
    gene_cluster['in_intron'] = annot_df['intron'].astype(bool) & gene_cluster['in_gene_body']
    gene_cluster['in_exon'] = annot_df['exon'].astype(bool) & gene_cluster['in_gene_body']
    gene_cluster['in_utr3'] = annot_df['UTR3'].astype(bool) & gene_cluster['in_gene_body']
    gene_cluster['in_utr5'] = annot_df['UTR5'].astype(bool) & gene_cluster['in_gene_body']
    
    # previous mC study
    gene_cluster['feDMR'] = annot_df['feDMR'].astype(bool)
    gene_cluster['adultDMR'] = annot_df['adultDMR'].astype(bool)
    
    
    other_profiles = []
    # DMR hypo call in each cluster
    this_hypo_hits = dmr_hits[gene_cluster.index]
    hits_df = pd.DataFrame(this_hypo_hits.X.todense(), 
                 index=this_hypo_hits.obs_names, columns=this_hypo_hits.var_names)
    hits_df.columns = hits_df.columns.map(lambda i: f'HypoDMR.{i}')
    other_profiles.append(hits_df)
    
    # DMR rate
    related_dmr_rate = dmr_rate.loc[gene_cluster['leiden'].sort_values().index, use_clusters].copy()
    related_dmr_rate.fillna(related_dmr_rate.mean(), axis=0, inplace=True)
    related_dmr_rate.columns = related_dmr_rate.columns.map(lambda i: f'DMRRate.{i}')
    other_profiles.append(related_dmr_rate)
    
    # atac peak
    atac_peak_df = atac_peak.loc[related_dmr_rate.index, use_clusters].copy()
    other_profiles.append(atac_peak_df)
    atac_peak_df.columns = atac_peak_df.columns.map(lambda i: f'ATACPeak.{i}')
    
    dmr_annotation = pd.concat([gene_cluster] + other_profiles, axis=1, sort=True)
    return dmr_annotation

In [None]:
for gene in genes:
    print(gene)
    check_path = output_dir / f'{gene}.DMR_cluster.msg'
    if check_path.exists():
        continue
    
    dmr_annotation = get_annotation(gene)
    dmr_annotation.to_msgpack(output_dir / f'{gene}.DMR_detail.msg', compress='zlib')
    
    cluster_annotation = dmr_annotation.groupby('leiden').mean()
    cluster_annotation.to_msgpack(output_dir / f'{gene}.DMR_cluster.msg', compress='zlib')

ENSMUSG00000022353.10
ENSMUSG00000061731.9
ENSMUSG00000036800.8
ENSMUSG00000022377.16
ENSMUSG00000041708.12
ENSMUSG00000022270.16
ENSMUSG00000016664.16
ENSMUSG00000071757.10
ENSMUSG00000019146.2
ENSMUSG00000036158.12
ENSMUSG00000051359.15
ENSMUSG00000022441.17
ENSMUSG00000022321.15
ENSMUSG00000022376.8
ENSMUSG00000115157.1
ENSMUSG00000022261.6
ENSMUSG00000022306.9
ENSMUSG00000022231.10
ENSMUSG00000038679.16
ENSMUSG00000022237.17
ENSMUSG00000015002.18
ENSMUSG00000094296.1
ENSMUSG00000062373.8
ENSMUSG00000022416.15
ENSMUSG00000022372.14
ENSMUSG00000062760.10
ENSMUSG00000115729.1
ENSMUSG00000044250.8
ENSMUSG00000060429.12
ENSMUSG00000053469.14
ENSMUSG00000023026.16
ENSMUSG00000022378.14
ENSMUSG00000054863.9
ENSMUSG00000016552.13
ENSMUSG00000050439.7
ENSMUSG00000016763.15
ENSMUSG00000016624.15
ENSMUSG00000079022.10
ENSMUSG00000022312.11
ENSMUSG00000022362.13
ENSMUSG00000000552.10
ENSMUSG00000022419.16
ENSMUSG00000022265.7
ENSMUSG00000005268.20
ENSMUSG00000013846.10
ENSMUSG00000115850.1
ENS