In [1]:
import pandas as pd
import numpy as np
import anndata
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import warnings
import pathlib

warnings.filterwarnings('ignore')

In [2]:
from matplotlib import rc
labelsize = 6
linewidth = 0.6
rc('lines', linewidth=linewidth)
rc('axes', labelsize=labelsize, linewidth=linewidth)
rc('xtick', labelsize=labelsize)
rc('ytick', labelsize=labelsize)
rc('xtick.major', width=linewidth)
rc('ytick.major', width=linewidth)
rc('xtick.minor', width=linewidth - 0.2)
rc('ytick.minor', width=linewidth - 0.2)

In [3]:
chrom = 'chr2'
genes = ['Lhx6']
slop = 250000
n_pc = 10
resolution = 1

In [4]:
# Parameters
chrom = "chr12"
genes = ["ENSMUSG00000021061.15", "ENSMUSG00000033854.10", "ENSMUSG00000035954.10", "ENSMUSG00000084883.2", "ENSMUSG00000041992.9", "ENSMUSG00000046314.16", "ENSMUSG00000079055.10", "ENSMUSG00000033713.12", "ENSMUSG00000020576.10", "ENSMUSG00000048251.15", "ENSMUSG00000034574.10", "ENSMUSG00000021221.15", "ENSMUSG00000057963.9", "ENSMUSG00000044712.15", "ENSMUSG00000020598.16", "ENSMUSG00000020672.14", "ENSMUSG00000020961.15", "ENSMUSG00000044912.10", "ENSMUSG00000021224.15", "ENSMUSG00000034402.3", "ENSMUSG00000066392.11", "ENSMUSG00000021277.16", "ENSMUSG00000041771.14", "ENSMUSG00000021143.10", "ENSMUSG00000021115.15", "ENSMUSG00000097929.9", "ENSMUSG00000021262.15", "ENSMUSG00000020656.16", "ENSMUSG00000113550.1", "ENSMUSG00000020593.15", "ENSMUSG00000021182.16", "ENSMUSG00000021209.12", "ENSMUSG00000020955.9", "ENSMUSG00000020601.8", "ENSMUSG00000042724.8", "ENSMUSG00000052632.16", "ENSMUSG00000002900.16", "ENSMUSG00000021177.16", "ENSMUSG00000101930.1", "ENSMUSG00000042350.13", "ENSMUSG00000036095.11", "ENSMUSG00000058070.14", "ENSMUSG00000020673.14", "ENSMUSG00000096751.3", "ENSMUSG00000048285.9", "ENSMUSG00000071265.10", "ENSMUSG00000012609.19", "ENSMUSG00000002020.15", "ENSMUSG00000015143.15", "ENSMUSG00000021130.8", "ENSMUSG00000113586.1", "ENSMUSG00000025321.14", "ENSMUSG00000021255.17", "ENSMUSG00000021097.15", "ENSMUSG00000047414.6", "ENSMUSG00000098128.1", "ENSMUSG00000037679.9", "ENSMUSG00000042700.16", "ENSMUSG00000052125.16", "ENSMUSG00000054459.8", "ENSMUSG00000021010.8", "ENSMUSG00000021048.7", "ENSMUSG00000020564.17", "ENSMUSG00000021087.18", "ENSMUSG00000055917.15", "ENSMUSG00000071454.13", "ENSMUSG00000021254.9", "ENSMUSG00000004151.17", "ENSMUSG00000113115.1", "ENSMUSG00000020651.8", "ENSMUSG00000113669.1", "ENSMUSG00000040877.16", "ENSMUSG00000112967.1", "ENSMUSG00000042734.6", "ENSMUSG00000035860.9", "ENSMUSG00000021136.13", "ENSMUSG00000020589.17", "ENSMUSG00000035653.16", "ENSMUSG00000035021.13", "ENSMUSG00000020627.10", "ENSMUSG00000020674.17", "ENSMUSG00000035933.5", "ENSMUSG00000020642.13", "ENSMUSG00000052593.16", "ENSMUSG00000060716.7", "ENSMUSG00000071379.2", "ENSMUSG00000021294.7", "ENSMUSG00000084979.1", "ENSMUSG00000112410.1", "ENSMUSG00000021013.16", "ENSMUSG00000112404.1", "ENSMUSG00000094910.1", "ENSMUSG00000021133.9", "ENSMUSG00000037896.17", "ENSMUSG00000034271.16", "ENSMUSG00000018581.15", "ENSMUSG00000071234.3", "ENSMUSG00000113634.1", "ENSMUSG00000021180.9", "ENSMUSG00000043673.11", "ENSMUSG00000063450.14", "ENSMUSG00000113913.1", "ENSMUSG00000021259.5", "ENSMUSG00000111986.1", "ENSMUSG00000112054.1", "ENSMUSG00000113285.1", "ENSMUSG00000020577.17", "ENSMUSG00000021068.16", "ENSMUSG00000098158.2", "ENSMUSG00000034111.6", "ENSMUSG00000034601.17", "ENSMUSG00000102098.1", "ENSMUSG00000021108.18", "ENSMUSG00000112343.1", "ENSMUSG00000007867.9", "ENSMUSG00000037735.7", "ENSMUSG00000113425.1", "ENSMUSG00000046157.13", "ENSMUSG00000112685.1", "ENSMUSG00000021279.5", "ENSMUSG00000020646.17", "ENSMUSG00000041886.7", "ENSMUSG00000046782.14", "ENSMUSG00000020654.15", "ENSMUSG00000021176.6", "ENSMUSG00000047415.12", "ENSMUSG00000002688.8", "ENSMUSG00000021186.9", "ENSMUSG00000046768.13", "ENSMUSG00000086012.2", "ENSMUSG00000045404.16", "ENSMUSG00000048483.6", "ENSMUSG00000001225.12", "ENSMUSG00000020661.15", "ENSMUSG00000091803.7", "ENSMUSG00000112310.1", "ENSMUSG00000020658.10", "ENSMUSG00000021044.15", "ENSMUSG00000035181.6", "ENSMUSG00000021036.10", "ENSMUSG00000029878.6", "ENSMUSG00000036333.11", "ENSMUSG00000100192.1", "ENSMUSG00000047022.18", "ENSMUSG00000048387.8", "ENSMUSG00000020647.10", "ENSMUSG00000048004.15", "ENSMUSG00000055884.8", "ENSMUSG00000036613.8", "ENSMUSG00000092006.1", "ENSMUSG00000042507.15", "ENSMUSG00000041669.15", "ENSMUSG00000021187.14", "ENSMUSG00000112694.1", "ENSMUSG00000061947.10", "ENSMUSG00000079012.11", "ENSMUSG00000112784.1", "ENSMUSG00000021245.15", "ENSMUSG00000085820.7", "ENSMUSG00000113441.1", "ENSMUSG00000112112.1", "ENSMUSG00000097917.2", "ENSMUSG00000050103.18", "ENSMUSG00000060807.7", "ENSMUSG00000061911.15", "ENSMUSG00000114048.1", "ENSMUSG00000083193.1", "ENSMUSG00000113630.1", "ENSMUSG00000097071.2", "ENSMUSG00000113447.1", "ENSMUSG00000095174.1", "ENSMUSG00000021071.16", "ENSMUSG00000113986.1", "ENSMUSG00000062352.14", "ENSMUSG00000112421.1", "ENSMUSG00000012076.8", "ENSMUSG00000035133.9", "ENSMUSG00000021103.12", "ENSMUSG00000011171.11", "ENSMUSG00000021256.5", "ENSMUSG00000113134.1", "ENSMUSG00000021257.14", "ENSMUSG00000099907.1", "ENSMUSG00000021198.16", "ENSMUSG00000113272.1", "ENSMUSG00000071179.8", "ENSMUSG00000034145.14", "ENSMUSG00000034258.4", "ENSMUSG00000087700.2", "ENSMUSG00000020990.5", "ENSMUSG00000020609.14", "ENSMUSG00000113735.1", "ENSMUSG00000097157.2", "ENSMUSG00000002799.6", "ENSMUSG00000044456.16", "ENSMUSG00000113900.1", "ENSMUSG00000021253.7", "ENSMUSG00000112704.1", "ENSMUSG00000097160.2", "ENSMUSG00000047446.18", "ENSMUSG00000044067.7", "ENSMUSG00000033731.9", "ENSMUSG00000113867.1", "ENSMUSG00000079645.2", "ENSMUSG00000113235.1", "ENSMUSG00000113548.1", "ENSMUSG00000094002.3", "ENSMUSG00000066364.2", "ENSMUSG00000032705.9", "ENSMUSG00000113587.1", "ENSMUSG00000113087.1", "ENSMUSG00000020962.14", "ENSMUSG00000019256.17", "ENSMUSG00000066363.12", "ENSMUSG00000021033.11", "ENSMUSG00000100890.1", "ENSMUSG00000112792.1", "ENSMUSG00000114149.1", "ENSMUSG00000051726.6", "ENSMUSG00000085801.1", "ENSMUSG00000072791.11", "ENSMUSG00000021009.15", "ENSMUSG00000112139.1", "ENSMUSG00000113732.1", "ENSMUSG00000054302.15", "ENSMUSG00000079143.3", "ENSMUSG00000056359.6", "ENSMUSG00000113330.1", "ENSMUSG00000021139.17", "ENSMUSG00000021062.15", "ENSMUSG00000072849.10", "ENSMUSG00000021112.9", "ENSMUSG00000113575.1", "ENSMUSG00000093765.7", "ENSMUSG00000073242.4", "ENSMUSG00000113179.1", "ENSMUSG00000111925.1", "ENSMUSG00000113620.1", "ENSMUSG00000043061.10", "ENSMUSG00000054003.13", "ENSMUSG00000020638.8", "ENSMUSG00000021194.6", "ENSMUSG00000058260.2", "ENSMUSG00000114150.1", "ENSMUSG00000021051.10", "ENSMUSG00000113975.1", "ENSMUSG00000112852.1", "ENSMUSG00000113679.1", "ENSMUSG00000095953.2", "ENSMUSG00000086975.1", "ENSMUSG00000025323.10", "ENSMUSG00000071177.4", "ENSMUSG00000112865.1", "ENSMUSG00000097543.1", "ENSMUSG00000112258.1", "ENSMUSG00000113868.1", "ENSMUSG00000020608.7", "ENSMUSG00000041536.13", "ENSMUSG00000085004.2", "ENSMUSG00000111971.1", "ENSMUSG00000112483.1", "ENSMUSG00000020640.10", "ENSMUSG00000113694.1", "ENSMUSG00000020607.7", "ENSMUSG00000114001.1", "ENSMUSG00000112664.1", "ENSMUSG00000099407.1", "ENSMUSG00000059060.15", "ENSMUSG00000098530.1", "ENSMUSG00000021039.9", "ENSMUSG00000097902.2", "ENSMUSG00000059669.8", "ENSMUSG00000020650.15", "ENSMUSG00000113895.1", "ENSMUSG00000021057.15", "ENSMUSG00000044548.11", "ENSMUSG00000097494.7", "ENSMUSG00000113427.1", "ENSMUSG00000020669.15", "ENSMUSG00000020671.9", "ENSMUSG00000021244.15", "ENSMUSG00000044573.16", "ENSMUSG00000113927.1", "ENSMUSG00000033454.6", "ENSMUSG00000061458.9", "ENSMUSG00000113193.1", "ENSMUSG00000072825.11", "ENSMUSG00000098029.2", "ENSMUSG00000113930.1", "ENSMUSG00000097488.1", "ENSMUSG00000036295.5", "ENSMUSG00000056770.15", "ENSMUSG00000021275.16", "ENSMUSG00000113950.1", "ENSMUSG00000011148.14", "ENSMUSG00000113218.1", "ENSMUSG00000048833.9", "ENSMUSG00000079014.4", "ENSMUSG00000098172.7", "ENSMUSG00000021192.16", "ENSMUSG00000041481.16", "ENSMUSG00000020572.8", "ENSMUSG00000091583.1", "ENSMUSG00000085120.1", "ENSMUSG00000054150.12", "ENSMUSG00000086319.1", "ENSMUSG00000113268.1", "ENSMUSG00000113320.1", "ENSMUSG00000113434.1", "ENSMUSG00000112138.1", "ENSMUSG00000112071.1", "ENSMUSG00000091157.1", "ENSMUSG00000113666.1", "ENSMUSG00000021091.8", "ENSMUSG00000021242.9", "ENSMUSG00000113093.1", "ENSMUSG00000066643.12", "ENSMUSG00000089441.1", "ENSMUSG00000020628.16", "ENSMUSG00000112812.1", "ENSMUSG00000021285.15", "ENSMUSG00000098757.1", "ENSMUSG00000020566.19", "ENSMUSG00000072946.12", "ENSMUSG00000036655.8", "ENSMUSG00000020964.14", "ENSMUSG00000087109.1", "ENSMUSG00000040867.12", "ENSMUSG00000020580.10", "ENSMUSG00000102953.1", "ENSMUSG00000021090.16", "ENSMUSG00000041567.3", "ENSMUSG00000052609.9", "ENSMUSG00000021027.17", "ENSMUSG00000113208.1", "ENSMUSG00000098480.1", "ENSMUSG00000045690.8", "ENSMUSG00000093465.1"]
slop = 250000
n_pc = 10
resolution = 1


In [5]:
output_dir = pathlib.Path(chrom)
output_dir.mkdir(exist_ok=True)

## Cell Meta

In [6]:
cell_tidy_data = pd.read_msgpack(
    '/home/hanliu/project/mouse_rostral_brain/study/ClusteringSummary/Summary/TotalClusteringResults.msg'
)
use_clusters = [
    i.replace(' ', '_') for i in cell_tidy_data[cell_tidy_data['CellClass'].isin(['Exc', 'Inh'])]
    ['SubType'].unique() if 'Outlier' not in i
]
len(use_clusters)

145

## ATAC peaks

In [7]:
atac_peak = pd.read_msgpack('/home/hanliu/project/mouse_rostral_brain/study/DMRCluster/SubType.ATAC_peak_merged.msg')
atac_peak = atac_peak.loc[atac_peak.index.map(lambda i: i.startswith(f'Sub{chrom}_'))].copy()

## Gene Info

In [8]:
gene_meta = pd.read_csv(
    '/home/hanliu/ref/mouse/gencode/vm22/gencode.vM22.annotation.gene.flat.tsv.gz',
    index_col='gene_id', sep='\t'
)
gene_meta = gene_meta[gene_meta['chrom'] == chrom].copy()

In [9]:
exon_bed = pd.read_csv('/home/hanliu/ref/mouse/gencode/vm22/genome_anno/exon.all.bed',
                       header=None, sep='\t')
exon_bed.columns = ['chrom', 'start', 'end', 'gene_id', 'gene_name']

## DMR Info

In [10]:
with pd.HDFStore('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/DMRInfo.h5', 'r') as hdf:
    dmr_rate = hdf['Rate']
dmr_rate = dmr_rate.loc[dmr_rate.index.map(lambda i: i.startswith(f'Sub{chrom}_'))].copy()

dmr_corr = pd.read_msgpack(
    '/home/hanliu/project/mouse_rostral_brain/study/DMRGeneCorr/TotalGeneDMRCorrLoop.0.3.msg'
)
dmr_corr = dmr_corr.set_index(['DMR', 'Gene'])
dmr_corr = dmr_corr.loc[dmr_corr.index.get_level_values('DMR').isin(dmr_rate.index)].copy()

dmr_bed = pd.read_csv('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/TotalDMR.nofilter.bed',
                      sep='\t', header=None, index_col=3)
dmr_bed.columns = ['chrom', 'start', 'end']
dmr_bed = dmr_bed[dmr_bed['chrom'] == chrom].copy()

dmr_hits = anndata.read_h5ad('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/TotalHits.HypoDMR.h5ad')
dmr_hits = dmr_hits[dmr_rate.index, :].copy()
dmr_hits = dmr_hits[:, use_clusters].copy()

dmr_annot = anndata.read_h5ad('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/DMRAnnotation.h5ad')
dmr_annot = dmr_annot[dmr_rate.index, :].copy()

In [11]:
dmr_hits.shape[0]

220589

In [12]:
def get_gene(gene_id):
    _gene = gene_meta.loc[gene_id]
    return _gene.name, _gene['chrom'], _gene['start'], _gene['end'], _gene['strand']

## Gene's DMR clustering

In [13]:
def calculate_gene(gene_id):
    # get gene information
    gene_id, _, gene_start, gene_end, _ = get_gene(gene_id)

    # select related DMRs
    related_dmr = dmr_bed[(dmr_bed['start'] > gene_start - slop) &
                          (dmr_bed['end'] < gene_end + slop)].copy()
    
    related_dmr_rate = dmr_rate.loc[related_dmr.index, use_clusters].copy()
    related_dmr_rate.fillna(related_dmr_rate.mean(), axis=0, inplace=True)
    
    # construct Adata
    adata = anndata.AnnData(X=related_dmr_rate.values.copy(),
                            obs=pd.DataFrame([], related_dmr_rate.index),
                            var=pd.DataFrame([], related_dmr_rate.columns))
    sc.pp.scale(adata)
    sc.pp.pca(adata)
    
    pcs = adata.obsm['X_pca'][:, :n_pc]
    sc.pp.neighbors(adata, n_neighbors=int(round(np.log2(adata.shape[0]))), n_pcs=n_pc)
    sc.tl.leiden(adata, resolution=resolution)
    return adata.obs


def get_annotation(gene_id):
    gene_id, _, gene_start, gene_end, strand = get_gene(gene_id)
    tss = gene_start if strand == '+' else gene_end
    gene_cluster = calculate_gene(gene_id)
    this_corr = dmr_corr[dmr_corr.index.get_level_values('Gene') == gene_id]['Corr']
    this_corr.index = this_corr.index.droplevel('Gene')
    gene_cluster['Corr'] = gene_cluster.index.map(this_corr).fillna(0)
    
    this_dmr_bed = dmr_bed.loc[gene_cluster.index]
    dmr_center = (this_dmr_bed['end'] + this_dmr_bed['start']) / 2
    gene_length = gene_end - gene_start
    if strand == '+':
        gene_cluster['reldist_tss'] = (dmr_center - gene_start) / gene_length
    else:
        gene_cluster['reldist_tss'] = (gene_end - dmr_center) / gene_length
    gene_cluster['in_gene_body'] = (gene_cluster['reldist_tss'] > 0) & (gene_cluster['reldist_tss'] < 1)
    
    this_annot = dmr_annot[gene_cluster.index]
    annot_df = pd.DataFrame(this_annot.X.todense(), 
                 index=this_annot.obs_names, columns=this_annot.var_names)
    
    # annotate TE cols
    dna_te = annot_df.columns[20:33]
    gene_cluster['is_dna_te'] = annot_df[dna_te].sum(axis=1) != 0
    
    line_te = annot_df.columns[33:39]
    gene_cluster['is_line_te'] = annot_df[line_te].sum(axis=1) != 0
    
    ltr_te = annot_df.columns[39:45]
    gene_cluster['is_ltr_te'] = annot_df[ltr_te].sum(axis=1) != 0
    
    sine_te = annot_df.columns[45:52]
    gene_cluster['is_sine_te'] = annot_df[sine_te].sum(axis=1) != 0
    
    # this dmr within GOI's gene feature
    gene_cluster['in_intron'] = annot_df['intron'].astype(bool) & gene_cluster['in_gene_body']
    gene_cluster['in_exon'] = annot_df['exon'].astype(bool) & gene_cluster['in_gene_body']
    gene_cluster['in_utr3'] = annot_df['UTR3'].astype(bool) & gene_cluster['in_gene_body']
    gene_cluster['in_utr5'] = annot_df['UTR5'].astype(bool) & gene_cluster['in_gene_body']
    
    # previous mC study
    gene_cluster['feDMR'] = annot_df['feDMR'].astype(bool)
    gene_cluster['adultDMR'] = annot_df['adultDMR'].astype(bool)
    
    
    other_profiles = []
    # DMR hypo call in each cluster
    this_hypo_hits = dmr_hits[gene_cluster.index]
    hits_df = pd.DataFrame(this_hypo_hits.X.todense(), 
                 index=this_hypo_hits.obs_names, columns=this_hypo_hits.var_names)
    hits_df.columns = hits_df.columns.map(lambda i: f'HypoDMR.{i}')
    other_profiles.append(hits_df)
    
    # DMR rate
    related_dmr_rate = dmr_rate.loc[gene_cluster['leiden'].sort_values().index, use_clusters].copy()
    related_dmr_rate.fillna(related_dmr_rate.mean(), axis=0, inplace=True)
    related_dmr_rate.columns = related_dmr_rate.columns.map(lambda i: f'DMRRate.{i}')
    other_profiles.append(related_dmr_rate)
    
    # atac peak
    atac_peak_df = atac_peak.loc[related_dmr_rate.index, use_clusters].copy()
    other_profiles.append(atac_peak_df)
    atac_peak_df.columns = atac_peak_df.columns.map(lambda i: f'ATACPeak.{i}')
    
    dmr_annotation = pd.concat([gene_cluster] + other_profiles, axis=1, sort=True)
    return dmr_annotation

In [14]:
for gene in genes:
    print(gene)
    check_path = output_dir / f'{gene}.DMR_cluster.msg'
    if check_path.exists():
        continue
    
    dmr_annotation = get_annotation(gene)
    dmr_annotation.to_msgpack(output_dir / f'{gene}.DMR_detail.msg', compress='zlib')
    
    cluster_annotation = dmr_annotation.groupby('leiden').mean()
    cluster_annotation.to_msgpack(output_dir / f'{gene}.DMR_cluster.msg', compress='zlib')

ENSMUSG00000021061.15
ENSMUSG00000033854.10
ENSMUSG00000035954.10
ENSMUSG00000084883.2
ENSMUSG00000041992.9
ENSMUSG00000046314.16
ENSMUSG00000079055.10
ENSMUSG00000033713.12
ENSMUSG00000020576.10
ENSMUSG00000048251.15
ENSMUSG00000034574.10
ENSMUSG00000021221.15
ENSMUSG00000057963.9
ENSMUSG00000044712.15
ENSMUSG00000020598.16
ENSMUSG00000020672.14
ENSMUSG00000020961.15
ENSMUSG00000044912.10
ENSMUSG00000021224.15
ENSMUSG00000034402.3
ENSMUSG00000066392.11
ENSMUSG00000021277.16
ENSMUSG00000041771.14
ENSMUSG00000021143.10
ENSMUSG00000021115.15
ENSMUSG00000097929.9
ENSMUSG00000021262.15
ENSMUSG00000020656.16
ENSMUSG00000113550.1
ENSMUSG00000020593.15
ENSMUSG00000021182.16
ENSMUSG00000021209.12
ENSMUSG00000020955.9
ENSMUSG00000020601.8
ENSMUSG00000042724.8
ENSMUSG00000052632.16
ENSMUSG00000002900.16
ENSMUSG00000021177.16
ENSMUSG00000101930.1
ENSMUSG00000042350.13
ENSMUSG00000036095.11
ENSMUSG00000058070.14
ENSMUSG00000020673.14
ENSMUSG00000096751.3
ENSMUSG00000048285.9
ENSMUSG00000071265.10
