In [1]:
import pandas as pd
import numpy as np
import anndata
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import warnings
import pathlib

warnings.filterwarnings('ignore')

In [2]:
from matplotlib import rc
labelsize = 6
linewidth = 0.6
rc('lines', linewidth=linewidth)
rc('axes', labelsize=labelsize, linewidth=linewidth)
rc('xtick', labelsize=labelsize)
rc('ytick', labelsize=labelsize)
rc('xtick.major', width=linewidth)
rc('ytick.major', width=linewidth)
rc('xtick.minor', width=linewidth - 0.2)
rc('ytick.minor', width=linewidth - 0.2)

In [3]:
chrom = 'chr2'
genes = ['Lhx6']
slop = 250000
n_pc = 10
resolution = 1

In [4]:
# Parameters
chrom = "chr14"
genes = ["ENSMUSG00000035067.9", "ENSMUSG00000025555.14", "ENSMUSG00000037824.6", "ENSMUSG00000072294.5", "ENSMUSG00000071531.3", "ENSMUSG00000056296.17", "ENSMUSG00000063142.15", "ENSMUSG00000025558.18", "ENSMUSG00000021838.17", "ENSMUSG00000021823.10", "ENSMUSG00000007817.15", "ENSMUSG00000055639.16", "ENSMUSG00000017491.9", "ENSMUSG00000054509.7", "ENSMUSG00000059456.13", "ENSMUSG00000007989.7", "ENSMUSG00000021779.17", "ENSMUSG00000045201.6", "ENSMUSG00000032849.14", "ENSMUSG00000067995.4", "ENSMUSG00000060548.13", "ENSMUSG00000021782.14", "ENSMUSG00000033885.15", "ENSMUSG00000093528.1", "ENSMUSG00000091722.2", "ENSMUSG00000021990.15", "ENSMUSG00000015759.11", "ENSMUSG00000021820.18", "ENSMUSG00000022012.9", "ENSMUSG00000034532.9", "ENSMUSG00000022085.4", "ENSMUSG00000034522.10", "ENSMUSG00000045776.3", "ENSMUSG00000037628.10", "ENSMUSG00000041471.9", "ENSMUSG00000021947.11", "ENSMUSG00000033060.15", "ENSMUSG00000034731.11", "ENSMUSG00000068015.8", "ENSMUSG00000021745.14", "ENSMUSG00000035184.15", "ENSMUSG00000043004.13", "ENSMUSG00000021974.15", "ENSMUSG00000114504.1", "ENSMUSG00000089887.2", "ENSMUSG00000037572.17", "ENSMUSG00000063410.8", "ENSMUSG00000022139.17", "ENSMUSG00000045875.13", "ENSMUSG00000115363.1", "ENSMUSG00000087535.2", "ENSMUSG00000014547.10", "ENSMUSG00000063506.14", "ENSMUSG00000033083.16", "ENSMUSG00000021932.13", "ENSMUSG00000101797.1", "ENSMUSG00000114704.1", "ENSMUSG00000042156.16", "ENSMUSG00000040640.11", "ENSMUSG00000054391.12", "ENSMUSG00000021959.16", "ENSMUSG00000076049.4", "ENSMUSG00000021846.9", "ENSMUSG00000058317.12", "ENSMUSG00000035078.6", "ENSMUSG00000022014.15", "ENSMUSG00000021852.14", "ENSMUSG00000114942.1", "ENSMUSG00000090038.7", "ENSMUSG00000022002.2", "ENSMUSG00000093383.1", "ENSMUSG00000108841.1", "ENSMUSG00000041765.6", "ENSMUSG00000022105.6", "ENSMUSG00000022013.4", "ENSMUSG00000048281.9", "ENSMUSG00000115099.1", "ENSMUSG00000036242.15", "ENSMUSG00000115801.1", "ENSMUSG00000060012.9", "ENSMUSG00000022211.9", "ENSMUSG00000034997.5", "ENSMUSG00000092165.9", "ENSMUSG00000095493.2", "ENSMUSG00000064128.8", "ENSMUSG00000021794.16", "ENSMUSG00000021775.11", "ENSMUSG00000109446.1", "ENSMUSG00000035566.7", "ENSMUSG00000097136.1", "ENSMUSG00000005148.8", "ENSMUSG00000004562.16", "ENSMUSG00000115517.1", "ENSMUSG00000114436.1", "ENSMUSG00000021890.6", "ENSMUSG00000022053.13", "ENSMUSG00000048279.19", "ENSMUSG00000115424.1", "ENSMUSG00000050505.7", "ENSMUSG00000114774.1", "ENSMUSG00000079265.10", "ENSMUSG00000067973.11", "ENSMUSG00000075592.10", "ENSMUSG00000033487.14", "ENSMUSG00000071262.10", "ENSMUSG00000022021.14", "ENSMUSG00000045731.6", "ENSMUSG00000084902.2", "ENSMUSG00000115149.1", "ENSMUSG00000115016.1", "ENSMUSG00000037712.16", "ENSMUSG00000035095.11", "ENSMUSG00000022098.10", "ENSMUSG00000002325.15", "ENSMUSG00000072624.3", "ENSMUSG00000097601.2", "ENSMUSG00000044447.5", "ENSMUSG00000051506.17", "ENSMUSG00000085883.2", "ENSMUSG00000021796.15", "ENSMUSG00000075512.5", "ENSMUSG00000021913.8", "ENSMUSG00000115821.1", "ENSMUSG00000093497.1", "ENSMUSG00000051615.14", "ENSMUSG00000042567.20", "ENSMUSG00000085500.9", "ENSMUSG00000114794.1", "ENSMUSG00000022031.6", "ENSMUSG00000021958.5", "ENSMUSG00000104658.4", "ENSMUSG00000115639.1", "ENSMUSG00000055128.15", "ENSMUSG00000021892.14", "ENSMUSG00000063821.6", "ENSMUSG00000097589.9", "ENSMUSG00000085133.2", "ENSMUSG00000021998.16", "ENSMUSG00000072589.4", "ENSMUSG00000036339.18", "ENSMUSG00000091923.8", "ENSMUSG00000115338.2", "ENSMUSG00000021973.9", "ENSMUSG00000021943.7", "ENSMUSG00000115623.1", "ENSMUSG00000079244.3", "ENSMUSG00000086112.1", "ENSMUSG00000041594.18", "ENSMUSG00000021918.10", "ENSMUSG00000055717.13", "ENSMUSG00000103188.1", "ENSMUSG00000114230.1", "ENSMUSG00000040717.6", "ENSMUSG00000021798.14", "ENSMUSG00000021871.19", "ENSMUSG00000042104.18", "ENSMUSG00000037536.14", "ENSMUSG00000022092.11", "ENSMUSG00000114680.1", "ENSMUSG00000014496.8", "ENSMUSG00000114615.1", "ENSMUSG00000097927.2", "ENSMUSG00000022020.15", "ENSMUSG00000021767.18", "ENSMUSG00000021993.10", "ENSMUSG00000114595.1", "ENSMUSG00000021948.17", "ENSMUSG00000047441.4", "ENSMUSG00000115627.1", "ENSMUSG00000115257.1", "ENSMUSG00000021919.8", "ENSMUSG00000021975.8", "ENSMUSG00000072595.3", "ENSMUSG00000115527.1", "ENSMUSG00000046523.5", "ENSMUSG00000115479.1", "ENSMUSG00000021996.16", "ENSMUSG00000022100.14", "ENSMUSG00000030662.10", "ENSMUSG00000114378.1", "ENSMUSG00000041625.15", "ENSMUSG00000072686.2", "ENSMUSG00000115529.1", "ENSMUSG00000114760.1", "ENSMUSG00000021895.10", "ENSMUSG00000053253.16", "ENSMUSG00000022010.19", "ENSMUSG00000015968.18", "ENSMUSG00000053093.16", "ENSMUSG00000114818.1", "ENSMUSG00000093472.1", "ENSMUSG00000058690.14", "ENSMUSG00000097542.1", "ENSMUSG00000021733.11", "ENSMUSG00000022111.9", "ENSMUSG00000114786.1", "ENSMUSG00000114446.1", "ENSMUSG00000115532.1", "ENSMUSG00000022180.7", "ENSMUSG00000021751.13", "ENSMUSG00000035296.14", "ENSMUSG00000021750.15", "ENSMUSG00000021743.6", "ENSMUSG00000114878.1", "ENSMUSG00000110080.1", "ENSMUSG00000114412.1", "ENSMUSG00000037833.14", "ENSMUSG00000021879.13", "ENSMUSG00000093081.1", "ENSMUSG00000100486.7", "ENSMUSG00000006522.17", "ENSMUSG00000115002.1", "ENSMUSG00000022019.16", "ENSMUSG00000115186.1", "ENSMUSG00000048379.9", "ENSMUSG00000022108.8", "ENSMUSG00000098609.1", "ENSMUSG00000114996.1", "ENSMUSG00000115615.1", "ENSMUSG00000021953.14", "ENSMUSG00000098479.1", "ENSMUSG00000115043.1", "ENSMUSG00000022032.14", "ENSMUSG00000021792.15", "ENSMUSG00000022026.7", "ENSMUSG00000022185.19", "ENSMUSG00000037697.19", "ENSMUSG00000040040.16", "ENSMUSG00000109849.1", "ENSMUSG00000091809.4", "ENSMUSG00000115304.1", "ENSMUSG00000115726.1", "ENSMUSG00000054051.7", "ENSMUSG00000002324.8", "ENSMUSG00000114720.1", "ENSMUSG00000021752.13", "ENSMUSG00000057606.14", "ENSMUSG00000037544.14", "ENSMUSG00000033111.16", "ENSMUSG00000114924.1", "ENSMUSG00000051729.12", "ENSMUSG00000114758.1", "ENSMUSG00000022123.9", "ENSMUSG00000021994.15", "ENSMUSG00000114232.1", "ENSMUSG00000022015.8", "ENSMUSG00000094692.2", "ENSMUSG00000040651.9", "ENSMUSG00000022106.14", "ENSMUSG00000090706.1", "ENSMUSG00000115709.1", "ENSMUSG00000003469.9", "ENSMUSG00000021884.18", "ENSMUSG00000114690.1", "ENSMUSG00000022056.9", "ENSMUSG00000022096.14", "ENSMUSG00000115580.1", "ENSMUSG00000115611.1", "ENSMUSG00000114267.1", "ENSMUSG00000115735.1", "ENSMUSG00000115022.1", "ENSMUSG00000022052.9", "ENSMUSG00000006526.13", "ENSMUSG00000115121.1", "ENSMUSG00000115781.1", "ENSMUSG00000114271.1", "ENSMUSG00000006281.8", "ENSMUSG00000115610.1", "ENSMUSG00000114463.1", "ENSMUSG00000014453.4", "ENSMUSG00000025545.10", "ENSMUSG00000114925.1", "ENSMUSG00000022176.11", "ENSMUSG00000033644.5", "ENSMUSG00000037798.7", "ENSMUSG00000034959.8", "ENSMUSG00000092272.1", "ENSMUSG00000114539.1", "ENSMUSG00000048628.5", "ENSMUSG00000098630.1", "ENSMUSG00000114898.1", "ENSMUSG00000022018.7", "ENSMUSG00000040618.7", "ENSMUSG00000057903.4", "ENSMUSG00000061356.13", "ENSMUSG00000052584.10", "ENSMUSG00000021843.17", "ENSMUSG00000098682.8", "ENSMUSG00000061244.14", "ENSMUSG00000022054.11", "ENSMUSG00000055538.7", "ENSMUSG00000022212.15", "ENSMUSG00000044819.5", "ENSMUSG00000115092.1", "ENSMUSG00000021806.4", "ENSMUSG00000022048.8", "ENSMUSG00000115001.1", "ENSMUSG00000097775.1", "ENSMUSG00000099994.1", "ENSMUSG00000114697.1", "ENSMUSG00000021933.11", "ENSMUSG00000046049.7", "ENSMUSG00000037580.10", "ENSMUSG00000114586.1", "ENSMUSG00000021922.17", "ENSMUSG00000021930.14", "ENSMUSG00000046204.14", "ENSMUSG00000052395.17", "ENSMUSG00000022124.15", "ENSMUSG00000014813.9", "ENSMUSG00000021903.11", "ENSMUSG00000068680.1", "ENSMUSG00000114298.1", "ENSMUSG00000115311.1", "ENSMUSG00000021824.13", "ENSMUSG00000095765.3", "ENSMUSG00000115718.1", "ENSMUSG00000086341.2", "ENSMUSG00000044772.2", "ENSMUSG00000021273.11", "ENSMUSG00000021936.14", "ENSMUSG00000091079.1", "ENSMUSG00000114441.1", "ENSMUSG00000099528.1", "ENSMUSG00000034893.8", "ENSMUSG00000114390.1", "ENSMUSG00000072145.3", "ENSMUSG00000115449.1", "ENSMUSG00000115787.1", "ENSMUSG00000068245.14", "ENSMUSG00000114284.1", "ENSMUSG00000022094.15", "ENSMUSG00000021972.14", "ENSMUSG00000089761.3", "ENSMUSG00000091649.3", "ENSMUSG00000051969.7", "ENSMUSG00000083606.3", "ENSMUSG00000022024.10", "ENSMUSG00000025279.7", "ENSMUSG00000068417.8", "ENSMUSG00000091132.1", "ENSMUSG00000090374.1", "ENSMUSG00000021770.11", "ENSMUSG00000115596.1", "ENSMUSG00000006529.16", "ENSMUSG00000033595.7", "ENSMUSG00000021940.10", "ENSMUSG00000021950.15", "ENSMUSG00000021738.5"]
slop = 250000
n_pc = 10
resolution = 1


In [5]:
output_dir = pathlib.Path(chrom)
output_dir.mkdir(exist_ok=True)

## Cell Meta

In [6]:
cell_tidy_data = pd.read_msgpack(
    '/home/hanliu/project/mouse_rostral_brain/study/ClusteringSummary/Summary/TotalClusteringResults.msg'
)
use_clusters = [
    i.replace(' ', '_') for i in cell_tidy_data[cell_tidy_data['CellClass'].isin(['Exc', 'Inh'])]
    ['SubType'].unique() if 'Outlier' not in i
]
len(use_clusters)

145

## ATAC peaks

In [7]:
atac_peak = pd.read_msgpack('/home/hanliu/project/mouse_rostral_brain/study/DMRCluster/SubType.ATAC_peak_merged.msg')
atac_peak = atac_peak.loc[atac_peak.index.map(lambda i: i.startswith(f'Sub{chrom}_'))].copy()

## Gene Info

In [8]:
gene_meta = pd.read_csv(
    '/home/hanliu/ref/mouse/gencode/vm22/gencode.vM22.annotation.gene.flat.tsv.gz',
    index_col='gene_id', sep='\t'
)
gene_meta = gene_meta[gene_meta['chrom'] == chrom].copy()

In [9]:
exon_bed = pd.read_csv('/home/hanliu/ref/mouse/gencode/vm22/genome_anno/exon.all.bed',
                       header=None, sep='\t')
exon_bed.columns = ['chrom', 'start', 'end', 'gene_id', 'gene_name']

## DMR Info

In [10]:
with pd.HDFStore('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/DMRInfo.h5', 'r') as hdf:
    dmr_rate = hdf['Rate']
dmr_rate = dmr_rate.loc[dmr_rate.index.map(lambda i: i.startswith(f'Sub{chrom}_'))].copy()

dmr_corr = pd.read_msgpack(
    '/home/hanliu/project/mouse_rostral_brain/study/DMRGeneCorr/TotalGeneDMRCorrLoop.0.3.msg'
)
dmr_corr = dmr_corr.set_index(['DMR', 'Gene'])
dmr_corr = dmr_corr.loc[dmr_corr.index.get_level_values('DMR').isin(dmr_rate.index)].copy()

dmr_bed = pd.read_csv('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/TotalDMR.nofilter.bed',
                      sep='\t', header=None, index_col=3)
dmr_bed.columns = ['chrom', 'start', 'end']
dmr_bed = dmr_bed[dmr_bed['chrom'] == chrom].copy()

dmr_hits = anndata.read_h5ad('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/TotalHits.HypoDMR.h5ad')
dmr_hits = dmr_hits[dmr_rate.index, :].copy()
dmr_hits = dmr_hits[:, use_clusters].copy()

dmr_annot = anndata.read_h5ad('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/DMRAnnotation.h5ad')
dmr_annot = dmr_annot[dmr_rate.index, :].copy()

In [11]:
dmr_hits.shape[0]

213045

In [12]:
def get_gene(gene_id):
    _gene = gene_meta.loc[gene_id]
    return _gene.name, _gene['chrom'], _gene['start'], _gene['end'], _gene['strand']

## Gene's DMR clustering

In [13]:
def calculate_gene(gene_id):
    # get gene information
    gene_id, _, gene_start, gene_end, _ = get_gene(gene_id)

    # select related DMRs
    related_dmr = dmr_bed[(dmr_bed['start'] > gene_start - slop) &
                          (dmr_bed['end'] < gene_end + slop)].copy()
    
    related_dmr_rate = dmr_rate.loc[related_dmr.index, use_clusters].copy()
    related_dmr_rate.fillna(related_dmr_rate.mean(), axis=0, inplace=True)
    
    # construct Adata
    adata = anndata.AnnData(X=related_dmr_rate.values.copy(),
                            obs=pd.DataFrame([], related_dmr_rate.index),
                            var=pd.DataFrame([], related_dmr_rate.columns))
    sc.pp.scale(adata)
    sc.pp.pca(adata)
    
    pcs = adata.obsm['X_pca'][:, :n_pc]
    sc.pp.neighbors(adata, n_neighbors=int(round(np.log2(adata.shape[0]))), n_pcs=n_pc)
    sc.tl.leiden(adata, resolution=resolution)
    return adata.obs


def get_annotation(gene_id):
    gene_id, _, gene_start, gene_end, strand = get_gene(gene_id)
    tss = gene_start if strand == '+' else gene_end
    gene_cluster = calculate_gene(gene_id)
    this_corr = dmr_corr[dmr_corr.index.get_level_values('Gene') == gene_id]['Corr']
    this_corr.index = this_corr.index.droplevel('Gene')
    gene_cluster['Corr'] = gene_cluster.index.map(this_corr).fillna(0)
    
    this_dmr_bed = dmr_bed.loc[gene_cluster.index]
    dmr_center = (this_dmr_bed['end'] + this_dmr_bed['start']) / 2
    gene_length = gene_end - gene_start
    if strand == '+':
        gene_cluster['reldist_tss'] = (dmr_center - gene_start) / gene_length
    else:
        gene_cluster['reldist_tss'] = (gene_end - dmr_center) / gene_length
    gene_cluster['in_gene_body'] = (gene_cluster['reldist_tss'] > 0) & (gene_cluster['reldist_tss'] < 1)
    
    this_annot = dmr_annot[gene_cluster.index]
    annot_df = pd.DataFrame(this_annot.X.todense(), 
                 index=this_annot.obs_names, columns=this_annot.var_names)
    
    # annotate TE cols
    dna_te = annot_df.columns[20:33]
    gene_cluster['is_dna_te'] = annot_df[dna_te].sum(axis=1) != 0
    
    line_te = annot_df.columns[33:39]
    gene_cluster['is_line_te'] = annot_df[line_te].sum(axis=1) != 0
    
    ltr_te = annot_df.columns[39:45]
    gene_cluster['is_ltr_te'] = annot_df[ltr_te].sum(axis=1) != 0
    
    sine_te = annot_df.columns[45:52]
    gene_cluster['is_sine_te'] = annot_df[sine_te].sum(axis=1) != 0
    
    # this dmr within GOI's gene feature
    gene_cluster['in_intron'] = annot_df['intron'].astype(bool) & gene_cluster['in_gene_body']
    gene_cluster['in_exon'] = annot_df['exon'].astype(bool) & gene_cluster['in_gene_body']
    gene_cluster['in_utr3'] = annot_df['UTR3'].astype(bool) & gene_cluster['in_gene_body']
    gene_cluster['in_utr5'] = annot_df['UTR5'].astype(bool) & gene_cluster['in_gene_body']
    
    # previous mC study
    gene_cluster['feDMR'] = annot_df['feDMR'].astype(bool)
    gene_cluster['adultDMR'] = annot_df['adultDMR'].astype(bool)
    
    
    other_profiles = []
    # DMR hypo call in each cluster
    this_hypo_hits = dmr_hits[gene_cluster.index]
    hits_df = pd.DataFrame(this_hypo_hits.X.todense(), 
                 index=this_hypo_hits.obs_names, columns=this_hypo_hits.var_names)
    hits_df.columns = hits_df.columns.map(lambda i: f'HypoDMR.{i}')
    other_profiles.append(hits_df)
    
    # DMR rate
    related_dmr_rate = dmr_rate.loc[gene_cluster['leiden'].sort_values().index, use_clusters].copy()
    related_dmr_rate.fillna(related_dmr_rate.mean(), axis=0, inplace=True)
    related_dmr_rate.columns = related_dmr_rate.columns.map(lambda i: f'DMRRate.{i}')
    other_profiles.append(related_dmr_rate)
    
    # atac peak
    atac_peak_df = atac_peak.loc[related_dmr_rate.index, use_clusters].copy()
    other_profiles.append(atac_peak_df)
    atac_peak_df.columns = atac_peak_df.columns.map(lambda i: f'ATACPeak.{i}')
    
    dmr_annotation = pd.concat([gene_cluster] + other_profiles, axis=1, sort=True)
    return dmr_annotation

In [None]:
for gene in genes:
    print(gene)
    check_path = output_dir / f'{gene}.DMR_cluster.msg'
    if check_path.exists():
        continue
    
    dmr_annotation = get_annotation(gene)
    dmr_annotation.to_msgpack(output_dir / f'{gene}.DMR_detail.msg', compress='zlib')
    
    cluster_annotation = dmr_annotation.groupby('leiden').mean()
    cluster_annotation.to_msgpack(output_dir / f'{gene}.DMR_cluster.msg', compress='zlib')

ENSMUSG00000035067.9
ENSMUSG00000025555.14
ENSMUSG00000037824.6
ENSMUSG00000072294.5
ENSMUSG00000071531.3
ENSMUSG00000056296.17
ENSMUSG00000063142.15
ENSMUSG00000025558.18
ENSMUSG00000021838.17
ENSMUSG00000021823.10
ENSMUSG00000007817.15
ENSMUSG00000055639.16
ENSMUSG00000017491.9
ENSMUSG00000054509.7
ENSMUSG00000059456.13
ENSMUSG00000007989.7
ENSMUSG00000021779.17
ENSMUSG00000045201.6
ENSMUSG00000032849.14
ENSMUSG00000067995.4
ENSMUSG00000060548.13
ENSMUSG00000021782.14
ENSMUSG00000033885.15
ENSMUSG00000093528.1
ENSMUSG00000091722.2
ENSMUSG00000021990.15
ENSMUSG00000015759.11
ENSMUSG00000021820.18
ENSMUSG00000022012.9
ENSMUSG00000034532.9
ENSMUSG00000022085.4
ENSMUSG00000034522.10
ENSMUSG00000045776.3
ENSMUSG00000037628.10
ENSMUSG00000041471.9
ENSMUSG00000021947.11
ENSMUSG00000033060.15
ENSMUSG00000034731.11
ENSMUSG00000068015.8
ENSMUSG00000021745.14
ENSMUSG00000035184.15
ENSMUSG00000043004.13
ENSMUSG00000021974.15
ENSMUSG00000114504.1
ENSMUSG00000089887.2
ENSMUSG00000037572.17
ENSMUSG