In [1]:
import pandas as pd
import numpy as np
import anndata
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import warnings
import pathlib

warnings.filterwarnings('ignore')

In [2]:
from matplotlib import rc
labelsize = 6
linewidth = 0.6
rc('lines', linewidth=linewidth)
rc('axes', labelsize=labelsize, linewidth=linewidth)
rc('xtick', labelsize=labelsize)
rc('ytick', labelsize=labelsize)
rc('xtick.major', width=linewidth)
rc('ytick.major', width=linewidth)
rc('xtick.minor', width=linewidth - 0.2)
rc('ytick.minor', width=linewidth - 0.2)

In [3]:
chrom = 'chr2'
genes = ['Lhx6']
slop = 250000
n_pc = 10
resolution = 1

In [4]:
# Parameters
chrom = "chr18"
genes = ["ENSMUSG00000053477.17", "ENSMUSG00000032735.15", "ENSMUSG00000052713.9", "ENSMUSG00000046318.16", "ENSMUSG00000024524.17", "ENSMUSG00000024238.15", "ENSMUSG00000118257.1", "ENSMUSG00000060275.13", "ENSMUSG00000026322.9", "ENSMUSG00000038128.7", "ENSMUSG00000058881.13", "ENSMUSG00000025425.18", "ENSMUSG00000024566.10", "ENSMUSG00000041607.17", "ENSMUSG00000117764.1", "ENSMUSG00000044252.18", "ENSMUSG00000045215.16", "ENSMUSG00000024424.15", "ENSMUSG00000024236.18", "ENSMUSG00000024544.9", "ENSMUSG00000024347.16", "ENSMUSG00000040957.15", "ENSMUSG00000049411.15", "ENSMUSG00000036103.9", "ENSMUSG00000117450.1", "ENSMUSG00000024431.15", "ENSMUSG00000019647.16", "ENSMUSG00000050321.3", "ENSMUSG00000024304.14", "ENSMUSG00000117521.1", "ENSMUSG00000052928.9", "ENSMUSG00000024511.15", "ENSMUSG00000034006.17", "ENSMUSG00000048410.17", "ENSMUSG00000043079.17", "ENSMUSG00000086607.7", "ENSMUSG00000071856.10", "ENSMUSG00000024558.12", "ENSMUSG00000061013.6", "ENSMUSG00000036452.18", "ENSMUSG00000036225.15", "ENSMUSG00000042942.18", "ENSMUSG00000024590.8", "ENSMUSG00000057719.11", "ENSMUSG00000047989.11", "ENSMUSG00000024593.15", "ENSMUSG00000024597.11", "ENSMUSG00000024270.11", "ENSMUSG00000061802.6", "ENSMUSG00000046668.9", "ENSMUSG00000024601.9", "ENSMUSG00000024302.16", "ENSMUSG00000024427.7", "ENSMUSG00000024268.16", "ENSMUSG00000042705.9", "ENSMUSG00000117841.1", "ENSMUSG00000041923.15", "ENSMUSG00000036585.16", "ENSMUSG00000073565.5", "ENSMUSG00000033871.14", "ENSMUSG00000057766.14", "ENSMUSG00000052026.8", "ENSMUSG00000039529.9", "ENSMUSG00000044646.15", "ENSMUSG00000118193.1", "ENSMUSG00000096948.2", "ENSMUSG00000073551.5", "ENSMUSG00000046982.11", "ENSMUSG00000039954.9", "ENSMUSG00000024535.16", "ENSMUSG00000024507.6", "ENSMUSG00000056124.5", "ENSMUSG00000118056.1", "ENSMUSG00000117792.1", "ENSMUSG00000117891.1", "ENSMUSG00000000420.15", "ENSMUSG00000024421.16", "ENSMUSG00000024592.15", "ENSMUSG00000037013.16", "ENSMUSG00000056214.9", "ENSMUSG00000024376.7", "ENSMUSG00000032818.16", "ENSMUSG00000097043.1", "ENSMUSG00000024598.9", "ENSMUSG00000114891.1", "ENSMUSG00000033016.16", "ENSMUSG00000110211.1", "ENSMUSG00000097707.1", "ENSMUSG00000024513.16", "ENSMUSG00000050945.8", "ENSMUSG00000086312.1", "ENSMUSG00000037815.7", "ENSMUSG00000041482.17", "ENSMUSG00000056153.15", "ENSMUSG00000036880.10", "ENSMUSG00000117733.1", "ENSMUSG00000024420.9", "ENSMUSG00000025420.13", "ENSMUSG00000024546.6", "ENSMUSG00000024617.16", "ENSMUSG00000024388.10", "ENSMUSG00000024594.9", "ENSMUSG00000024260.14", "ENSMUSG00000051375.15", "ENSMUSG00000048799.8", "ENSMUSG00000117840.1", "ENSMUSG00000087597.1", "ENSMUSG00000118120.1", "ENSMUSG00000033628.15", "ENSMUSG00000045629.8", "ENSMUSG00000042834.15", "ENSMUSG00000024563.16", "ENSMUSG00000118380.1", "ENSMUSG00000118002.1", "ENSMUSG00000025880.11", "ENSMUSG00000024533.17", "ENSMUSG00000117984.1", "ENSMUSG00000118013.1", "ENSMUSG00000117968.1", "ENSMUSG00000117701.1", "ENSMUSG00000024472.9", "ENSMUSG00000039616.10", "ENSMUSG00000025427.15", "ENSMUSG00000117719.1", "ENSMUSG00000117574.1", "ENSMUSG00000024600.8", "ENSMUSG00000085800.7", "ENSMUSG00000040560.10", "ENSMUSG00000117994.1", "ENSMUSG00000056671.7", "ENSMUSG00000117895.1", "ENSMUSG00000041915.9", "ENSMUSG00000109232.1", "ENSMUSG00000118102.1", "ENSMUSG00000117852.1", "ENSMUSG00000035765.10", "ENSMUSG00000117652.1", "ENSMUSG00000024613.16", "ENSMUSG00000097222.2", "ENSMUSG00000118154.1", "ENSMUSG00000045094.8", "ENSMUSG00000024575.16", "ENSMUSG00000085055.2", "ENSMUSG00000024539.17", "ENSMUSG00000024481.5", "ENSMUSG00000024646.14", "ENSMUSG00000071847.13", "ENSMUSG00000024471.12", "ENSMUSG00000117618.1", "ENSMUSG00000118227.1", "ENSMUSG00000117919.1", "ENSMUSG00000096982.2", "ENSMUSG00000088659.1", "ENSMUSG00000044393.15", "ENSMUSG00000024542.9", "ENSMUSG00000118131.1", "ENSMUSG00000032688.9", "ENSMUSG00000024525.7", "ENSMUSG00000086333.1", "ENSMUSG00000024261.6", "ENSMUSG00000118315.1", "ENSMUSG00000102959.1", "ENSMUSG00000042514.11", "ENSMUSG00000090523.2", "ENSMUSG00000024395.9", "ENSMUSG00000038059.7", "ENSMUSG00000024528.8", "ENSMUSG00000102697.1", "ENSMUSG00000118391.1", "ENSMUSG00000025421.15", "ENSMUSG00000117732.1", "ENSMUSG00000054008.9", "ENSMUSG00000054321.7", "ENSMUSG00000118300.1", "ENSMUSG00000117478.1", "ENSMUSG00000097440.2", "ENSMUSG00000118347.1", "ENSMUSG00000024502.10", "ENSMUSG00000024413.14", "ENSMUSG00000025423.16", "ENSMUSG00000118254.1", "ENSMUSG00000117456.1", "ENSMUSG00000049090.3", "ENSMUSG00000118024.1", "ENSMUSG00000024313.8", "ENSMUSG00000117926.1", "ENSMUSG00000024383.9", "ENSMUSG00000036412.5", "ENSMUSG00000062210.13", "ENSMUSG00000117430.1", "ENSMUSG00000118202.1", "ENSMUSG00000055561.9", "ENSMUSG00000082784.1", "ENSMUSG00000062526.4", "ENSMUSG00000118365.1", "ENSMUSG00000118361.1", "ENSMUSG00000053846.5", "ENSMUSG00000117943.1", "ENSMUSG00000041258.18", "ENSMUSG00000024516.13", "ENSMUSG00000117727.1", "ENSMUSG00000096934.1", "ENSMUSG00000097593.2", "ENSMUSG00000002475.16", "ENSMUSG00000033022.8", "ENSMUSG00000103255.1", "ENSMUSG00000117864.1", "ENSMUSG00000117573.1", "ENSMUSG00000117923.1", "ENSMUSG00000117410.1", "ENSMUSG00000073532.3", "ENSMUSG00000118362.1", "ENSMUSG00000089744.2", "ENSMUSG00000044176.12", "ENSMUSG00000099157.1", "ENSMUSG00000024647.14", "ENSMUSG00000118108.1", "ENSMUSG00000024556.4", "ENSMUSG00000024480.9", "ENSMUSG00000085289.1", "ENSMUSG00000050875.11", "ENSMUSG00000024486.6", "ENSMUSG00000104521.1", "ENSMUSG00000024515.13", "ENSMUSG00000118094.1", "ENSMUSG00000024505.16"]
slop = 250000
n_pc = 10
resolution = 1


In [5]:
output_dir = pathlib.Path(chrom)
output_dir.mkdir(exist_ok=True)

## Cell Meta

In [6]:
cell_tidy_data = pd.read_msgpack(
    '/home/hanliu/project/mouse_rostral_brain/study/ClusteringSummary/Summary/TotalClusteringResults.msg'
)
use_clusters = [
    i.replace(' ', '_') for i in cell_tidy_data[cell_tidy_data['CellClass'].isin(['Exc', 'Inh'])]
    ['SubType'].unique() if 'Outlier' not in i
]
len(use_clusters)

145

## ATAC peaks

In [7]:
atac_peak = pd.read_msgpack('/home/hanliu/project/mouse_rostral_brain/study/DMRCluster/SubType.ATAC_peak_merged.msg')
atac_peak = atac_peak.loc[atac_peak.index.map(lambda i: i.startswith(f'Sub{chrom}_'))].copy()

## Gene Info

In [8]:
gene_meta = pd.read_csv(
    '/home/hanliu/ref/mouse/gencode/vm22/gencode.vM22.annotation.gene.flat.tsv.gz',
    index_col='gene_id', sep='\t'
)
gene_meta = gene_meta[gene_meta['chrom'] == chrom].copy()

In [9]:
exon_bed = pd.read_csv('/home/hanliu/ref/mouse/gencode/vm22/genome_anno/exon.all.bed',
                       header=None, sep='\t')
exon_bed.columns = ['chrom', 'start', 'end', 'gene_id', 'gene_name']

## DMR Info

In [10]:
with pd.HDFStore('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/DMRInfo.h5', 'r') as hdf:
    dmr_rate = hdf['Rate']
dmr_rate = dmr_rate.loc[dmr_rate.index.map(lambda i: i.startswith(f'Sub{chrom}_'))].copy()

dmr_corr = pd.read_msgpack(
    '/home/hanliu/project/mouse_rostral_brain/study/DMRGeneCorr/TotalGeneDMRCorrLoop.0.3.msg'
)
dmr_corr = dmr_corr.set_index(['DMR', 'Gene'])
dmr_corr = dmr_corr.loc[dmr_corr.index.get_level_values('DMR').isin(dmr_rate.index)].copy()

dmr_bed = pd.read_csv('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/TotalDMR.nofilter.bed',
                      sep='\t', header=None, index_col=3)
dmr_bed.columns = ['chrom', 'start', 'end']
dmr_bed = dmr_bed[dmr_bed['chrom'] == chrom].copy()

dmr_hits = anndata.read_h5ad('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/TotalHits.HypoDMR.h5ad')
dmr_hits = dmr_hits[dmr_rate.index, :].copy()
dmr_hits = dmr_hits[:, use_clusters].copy()

dmr_annot = anndata.read_h5ad('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/DMRAnnotation.h5ad')
dmr_annot = dmr_annot[dmr_rate.index, :].copy()

In [11]:
dmr_hits.shape[0]

159776

In [12]:
def get_gene(gene_id):
    _gene = gene_meta.loc[gene_id]
    return _gene.name, _gene['chrom'], _gene['start'], _gene['end'], _gene['strand']

## Gene's DMR clustering

In [13]:
def calculate_gene(gene_id):
    # get gene information
    gene_id, _, gene_start, gene_end, _ = get_gene(gene_id)

    # select related DMRs
    related_dmr = dmr_bed[(dmr_bed['start'] > gene_start - slop) &
                          (dmr_bed['end'] < gene_end + slop)].copy()
    
    related_dmr_rate = dmr_rate.loc[related_dmr.index, use_clusters].copy()
    related_dmr_rate.fillna(related_dmr_rate.mean(), axis=0, inplace=True)
    
    # construct Adata
    adata = anndata.AnnData(X=related_dmr_rate.values.copy(),
                            obs=pd.DataFrame([], related_dmr_rate.index),
                            var=pd.DataFrame([], related_dmr_rate.columns))
    sc.pp.scale(adata)
    sc.pp.pca(adata)
    
    pcs = adata.obsm['X_pca'][:, :n_pc]
    sc.pp.neighbors(adata, n_neighbors=int(round(np.log2(adata.shape[0]))), n_pcs=n_pc)
    sc.tl.leiden(adata, resolution=resolution)
    return adata.obs


def get_annotation(gene_id):
    gene_id, _, gene_start, gene_end, strand = get_gene(gene_id)
    tss = gene_start if strand == '+' else gene_end
    gene_cluster = calculate_gene(gene_id)
    this_corr = dmr_corr[dmr_corr.index.get_level_values('Gene') == gene_id]['Corr']
    this_corr.index = this_corr.index.droplevel('Gene')
    gene_cluster['Corr'] = gene_cluster.index.map(this_corr).fillna(0)
    
    this_dmr_bed = dmr_bed.loc[gene_cluster.index]
    dmr_center = (this_dmr_bed['end'] + this_dmr_bed['start']) / 2
    gene_length = gene_end - gene_start
    if strand == '+':
        gene_cluster['reldist_tss'] = (dmr_center - gene_start) / gene_length
    else:
        gene_cluster['reldist_tss'] = (gene_end - dmr_center) / gene_length
    gene_cluster['in_gene_body'] = (gene_cluster['reldist_tss'] > 0) & (gene_cluster['reldist_tss'] < 1)
    
    this_annot = dmr_annot[gene_cluster.index]
    annot_df = pd.DataFrame(this_annot.X.todense(), 
                 index=this_annot.obs_names, columns=this_annot.var_names)
    
    # annotate TE cols
    dna_te = annot_df.columns[20:33]
    gene_cluster['is_dna_te'] = annot_df[dna_te].sum(axis=1) != 0
    
    line_te = annot_df.columns[33:39]
    gene_cluster['is_line_te'] = annot_df[line_te].sum(axis=1) != 0
    
    ltr_te = annot_df.columns[39:45]
    gene_cluster['is_ltr_te'] = annot_df[ltr_te].sum(axis=1) != 0
    
    sine_te = annot_df.columns[45:52]
    gene_cluster['is_sine_te'] = annot_df[sine_te].sum(axis=1) != 0
    
    # this dmr within GOI's gene feature
    gene_cluster['in_intron'] = annot_df['intron'].astype(bool) & gene_cluster['in_gene_body']
    gene_cluster['in_exon'] = annot_df['exon'].astype(bool) & gene_cluster['in_gene_body']
    gene_cluster['in_utr3'] = annot_df['UTR3'].astype(bool) & gene_cluster['in_gene_body']
    gene_cluster['in_utr5'] = annot_df['UTR5'].astype(bool) & gene_cluster['in_gene_body']
    
    # previous mC study
    gene_cluster['feDMR'] = annot_df['feDMR'].astype(bool)
    gene_cluster['adultDMR'] = annot_df['adultDMR'].astype(bool)
    
    
    other_profiles = []
    # DMR hypo call in each cluster
    this_hypo_hits = dmr_hits[gene_cluster.index]
    hits_df = pd.DataFrame(this_hypo_hits.X.todense(), 
                 index=this_hypo_hits.obs_names, columns=this_hypo_hits.var_names)
    hits_df.columns = hits_df.columns.map(lambda i: f'HypoDMR.{i}')
    other_profiles.append(hits_df)
    
    # DMR rate
    related_dmr_rate = dmr_rate.loc[gene_cluster['leiden'].sort_values().index, use_clusters].copy()
    related_dmr_rate.fillna(related_dmr_rate.mean(), axis=0, inplace=True)
    related_dmr_rate.columns = related_dmr_rate.columns.map(lambda i: f'DMRRate.{i}')
    other_profiles.append(related_dmr_rate)
    
    # atac peak
    atac_peak_df = atac_peak.loc[related_dmr_rate.index, use_clusters].copy()
    other_profiles.append(atac_peak_df)
    atac_peak_df.columns = atac_peak_df.columns.map(lambda i: f'ATACPeak.{i}')
    
    dmr_annotation = pd.concat([gene_cluster] + other_profiles, axis=1, sort=True)
    return dmr_annotation

In [15]:
for gene in genes:
    print(gene)
    check_path = output_dir / f'{gene}.DMR_cluster.msg'
    if check_path.exists():
        continue
    
    dmr_annotation = get_annotation(gene)
    dmr_annotation.to_msgpack(output_dir / f'{gene}.DMR_detail.msg', compress='zlib')
    
    cluster_annotation = dmr_annotation.groupby('leiden').mean()
    cluster_annotation.to_msgpack(output_dir / f'{gene}.DMR_cluster.msg', compress='zlib')

ENSMUSG00000053477.17
ENSMUSG00000032735.15
ENSMUSG00000052713.9
ENSMUSG00000046318.16
ENSMUSG00000024524.17
ENSMUSG00000024238.15
ENSMUSG00000118257.1
ENSMUSG00000060275.13
ENSMUSG00000026322.9
ENSMUSG00000038128.7
ENSMUSG00000058881.13
ENSMUSG00000025425.18
ENSMUSG00000024566.10
ENSMUSG00000041607.17
ENSMUSG00000117764.1
ENSMUSG00000044252.18
ENSMUSG00000045215.16
ENSMUSG00000024424.15
ENSMUSG00000024236.18
ENSMUSG00000024544.9
ENSMUSG00000024347.16
ENSMUSG00000040957.15
ENSMUSG00000049411.15
ENSMUSG00000036103.9
ENSMUSG00000117450.1
ENSMUSG00000024431.15
ENSMUSG00000019647.16
ENSMUSG00000050321.3
ENSMUSG00000024304.14
ENSMUSG00000117521.1
ENSMUSG00000052928.9
ENSMUSG00000024511.15
ENSMUSG00000034006.17
ENSMUSG00000048410.17
ENSMUSG00000043079.17
ENSMUSG00000086607.7
ENSMUSG00000071856.10
ENSMUSG00000024558.12
ENSMUSG00000061013.6
ENSMUSG00000036452.18
ENSMUSG00000036225.15
ENSMUSG00000042942.18
ENSMUSG00000024590.8
ENSMUSG00000057719.11
ENSMUSG00000047989.11
ENSMUSG00000024593.15
EN