In [1]:
import pandas as pd
import numpy as np
import anndata
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import warnings
import pathlib

warnings.filterwarnings('ignore')

In [2]:
from matplotlib import rc
labelsize = 6
linewidth = 0.6
rc('lines', linewidth=linewidth)
rc('axes', labelsize=labelsize, linewidth=linewidth)
rc('xtick', labelsize=labelsize)
rc('ytick', labelsize=labelsize)
rc('xtick.major', width=linewidth)
rc('ytick.major', width=linewidth)
rc('xtick.minor', width=linewidth - 0.2)
rc('ytick.minor', width=linewidth - 0.2)

In [3]:
chrom = 'chr2'
genes = ['Lhx6']
slop = 250000
n_pc = 10
resolution = 1

In [4]:
# Parameters
chrom = "chr16"
genes = ["ENSMUSG00000057880.12", "ENSMUSG00000022685.9", "ENSMUSG00000043811.5", "ENSMUSG00000022840.9", "ENSMUSG00000047434.14", "ENSMUSG00000033306.15", "ENSMUSG00000050272.10", "ENSMUSG00000022790.14", "ENSMUSG00000053414.8", "ENSMUSG00000022494.15", "ENSMUSG00000086175.2", "ENSMUSG00000043301.16", "ENSMUSG00000090386.10", "ENSMUSG00000090223.2", "ENSMUSG00000051065.9", "ENSMUSG00000025612.5", "ENSMUSG00000035258.15", "ENSMUSG00000022656.15", "ENSMUSG00000022514.14", "ENSMUSG00000022861.17", "ENSMUSG00000035270.15", "ENSMUSG00000022762.18", "ENSMUSG00000048399.4", "ENSMUSG00000022687.12", "ENSMUSG00000047261.9", "ENSMUSG00000074892.9", "ENSMUSG00000117081.1", "ENSMUSG00000065979.12", "ENSMUSG00000055540.15", "ENSMUSG00000117000.1", "ENSMUSG00000022844.8", "ENSMUSG00000022701.18", "ENSMUSG00000022641.15", "ENSMUSG00000022637.11", "ENSMUSG00000115293.2", "ENSMUSG00000022537.18", "ENSMUSG00000022636.13", "ENSMUSG00000013089.15", "ENSMUSG00000005615.15", "ENSMUSG00000008658.16", "ENSMUSG00000115852.2", "ENSMUSG00000116029.1", "ENSMUSG00000052516.19", "ENSMUSG00000116720.1", "ENSMUSG00000022780.5", "ENSMUSG00000052504.7", "ENSMUSG00000022696.17", "ENSMUSG00000033653.18", "ENSMUSG00000022538.20", "ENSMUSG00000043391.10", "ENSMUSG00000022867.10", "ENSMUSG00000005958.15", "ENSMUSG00000022842.18", "ENSMUSG00000035376.10", "ENSMUSG00000038168.5", "ENSMUSG00000068196.5", "ENSMUSG00000033149.17", "ENSMUSG00000116574.1", "ENSMUSG00000093575.1", "ENSMUSG00000022519.14", "ENSMUSG00000040732.20", "ENSMUSG00000022887.9", "ENSMUSG00000050783.5", "ENSMUSG00000087526.2", "ENSMUSG00000035107.13", "ENSMUSG00000022885.16", "ENSMUSG00000055370.3", "ENSMUSG00000005982.14", "ENSMUSG00000116324.1", "ENSMUSG00000022723.16", "ENSMUSG00000117179.1", "ENSMUSG00000061751.16", "ENSMUSG00000116555.1", "ENSMUSG00000048490.14", "ENSMUSG00000035629.19", "ENSMUSG00000041957.15", "ENSMUSG00000033618.7", "ENSMUSG00000022836.11", "ENSMUSG00000036304.14", "ENSMUSG00000085499.1", "ENSMUSG00000022957.20", "ENSMUSG00000040605.7", "ENSMUSG00000022951.16", "ENSMUSG00000033355.6", "ENSMUSG00000022508.5", "ENSMUSG00000022510.14", "ENSMUSG00000068617.5", "ENSMUSG00000116930.1", "ENSMUSG00000101481.1", "ENSMUSG00000018830.10", "ENSMUSG00000032965.11", "ENSMUSG00000009569.14", "ENSMUSG00000052133.16", "ENSMUSG00000039680.10", "ENSMUSG00000022802.2", "ENSMUSG00000087141.1", "ENSMUSG00000062901.3", "ENSMUSG00000043008.9", "ENSMUSG00000092545.7", "ENSMUSG00000116933.1", "ENSMUSG00000116976.1", "ENSMUSG00000071552.5", "ENSMUSG00000115869.1", "ENSMUSG00000116610.1", "ENSMUSG00000022952.17", "ENSMUSG00000022755.4", "ENSMUSG00000005983.15", "ENSMUSG00000022533.14", "ENSMUSG00000022657.9", "ENSMUSG00000116885.1", "ENSMUSG00000022894.6", "ENSMUSG00000116469.1", "ENSMUSG00000022892.11", "ENSMUSG00000000325.16", "ENSMUSG00000022865.14", "ENSMUSG00000116588.1", "ENSMUSG00000022665.15", "ENSMUSG00000062082.16", "ENSMUSG00000022791.18", "ENSMUSG00000097307.1", "ENSMUSG00000033210.16", "ENSMUSG00000022848.8", "ENSMUSG00000035506.16", "ENSMUSG00000062609.14", "ENSMUSG00000075254.12", "ENSMUSG00000022747.17", "ENSMUSG00000116290.1", "ENSMUSG00000085826.1", "ENSMUSG00000116641.1", "ENSMUSG00000022704.15", "ENSMUSG00000116589.1", "ENSMUSG00000051980.13", "ENSMUSG00000022895.16", "ENSMUSG00000079546.1", "ENSMUSG00000116848.1", "ENSMUSG00000075704.16", "ENSMUSG00000097551.1", "ENSMUSG00000117324.1", "ENSMUSG00000033157.17", "ENSMUSG00000022876.18", "ENSMUSG00000022801.13", "ENSMUSG00000046961.8", "ENSMUSG00000046613.19", "ENSMUSG00000023088.17", "ENSMUSG00000078489.2", "ENSMUSG00000087605.1", "ENSMUSG00000030775.10", "ENSMUSG00000039179.13", "ENSMUSG00000116755.1", "ENSMUSG00000087384.1", "ENSMUSG00000022816.11", "ENSMUSG00000068284.14", "ENSMUSG00000022793.17", "ENSMUSG00000102967.1", "ENSMUSG00000083505.1", "ENSMUSG00000039903.17", "ENSMUSG00000116894.1", "ENSMUSG00000115219.2", "ENSMUSG00000116865.1", "ENSMUSG00000022863.15", "ENSMUSG00000022753.15", "ENSMUSG00000033581.17", "ENSMUSG00000116654.1", "ENSMUSG00000089874.1", "ENSMUSG00000022898.13", "ENSMUSG00000035356.17", "ENSMUSG00000049076.11", "ENSMUSG00000076039.1", "ENSMUSG00000055972.5", "ENSMUSG00000116689.1", "ENSMUSG00000085732.1", "ENSMUSG00000022964.14", "ENSMUSG00000003531.15", "ENSMUSG00000022799.4", "ENSMUSG00000022711.16", "ENSMUSG00000109783.1", "ENSMUSG00000043065.12", "ENSMUSG00000076372.2", "ENSMUSG00000068167.6", "ENSMUSG00000096960.1", "ENSMUSG00000022661.14", "ENSMUSG00000109857.1", "ENSMUSG00000116747.1", "ENSMUSG00000108903.1", "ENSMUSG00000022938.8", "ENSMUSG00000003526.12", "ENSMUSG00000022832.11", "ENSMUSG00000045275.16", "ENSMUSG00000048939.13", "ENSMUSG00000022821.13", "ENSMUSG00000038094.15", "ENSMUSG00000065494.1", "ENSMUSG00000117309.1", "ENSMUSG00000008393.9", "ENSMUSG00000103049.1", "ENSMUSG00000022512.3", "ENSMUSG00000000157.16", "ENSMUSG00000101463.1", "ENSMUSG00000099968.7", "ENSMUSG00000116937.1", "ENSMUSG00000116665.1", "ENSMUSG00000096918.2", "ENSMUSG00000022663.3", "ENSMUSG00000075395.11", "ENSMUSG00000102854.1", "ENSMUSG00000103932.1", "ENSMUSG00000022639.14", "ENSMUSG00000055447.19", "ENSMUSG00000096870.2", "ENSMUSG00000099293.1", "ENSMUSG00000025610.7", "ENSMUSG00000022864.14", "ENSMUSG00000068428.7", "ENSMUSG00000089774.2", "ENSMUSG00000022664.11", "ENSMUSG00000116684.1", "ENSMUSG00000116173.1", "ENSMUSG00000022754.6", "ENSMUSG00000034473.14", "ENSMUSG00000116536.1", "ENSMUSG00000102344.1", "ENSMUSG00000022658.10", "ENSMUSG00000003955.8", "ENSMUSG00000005899.14", "ENSMUSG00000075033.4", "ENSMUSG00000097764.1", "ENSMUSG00000045178.10", "ENSMUSG00000076064.1", "ENSMUSG00000116714.1", "ENSMUSG00000004366.4", "ENSMUSG00000038127.14", "ENSMUSG00000039789.7", "ENSMUSG00000075265.4", "ENSMUSG00000000386.16", "ENSMUSG00000115936.1", "ENSMUSG00000022680.14", "ENSMUSG00000044117.12", "ENSMUSG00000116752.1", "ENSMUSG00000116557.1", "ENSMUSG00000022900.14", "ENSMUSG00000022822.16", "ENSMUSG00000103916.1", "ENSMUSG00000116831.1", "ENSMUSG00000116766.1", "ENSMUSG00000071550.14", "ENSMUSG00000022949.9", "ENSMUSG00000089607.1", "ENSMUSG00000037991.9", "ENSMUSG00000004069.17", "ENSMUSG00000022548.14", "ENSMUSG00000022525.13", "ENSMUSG00000072419.5", "ENSMUSG00000000028.15", "ENSMUSG00000000385.8", "ENSMUSG00000022827.14", "ENSMUSG00000022962.15", "ENSMUSG00000091243.2", "ENSMUSG00000116964.1", "ENSMUSG00000039637.15", "ENSMUSG00000046598.15", "ENSMUSG00000022897.15", "ENSMUSG00000051669.6", "ENSMUSG00000022875.18", "ENSMUSG00000022544.14", "ENSMUSG00000046748.8", "ENSMUSG00000079620.13"]
slop = 250000
n_pc = 10
resolution = 1


In [5]:
output_dir = pathlib.Path(chrom)
output_dir.mkdir(exist_ok=True)

## Cell Meta

In [6]:
cell_tidy_data = pd.read_msgpack(
    '/home/hanliu/project/mouse_rostral_brain/study/ClusteringSummary/Summary/TotalClusteringResults.msg'
)
use_clusters = [
    i.replace(' ', '_') for i in cell_tidy_data[cell_tidy_data['CellClass'].isin(['Exc', 'Inh'])]
    ['SubType'].unique() if 'Outlier' not in i
]
len(use_clusters)

145

## ATAC peaks

In [7]:
atac_peak = pd.read_msgpack('/home/hanliu/project/mouse_rostral_brain/study/DMRCluster/SubType.ATAC_peak_merged.msg')
atac_peak = atac_peak.loc[atac_peak.index.map(lambda i: i.startswith(f'Sub{chrom}_'))].copy()

## Gene Info

In [8]:
gene_meta = pd.read_csv(
    '/home/hanliu/ref/mouse/gencode/vm22/gencode.vM22.annotation.gene.flat.tsv.gz',
    index_col='gene_id', sep='\t'
)
gene_meta = gene_meta[gene_meta['chrom'] == chrom].copy()

In [9]:
exon_bed = pd.read_csv('/home/hanliu/ref/mouse/gencode/vm22/genome_anno/exon.all.bed',
                       header=None, sep='\t')
exon_bed.columns = ['chrom', 'start', 'end', 'gene_id', 'gene_name']

## DMR Info

In [10]:
with pd.HDFStore('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/DMRInfo.h5', 'r') as hdf:
    dmr_rate = hdf['Rate']
dmr_rate = dmr_rate.loc[dmr_rate.index.map(lambda i: i.startswith(f'Sub{chrom}_'))].copy()

dmr_corr = pd.read_msgpack(
    '/home/hanliu/project/mouse_rostral_brain/study/DMRGeneCorr/TotalGeneDMRCorrLoop.0.3.msg'
)
dmr_corr = dmr_corr.set_index(['DMR', 'Gene'])
dmr_corr = dmr_corr.loc[dmr_corr.index.get_level_values('DMR').isin(dmr_rate.index)].copy()

dmr_bed = pd.read_csv('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/TotalDMR.nofilter.bed',
                      sep='\t', header=None, index_col=3)
dmr_bed.columns = ['chrom', 'start', 'end']
dmr_bed = dmr_bed[dmr_bed['chrom'] == chrom].copy()

dmr_hits = anndata.read_h5ad('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/TotalHits.HypoDMR.h5ad')
dmr_hits = dmr_hits[dmr_rate.index, :].copy()
dmr_hits = dmr_hits[:, use_clusters].copy()

dmr_annot = anndata.read_h5ad('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/DMRAnnotation.h5ad')
dmr_annot = dmr_annot[dmr_rate.index, :].copy()

In [11]:
dmr_hits.shape[0]

184650

In [12]:
def get_gene(gene_id):
    _gene = gene_meta.loc[gene_id]
    return _gene.name, _gene['chrom'], _gene['start'], _gene['end'], _gene['strand']

## Gene's DMR clustering

In [13]:
def calculate_gene(gene_id):
    # get gene information
    gene_id, _, gene_start, gene_end, _ = get_gene(gene_id)

    # select related DMRs
    related_dmr = dmr_bed[(dmr_bed['start'] > gene_start - slop) &
                          (dmr_bed['end'] < gene_end + slop)].copy()
    
    related_dmr_rate = dmr_rate.loc[related_dmr.index, use_clusters].copy()
    related_dmr_rate.fillna(related_dmr_rate.mean(), axis=0, inplace=True)
    
    # construct Adata
    adata = anndata.AnnData(X=related_dmr_rate.values.copy(),
                            obs=pd.DataFrame([], related_dmr_rate.index),
                            var=pd.DataFrame([], related_dmr_rate.columns))
    sc.pp.scale(adata)
    sc.pp.pca(adata)
    
    pcs = adata.obsm['X_pca'][:, :n_pc]
    sc.pp.neighbors(adata, n_neighbors=int(round(np.log2(adata.shape[0]))), n_pcs=n_pc)
    sc.tl.leiden(adata, resolution=resolution)
    return adata.obs


def get_annotation(gene_id):
    gene_id, _, gene_start, gene_end, strand = get_gene(gene_id)
    tss = gene_start if strand == '+' else gene_end
    gene_cluster = calculate_gene(gene_id)
    this_corr = dmr_corr[dmr_corr.index.get_level_values('Gene') == gene_id]['Corr']
    this_corr.index = this_corr.index.droplevel('Gene')
    gene_cluster['Corr'] = gene_cluster.index.map(this_corr).fillna(0)
    
    this_dmr_bed = dmr_bed.loc[gene_cluster.index]
    dmr_center = (this_dmr_bed['end'] + this_dmr_bed['start']) / 2
    gene_length = gene_end - gene_start
    if strand == '+':
        gene_cluster['reldist_tss'] = (dmr_center - gene_start) / gene_length
    else:
        gene_cluster['reldist_tss'] = (gene_end - dmr_center) / gene_length
    gene_cluster['in_gene_body'] = (gene_cluster['reldist_tss'] > 0) & (gene_cluster['reldist_tss'] < 1)
    
    this_annot = dmr_annot[gene_cluster.index]
    annot_df = pd.DataFrame(this_annot.X.todense(), 
                 index=this_annot.obs_names, columns=this_annot.var_names)
    
    # annotate TE cols
    dna_te = annot_df.columns[20:33]
    gene_cluster['is_dna_te'] = annot_df[dna_te].sum(axis=1) != 0
    
    line_te = annot_df.columns[33:39]
    gene_cluster['is_line_te'] = annot_df[line_te].sum(axis=1) != 0
    
    ltr_te = annot_df.columns[39:45]
    gene_cluster['is_ltr_te'] = annot_df[ltr_te].sum(axis=1) != 0
    
    sine_te = annot_df.columns[45:52]
    gene_cluster['is_sine_te'] = annot_df[sine_te].sum(axis=1) != 0
    
    # this dmr within GOI's gene feature
    gene_cluster['in_intron'] = annot_df['intron'].astype(bool) & gene_cluster['in_gene_body']
    gene_cluster['in_exon'] = annot_df['exon'].astype(bool) & gene_cluster['in_gene_body']
    gene_cluster['in_utr3'] = annot_df['UTR3'].astype(bool) & gene_cluster['in_gene_body']
    gene_cluster['in_utr5'] = annot_df['UTR5'].astype(bool) & gene_cluster['in_gene_body']
    
    # previous mC study
    gene_cluster['feDMR'] = annot_df['feDMR'].astype(bool)
    gene_cluster['adultDMR'] = annot_df['adultDMR'].astype(bool)
    
    
    other_profiles = []
    # DMR hypo call in each cluster
    this_hypo_hits = dmr_hits[gene_cluster.index]
    hits_df = pd.DataFrame(this_hypo_hits.X.todense(), 
                 index=this_hypo_hits.obs_names, columns=this_hypo_hits.var_names)
    hits_df.columns = hits_df.columns.map(lambda i: f'HypoDMR.{i}')
    other_profiles.append(hits_df)
    
    # DMR rate
    related_dmr_rate = dmr_rate.loc[gene_cluster['leiden'].sort_values().index, use_clusters].copy()
    related_dmr_rate.fillna(related_dmr_rate.mean(), axis=0, inplace=True)
    related_dmr_rate.columns = related_dmr_rate.columns.map(lambda i: f'DMRRate.{i}')
    other_profiles.append(related_dmr_rate)
    
    # atac peak
    atac_peak_df = atac_peak.loc[related_dmr_rate.index, use_clusters].copy()
    other_profiles.append(atac_peak_df)
    atac_peak_df.columns = atac_peak_df.columns.map(lambda i: f'ATACPeak.{i}')
    
    dmr_annotation = pd.concat([gene_cluster] + other_profiles, axis=1, sort=True)
    return dmr_annotation

In [None]:
for gene in genes:
    print(gene)
    check_path = output_dir / f'{gene}.DMR_cluster.msg'
    if check_path.exists():
        continue
    
    dmr_annotation = get_annotation(gene)
    dmr_annotation.to_msgpack(output_dir / f'{gene}.DMR_detail.msg', compress='zlib')
    
    cluster_annotation = dmr_annotation.groupby('leiden').mean()
    cluster_annotation.to_msgpack(output_dir / f'{gene}.DMR_cluster.msg', compress='zlib')

ENSMUSG00000057880.12
ENSMUSG00000022685.9
ENSMUSG00000043811.5
ENSMUSG00000022840.9
ENSMUSG00000047434.14
ENSMUSG00000033306.15
ENSMUSG00000050272.10
ENSMUSG00000022790.14
ENSMUSG00000053414.8
ENSMUSG00000022494.15
ENSMUSG00000086175.2
ENSMUSG00000043301.16
ENSMUSG00000090386.10
ENSMUSG00000090223.2
ENSMUSG00000051065.9
ENSMUSG00000025612.5
ENSMUSG00000035258.15
ENSMUSG00000022656.15
ENSMUSG00000022514.14
ENSMUSG00000022861.17
ENSMUSG00000035270.15
ENSMUSG00000022762.18
ENSMUSG00000048399.4
ENSMUSG00000022687.12
ENSMUSG00000047261.9
ENSMUSG00000074892.9
ENSMUSG00000117081.1
ENSMUSG00000065979.12
ENSMUSG00000055540.15
ENSMUSG00000117000.1
ENSMUSG00000022844.8
ENSMUSG00000022701.18
ENSMUSG00000022641.15
ENSMUSG00000022637.11
ENSMUSG00000115293.2
ENSMUSG00000022537.18
ENSMUSG00000022636.13
ENSMUSG00000013089.15
ENSMUSG00000005615.15
ENSMUSG00000008658.16
ENSMUSG00000115852.2
ENSMUSG00000116029.1
ENSMUSG00000052516.19
ENSMUSG00000116720.1
ENSMUSG00000022780.5
ENSMUSG00000052504.7
ENSMUSG0