In [1]:
import pandas as pd
import pybedtools
import pathlib
import anndata
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
import numpy as np

In [2]:
region_dmr_dir = pathlib.Path('ITSpatialDMR_overlap_region_specific_dmr')
dmr_list = list(region_dmr_dir.glob('*.bed'))

gene_corr_dir = pathlib.Path(
    '/home/hanliu/project/mouse_rostral_brain/study/ITSpatial/gene_dmr_corr/Corr/'
)
corr_adata_list = list(gene_corr_dir.glob('*h5ad'))
adata_list = [anndata.read_h5ad(p) for p in corr_adata_list]

## Get DMR with certain TF motif hits

In [3]:
motif_hits = anndata.read_h5ad('/home/hanliu/project/mouse_rostral_brain/DMR/ITSpatial/MotifScan.h5ad')

In [4]:
motif_meta_with_cluster = pd.read_csv(
    '/home/hanliu/project/mouse_rostral_brain/study/MotifClustering/JASPAR2020_CORE_vertebrates_non-redundant.mouse_genes.with_motif_group.199.csv',
    index_col='motif_uid'
)
tf_class = pd.read_csv('/home/hanliu/ref/TFGene/TFClass/TFClass.with_mouse_gene_id.csv')
tf_class.head()
level = 'Family'
gene_to_tf_class = {}
for _, row in tf_class.iterrows():
    if isinstance(row['EnsemblID'], float):
        continue
    for g in row['EnsemblID'].split(','):
        gene_to_tf_class[g] = row[level]
motif_meta_with_cluster['motif_class'] = motif_meta_with_cluster['gene_ids'].map(
    lambda i: gene_to_tf_class[i.split('.')[0]] if i.split('.')[0] in gene_to_tf_class else '')

level = 'SubFamily'
gene_to_tf_class = {}
for _, row in tf_class.iterrows():
    if isinstance(row['EnsemblID'], float):
        continue
    for g in row['EnsemblID'].split(','):
        gene_to_tf_class[g] = row[level]
motif_meta_with_cluster['motif_class_sub'] = motif_meta_with_cluster['gene_ids'].map(
    lambda i: gene_to_tf_class[i.split('.')[0]] if i.split('.')[0] in gene_to_tf_class else '')

In [76]:
motif_classes = ['MyoD-ASC-related']

motif_of_interest = motif_meta_with_cluster[motif_meta_with_cluster['motif_class'].isin(motif_classes)].index

In [77]:
use_hits = motif_hits[:, motif_of_interest].copy()
use_hits.X = use_hits.X.todense()

In [78]:
# mask small motif scores
mask_quantile_to99 = 0.9

motif_cutoff = pd.Series(np.apply_along_axis(lambda i: np.quantile(i[i>0], 0.99) * mask_quantile_to99, 
                                             0, use_hits.X),
                         index=use_hits.var_names)
# only keep value larger than the cutoff for each motif
use_hits.X = np.multiply(use_hits.X, (use_hits.X > motif_cutoff.values[None, :]))

In [79]:
hits = pd.Series(use_hits.X.sum(axis=1).A1 > 0, index=use_hits.obs_names)

In [80]:
hits_dmr = hits[hits].index

In [81]:
hits_dmr

Index(['ITSpatial_31', 'ITSpatial_43', 'ITSpatial_164', 'ITSpatial_215',
       'ITSpatial_304', 'ITSpatial_309', 'ITSpatial_310', 'ITSpatial_315',
       'ITSpatial_430', 'ITSpatial_438',
       ...
       'ITSpatial_1588093', 'ITSpatial_1588172', 'ITSpatial_1588183',
       'ITSpatial_1588184', 'ITSpatial_1588186', 'ITSpatial_1588206',
       'ITSpatial_1588288', 'ITSpatial_1588300', 'ITSpatial_1588329',
       'ITSpatial_1588388'],
      dtype='object', name='index', length=103264)

In [82]:
(use_hits.X > 0).sum(axis=0).A1

array([32244, 22330, 16575, 23243,  7670, 16411, 35396])

## Intersect with markers

In [67]:
region_markers = joblib.load(
    '/home/hanliu/project/mouse_rostral_brain/study/ITSpatial/DMGAnalysis/region_hypo_genes.IT-L23.obj'
)

In [68]:
records = []
for bed_path in dmr_list:
    dmr_df = pd.read_csv(bed_path, header=None, index_col=3, sep='\t')
    group = bed_path.name.split('.')[0]
    print(group, dmr_df.shape[0])
    for adata in adata_list:
        _this_adata = adata[adata.obs_names & dmr_df.index, :].copy()
        _this_loops = pd.DataFrame(_this_adata.X.todense(), 
                                   index=_this_adata.obs_names, 
                                   columns=_this_adata.var_names).unstack()
        _this_loops = _this_loops[_this_loops > 0].copy()
        _this_loops.index.names = ['gene', 'dmr']
        _this_loops.name = 'corr'
        _this_loops = _this_loops.reset_index()
        _this_loops['group'] = group
        records.append(_this_loops)
total_df = pd.concat(records)
del records

IT-L23+ACA 42423
IT-L23+AI 15777
IT-L23+MOp 98802
IT-L23+ORB 23678
IT-L23+PFC 38838
IT-L23+SSp 104134
IT-L23+MOs 55983
IT-L23+SSs 96864


In [88]:
corr_cutoff = 0.4
min_dmr_cutoff = 2

In [89]:
for group, sub_df in total_df.groupby('group'):
    group_region_marker = set(region_markers[group.split('+')[1]])
    
    sub_df = sub_df[(sub_df['corr'] > corr_cutoff) & sub_df['dmr'].isin(hits_dmr)].copy()
    gene_dmr_counts = sub_df['gene'].value_counts()
    gene_dmr_counts = gene_dmr_counts[gene_dmr_counts > min_dmr_cutoff].index
    sub_df = sub_df[sub_df['gene'].isin(gene_dmr_counts)]

    corr_gene_set = set(sub_df.gene.unique())
    print(group, 
          len(group_region_marker), 
          len(corr_gene_set), 
          len(group_region_marker & corr_gene_set))

IT-L23+ACA 68 368 24
IT-L23+AI 56 131 9
IT-L23+MOp 67 1715 35
IT-L23+MOs 44 658 16
IT-L23+ORB 69 409 9
IT-L23+PFC 90 391 23
IT-L23+SSp 83 1811 43
IT-L23+SSs 72 1554 39


In [22]:
gene_meta = pd.read_csv('/home/hanliu/ref/mouse/gencode/vm22/gencode.vM22.annotation.gene.flat.tsv.gz',
                        sep='\t', index_col='gene_id')

In [23]:
gene_meta.loc[group_region_marker & corr_gene_set]['gene_name']

gene_id
ENSMUSG00000036510.17             Cdh8
ENSMUSG00000085456.2           Gm15398
ENSMUSG00000030199.16             Etv6
ENSMUSG00000003746.16            Man1a
ENSMUSG00000022376.8             Adcy8
ENSMUSG00000032076.19            Cadm1
ENSMUSG00000000247.11             Lhx2
ENSMUSG00000034312.14           Iqsec1
ENSMUSG00000105511.1           Gm33758
ENSMUSG00000042671.12             Rgs8
ENSMUSG00000085792.2           Gm15414
ENSMUSG00000048078.16            Tenm4
ENSMUSG00000097695.1           Gm26905
ENSMUSG00000035357.16           Pdzrn3
ENSMUSG00000050271.12            Prag1
ENSMUSG00000049336.16            Tenm2
ENSMUSG00000103779.1           Gm36931
ENSMUSG00000032452.12           Clstn2
ENSMUSG00000105279.1            Gm6260
ENSMUSG00000031608.13           Galnt7
ENSMUSG00000027273.13           Snap25
ENSMUSG00000034275.18           Igsf9b
ENSMUSG00000040943.12             Tet2
ENSMUSG00000027210.20            Meis2
ENSMUSG00000031990.15             Jam3
ENSMUSG0000000191

In [107]:
motif_class_order = [
    # 'More than 3 adjacent zinc fingers',
    'MyoD-ASC-related',
    'RFX',
    'POU',
    'PD+HD',
    'Paired-related HD',
    'HOX',
    'HD-LIM',
    'NK',
    # 'DMRT',
    'Thyroid hormone receptor-related factors',
    'Regulators of differentiation',
    'bHLH-ZIP',
    'Tal-related',
    'SMAD',
    'Jun-related',
    'Fos-related',
    'B-ATF-related',
    'CEBP-related',
]

tf_genes = []
for motif_id, gs in motif_meta_with_cluster[
        motif_meta_with_cluster['motif_class'].isin(
            motif_class_order)]['gene_ids'].items():
    for g in gs.split(','):
        tf_genes.append(g)
tf_genes = set(tf_genes)

In [112]:
for group, markers in region_markers.items():
    print(group, gene_meta.loc[tf_genes & markers, 'gene_name'].tolist())
    

MOs ['Lhx2']
SSp ['Lhx2', 'Jdp2', 'Emx1', 'Barx2']
ACA ['Lhx2']
MOp ['Lhx2', 'Emx1']
SSs ['Lhx2', 'Jdp2', 'Emx1', 'Barx2']
PFC []
ORB []
AI []


In [115]:
motif_meta_with_cluster[motif_meta_with_cluster['gene_names'] == 'Lhx2']

Unnamed: 0_level_0,motif_name,motif_genes,gene_ids,gene_names,motif_group,motif_class,motif_class_sub
motif_uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
MA0700.2,LHX2,LHX2,ENSMUSG00000000247.11,Lhx2,MotifGroup3,HD-LIM,LHX2-like
