In [106]:
import pandas as pd
import pybedtools
import pathlib
import anndata
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

In [5]:
region_dmr_dir = pathlib.Path('ITSpatialDMR_overlap_region_specific_dmr')
dmr_list = list(region_dmr_dir.glob('*.bed'))

gene_corr_dir = pathlib.Path(
    '/home/hanliu/project/mouse_rostral_brain/study/ITSpatial/gene_dmr_corr/Corr/'
)
corr_adata_list = list(gene_corr_dir.glob('*h5ad'))
adata_list = [anndata.read_h5ad(p) for p in corr_adata_list]

In [91]:
region_markers = joblib.load(
    '/home/hanliu/project/mouse_rostral_brain/study/ITSpatial/DMGAnalysis/region_hypo_genes.IT-L23.obj'
)

In [60]:
records = []
for bed_path in dmr_list:
    dmr_df = pd.read_csv(bed_path, header=None, index_col=3, sep='\t')
    group = bed_path.name.split('.')[0]
    print(group, dmr_df.shape[0])
    for adata in adata_list:
        _this_adata = adata[adata.obs_names & dmr_df.index, :].copy()
        _this_loops = pd.DataFrame(_this_adata.X.todense(), 
                                   index=_this_adata.obs_names, 
                                   columns=_this_adata.var_names).unstack()
        _this_loops = _this_loops[_this_loops > 0].copy()
        _this_loops.index.names = ['gene', 'dmr']
        _this_loops.name = 'corr'
        _this_loops = _this_loops.reset_index()
        _this_loops['group'] = group
        records.append(_this_loops)
total_df = pd.concat(records)
del records

IT-L23+ACA 42423
IT-L23+AI 15777
IT-L23+MOp 98802
IT-L23+ORB 23678
IT-L23+PFC 38838
IT-L23+SSp 104134
IT-L23+MOs 55983
IT-L23+SSs 96864


In [81]:
corr_cutoff = 0.65
min_dmr_cutoff = 15

In [100]:
for group, sub_df in total_df.groupby('group'):
    group_region_marker = set(region_markers[group.split('+')[1]])
    
    sub_df = sub_df[sub_df['corr'] > corr_cutoff]
    gene_dmr_counts = sub_df['gene'].value_counts()
    gene_dmr_counts = gene_dmr_counts[gene_dmr_counts > min_dmr_cutoff].index
    sub_df = sub_df[sub_df['gene'].isin(gene_dmr_counts)]
    corr_gene_set = set(sub_df.gene.unique())
    break

In [102]:
len(group_region_marker)

68

In [103]:
len(corr_gene_set)

459

In [108]:
gene_meta = pd.read_csv('/home/hanliu/ref/mouse/gencode/vm22/gencode.vM22.annotation.gene.flat.tsv.gz',
                        sep='\t', index_col='gene_id')

In [118]:
gene_meta.loc[group_region_marker & corr_gene_set]['gene_name']

gene_id
ENSMUSG00000036510.17             Cdh8
ENSMUSG00000104093.1     A330015K06Rik
ENSMUSG00000115529.1     9630013A20Rik
ENSMUSG00000039706.11             Ldb2
ENSMUSG00000028266.17             Lmo4
ENSMUSG00000032452.12           Clstn2
ENSMUSG00000094296.1           Gm21798
ENSMUSG00000035357.16           Pdzrn3
ENSMUSG00000030199.16             Etv6
ENSMUSG00000103779.1           Gm36931
ENSMUSG00000104894.4           Gm43507
ENSMUSG00000108238.1           Gm43984
ENSMUSG00000006586.15          Runx1t1
ENSMUSG00000048078.16            Tenm4
ENSMUSG00000105511.1           Gm33758
ENSMUSG00000027784.10            Ppm1l
ENSMUSG00000097695.1           Gm26905
ENSMUSG00000092134.1           Gm17089
ENSMUSG00000049336.16            Tenm2
ENSMUSG00000021217.7             Tshz3
ENSMUSG00000085456.2           Gm15398
ENSMUSG00000085792.2           Gm15414
ENSMUSG00000022376.8             Adcy8
ENSMUSG00000061013.6               Mkx
ENSMUSG00000034312.14           Iqsec1
ENSMUSG0000002721