In [1]:
import joblib
import xarray as xr
import pandas as pd
import anndata
import numpy as np
import seaborn as sns
import anndata
from scipy.sparse import csr_matrix, vstack
import matplotlib.pyplot as plt

from concurrent.futures import as_completed, ProcessPoolExecutor

## Load Data

### Cell type phylogeny

In [2]:
node_cluster_dict = joblib.load(
    '/home/hanliu/project/mouse_rostral_brain/study/AssignGeneToTree/RTree/Exc.non_singleton_node_dict.lib'
)

cluster_dendro = joblib.load(
    '/home/hanliu/project/mouse_rostral_brain/study/AssignGeneToTree/RTree/Exc.dendrogram.lib'
)
cluster_linkage = pd.read_csv(
    '/home/hanliu/project/mouse_rostral_brain/study/AssignGeneToTree/RTree/Exc.linkage.csv', index_col=0).values
cluster_linkage_order = pd.read_csv(
    '/home/hanliu/project/mouse_rostral_brain/study/AssignGeneToTree/RTree/Exc.linkage.orders.txt', 
    index_col=0, header=None
).index

cluster_linkage_order = [i.replace(' ', '_') for i in cluster_linkage_order]
name_map = {i.replace('_', '.').replace('-', '.'):i for i in cluster_linkage_order}
cluster_dendro['ivl'] = [name_map[i] for i in cluster_dendro['ivl']]

### DMR

#### DMG DMR Corr

In [3]:
# dmr_gene_corr = pd.read_msgpack(
#     '/home/hanliu/project/mouse_rostral_brain/study/DMRGeneCorr/TotalGeneDMRCorrLoop.0.3.msg')

#### DMR Rate

In [4]:
dmr_rate = pd.read_hdf(
    '/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/DMRInfo.h5',
    key='Rate').loc[:, cluster_dendro['ivl']].copy()

#### DMR Hits

In [5]:
hypo_hits = anndata.read_h5ad(
    '/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/TotalHits.HypoDMR.h5ad'
)[:, cluster_dendro['ivl']].copy()

### Make pairs

In [6]:
from itertools import combinations

In [7]:
cluster_pairs = list(combinations(cluster_dendro['ivl'], 2))
print(len(cluster_pairs), 'pairwise comparison to test.')

2278 pairwise comparison to test.


## Select DMR Per Pair

In [8]:
dmr_rate_delta = 0.4

In [None]:
pairs = []
records = []

for i, (a, b) in enumerate(cluster_pairs):
    rate_delta = dmr_rate[a] - dmr_rate[b]
    # only one of the cluster is 1, means one hypo, the other not
    hypo_judge = (hypo_hits.obs_vector(a) + hypo_hits.obs_vector(b)) == 1
    rate_delta = rate_delta[hypo_judge].copy()
    # top 100k (or less) most diff DMRs
    a_hypo = rate_delta[rate_delta < -dmr_rate_delta].sort_values()[:100000]
    b_hypo = rate_delta[rate_delta > dmr_rate_delta].sort_values(ascending=False)[:100000]
    
    sparse_data = csr_matrix(dmr_rate.index.isin(a_hypo.index | b_hypo.index)[None, :])

    pairs.append([a, b])
    records.append(sparse_data)
    print(i, a, a_hypo.size, f'{a_hypo.iat[-1]:.2f}', b, b_hypo.size, f'{b_hypo.iat[-1]:.2f}', sep='\t')

0	Gfra1_Gfra1	100000	-0.50	CA1_Kif26a	32207	0.40
1	Gfra1_Gfra1	100000	-0.63	PT-L5_Unc5b	57868	0.40
2	Gfra1_Gfra1	100000	-0.56	PT-L5_Nectin1	100000	0.44
3	Gfra1_Gfra1	100000	-0.58	PT-L5_Tmtc2	100000	0.41
4	Gfra1_Gfra1	100000	-0.62	PT-L5_Plcb4	66005	0.40
5	Gfra1_Gfra1	100000	-0.57	PT-L5_Tenm2	63392	0.40
6	Gfra1_Gfra1	100000	-0.65	PT-L5_Ptprt	54215	0.40
7	Gfra1_Gfra1	100000	-0.52	PT-L5_Kcnh1	100000	0.45
8	Gfra1_Gfra1	100000	-0.59	PT-L5_Astn2	68788	0.40
9	Gfra1_Gfra1	100000	-0.64	CA1_Ak5	73830	0.40
10	Gfra1_Gfra1	100000	-0.52	CA1_Chrm3	100000	0.58
11	Gfra1_Gfra1	100000	-0.59	CA1_Ptprg	100000	0.45
12	Gfra1_Gfra1	100000	-0.61	CA1_Lingo2	100000	0.41
13	Gfra1_Gfra1	100000	-0.68	PT-L5_Abca12	81971	0.40
14	Gfra1_Gfra1	100000	-0.73	CA3_Cadm2	100000	0.63
15	Gfra1_Gfra1	100000	-0.80	DG-po_Calb2	100000	0.67
16	Gfra1_Gfra1	100000	-0.86	DG-po_Kctd8	100000	0.60
17	Gfra1_Gfra1	100000	-0.79	CA3_Efnb2	100000	0.52
18	Gfra1_Gfra1	100000	-0.78	CA3-St18_Nuak1	100000	0.50
19	Gfra1_Gfra1	100000	-0.77	CA3-St18_T

160	PT-L5_Unc5b	100000	-0.76	NP-L6_Cntnap5a	51444	0.40
161	PT-L5_Unc5b	100000	-0.75	NP-L6_Boc	58843	0.40
162	PT-L5_Unc5b	100000	-0.77	NP-L6_Cyp7b1	71015	0.40
163	PT-L5_Unc5b	100000	-0.68	L6b_Nrp2	56585	0.40
164	PT-L5_Unc5b	100000	-0.64	L6b_Adcy8	55442	0.40
165	PT-L5_Unc5b	100000	-0.59	L6b_Kcnk2	100000	0.46
166	PT-L5_Unc5b	100000	-0.53	L6b_Pkhd1	66508	0.40
167	PT-L5_Unc5b	100000	-0.47	CT-L6_Megf9	100000	0.41
168	PT-L5_Unc5b	100000	-0.50	CT-L6_Il1rap	100000	0.48
169	PT-L5_Unc5b	100000	-0.54	CT-L6_Hcrtr2	100000	0.56
170	PT-L5_Unc5b	100000	-0.51	CT-L6_Map4	100000	0.47
171	PT-L5_Unc5b	100000	-0.41	OLF-Exc_Bmpr1b	100000	0.43
172	PT-L5_Unc5b	100000	-0.59	OLF-Exc_Pld5	100000	0.66
173	PT-L5_Unc5b	100000	-0.55	OLF-Exc_Rmst	18002	0.40
174	PT-L5_Unc5b	100000	-0.49	OLF-Exc_Cdh9	100000	0.44
175	PT-L5_Unc5b	100000	-0.54	OLF-Exc_Cux2	100000	0.49
176	PT-L5_Unc5b	100000	-0.69	OLF-Exc_Sgcd	100000	0.57
177	PT-L5_Unc5b	100000	-0.56	OLF-Exc_Lrrtm3	100000	0.55
178	PT-L5_Unc5b	100000	-0.57	OLF-Exc_Unc13c	1000

314	PT-L5_Tmtc2	100000	-0.55	IT-L6_Man1c1	75416	0.40
315	PT-L5_Tmtc2	100000	-0.51	IT-L6_Fstl4	100000	0.42
316	PT-L5_Tmtc2	100000	-0.52	IT-L5_Etv1	86341	0.40
317	PT-L5_Tmtc2	100000	-0.48	IT-L5_Grik3	97909	0.40
318	PT-L5_Tmtc2	100000	-0.45	IT-L23_Cux1	100000	0.51
319	PT-L5_Tmtc2	100000	-0.53	IT-L23_Tenm2	100000	0.49
320	PT-L5_Tmtc2	100000	-0.57	IT-L23_Ptprt	100000	0.42
321	PT-L5_Tmtc2	100000	-0.50	IT-L4_Shc3	100000	0.48
322	PT-L5_Tmtc2	100000	-0.51	IT-L4_Astn2	100000	0.51
323	PT-L5_Tmtc2	100000	-0.53	IT-L23_Foxp1	100000	0.46
324	PT-L5_Tmtc2	100000	-0.50	IT-L5_Cdh8	100000	0.53
325	PT-L5_Plcb4	9361	-0.40	PT-L5_Tenm2	17581	0.40
326	PT-L5_Plcb4	13952	-0.40	PT-L5_Ptprt	5941	0.40
327	PT-L5_Plcb4	17493	-0.40	PT-L5_Kcnh1	100000	0.42
328	PT-L5_Plcb4	25085	-0.40	PT-L5_Astn2	45095	0.40
329	PT-L5_Plcb4	100000	-0.54	CA1_Ak5	100000	0.50
330	PT-L5_Plcb4	100000	-0.44	CA1_Chrm3	100000	0.74
331	PT-L5_Plcb4	100000	-0.46	CA1_Ptprg	100000	0.60
332	PT-L5_Plcb4	100000	-0.46	CA1_Lingo2	100000	0.57
333	PT-L5_Plc

471	PT-L5_Ptprt	100000	-0.68	NP-L6_Boc	55118	0.40
472	PT-L5_Ptprt	100000	-0.70	NP-L6_Cyp7b1	69007	0.40
473	PT-L5_Ptprt	100000	-0.59	L6b_Nrp2	49587	0.40
474	PT-L5_Ptprt	100000	-0.57	L6b_Adcy8	52876	0.40
475	PT-L5_Ptprt	100000	-0.50	L6b_Kcnk2	100000	0.45
476	PT-L5_Ptprt	100000	-0.43	L6b_Pkhd1	66952	0.40
477	PT-L5_Ptprt	100000	-0.41	CT-L6_Megf9	100000	0.41
478	PT-L5_Ptprt	100000	-0.43	CT-L6_Il1rap	100000	0.47
479	PT-L5_Ptprt	100000	-0.47	CT-L6_Hcrtr2	100000	0.54
480	PT-L5_Ptprt	100000	-0.41	CT-L6_Map4	100000	0.44
481	PT-L5_Ptprt	91674	-0.40	OLF-Exc_Bmpr1b	100000	0.45
482	PT-L5_Ptprt	100000	-0.52	OLF-Exc_Pld5	100000	0.65
483	PT-L5_Ptprt	100000	-0.50	OLF-Exc_Rmst	16786	0.40
484	PT-L5_Ptprt	100000	-0.42	OLF-Exc_Cdh9	100000	0.42
485	PT-L5_Ptprt	100000	-0.42	OLF-Exc_Cux2	100000	0.44
486	PT-L5_Ptprt	100000	-0.64	OLF-Exc_Sgcd	100000	0.58
487	PT-L5_Ptprt	100000	-0.50	OLF-Exc_Lrrtm3	100000	0.53
488	PT-L5_Ptprt	100000	-0.49	OLF-Exc_Unc13c	100000	0.50
489	PT-L5_Ptprt	100000	-0.42	EP_Adcy8	76690	0.40

628	CA1_Ak5	88946	-0.40	PT-L5_Abca12	99434	0.40
629	CA1_Ak5	32913	-0.40	CA3_Cadm2	100000	0.49
630	CA1_Ak5	86786	-0.40	DG-po_Calb2	100000	0.59
631	CA1_Ak5	100000	-0.46	DG-po_Kctd8	100000	0.54
632	CA1_Ak5	37801	-0.40	CA3_Efnb2	52945	0.40
633	CA1_Ak5	57815	-0.40	CA3-St18_Nuak1	81963	0.40
634	CA1_Ak5	53280	-0.40	CA3-St18_Tead1	100000	0.44
635	CA1_Ak5	55834	-0.40	CA3-St18_Epha5	100000	0.54
636	CA1_Ak5	37733	-0.40	IG-CA2_Chrm3	100000	0.54
637	CA1_Ak5	30355	-0.40	IG-CA2_Peak1	100000	0.51
638	CA1_Ak5	90153	-0.40	IG-CA2_Xpr1	100000	0.44
639	CA1_Ak5	76614	-0.40	DG_dg-all	100000	0.50
640	CA1_Ak5	43830	-0.40	DG-po_Bcl11a	100000	0.42
641	CA1_Ak5	100000	-0.57	NP-L6_Cntnap4	64971	0.40
642	CA1_Ak5	100000	-0.65	NP-L6_Olfml2b	57970	0.40
643	CA1_Ak5	100000	-0.69	NP-L6_Kcnab1	54207	0.40
644	CA1_Ak5	100000	-0.69	NP-L6_Cntnap5a	57023	0.40
645	CA1_Ak5	100000	-0.68	NP-L6_Boc	64382	0.40
646	CA1_Ak5	100000	-0.69	NP-L6_Cyp7b1	70916	0.40
647	CA1_Ak5	100000	-0.65	L6b_Nrp2	75903	0.40
648	CA1_Ak5	100000	-0.64	L6b_Ad

793	CA1_Lingo2	100000	-0.45	PT-L5_Abca12	56330	0.40
794	CA1_Lingo2	100000	-0.41	CA3_Cadm2	100000	0.54
795	CA1_Lingo2	100000	-0.50	DG-po_Calb2	100000	0.59
796	CA1_Lingo2	100000	-0.54	DG-po_Kctd8	100000	0.51
797	CA1_Lingo2	100000	-0.50	CA3_Efnb2	100000	0.42
798	CA1_Lingo2	100000	-0.50	CA3-St18_Nuak1	100000	0.45
799	CA1_Lingo2	100000	-0.47	CA3-St18_Tead1	100000	0.52
800	CA1_Lingo2	100000	-0.44	CA3-St18_Epha5	100000	0.59
801	CA1_Lingo2	100000	-0.44	IG-CA2_Chrm3	100000	0.62
802	CA1_Lingo2	100000	-0.44	IG-CA2_Peak1	100000	0.62
803	CA1_Lingo2	100000	-0.52	IG-CA2_Xpr1	100000	0.47
804	CA1_Lingo2	100000	-0.48	DG_dg-all	100000	0.57
805	CA1_Lingo2	94187	-0.40	DG-po_Bcl11a	100000	0.43
806	CA1_Lingo2	100000	-0.62	NP-L6_Cntnap4	51870	0.40
807	CA1_Lingo2	100000	-0.69	NP-L6_Olfml2b	42952	0.40
808	CA1_Lingo2	100000	-0.73	NP-L6_Kcnab1	40377	0.40
809	CA1_Lingo2	100000	-0.73	NP-L6_Cntnap5a	43578	0.40
810	CA1_Lingo2	100000	-0.72	NP-L6_Boc	50313	0.40
811	CA1_Lingo2	100000	-0.72	NP-L6_Cyp7b1	57752	0.40
812	CA

In [None]:
adata = anndata.AnnData(X=vstack(records),
                        obs=pd.DataFrame(pairs, columns=['cluster_a', 'cluster_b']),
                        var=pd.DataFrame([], index=dmr_rate.index))

In [None]:
adata.write_h5ad('PairwiseDMR.h5ad')