In [1]:
import joblib
import xarray as xr
import pandas as pd
import anndata
import numpy as np
import seaborn as sns
from scipy.sparse import csr_matrix, vstack
import matplotlib.pyplot as plt

## Load Data

### Cell type phylogeny

In [2]:
node_cluster_dict = joblib.load(
    '/home/hanliu/project/mouse_rostral_brain/study/AssignGeneToTree/RTree/Inh.non_singleton_node_dict.lib'
)

cluster_dendro = joblib.load(
    '/home/hanliu/project/mouse_rostral_brain/study/AssignGeneToTree/RTree/Inh.dendrogram.lib'
)
cluster_linkage = pd.read_csv(
    '/home/hanliu/project/mouse_rostral_brain/study/AssignGeneToTree/RTree/Inh.linkage.csv', index_col=0).values
cluster_linkage_order = pd.read_csv(
    '/home/hanliu/project/mouse_rostral_brain/study/AssignGeneToTree/RTree/Inh.linkage.orders.txt', 
    index_col=0, header=None
).index

cluster_linkage_order = [i.replace(' ', '_') for i in cluster_linkage_order]
name_map = {i.replace('_', '.').replace('-', '.'):i for i in cluster_linkage_order}
cluster_dendro['ivl'] = [name_map[i] for i in cluster_dendro['ivl']]

### Gene

#### DMG

In [3]:
related_dmg = pd.read_msgpack('RelatedDMG.msg')

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  exec(code_obj, self.user_global_ns, self.user_ns)


#### Gene meta

In [4]:
gene_meta = pd.read_csv(
    '/home/hanliu/ref/mouse/gencode/vm22/gencode.vM22.annotation.gene.flat.tsv.gz',
    sep='\t',
    index_col='gene_id')
gene_name_to_id = {v: k for k, v in gene_meta['gene_name'].items()}

#### Subtype gene rate

In [5]:
subtype_rate = xr.open_dataset(
    '/home/hanliu/project/mouse_rostral_brain/study/mCClustermCLevel/SubType.geneslop2k.mcds'
)['gene_cluster_da_rate'].sel(mc_type='CHN').to_pandas().T
subtype_rate.columns = subtype_rate.columns.str.replace(' ', '_')
subtype_rate.head()

SubType,MGE-Sst_Rxra,CA3_Cadm2,CA1_Chrm3,CA3-St18_Tead1,Unc5c_Unc5c,Gfra1_Gfra1,ODC_odc-small,PC_pc-all,ODC_odc-large,ANP_anp-dg,...,D1L-PAL_Plcxd3,PAL-Inh_Onecut2,LSX-Inh_Foxp2,LSX-Inh_Enox1,MSN-D1_Outlier,LSX-Inh_Dock10,LSX-Inh_Nxph1,LSX-Inh_Zeb2,LSX-Inh_Lats2,PT-L5_Outlier
geneslop2k,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSMUSG00000102693.1,0.036028,0.033156,0.023127,0.027134,0.024149,0.030705,0.007401,0.004408,0.006504,0.005405,...,0.011507,0.0429,0.02867,0.025057,0.026316,0.022981,0.02237,0.0197,0.018092,0.018018
ENSMUSG00000064842.1,0.043546,0.029851,0.024722,0.027768,0.030352,0.039891,0.009691,0.004859,0.009342,0.006471,...,0.014913,0.049733,0.032969,0.029144,,0.028612,0.030037,0.026626,0.031396,0.0
ENSMUSG00000051951.5,0.011518,0.00963,0.008233,0.017092,0.011801,0.014484,0.018628,0.00479,0.015895,0.005185,...,0.012604,0.023654,0.026305,0.021451,0.013525,0.024743,0.022012,0.0253,0.021889,0.022814
ENSMUSG00000102851.1,0.012793,0.008696,0.008266,0.013822,0.010621,0.012788,0.013043,0.005251,0.013161,0.004871,...,0.012873,0.025467,0.030113,0.022554,0.0,0.024056,0.025327,0.024327,0.015444,
ENSMUSG00000103377.1,0.008664,0.009502,0.009479,0.024804,0.010696,0.016256,0.049207,0.004797,0.03858,0.005653,...,0.014659,0.025282,0.034705,0.026059,0.0,0.032258,0.030009,0.032695,0.029836,0.015038


### Gene assign to node result

In [6]:
total_result = pd.read_msgpack('NodeGeneResults.msg')

### DMR

#### DMG DMR Corr

In [7]:
dmr_gene_corr = pd.read_msgpack(
    '/home/hanliu/project/mouse_rostral_brain/study/DMRGeneCorr/TotalGeneDMRCorrLoop.0.3.msg')

#### DMR Rate

In [8]:
dmr_rate = pd.read_hdf(
    '/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/DMRInfo.h5',
    key='Rate').loc[:, cluster_dendro['ivl']].copy()

#### DMR Hits

In [9]:
hypo_hits = anndata.read_h5ad(
    '/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/TotalHits.HypoDMR.h5ad'
)[:, cluster_dendro['ivl']].copy()

## Select gene per node

In [10]:
cutoff = 0.1

node_dict = {}
for node, row in total_result.iterrows():
    node_record = total_result.loc[node]
    left_gene = node_record[node_record < -cutoff].abs()
    right_gene = node_record[node_record > cutoff]
    node_dict[node] = {'left': left_gene, 'right': right_gene}
    # print(node, left_gene.size, right_gene.size)

In [11]:
# sns.clustermap(subtype_rate.loc[left_gene.index, cluster_linkage_order].fillna(0.8), 
#                col_linkage=cluster_linkage, vmin=0, vmax=0.03)
# sns.clustermap(subtype_rate.loc[right_gene.index, cluster_linkage_order].fillna(0.8), 
#                col_linkage=cluster_linkage, vmin=0, vmax=0.03)

## Select DMR

- DMR is correlated to gene
- DMR is also relavent to the node

In [12]:
def get_node_relavent_dmr(node, abs_cutoff=0.1):
    left_gene = node_dict[node]['left']
    right_gene = node_dict[node]['right']

    left_nodes = [i.replace(' ', '_') for i in node_cluster_dict[node]['left']]
    right_nodes = [
        i.replace(' ', '_') for i in node_cluster_dict[node]['right']
    ]

    n_left = len(left_nodes)
    n_right = len(right_nodes)

    left_dmr = dmr_gene_corr[dmr_gene_corr['Gene'].isin(
        left_gene.index)]['DMR'].drop_duplicates()
    right_dmr = dmr_gene_corr[dmr_gene_corr['Gene'].isin(
        right_gene.index)]['DMR'].drop_duplicates()
    total_node_dmr = pd.concat([left_dmr, right_dmr])
    print(f'{node} gene', left_gene.size, right_gene.size)
    print(f'{node} total DMR', total_node_dmr.unique().size)
    
    related_hits = hypo_hits[total_node_dmr.unique()].copy()
    if len(left_nodes) > 1:
        left_hits = related_hits[:, left_nodes].X.sum(axis=1).A1
    else:
        left_hits = related_hits[:, left_nodes].X
    if len(right_nodes) > 1:
        right_hits = related_hits[:, right_nodes].X.sum(axis=1).A1
    else:
        right_hits = related_hits[:, left_nodes].X

    # approximate relavent score, prevent redo test: (N * (M-m) - M * (N-n)) / N * M
    # positive score is left hypo DMR, negative score is right hypo DMR
    dmr_relavent_score = pd.Series(
        (left_hits * (n_right - right_hits) - right_hits *
         (n_left - left_hits)) / (n_left * n_right),
        index=related_hits.obs_names)
    dmr_relavent_score.name = node
    return dmr_relavent_score

In [13]:
data_list = []
node_list = []
for node in node_cluster_dict.keys():
    use_dmr = get_node_relavent_dmr(node, abs_cutoff=0)
    use_dmr = use_dmr[use_dmr.abs() > 0.1]
    sparse_data = csr_matrix(use_dmr.reindex(dmr_rate.index).fillna(0).values[None, :])
    node_list.append(node)
    data_list.append(sparse_data)
    print(f'{node} related dmr', use_dmr.size)

77 gene 0 0
77 total DMR 0


  warn_flatten()


77 related dmr 0
78 gene 13 3
78 total DMR 3024
78 related dmr 0
79 gene 8 0
79 total DMR 3067
79 related dmr 0
80 gene 1 4
80 total DMR 2774
80 related dmr 937
81 gene 0 2
81 total DMR 659
81 related dmr 0


  warn_flatten()


82 gene 5 0
82 total DMR 2156
82 related dmr 0
83 gene 2 5
83 total DMR 2242
83 related dmr 0
84 gene 0 0
84 total DMR 0
84 related dmr 0
85 gene 3 2
85 total DMR 2474
85 related dmr 737
86 gene 1 2
86 total DMR 978
86 related dmr 0


  warn_flatten()


87 gene 0 3
87 total DMR 1741
87 related dmr 0
88 gene 0 6
88 total DMR 1174
88 related dmr 0
89 gene 18 26
89 total DMR 10602
89 related dmr 0
90 gene 4 7
90 total DMR 2594
90 related dmr 634
91 gene 1 0
91 total DMR 437


  warn_flatten()


91 related dmr 0
92 gene 5 2
92 total DMR 1964
92 related dmr 0
93 gene 10 11
93 total DMR 8083
93 related dmr 2184
94 gene 13 17
94 total DMR 9917


  warn_flatten()


94 related dmr 2929
95 gene 3 2
95 total DMR 1031
95 related dmr 0


  warn_flatten()


96 gene 8 34
96 total DMR 16236
96 related dmr 4638
97 gene 2 25
97 total DMR 4219
97 related dmr 0


  warn_flatten()


98 gene 5 28
98 total DMR 14223
98 related dmr 5384
99 gene 17 3
99 total DMR 6398


  warn_flatten()


99 related dmr 2263
100 gene 0 6
100 total DMR 2296
100 related dmr 0


  warn_flatten()


101 gene 0 2
101 total DMR 1004
101 related dmr 0
102 gene 2 2
102 total DMR 2354
102 related dmr 0
103 gene 3 1
103 total DMR 1453
103 related dmr 259
104 gene 5 5
104 total DMR 3007


  warn_flatten()


104 related dmr 225
105 gene 16 35
105 total DMR 20050
105 related dmr 7479
106 gene 17 22
106 total DMR 9394
106 related dmr 2304
107 gene 1 1
107 total DMR 584


  warn_flatten()


107 related dmr 0
108 gene 9 9
108 total DMR 7458
108 related dmr 0
109 gene 22 32
109 total DMR 22000
109 related dmr 6704
110 gene 17 34
110 total DMR 18605


  warn_flatten()


110 related dmr 4663
111 gene 6 15
111 total DMR 7876
111 related dmr 0


  warn_flatten()


112 gene 35 6
112 total DMR 16018
112 related dmr 3657
113 gene 6 5
113 total DMR 3471
113 related dmr 0


  warn_flatten()


114 gene 38 34
114 total DMR 24808
114 related dmr 8718
115 gene 38 27
115 total DMR 19504
115 related dmr 4592
116 gene 0 2
116 total DMR 1697
116 related dmr 0


  warn_flatten()


117 gene 25 71
117 total DMR 27510
117 related dmr 7678
118 gene 8 29
118 total DMR 11928
118 related dmr 0


  warn_flatten()


119 gene 53 38
119 total DMR 33583
119 related dmr 15065
120 gene 18 20
120 total DMR 8684


  warn_flatten()


120 related dmr 1836
121 gene 13 41
121 total DMR 17121
121 related dmr 7675
122 gene 6 48
122 total DMR 20215


  warn_flatten()


122 related dmr 6965
123 gene 14 17
123 total DMR 11162


  warn_flatten()


123 related dmr 2134
124 gene 311 126
124 total DMR 101522


  warn_flatten()


124 related dmr 10987
125 gene 91 44
125 total DMR 38073
125 related dmr 8491
126 gene 42 25
126 total DMR 17972
126 related dmr 4389
127 gene 60 42
127 total DMR 30226


  warn_flatten()


127 related dmr 10619
128 gene 11 89
128 total DMR 34939
128 related dmr 11686
129 gene 46 88
129 total DMR 48361


  warn_flatten()


129 related dmr 15903
130 gene 57 157
130 total DMR 57520
130 related dmr 15986
131 gene 17 9
131 total DMR 11017
131 related dmr 0


  warn_flatten()


132 gene 87 67
132 total DMR 44081
132 related dmr 12499
133 gene 86 102
133 total DMR 48423


  warn_flatten()


133 related dmr 13694
134 gene 318 820
134 total DMR 259866


  warn_flatten()


134 related dmr 38848
135 gene 740 431
135 total DMR 263624


  warn_flatten()


135 related dmr 46475
136 gene 170 73
136 total DMR 80709
136 related dmr 43183
137 gene 84 261
137 total DMR 92196


  warn_flatten()


137 related dmr 11991
138 gene 78 262
138 total DMR 103440
138 related dmr 46581
139 gene 137 42
139 total DMR 50822
139 related dmr 16543
140 gene 70 115
140 total DMR 66702


  warn_flatten()


140 related dmr 29107
141 gene 618 1459
141 total DMR 432116


  warn_flatten()


141 related dmr 93788
142 gene 77 567
142 total DMR 105889


  warn_flatten()


142 related dmr 46334
143 gene 170 607
143 total DMR 188527
143 related dmr 41454
144 gene 47 187
144 total DMR 75090
144 related dmr 21820
145 gene 1117 787
145 total DMR 403682
145 related dmr 122238
146 gene 271 84
146 total DMR 94114
146 related dmr 38609
147 gene 252 121
147 total DMR 123444
147 related dmr 62688
148 gene 1484 261
148 total DMR 351469
148 related dmr 91995
149 gene 177 127
149 total DMR 93403


  warn_flatten()


149 related dmr 22918
150 gene 257 110
150 total DMR 124220
150 related dmr 57121
151 gene 383 110
151 total DMR 118542
151 related dmr 51685
152 gene 206 1196
152 total DMR 254340
152 related dmr 95459


In [14]:
adata = anndata.AnnData(X=vstack(data_list),
                        obs=pd.DataFrame([], index=node_list),
                        var=pd.DataFrame([], index=dmr_rate.index))

Transforming to str index.


In [15]:
adata.write_h5ad('NodeDMRResults.h5ad')