In [31]:
import joblib
import xarray as xr
import pandas as pd
import anndata
import numpy as np
import seaborn as sns
from scipy.sparse import csr_matrix, vstack
import matplotlib.pyplot as plt

## Load Data

### Cell type phylogeny

In [2]:
node_cluster_dict = joblib.load(
    '/home/hanliu/project/mouse_rostral_brain/study/AssignGeneToTree/RTree/Exc.non_singleton_node_dict.lib'
)

cluster_dendro = joblib.load(
    '/home/hanliu/project/mouse_rostral_brain/study/AssignGeneToTree/RTree/Exc.dendrogram.lib'
)
cluster_linkage = pd.read_csv(
    '/home/hanliu/project/mouse_rostral_brain/study/AssignGeneToTree/RTree/Exc.linkage.csv', index_col=0).values
cluster_linkage_order = pd.read_csv(
    '/home/hanliu/project/mouse_rostral_brain/study/AssignGeneToTree/RTree/Exc.linkage.orders.txt', 
    index_col=0, header=None
).index

cluster_linkage_order = [i.replace(' ', '_') for i in cluster_linkage_order]
name_map = {i.replace('_', '.').replace('-', '.'):i for i in cluster_linkage_order}
cluster_dendro['ivl'] = [name_map[i] for i in cluster_dendro['ivl']]

### Gene

#### DMG

In [3]:
related_dmg = pd.read_msgpack('RelatedDMG.msg')

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  exec(code_obj, self.user_global_ns, self.user_ns)


#### Gene meta

In [4]:
gene_meta = pd.read_csv(
    '/home/hanliu/ref/mouse/gencode/vm22/gencode.vM22.annotation.gene.flat.tsv.gz',
    sep='\t',
    index_col='gene_id')
gene_name_to_id = {v: k for k, v in gene_meta['gene_name'].items()}

#### Subtype gene rate

In [12]:
subtype_rate = xr.open_dataset(
    '/home/hanliu/project/mouse_rostral_brain/study/mCClustermCLevel/SubType.geneslop2k.mcds'
)['gene_cluster_da_rate'].sel(mc_type='CHN').to_pandas().T
subtype_rate.columns = subtype_rate.columns.str.replace(' ', '_')
subtype_rate.head()

SubType,MGE-Sst_Rxra,CA3_Cadm2,CA1_Chrm3,CA3-St18_Tead1,Unc5c_Unc5c,Gfra1_Gfra1,ODC_odc-small,PC_pc-all,ODC_odc-large,ANP_anp-dg,...,D1L-PAL_Plcxd3,PAL-Inh_Onecut2,LSX-Inh_Foxp2,LSX-Inh_Enox1,MSN-D1_Outlier,LSX-Inh_Dock10,LSX-Inh_Nxph1,LSX-Inh_Zeb2,LSX-Inh_Lats2,PT-L5_Outlier
geneslop2k,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSMUSG00000102693.1,0.036028,0.033156,0.023127,0.027134,0.024149,0.030705,0.007401,0.004408,0.006504,0.005405,...,0.011507,0.0429,0.02867,0.025057,0.026316,0.022981,0.02237,0.0197,0.018092,0.018018
ENSMUSG00000064842.1,0.043546,0.029851,0.024722,0.027768,0.030352,0.039891,0.009691,0.004859,0.009342,0.006471,...,0.014913,0.049733,0.032969,0.029144,,0.028612,0.030037,0.026626,0.031396,0.0
ENSMUSG00000051951.5,0.011518,0.00963,0.008233,0.017092,0.011801,0.014484,0.018628,0.00479,0.015895,0.005185,...,0.012604,0.023654,0.026305,0.021451,0.013525,0.024743,0.022012,0.0253,0.021889,0.022814
ENSMUSG00000102851.1,0.012793,0.008696,0.008266,0.013822,0.010621,0.012788,0.013043,0.005251,0.013161,0.004871,...,0.012873,0.025467,0.030113,0.022554,0.0,0.024056,0.025327,0.024327,0.015444,
ENSMUSG00000103377.1,0.008664,0.009502,0.009479,0.024804,0.010696,0.016256,0.049207,0.004797,0.03858,0.005653,...,0.014659,0.025282,0.034705,0.026059,0.0,0.032258,0.030009,0.032695,0.029836,0.015038


### Gene assign to node result

In [6]:
total_result = pd.read_msgpack('NodeGeneResults.msg')

### DMR

#### DMG DMR Corr

In [7]:
dmr_gene_corr = pd.read_msgpack(
    '/home/hanliu/project/mouse_rostral_brain/study/DMRGeneCorr/TotalGeneDMRCorrLoop.0.3.msg')

#### DMR Rate

In [8]:
dmr_rate = pd.read_hdf(
    '/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/DMRInfo.h5',
    key='Rate').loc[:, cluster_dendro['ivl']].copy()

#### DMR Hits

In [9]:
hypo_hits = anndata.read_h5ad(
    '/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/TotalHits.HypoDMR.h5ad'
)[:, cluster_dendro['ivl']].copy()

## Select gene per node

In [10]:
cutoff = 0.3

node_dict = {}
for node, row in total_result.iterrows():
    node_record = total_result.loc[node]
    left_gene = node_record[node_record < -cutoff].abs()
    right_gene = node_record[node_record > cutoff]
    node_dict[node] = {'left': left_gene, 'right': right_gene}
    # print(node, left_gene.size, right_gene.size)

In [24]:
# sns.clustermap(subtype_rate.loc[left_gene.index, cluster_linkage_order].fillna(0.8), 
#                col_linkage=cluster_linkage, vmin=0, vmax=0.03)
# sns.clustermap(subtype_rate.loc[right_gene.index, cluster_linkage_order].fillna(0.8), 
#                col_linkage=cluster_linkage, vmin=0, vmax=0.03)

## Select DMR

- DMR is correlated to gene
- DMR is also relavent to the node

In [26]:
def get_node_relavent_dmr(node, abs_cutoff=0.3):
    left_gene = node_dict[node]['left']
    right_gene = node_dict[node]['right']

    left_nodes = [i.replace(' ', '_') for i in node_cluster_dict[node]['left']]
    right_nodes = [
        i.replace(' ', '_') for i in node_cluster_dict[node]['right']
    ]

    n_left = len(left_nodes)
    n_right = len(right_nodes)

    left_dmr = dmr_gene_corr[dmr_gene_corr['Gene'].isin(
        left_gene.index)]['DMR'].drop_duplicates()
    right_dmr = dmr_gene_corr[dmr_gene_corr['Gene'].isin(
        right_gene.index)]['DMR'].drop_duplicates()
    total_node_dmr = pd.concat([left_dmr, right_dmr])
    print(f'{node} gene', left_gene.size, right_gene.size)
    print(f'{node} total DMR', total_node_dmr.unique().size)
    
    related_hits = hypo_hits[total_node_dmr.unique()].copy()
    if len(left_nodes) > 1:
        left_hits = related_hits[:, left_nodes].X.sum(axis=1).A1
    else:
        left_hits = related_hits[:, left_nodes].X
    if len(right_nodes) > 1:
        right_hits = related_hits[:, right_nodes].X.sum(axis=1).A1
    else:
        right_hits = related_hits[:, right_nodes].X

    # approximate relavent score, prevent redo test: (N * (M-m) - M * (N-n)) / N * M
    # positive score is left hypo DMR, negative score is right hypo DMR
    dmr_relavent_score = pd.Series(
        (left_hits * (n_right - right_hits) - right_hits *
         (n_left - left_hits)) / (n_left * n_right),
        index=related_hits.obs_names)
    dmr_relavent_score.name = node
    return dmr_relavent_score

In [38]:
data_list = []
node_list = []
for node in node_cluster_dict.keys():
    use_dmr = get_node_relavent_dmr(node, abs_cutoff=0)
    use_dmr = use_dmr[use_dmr.abs() > 0.1]
    sparse_data = csr_matrix(use_dmr.reindex(dmr_rate.index).fillna(0).values[None, :])
    node_list.append(node)
    data_list.append(sparse_data)
    print(f'{node} related dmr', use_dmr.size)

68 gene 1 6
68 total DMR 3019
68 related dmr 0


  warn_flatten()


69 gene 24 18
69 total DMR 18525
69 related dmr 0
70 gene 26 14
70 total DMR 17066
70 related dmr 0
71 gene 122 109
71 total DMR 81392
71 related dmr 0
72 gene 45 37
72 total DMR 39633
72 related dmr 0
73 gene 7 24
73 total DMR 14083
73 related dmr 0
74 gene 32 30
74 total DMR 28461
74 related dmr 4121
75 gene 74 119
75 total DMR 74046


  warn_flatten()


75 related dmr 0
76 gene 86 25
76 total DMR 48126
76 related dmr 2824
77 gene 24 22
77 total DMR 20802
77 related dmr 0


  warn_flatten()


78 gene 73 78
78 total DMR 68964
78 related dmr 9361
79 gene 57 43
79 total DMR 36143
79 related dmr 0


  warn_flatten()


80 gene 87 109
80 total DMR 74431
80 related dmr 0
81 gene 151 72
81 total DMR 77864
81 related dmr 0
82 gene 39 12
82 total DMR 21527
82 related dmr 0
83 gene 70 135
83 total DMR 81471
83 related dmr 0
84 gene 12 25
84 total DMR 14587
84 related dmr 0
85 gene 80 65
85 total DMR 59350
85 related dmr 0
86 gene 106 93
86 total DMR 67514
86 related dmr 0
87 gene 103 130
87 total DMR 87849
87 related dmr 0
88 gene 204 278
88 total DMR 154931
88 related dmr 16260
89 gene 102 63
89 total DMR 68518


  warn_flatten()


89 related dmr 10301
90 gene 11 44
90 total DMR 23837
90 related dmr 0


  warn_flatten()


91 gene 370 118
91 total DMR 156390
91 related dmr 13447
92 gene 254 262
92 total DMR 169380
92 related dmr 35707
93 gene 387 157
93 total DMR 165796


  warn_flatten()


93 related dmr 23725
94 gene 26 138
94 total DMR 57532


  warn_flatten()


94 related dmr 0
95 gene 279 434
95 total DMR 222293
95 related dmr 27433
96 gene 116 68
96 total DMR 72776


  warn_flatten()


96 related dmr 0
97 gene 350 228
97 total DMR 182374
97 related dmr 20680
98 gene 93 53
98 total DMR 65915


  warn_flatten()


98 related dmr 0
99 gene 389 241
99 total DMR 168519
99 related dmr 0
100 gene 59 98
100 total DMR 55513
100 related dmr 0
101 gene 230 252
101 total DMR 161166
101 related dmr 44421
102 gene 245 119
102 total DMR 146937


  warn_flatten()


102 related dmr 28561
103 gene 283 131
103 total DMR 149867


  warn_flatten()


103 related dmr 35985
104 gene 178 448
104 total DMR 182613


  warn_flatten()


104 related dmr 0
105 gene 101 101
105 total DMR 84524
105 related dmr 13605
106 gene 135 150
106 total DMR 120189
106 related dmr 16994
107 gene 273 237
107 total DMR 169494


  warn_flatten()


107 related dmr 55136
108 gene 384 275
108 total DMR 199284


  warn_flatten()


108 related dmr 0
109 gene 512 376
109 total DMR 263633
109 related dmr 32992
110 gene 247 580
110 total DMR 231014


  warn_flatten()


110 related dmr 0
111 gene 515 770
111 total DMR 303253
111 related dmr 63585
112 gene 1658 812
112 total DMR 492255


  warn_flatten()


112 related dmr 110955
113 gene 612 528
113 total DMR 321226


  warn_flatten()


113 related dmr 51437
114 gene 718 260
114 total DMR 278880


  warn_flatten()


114 related dmr 63850
115 gene 851 684
115 total DMR 380134


  warn_flatten()


115 related dmr 70924
116 gene 548 688
116 total DMR 321482
116 related dmr 121167
117 gene 375 618
117 total DMR 261006
117 related dmr 53226
118 gene 1247 121
118 total DMR 336974
118 related dmr 94135
119 gene 218 359
119 total DMR 204718
119 related dmr 49830
120 gene 774 591
120 total DMR 384259
120 related dmr 98523
121 gene 634 519
121 total DMR 300081
121 related dmr 86759
122 gene 753 213
122 total DMR 262273
122 related dmr 79328
123 gene 590 679
123 total DMR 350979
123 related dmr 101351
124 gene 683 735
124 total DMR 380569
124 related dmr 123871
125 gene 608 747
125 total DMR 332770
125 related dmr 80639
126 gene 315 203
126 total DMR 167738
126 related dmr 47910
127 gene 1050 343
127 total DMR 299078
127 related dmr 112558
128 gene 684 936
128 total DMR 417342
128 related dmr 120270
129 gene 397 592
129 total DMR 283721
129 related dmr 112116
130 gene 530 257
130 total DMR 245758
130 related dmr 81744
131 gene 471 525
131 total DMR 308939
131 related dmr 127992
132 gene 

In [39]:
adata = anndata.AnnData(X=vstack(data_list),
                        obs=pd.DataFrame([], index=node_list),
                        var=pd.DataFrame([], index=dmr_rate.index))

Transforming to str index.


In [41]:
adata.write_h5ad('NodeDMRResults.h5ad')