In [1]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
import anndata
import xarray as xr
import pybedtools
import json
from scipy.sparse import csr_matrix, save_npz

pybedtools.cleanup(remove_all=True)

  PANDAS_TYPES = (pd.Series, pd.DataFrame, pd.Panel)
  'DataArray', pd.Series, pd.DataFrame, pd.Panel]:


In [2]:
# 145 neuronal subtypes
use_clusters = [
    'CA1 Ak5', 'CA1 Chrm3', 'CA1 Kif26a', 'CA1 Lingo2', 'CA1 Ptprg',
    'CA3 Cadm2', 'CA3 Efnb2', 'CA3-St18 Epha5', 'CA3-St18 Nuak1',
    'CA3-St18 Tead1', 'CGE-Lamp5 Dock5', 'CGE-Lamp5 Grid1', 'CGE-Lamp5 Grk5',
    'CGE-Lamp5 Nrxn3', 'CGE-Lamp5 Sorcs1', 'CGE-Vip Ccser1', 'CGE-Vip Clstn2',
    'CGE-Vip Fstl4', 'CGE-Vip Galnt17', 'CGE-Vip Grm8', 'CGE-Vip Ntng1',
    'CGE-Vip Ptprm', 'CGE-Vip Robo1', 'CLA Bcl11a', 'CLA Cdh8', 'CLA Nrp2',
    'CT-L6 Hcrtr2', 'CT-L6 Il1rap', 'CT-L6 Map4', 'CT-L6 Megf9', 'Chd7 Kcnc2',
    'Chd7 Megf11', 'Chd7 Trpc7', 'D1L-Fstl4 Cadm1', 'D1L-Fstl4 Crim1',
    'D1L-Fstl4 Grm3', 'D1L-Fstl4 Sipa1l2', 'D1L-Fstl4 Trps1', 'D1L-PAL Flrt2',
    'D1L-PAL Plcxd3', 'DG dg-all', 'DG-po Bcl11a', 'DG-po Calb2',
    'DG-po Kctd8', 'EP Adcy8', 'EP Rgs8', 'EP Tspan5', 'Foxp2 Dchs2',
    'Foxp2 Homer2', 'Foxp2 Inpp4b', 'Foxp2 Trpc7', 'Gfra1 Gfra1',
    'IG-CA2 Chrm3', 'IG-CA2 Peak1', 'IG-CA2 Xpr1', 'IT-L23 Cux1',
    'IT-L23 Foxp1', 'IT-L23 Ptprt', 'IT-L23 Tenm2', 'IT-L4 Astn2',
    'IT-L4 Shc3', 'IT-L5 Cdh8', 'IT-L5 Etv1', 'IT-L5 Grik3', 'IT-L6 Cadps2',
    'IT-L6 Fstl4', 'IT-L6 Man1c1', 'IT-L6 Oxr1', 'L6b Adcy8', 'L6b Kcnk2',
    'L6b Nrp2', 'L6b Pkhd1', 'LSX-Inh Cacna1i', 'LSX-Inh Dock10',
    'LSX-Inh Enox1', 'LSX-Inh Foxp2', 'LSX-Inh Lats2', 'LSX-Inh Nxph1',
    'LSX-Inh Zeb2', 'MGE-Pvalb Cacna1i', 'MGE-Pvalb Cnih3', 'MGE-Pvalb Entpd3',
    'MGE-Pvalb Gfra2', 'MGE-Pvalb Ptprk', 'MGE-Pvalb Sema5a',
    'MGE-Pvalb Thsd7a', 'MGE-Sst Bmper', 'MGE-Sst Chodl', 'MGE-Sst Dock4',
    'MGE-Sst Etv1', 'MGE-Sst Frmd6', 'MGE-Sst Kcnip4', 'MGE-Sst Ptpre',
    'MGE-Sst Rerg', 'MGE-Sst Rxra', 'MGE-Sst Ubtd1', 'MGE-Sst Unc5b',
    'MSN-D1 Hrh1', 'MSN-D1 Khdrbs3', 'MSN-D1 Ntn1', 'MSN-D1 Plxnc1',
    'MSN-D2 Casz1', 'MSN-D2 Col14a1', 'MSN-D2 Nrp2', 'MSN-D2 Slc24a2',
    'NP-L6 Boc', 'NP-L6 Cntnap4', 'NP-L6 Cntnap5a', 'NP-L6 Cyp7b1',
    'NP-L6 Kcnab1', 'NP-L6 Olfml2b', 'OLF Gabbr2', 'OLF Kcnd3', 'OLF Mapk10',
    'OLF Pag1', 'OLF Trpc4', 'OLF Xkr6', 'OLF-Exc Bmpr1b', 'OLF-Exc Cdh9',
    'OLF-Exc Cux2', 'OLF-Exc Lrrtm3', 'OLF-Exc Pld5', 'OLF-Exc Rmst',
    'OLF-Exc Sgcd', 'OLF-Exc Unc13c', 'PAL-Inh Chat', 'PAL-Inh Deptor',
    'PAL-Inh Igdcc3', 'PAL-Inh Meis1', 'PAL-Inh Meis2', 'PAL-Inh Onecut2',
    'PAL-Inh Ptprd', 'PAL-Inh Rarb', 'PAL-Inh Tcf7l2', 'PAL-Inh Tmem178',
    'PT-L5 Abca12', 'PT-L5 Astn2', 'PT-L5 Kcnh1', 'PT-L5 Nectin1',
    'PT-L5 Plcb4', 'PT-L5 Ptprt', 'PT-L5 Tenm2', 'PT-L5 Tmtc2', 'PT-L5 Unc5b',
    'Unc5c Unc5c'
]

## DMR mCG

### Fillna

In [None]:
with pd.HDFStore('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/FilteredDMR.h5') as hdf:
    dmr_rate = hdf['Rate']
    dmr_bed = hdf['Bed']
dmr_rate.index = 'Sub' + dmr_bed['#chr'] + '_' + dmr_bed.index.astype(str)

In [None]:
dmr_median = dmr_rate.median(axis=1)
no_na_rate = dmr_rate.T.fillna(dmr_median).T

### Save each chromosom

In [None]:
chrs = dmr_bed['#chr'].copy()
chrs.index = no_na_rate.index

for chrom, sub_df in no_na_rate.groupby(chrs):
    print(chrom)
    sub_df.to_msgpack(f'DMR_rate/SubDMR_rate_no_na.{chrom}.msg')

## Gene mCH

### Save each gene rate chrom

In [3]:
gene_rate = xr.open_dataset(
    '/home/hanliu/project/mouse_rostral_brain/study/mCClustermCLevel/SubType.geneslop2k.mcds'
)['gene_cluster_da_rate'].sel(dict(mc_type='CHN', SubType=use_clusters))
gene_rate_df = gene_rate.to_pandas().T

  3: pd.Panel}


In [4]:
gene_rate_df.shape

(55487, 145)

## Cluster Pairwise DMG

In [5]:
total_dmg = pd.read_csv(
    '/home/hanliu/project/mouse_rostral_brain/study/ClusterMethylMarker/SubTypePairwiseDEG/TotalGeneID.txt',
    index_col=0, header=None
).index

In [6]:
# gene_rate_df = gene_rate_df.loc[total_dmg]

In [16]:
gene_rate_median = gene_rate_df.median(axis=1)
no_na_rate = gene_rate_df.T.fillna(gene_rate_median).T
no_na_rate.dropna(how='any', inplace=True)
assert no_na_rate.isna().sum().sum() == 0

  overwrite_input=overwrite_input)


In [18]:
no_na_rate.shape

(55486, 145)

In [37]:
gene_meta = pd.read_csv(
    '/home/hanliu/ref/mouse/gencode/vm22/gencode.vM22.annotation.gene.flat.tsv.gz',
    index_col='gene_id',
    sep='\t')
gene_meta.index.name = 'gene'
gene_name_to_id = {v: k for k, v in gene_meta['gene_name'].iteritems()}
gene_idbase_to_id = {i.split('.')[0]: i for i in gene_meta.index}

gene_meta = gene_meta.reindex(no_na_rate.index)

In [40]:
for chrom, sub_df in no_na_rate.groupby(gene_meta['chrom']):
    print(chrom)
    sub_df.to_msgpack(f'Gene-slop2k-mCH/SubDMR_rate_no_na.{chrom}.msg')

chr1
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chrM
chrX
chrY


It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  This is separate from the ipykernel package so we can avoid doing imports until


## DMR Gene Distance

In [41]:
with pd.HDFStore('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/FilteredDMR.h5') as hdf:
    dmr_bed = hdf['Bed']
dmr_bed.index = 'Sub' + dmr_bed['#chr'] + '_' + dmr_bed.index.astype(str)

In [42]:
def split_row(string):
    dmrs = tuple(map(int, [i.split('_')[-1] for i in string.split(',')]))
    return dmrs

In [46]:
distance_cutoff = 10000000
output_dir = 'DMR_gene_10M_mask'

In [47]:
for chrom in dmr_bed['#chr'].unique():
    print(chrom)
    if chrom == 'chrY':
        continue
    this_dmr = dmr_bed[dmr_bed['#chr'] == chrom].sort_values('start').iloc[:, :3].copy()
    this_dmr = this_dmr.reset_index().iloc[:, [1, 2, 3, 0]]
    this_gene = gene_meta[gene_meta['chrom'] == chrom].sort_values('start').loc[:, ['chrom', 'start', 'end']]
    gene_int_dict = {g: i for i, g in enumerate(this_gene.index)}
    this_gene = this_gene.reset_index().iloc[:, [1, 2, 3, 0]]
    
    _gene_bed = pybedtools.BedTool.from_dataframe(this_gene)
    _dmr_bed = pybedtools.BedTool.from_dataframe(this_dmr)
    
    mapped = _gene_bed.slop(b=distance_cutoff, g='/home/hanliu/ref/mouse/genome/mm10.main.chrom.sizes'
                      ).map(_dmr_bed, c=4, o='collapse')
    df = mapped.to_dataframe()
    df['gene_int'] = df['name'].map(gene_int_dict)
    df['dmr_ids'] = df['score'].apply(split_row)

    row_records = []
    col_records = []
    data_records = []
    for _, row in df.iterrows():
        gene_int = row['gene_int']
        dmr_ids = row['dmr_ids']
        _col = np.array(dmr_ids)
        col_records.append(_col)
        data_records.append(np.ones_like(_col))
        row_records.append((np.ones_like(_col) * gene_int).astype(np.int64))
        
    row_records = np.concatenate(row_records)
    col_records = np.concatenate(col_records)
    data_records = np.concatenate(data_records)
    final_mat = csr_matrix((data_records, (row_records, col_records)), 
                           shape=(this_gene.shape[0], this_dmr.shape[0]))
    save_npz(f'{output_dir}/{chrom}.npz', final_mat.astype(bool))
    
    with open(f'{output_dir}/{chrom}.gene_int.json', 'w') as f:
        json.dump(gene_int_dict, f)


chr1
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chrX
chrY


## DMR ChromBin Distance

In [3]:
with pd.HDFStore('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/FilteredDMR.h5') as hdf:
    dmr_bed = hdf['Bed']
dmr_bed.index = 'Sub' + dmr_bed['#chr'] + '_' + dmr_bed.index.astype(str)

In [5]:
def split_row(string):
    dmrs = tuple(map(int, [i.split('_')[-1] for i in string.split(',')]))
    return dmrs

In [8]:
distance_cutoff = 1000000
output_dir = 'DMR_5kbin_1M_mask'

In [10]:
for chrom in dmr_bed['#chr'].unique():
    print(chrom)
    if chrom == 'chrY':
        continue
    this_dmr = dmr_bed[dmr_bed['#chr'] == chrom].sort_values('start').iloc[:, :3].copy()
    this_dmr = this_dmr.reset_index().iloc[:, [1, 2, 3, 0]]
    break

chr1
