In [1]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
import anndata
import xarray as xr
import pybedtools
import json
from scipy.sparse import csr_matrix, save_npz

pybedtools.cleanup(remove_all=True)

  PANDAS_TYPES = (pd.Series, pd.DataFrame, pd.Panel)
  'DataArray', pd.Series, pd.DataFrame, pd.Panel]:


In [2]:
# 145 neuronal subtypes
use_clusters = [
    'CA1 Ak5', 'CA1 Chrm3', 'CA1 Kif26a', 'CA1 Lingo2', 'CA1 Ptprg',
    'CA3 Cadm2', 'CA3 Efnb2', 'CA3-St18 Epha5', 'CA3-St18 Nuak1',
    'CA3-St18 Tead1', 'CGE-Lamp5 Dock5', 'CGE-Lamp5 Grid1', 'CGE-Lamp5 Grk5',
    'CGE-Lamp5 Nrxn3', 'CGE-Lamp5 Sorcs1', 'CGE-Vip Ccser1', 'CGE-Vip Clstn2',
    'CGE-Vip Fstl4', 'CGE-Vip Galnt17', 'CGE-Vip Grm8', 'CGE-Vip Ntng1',
    'CGE-Vip Ptprm', 'CGE-Vip Robo1', 'CLA Bcl11a', 'CLA Cdh8', 'CLA Nrp2',
    'CT-L6 Hcrtr2', 'CT-L6 Il1rap', 'CT-L6 Map4', 'CT-L6 Megf9', 'Chd7 Kcnc2',
    'Chd7 Megf11', 'Chd7 Trpc7', 'D1L-Fstl4 Cadm1', 'D1L-Fstl4 Crim1',
    'D1L-Fstl4 Grm3', 'D1L-Fstl4 Sipa1l2', 'D1L-Fstl4 Trps1', 'D1L-PAL Flrt2',
    'D1L-PAL Plcxd3', 'DG dg-all', 'DG-po Bcl11a', 'DG-po Calb2',
    'DG-po Kctd8', 'EP Adcy8', 'EP Rgs8', 'EP Tspan5', 'Foxp2 Dchs2',
    'Foxp2 Homer2', 'Foxp2 Inpp4b', 'Foxp2 Trpc7', 'Gfra1 Gfra1',
    'IG-CA2 Chrm3', 'IG-CA2 Peak1', 'IG-CA2 Xpr1', 'IT-L23 Cux1',
    'IT-L23 Foxp1', 'IT-L23 Ptprt', 'IT-L23 Tenm2', 'IT-L4 Astn2',
    'IT-L4 Shc3', 'IT-L5 Cdh8', 'IT-L5 Etv1', 'IT-L5 Grik3', 'IT-L6 Cadps2',
    'IT-L6 Fstl4', 'IT-L6 Man1c1', 'IT-L6 Oxr1', 'L6b Adcy8', 'L6b Kcnk2',
    'L6b Nrp2', 'L6b Pkhd1', 'LSX-Inh Cacna1i', 'LSX-Inh Dock10',
    'LSX-Inh Enox1', 'LSX-Inh Foxp2', 'LSX-Inh Lats2', 'LSX-Inh Nxph1',
    'LSX-Inh Zeb2', 'MGE-Pvalb Cacna1i', 'MGE-Pvalb Cnih3', 'MGE-Pvalb Entpd3',
    'MGE-Pvalb Gfra2', 'MGE-Pvalb Ptprk', 'MGE-Pvalb Sema5a',
    'MGE-Pvalb Thsd7a', 'MGE-Sst Bmper', 'MGE-Sst Chodl', 'MGE-Sst Dock4',
    'MGE-Sst Etv1', 'MGE-Sst Frmd6', 'MGE-Sst Kcnip4', 'MGE-Sst Ptpre',
    'MGE-Sst Rerg', 'MGE-Sst Rxra', 'MGE-Sst Ubtd1', 'MGE-Sst Unc5b',
    'MSN-D1 Hrh1', 'MSN-D1 Khdrbs3', 'MSN-D1 Ntn1', 'MSN-D1 Plxnc1',
    'MSN-D2 Casz1', 'MSN-D2 Col14a1', 'MSN-D2 Nrp2', 'MSN-D2 Slc24a2',
    'NP-L6 Boc', 'NP-L6 Cntnap4', 'NP-L6 Cntnap5a', 'NP-L6 Cyp7b1',
    'NP-L6 Kcnab1', 'NP-L6 Olfml2b', 'OLF Gabbr2', 'OLF Kcnd3', 'OLF Mapk10',
    'OLF Pag1', 'OLF Trpc4', 'OLF Xkr6', 'OLF-Exc Bmpr1b', 'OLF-Exc Cdh9',
    'OLF-Exc Cux2', 'OLF-Exc Lrrtm3', 'OLF-Exc Pld5', 'OLF-Exc Rmst',
    'OLF-Exc Sgcd', 'OLF-Exc Unc13c', 'PAL-Inh Chat', 'PAL-Inh Deptor',
    'PAL-Inh Igdcc3', 'PAL-Inh Meis1', 'PAL-Inh Meis2', 'PAL-Inh Onecut2',
    'PAL-Inh Ptprd', 'PAL-Inh Rarb', 'PAL-Inh Tcf7l2', 'PAL-Inh Tmem178',
    'PT-L5 Abca12', 'PT-L5 Astn2', 'PT-L5 Kcnh1', 'PT-L5 Nectin1',
    'PT-L5 Plcb4', 'PT-L5 Ptprt', 'PT-L5 Tenm2', 'PT-L5 Tmtc2', 'PT-L5 Unc5b',
    'Unc5c Unc5c'
]

## DMR mCG

### Fillna

In [None]:
with pd.HDFStore('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/FilteredDMR.h5') as hdf:
    dmr_rate = hdf['Rate']
    dmr_bed = hdf['Bed']
dmr_rate.index = 'Sub' + dmr_bed['#chr'] + '_' + dmr_bed.index.astype(str)

In [None]:
dmr_median = dmr_rate.median(axis=1)
no_na_rate = dmr_rate.T.fillna(dmr_median).T

### Save each chromosom

In [None]:
chrs = dmr_bed['#chr'].copy()
chrs.index = no_na_rate.index

for chrom, sub_df in no_na_rate.groupby(chrs):
    print(chrom)
    sub_df.to_msgpack(f'DMR_rate/SubDMR_rate_no_na.{chrom}.msg')

## DMR Gene Distance

In [3]:
distance_cutoff = 1000000
output_dir = 'DMR_5kb_1M_mask'

In [4]:
bins = distance_cutoff // 5000

with pd.HDFStore('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/FilteredDMR.h5') as hdf:
    dmr_bed = hdf['Bed']
dmr_bed.index = 'Sub' + dmr_bed['#chr'] + '_' + dmr_bed.index.astype(str)
dmr_bed['center'] = (dmr_bed['end'] + dmr_bed['start']) / 2
dmr_bed['dmr_int'] = dmr_bed.index.map(lambda i: int(i.split('_')[1]))

dmr_bed = dmr_bed[dmr_bed['start'] > distance_cutoff].copy()
dmr_bed['center_bin_id'] = (dmr_bed['center'] // 5000).astype(int)
dmr_bed['start_bin'] = dmr_bed['center_bin_id'].apply(lambda i: i - bins)
dmr_bed['end_bin'] = dmr_bed['center_bin_id'].apply(lambda i: i + bins)

In [5]:
dmr_bed.head()

Unnamed: 0,#chr,start,end,number_of_dms,center,dmr_int,center_bin_id,start_bin,end_bin
Subchr1_0,chr1,3001007,3001018,2,3001012.5,0,600,400,800
Subchr1_1,chr1,3001277,3001277,1,3001277.0,1,600,400,800
Subchr1_2,chr1,3001629,3001629,1,3001629.0,2,600,400,800
Subchr1_3,chr1,3003379,3003898,6,3003638.5,3,600,400,800
Subchr1_4,chr1,3004530,3004530,1,3004530.0,4,600,400,800


In [6]:
chrom_bin_bed = pd.read_csv('SubTypeBins/mm10.w5k.remove_black.bed', 
                            sep='\t', index_col=3, header=None)

In [7]:
for chrom, sub_df in dmr_bed.groupby('#chr'):
    print(chrom)
    # dmr_bed = dmr_bed[dmr_bed['start'] > distance_cutoff].copy()
    # so i-bins >= 0
    indices = np.concatenate(sub_df['center_bin_id'].apply(lambda i: np.arange(
        i - bins, i + bins)).tolist())
    indptr = np.arange(0, indices.size + bins * 2, bins * 2)
    data = np.ones_like(indices)

    chrom_csr = csr_matrix((data, indices, indptr),
                           shape=(sub_df.shape[0], sub_df['end_bin'].max()))

    chrom_adata = anndata.AnnData(
        X=chrom_csr,
        obs=sub_df[['#chr', 'start', 'end', 'number_of_dms']],
        var=pd.DataFrame([],
                         pd.Index((f'{chrom}_{i+1}'
                                   for i in range(chrom_csr.shape[1])))))
    use_index = chrom_bin_bed[chrom_bin_bed[0] == chrom].index
    chrom_adata = chrom_adata[:, use_index]

    chrom_adata.write_h5ad(f'DMR_5kb_1M_mask/{chrom}.h5ad')
    

chr1


Trying to set attribute `.obs` of view, making a copy.
... storing '#chr' as categorical


chr10


Trying to set attribute `.obs` of view, making a copy.
... storing '#chr' as categorical


chr11


Trying to set attribute `.obs` of view, making a copy.
... storing '#chr' as categorical


chr12


Trying to set attribute `.obs` of view, making a copy.
... storing '#chr' as categorical


chr13


Trying to set attribute `.obs` of view, making a copy.
... storing '#chr' as categorical


chr14


Trying to set attribute `.obs` of view, making a copy.
... storing '#chr' as categorical


chr15


Trying to set attribute `.obs` of view, making a copy.
... storing '#chr' as categorical


chr16


Trying to set attribute `.obs` of view, making a copy.
... storing '#chr' as categorical


chr17


Trying to set attribute `.obs` of view, making a copy.
... storing '#chr' as categorical


chr18


Trying to set attribute `.obs` of view, making a copy.
... storing '#chr' as categorical


chr19


Trying to set attribute `.obs` of view, making a copy.
... storing '#chr' as categorical


chr2


Trying to set attribute `.obs` of view, making a copy.
... storing '#chr' as categorical


chr3


Trying to set attribute `.obs` of view, making a copy.
... storing '#chr' as categorical


chr4


Trying to set attribute `.obs` of view, making a copy.
... storing '#chr' as categorical


chr5


Trying to set attribute `.obs` of view, making a copy.
... storing '#chr' as categorical


chr6


Trying to set attribute `.obs` of view, making a copy.
... storing '#chr' as categorical


chr7


Trying to set attribute `.obs` of view, making a copy.
... storing '#chr' as categorical


chr8


Trying to set attribute `.obs` of view, making a copy.
... storing '#chr' as categorical


chr9


Trying to set attribute `.obs` of view, making a copy.
... storing '#chr' as categorical


chrX


Trying to set attribute `.obs` of view, making a copy.
... storing '#chr' as categorical


chrY


Trying to set attribute `.obs` of view, making a copy.
... storing '#chr' as categorical
