In [13]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
import anndata
import xarray as xr
import pybedtools
import json
from scipy.sparse import csr_matrix, save_npz
import pathlib
# pybedtools.cleanup(remove_all=True)

  PANDAS_TYPES = (pd.Series, pd.DataFrame, pd.Panel)
  'DataArray', pd.Series, pd.DataFrame, pd.Panel]:


In [14]:
output_dir = '/home/hanliu/project/mouse_rostral_brain/study/ITSpatial/gene_dmr_corr'
output_dir = pathlib.Path(output_dir)

## DMR mCG

### Fillna

In [4]:
with pd.HDFStore('/home/hanliu/project/mouse_rostral_brain/DMR/ITSpatial/DMRInfo.h5') as hdf:
    dmr_rate = hdf['Rate']
    dmr_bed = hdf['Bed']

In [5]:
dmr_median = dmr_rate.median(axis=1)
no_na_rate = dmr_rate.T.fillna(dmr_median).T

### Save each chromosom

In [10]:
dmr_rate_dir = output_dir / 'DMRRate'
dmr_rate_dir.mkdir(exist_ok=True)

In [11]:
chrs = dmr_bed['#chr'].copy()
chrs.index = no_na_rate.index

for chrom, sub_df in no_na_rate.groupby(chrs):
    print(chrom)
    sub_df.to_msgpack(dmr_rate_dir / f'SubDMR_rate_no_na.{chrom}.msg')

chr1
chr10
chr11


It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  


chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chrM
chrX
chrY


## Gene mCH

### Save each gene rate chrom

In [13]:
gene_rate_df = pd.read_msgpack(
    '/home/hanliu/project/mouse_rostral_brain/study/ITSpatial/DMG Analysis/ITSpatialPairwiseDEG/ClusterMean.genes.msg'
).T

In [14]:
gene_rate_df.shape

(50231, 27)

## Cluster Pairwise DMG

In [15]:
total_dmg = pd.read_csv(
    '/home/hanliu/project/mouse_rostral_brain/study/ITSpatial/DMG Analysis/ITSpatialPairwiseDEG/TotalGeneID.txt',
    index_col=0, header=None
).index

In [16]:
# gene_rate_df = gene_rate_df.loc[total_dmg]

In [17]:
gene_rate_median = gene_rate_df.median(axis=1)
no_na_rate = gene_rate_df.T.fillna(gene_rate_median).T
no_na_rate.dropna(how='any', inplace=True)
assert no_na_rate.isna().sum().sum() == 0

In [18]:
no_na_rate.shape

(50231, 27)

In [None]:
gene_meta = pd.read_csv(
    '/home/hanliu/ref/mouse/gencode/vm22/gencode.vM22.annotation.gene.flat.tsv.gz',
    index_col='gene_id',
    sep='\t')
gene_meta.index.name = 'gene'
gene_name_to_id = {v: k for k, v in gene_meta['gene_name'].iteritems()}
gene_idbase_to_id = {i.split('.')[0]: i for i in gene_meta.index}

gene_meta = gene_meta.reindex(no_na_rate.index)

In [21]:
gene_rate_dir = output_dir / 'GeneRate'
gene_rate_dir.mkdir(exist_ok=True)

In [22]:
for chrom, sub_df in no_na_rate.groupby(gene_meta['chrom']):
    print(chrom)
    sub_df.to_msgpack(gene_rate_dir / f'gene_rate_no_na.{chrom}.msg')

chr1
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chrM
chrX
chrY


It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  This is separate from the ipykernel package so we can avoid doing imports until


## DMR Gene Distance

In [6]:
distance_cutoff = 1000000
output_dir = 'DMR_gene_1M_mask'

In [4]:
with pd.HDFStore('/home/hanliu/project/mouse_rostral_brain/DMR/ITSpatial/DMRInfo.h5') as hdf:
    dmr_bed = hdf['Bed']

In [9]:
gene_meta = pd.read_csv(
    '/home/hanliu/ref/mouse/gencode/vm22/gencode.vM22.annotation.gene.flat.tsv.gz',
    index_col='gene_id',
    sep='\t')
gene_meta.index.name = 'gene'
gene_name_to_id = {v: k for k, v in gene_meta['gene_name'].iteritems()}
gene_idbase_to_id = {i.split('.')[0]: i for i in gene_meta.index}

In [10]:
def split_row(string):
    dmrs = tuple(map(int, [i.split('_')[-1] for i in string.split(',')]))
    return dmrs

In [14]:
mask_dir = output_dir / 'GeneDMRDistanceMask'
mask_dir.mkdir(exist_ok=True)

In [44]:
for chrom in dmr_bed['#chr'].unique():
    print(chrom)
    if chrom in ['chrY', 'chrM']:
        continue
    this_dmr = dmr_bed[dmr_bed['#chr'] == chrom].sort_values(
        'start').iloc[:, :3].copy()
    # change dmr_bed index into number, so each chrom start from 0
    dmr_int_dict = {d: i for i, d in enumerate(this_dmr.index)}
    this_dmr.index = this_dmr.index.map(dmr_int_dict)
    this_dmr = this_dmr.reset_index().iloc[:, [1, 2, 3, 0]]

    this_gene = gene_meta[gene_meta['chrom'] == chrom].sort_values(
        'start').loc[:, ['chrom', 'start', 'end']]
    gene_int_dict = {g: i for i, g in enumerate(this_gene.index)}
    this_gene = this_gene.reset_index().iloc[:, [1, 2, 3, 0]]

    _gene_bed = pybedtools.BedTool.from_dataframe(this_gene)
    _dmr_bed = pybedtools.BedTool.from_dataframe(this_dmr)

    mapped = _gene_bed.slop(
        b=distance_cutoff,
        g='/home/hanliu/ref/mouse/genome/mm10.main.chrom.sizes').map(
            _dmr_bed, c=4, o='collapse')
    df = mapped.to_dataframe()
    df = df[df['score'] != '.'].copy()
    df['gene_int'] = df['name'].map(gene_int_dict)
    df['dmr_ids'] = df['score'].apply(split_row)

    row_records = []
    col_records = []
    data_records = []
    for _, row in df.iterrows():
        gene_int = row['gene_int']
        dmr_ids = row['dmr_ids']
        _col = np.array(dmr_ids)
        col_records.append(_col)
        data_records.append(np.ones_like(_col))
        row_records.append((np.ones_like(_col) * gene_int).astype(np.int64))

    row_records = np.concatenate(row_records)
    col_records = np.concatenate(col_records)
    data_records = np.concatenate(data_records)
    final_mat = csr_matrix((data_records, (row_records, col_records)),
                           shape=(this_gene.shape[0], this_dmr.shape[0]))

    final_adata = anndata.AnnData(
        X=final_mat,
        var=pd.DataFrame([],
                         index=pd.Series(dmr_int_dict).sort_values().index),
        obs=pd.DataFrame([],
                         index=pd.Series(gene_int_dict).sort_values().index))
    final_adata.write_h5ad(mask_dir / f'{chrom}.h5ad')


chr1
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chrM
chrX
chrY


## Calculate global mCH mCG

In [15]:
import pandas as pd

In [16]:
cell_tidy_data = pd.read_msgpack(
    '/home/hanliu/project/mouse_rostral_brain/study/ClusteringSummary/Summary/TotalClusteringResults.msg'
)
group_design = pd.read_csv('/home/hanliu/project/mouse_rostral_brain/study/ITSpatial/Group Design/ITSpatial.cell_group_design.csv',
                           index_col=0)


It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  exec(code_obj, self.user_global_ns, self.user_ns)


In [17]:
cell_tidy_data['ITSpatial'] = group_design['ITSpatial']

median_cg = cell_tidy_data.groupby('ITSpatial')['CG_RateAdj'].median()
median_ch = cell_tidy_data.groupby('ITSpatial')['CH_RateAdj'].median()
median_cg.to_csv(output_dir / 'ClusterGlobalmCG.csv', header=False)
median_ch.to_csv(output_dir / 'ClusterGlobalmCH.csv', header=False)