In [1]:
import pandas as pd
import numpy as np
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
import pathlib
import anndata

In [2]:
corr_cutoff = 0.5
occur_cotoff = 0
output_dir = '/home/hanliu/project/mouse_rostral_brain/study/ClustersEnsemble/CA1CA3DGEnsemble/'

In [3]:
output_dir = pathlib.Path(output_dir)

corr_total = []
with pd.HDFStore(output_dir / 'FinalDMGDMR.h5') as hdf:
    for k, v in hdf.items():
        if k.endswith('Corr'):
            v = hdf[k]
            _, cluster_from, cluster_to, _ = k.split('/')
            v['cluster_from'] = cluster_from
            v['cluster_to'] = cluster_to
            corr_total.append(v)

corr_total = pd.concat(corr_total)
corr_total = corr_total[corr_total['Corr'] > corr_cutoff]

relate_to_cluster = corr_total.pivot_table(
    index=['DMR', 'Gene'], columns='cluster_from',
    values='Corr').applymap(lambda i: True if i > 0 else False)

corr_total = corr_total.set_index(['DMR', 'Gene'])
occur_count = corr_total.index.value_counts()
corr_total['occur_count'] = occur_count
corr_total = corr_total[corr_total['occur_count'] > occur_cotoff]
corr_total = corr_total[~corr_total.index.duplicated()]
corr_total = pd.concat([corr_total, relate_to_cluster], axis=1, sort=True)
corr_total.reset_index(inplace=True)

In [4]:
gene_meta = pd.read_csv('/home/hanliu/ref/mouse/gencode/vm22/gencode.vM22.annotation.gene.flat.tsv.gz',
                        sep='\t', index_col='gene_id')
gene_meta['length'] = gene_meta['end'] - gene_meta['start']
gene_name_to_id = {v: k for k, v in gene_meta['gene_name'].items()}
gene_meta['tss'] = gene_meta.apply(lambda i: i['start'] if i['strand'] == '+' else i['end'], axis=1)

In [5]:
# gene info
marker_gene_info = pd.read_msgpack(output_dir / 'RelatedDMG.msg')
marker_dict = marker_gene_info.groupby('cluster_from').apply(lambda i: i['gene_id'].values).to_dict()

# dmr info
dmr_info = anndata.read_h5ad(output_dir / 'RelatedClusterAndDMR.hypo_hits.h5ad')
dmr_info = dmr_info[corr_total['DMR'].unique(), :].copy()
dmr_bed = dmr_info.obs

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  exec(code_obj, self.user_global_ns, self.user_ns)


In [6]:
dmr_annot = anndata.read_h5ad('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/DMRAnnotation.h5ad')
dmr_annot = dmr_annot[corr_total['DMR'].unique(), :].copy()
dmr_annot = pd.DataFrame(dmr_annot.X.todense(), 
                         index=dmr_annot.obs_names, 
                         columns=dmr_annot.var_names).astype(bool)
dmr_hits_df = pd.DataFrame(dmr_info.X.todense().astype(bool),
                           index=dmr_info.obs_names,
                           columns=[i + '-Hypo' for i in dmr_info.var_names])

In [7]:
use_dmr_bed = dmr_bed.reindex(
    corr_total['DMR'].values).reset_index(drop=True).iloc[:, :3]
use_dmr_bed.columns = ['dmr_chr', 'dmr_start', 'dmr_end']

use_dmr_annot = dmr_annot.reindex(
    corr_total['DMR'].values).reset_index(drop=True)

use_hits_df = dmr_hits_df.reindex(
    corr_total['DMR'].values).reset_index(drop=True)

use_gene_bed = gene_meta.reindex(corr_total['Gene'].values).reset_index(
    drop=True).loc[:, ['chrom', 'tss', 'tss', 'strand']]
use_gene_bed.columns = ['tss_chr', 'tss_start', 'tss_end', 'gene_strand']

In [8]:
bed_pe = pd.concat([use_dmr_bed, use_gene_bed, corr_total, use_dmr_annot], axis=1)
bed_pe['DMR_to_gene_dist'] = ((bed_pe['dmr_start'] + bed_pe['dmr_end']) / 2 - bed_pe['tss_start']
 ) * bed_pe['gene_strand'].map(lambda i: 1 if i == '+' else -1)

In [9]:
bed_pe.to_msgpack(output_dir / 'TotalEnsembl.bedpe.msg')

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  """Entry point for launching an IPython kernel.
