In [1]:
import pandas as pd
import numpy as np
import pybedtools
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
use_clusters = [
    'CA1_Ak5', 'CA1_Chrm3', 'CA1_Kif26a', 'CA1_Ptprg',
    'CA3_Cadm2', 'CA3_Efnb2', 'CA3-St18_Epha5', 'CA3-St18_Nuak1',
    'CA3-St18_Tead1', 'DG_dg-all', 'DG-po_Bcl11a', 'DG-po_Calb2',
    'DG-po_Kctd8', 'Gfra1_Gfra1', 'IG-CA2_Chrm3', 'IG-CA2_Peak1', 'IG-CA2_Xpr1'
]

## Final Results

In [3]:
cluster_from = 'CA1_Chrm3'
cluster_to = 'CA3_Cadm2'

with pd.HDFStore('FinalDMGDMR.h5') as f:
    dmr = f[f'{cluster_from}/{cluster_to}/DMR']
    dmg = f[f'{cluster_from}/{cluster_to}/Gene']
    corr = f[f'{cluster_from}/{cluster_to}/Corr']

## DMR Bed

In [4]:
with pd.HDFStore('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/FilteredDMR.h5') as hdf:
    dmr_bed_df = hdf['Bed']
dmr_bed_df.index = 'Sub' + dmr_bed_df['#chr'] + '_' + dmr_bed_df.index.astype(str)
dmr_bed_df = dmr_bed_df.loc[dmr].copy()
dmr_bed = pybedtools.BedTool.from_dataframe(dmr_bed_df)

In [5]:
dmr_bed_df.shape

(13038, 4)

## Genemeta

In [7]:
gene_meta = pd.read_csv(
    '/home/hanliu/ref/mouse/gencode/vm22/gencode.vM22.annotation.gene.flat.tsv.gz',
    index_col='gene_id',
    sep='\t')
gene_meta.index.name = 'gene'
gene_name_to_id = {v: k for k, v in gene_meta['gene_name'].iteritems()}
gene_idbase_to_id = {i.split('.')[0]: i for i in gene_meta.index}

dmg_names = pd.Index(dmg.map(gene_meta['gene_name']))

## snm3C loops

In [24]:
ca1_loop = pd.read_csv(
    '/home/hanliu/project/mouse_rostral_brain/snm3C/loop/HICCUPS_CA1/10k/merged_loops.bedpe',
    sep='\t'
)
ca1_loop = ca1_loop.iloc[1:].copy()
ca1_loop['#chr1'] = 'chr' + ca1_loop['#chr1']
ca1_loop['chr2'] = 'chr' + ca1_loop['chr2']

In [51]:
ca3_loop = pd.read_csv(
    '/home/hanliu/project/mouse_rostral_brain/snm3C/loop/HICCUPS_CA3/10k/merged_loops.bedpe',
    sep='\t'
)
ca3_loop = ca3_loop.iloc[1:].copy()
ca3_loop['#chr1'] = 'chr' + ca3_loop['#chr1']
ca3_loop['chr2'] = 'chr' + ca3_loop['chr2']

In [50]:
def get_loop_anchor_bed_df(loop_df):
    left = loop_df[['#chr1', 'x1', 'x2']]
    left.columns = ['chr', 'start', 'end']
    right = loop_df[['chr2', 'y1', 'y2']]
    right.columns = ['chr', 'start', 'end']
    loop_anchor = pd.concat([left, right])
    loop_anchor = loop_anchor[~loop_anchor.duplicated()]
    loop_anchor['start'] = loop_anchor['start'].astype(int)
    loop_anchor['end'] = loop_anchor['end'].astype(int)
    return loop_anchor

In [53]:
ca1_loop_anchor = get_loop_anchor_bed_df(ca1_loop)
ca3_loop_anchor = get_loop_anchor_bed_df(ca3_loop)

ca1_loop_anchor_bed = pybedtools.BedTool.from_dataframe(ca1_loop_anchor)
ca3_loop_anchor_bed = pybedtools.BedTool.from_dataframe(ca3_loop_anchor)

In [64]:
dmr_bed.slop(b=10000, g='/home/hanliu/ref/mouse/genome/mm10.main.chrom.sizes').intersect(ca1_loop_anchor_bed, wa=True, u=True).count()

1856

In [65]:
dmr_bed.count()

13038

In [66]:
ca1_loop_anchor.shape[0]

7579

In [72]:
ca1_loop_anchor[(ca1_loop_anchor['chr'] == 'chr3') & 
                (ca1_loop_anchor['start'] > 31000000) & 
                (ca1_loop_anchor['end'] < 34000000)].sort_values('start')

Unnamed: 0,chr,start,end
2464,chr3,31050000,31060000
2464,chr3,31130000,31140000
2496,chr3,31140000,31150000
2496,chr3,31590000,31600000
2677,chr3,31610000,31620000
2671,chr3,31680000,31690000
2569,chr3,31790000,31800000
2569,chr3,32310000,32320000
2590,chr3,32330000,32340000
2590,chr3,32500000,32510000


In [79]:
dmr_bed_df[(dmr_bed_df['#chr'] == 'chr3') & 
                (dmr_bed_df['start'] > 31000000) & 
                (dmr_bed_df['end'] < 34000000)].sort_values('start')

Unnamed: 0,#chr,start,end,number_of_dms
Subchr3_47607,chr3,32334694,32334710,2
Subchr3_47608,chr3,32334774,32334789,3
Subchr3_47612,chr3,32335173,32335247,4
Subchr3_47616,chr3,32336672,32336727,2
Subchr3_47617,chr3,32336745,32336757,2
Subchr3_47618,chr3,32336894,32336947,3
Subchr3_47645,chr3,32346295,32346383,4
Subchr3_48017,chr3,32516726,32516775,2
Subchr3_48407,chr3,32692315,32692337,2
Subchr3_48471,chr3,32727694,32727698,2
