In [1]:
from papermill import execute_notebook, PapermillExecutionError
import pandas as pd
import pathlib
import numpy as np
from scipy.sparse import csr_matrix, vstack
import anndata

## Runner

In [2]:
def runner(chrom):
    dmr_prefix = 'Sub'
    dmr_dir = f'/home/hanliu/project/mouse_rostral_brain/DMR/SubType/raw/SubType-{chrom}/'
    dmr_path = f'/home/hanliu/project/mouse_rostral_brain/DMR/SubType/raw/SubType-{chrom}/SubType-{chrom}_rms_results_collapsed.tsv'

    skip_quantile = 0.25
    delta_to_mean = 0.3
    dms_cutoff = 1

    black_list_path = '/home/hanliu/ref/blacklist/mm10-blacklist.v2.bed.gz'
    motif_file_path = '/home/hanliu/ref/MEME/selected_motifs_sets/JASPAR2020_CORE_vertebrates_non-redundant_pfms.has_mouse_id.meme'
    genome_fasta_path = '/home/hanliu/ref/mouse/genome/fasta/raw/mm10.fa'
    chrom_size_path = '/home/hanliu/ref/mouse/genome/mm10.main.chrom.sizes'
    
    # set up 
    dmr_dir = pathlib.Path(dmr_dir)
    recipe_dir = dmr_dir / 'Recipe'
    recipe_dir.mkdir(exist_ok=True)

    # step 1 filter 
    params = dict(dmr_path=dmr_path,
                  dmr_prefix=dmr_prefix,
                  black_list_path=black_list_path,
                  skip_quantile=skip_quantile,
                  delta_to_mean=delta_to_mean,
                  dms_cutoff=dms_cutoff)

    input_path = '1.FilterDMRRecipe_robust_mean_delta.ipynb'
    output_path = recipe_dir / '1.FilterDMRRecipe_robust_mean_delta.ipynb'
    # check the last file generated by this notebook
    if not (dmr_dir / 'HyperDMR/TotalHits.h5ad').exists():
        execute_notebook(str(input_path),
                         str(output_path),
                         parameters=params,
                         engine_name=None,
                         prepare_only=False,
                         kernel_name=None,
                         progress_bar=True,
                         log_output=False,
                         start_timeout=60,
                         report_mode=False,
                         cwd=str(recipe_dir))
    
    
    # step 2 motif scan of total DMR
    params = dict(bed_path=str(dmr_dir / 'TotalDMR.nofilter.bed'),
                  motif_file_path=motif_file_path,
                  genome_fasta_path=genome_fasta_path,
                  chrom_size_path=chrom_size_path)
    
    if not (dmr_dir / 'MotifScan.h5ad').exists():
        input_path = '2.MotifScan.ipynb'
        output_path = recipe_dir / '2.MotifScan.ipynb'
        execute_notebook(str(input_path),
                         str(output_path),
                         parameters=params,
                         engine_name=None,
                         prepare_only=False,
                         kernel_name=None,
                         progress_bar=True,
                         log_output=False,
                         start_timeout=60,
                         report_mode=False,
                         cwd=str(recipe_dir))
    
    # step 3 
    params = dict(dmr_bed=str(dmr_dir / 'TotalDMR.nofilter.bed'))
    if not (dmr_dir / 'Annotation/DMRAnnotation.h5ad').exists():
        input_path = '3.DMRAnnotation.ipynb'
        output_path = recipe_dir / '3.DMRAnnotation.ipynb'
        execute_notebook(str(input_path),
                         str(output_path),
                         parameters=params,
                         engine_name=None,
                         prepare_only=False,
                         kernel_name=None,
                         progress_bar=True,
                         log_output=False,
                         start_timeout=60,
                         report_mode=False,
                         cwd=str(recipe_dir))
    return

## Run each chrom

In [3]:
chroms = [
    'chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9',
    'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17',
    'chr18', 'chr19', 'chrX', 'chrY'
]

In [4]:
for chrom in chroms:
    runner(chrom)

## Combine Chrom Results

In [5]:
input_dir = pathlib.Path('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/raw/')
output_dir = pathlib.Path('/home/hanliu/project/mouse_rostral_brain/DMR/SubType/Total/')

### Aggregate DMR bed

In [6]:
dmr_bed_paths = {p.parent.name.split('-')[1]: p 
                 for p in input_dir.glob('SubType-chr*/TotalDMR.nofilter.bed')}

In [14]:
records = []
for chrom in chroms:
    path = dmr_bed_paths[chrom]
    df = pd.read_csv(path, sep='\t', index_col=3, header=None)
    df.index = df.index.map(lambda i: f'Sub{chrom}_' + i.split('_')[1])
    records.append(df)
total_df = pd.concat(records)
total_df.reset_index().iloc[:, [1, 2, 3, 0]].to_csv(output_dir /
                                                    'TotalDMR.nofilter.bed',
                                                    sep='\t',
                                                    header=None,
                                                    index=None)

## Aggregate Motif Scan

In [7]:
adata_paths = {
    p.parent.name.split('-')[1]: p
    for p in input_dir.glob('SubType-chr*/MotifScan.h5ad')
}

In [8]:
records = []
obs = []
var = []
first = True
for chrom in chroms:
    path = adata_paths[chrom]
    adata = anndata.read_h5ad(path)
    adata.obs_names = adata.obs_names.map(lambda i: f'Sub{chrom}_' + i.split('_')[1])
    
    if first:
        # make sure columns have the same order
        first = False
        motif_order = adata.var_names
    else:
        adata = adata[:, motif_order].copy()
    obs.append(adata.obs)
    records.append(adata.X)
    var.append(adata.var)
    
total_adata = anndata.AnnData(X=vstack(records),
                              obs=pd.concat(obs),
                              var=pd.DataFrame([], motif_order))
total_adata.write_h5ad('MotifScan.h5ad')

## Aggregate Annotation

In [8]:
adata_paths = {
    p.parent.parent.name.split('-')[1]: p
    for p in input_dir.glob('SubType-chr*/Annotation/DMRAnnotation.h5ad')
}

In [12]:
records = []
obs = []
var = []
first = True
for chrom in chroms:
    path = adata_paths[chrom]
    adata = anndata.read_h5ad(path)
    adata.obs_names = adata.obs_names.map(lambda i: f'Sub{chrom}_' + i.split('_')[1])
    
    if first:
        # make sure columns have the same order
        first = False
        motif_order = adata.var_names
    else:
        adata = adata[:, motif_order].copy()
    obs.append(adata.obs)
    records.append(adata.X)
    var.append(adata.var)
    
total_adata = anndata.AnnData(X=vstack(records),
                              obs=pd.concat(obs),
                              var=pd.DataFrame([], motif_order))
total_adata.write_h5ad(output_dir / 'DMRAnnotation.h5ad')

## Hypo Hyper DMR

### Aggregate hypo bed

In [9]:
dmr_beds = []
adata_list = []
for chrom in chroms:
    dmr_dir = input_dir / f'SubType-{chrom}/HypoDMR/'
    dmr_list = list(dmr_dir.glob('*bed'))
    dmr_beds += dmr_list
    adata_path = dmr_dir / 'TotalHits.h5ad'
    adata_list.append(adata_path)

In [10]:
records = []
for p in dmr_beds:
    cluster = p.name.split('.')[0]
    chrom = p.parent.parent.name.split('-')[1]
    records.append([cluster, chrom, str(p)])
bed_path_df = pd.DataFrame(records, columns=['cluster', 'chrom', 'path'])

In [11]:
this_output = output_dir / 'HypoDMR'
this_output.mkdir(exist_ok=True)

for cluster, sub_df in bed_path_df.groupby('cluster'):
    print(cluster)
    sub_df = sub_df.set_index('chrom')
    dfs = []
    for chrom in chroms:
        df = pd.read_csv(sub_df.at[chrom, 'path'],
                         header=None,
                         sep='\t',
                         index_col=3)
        df.index = df.index.map(lambda i: f'Sub{chrom}_' + i.split('_')[1])
        dfs.append(df)
    total_df = pd.concat(dfs)
    total_df.reset_index().iloc[:, [1, 2, 3, 0]].to_csv(this_output /
                                                        f'{cluster}.DMS1.bed',
                                                        header=None,
                                                        index=None,
                                                        sep='\t')
    

ANP_anp-dg
ANP_anp-olf-cnu
ASC_cortex-olf
ASC_mid
ASC_str-hpf
CA1_Ak5
CA1_Chrm3
CA1_Kif26a
CA1_Lingo2
CA1_Ptprg
CA3-St18_Epha5
CA3-St18_Nuak1
CA3-St18_Tead1
CA3_Cadm2
CA3_Efnb2
CGE-Lamp5_Dock5
CGE-Lamp5_Grid1
CGE-Lamp5_Grk5
CGE-Lamp5_Nrxn3
CGE-Lamp5_Sorcs1
CGE-Vip_Ccser1
CGE-Vip_Clstn2
CGE-Vip_Fstl4
CGE-Vip_Galnt17
CGE-Vip_Grm8
CGE-Vip_Ntng1
CGE-Vip_Ptprm
CGE-Vip_Robo1
CLA_Bcl11a
CLA_Cdh8
CLA_Nrp2
CT-L6_Hcrtr2
CT-L6_Il1rap
CT-L6_Map4
CT-L6_Megf9
Chd7_Kcnc2
Chd7_Megf11
Chd7_Trpc7
D1L-Fstl4_Cadm1
D1L-Fstl4_Crim1
D1L-Fstl4_Grm3
D1L-Fstl4_Sipa1l2
D1L-Fstl4_Trps1
D1L-PAL_Flrt2
D1L-PAL_Plcxd3
DG-po_Bcl11a
DG-po_Calb2
DG-po_Kctd8
DG_dg-all
EC_Abhd2
EC_Sema3g
EP_Adcy8
EP_Rgs8
EP_Tspan5
Foxp2_Dchs2
Foxp2_Homer2
Foxp2_Inpp4b
Foxp2_Trpc7
Gfra1_Gfra1
IG-CA2_Chrm3
IG-CA2_Peak1
IG-CA2_Xpr1
IT-L23_Cux1
IT-L23_Foxp1
IT-L23_Ptprt
IT-L23_Tenm2
IT-L4_Astn2
IT-L4_Shc3
IT-L5_Cdh8
IT-L5_Etv1
IT-L5_Grik3
IT-L6_Cadps2
IT-L6_Fstl4
IT-L6_Man1c1
IT-L6_Oxr1
L6b_Adcy8
L6b_Kcnk2
L6b_Nrp2
L6b_Pkhd1
LSX-Inh_Cacna1i
L

### Aggregate hyper bed

In [14]:
dmr_beds = []
adata_list = []
for chrom in chroms:
    dmr_dir = input_dir / f'SubType-{chrom}/HyperDMR/'
    dmr_list = list(dmr_dir.glob('*bed'))
    dmr_beds += dmr_list
    adata_path = dmr_dir / 'TotalHits.h5ad'
    adata_list.append(adata_path)

In [15]:
records = []
for p in dmr_beds:
    cluster = p.name.split('.')[0]
    chrom = p.parent.parent.name.split('-')[1]
    records.append([cluster, chrom, str(p)])
bed_path_df = pd.DataFrame(records, columns=['cluster', 'chrom', 'path'])

In [16]:
this_output = output_dir / 'HyperDMR'
this_output.mkdir(exist_ok=True)

for cluster, sub_df in bed_path_df.groupby('cluster'):
    print(cluster)
    sub_df = sub_df.set_index('chrom')
    dfs = []
    for chrom in chroms:
        try:
            df = pd.read_csv(sub_df.at[chrom, 'path'],
                         header=None,
                         sep='\t',
                         index_col=3)
            
            df.index = df.index.map(lambda i: f'Sub{chrom}_' + i.split('_')[1])
        except pd.errors.EmptyDataError:
            continue
        dfs.append(df)
    total_df = pd.concat(dfs)
    total_df.reset_index().iloc[:, [1, 2, 3, 0]].to_csv(this_output /
                                                        f'{cluster}.DMS1.bed',
                                                        header=None,
                                                        index=None,
                                                        sep='\t')
    

ANP_anp-dg
ANP_anp-olf-cnu
ASC_cortex-olf
ASC_mid
ASC_str-hpf
CA1_Ak5
CA1_Chrm3
CA1_Kif26a
CA1_Lingo2
CA1_Ptprg
CA3-St18_Epha5
CA3-St18_Nuak1
CA3-St18_Tead1
CA3_Cadm2
CA3_Efnb2
CGE-Lamp5_Dock5
CGE-Lamp5_Grid1
CGE-Lamp5_Grk5
CGE-Lamp5_Nrxn3
CGE-Lamp5_Sorcs1
CGE-Vip_Ccser1
CGE-Vip_Clstn2
CGE-Vip_Fstl4
CGE-Vip_Galnt17
CGE-Vip_Grm8
CGE-Vip_Ntng1
CGE-Vip_Ptprm
CGE-Vip_Robo1
CLA_Bcl11a
CLA_Cdh8
CLA_Nrp2
CT-L6_Hcrtr2
CT-L6_Il1rap
CT-L6_Map4
CT-L6_Megf9
Chd7_Kcnc2
Chd7_Megf11
Chd7_Trpc7
D1L-Fstl4_Cadm1
D1L-Fstl4_Crim1
D1L-Fstl4_Grm3
D1L-Fstl4_Sipa1l2
D1L-Fstl4_Trps1
D1L-PAL_Flrt2
D1L-PAL_Plcxd3
DG-po_Bcl11a
DG-po_Calb2
DG-po_Kctd8
DG_dg-all
EC_Abhd2
EC_Sema3g
EP_Adcy8
EP_Rgs8
EP_Tspan5
Foxp2_Dchs2
Foxp2_Homer2
Foxp2_Inpp4b
Foxp2_Trpc7
Gfra1_Gfra1
IG-CA2_Chrm3
IG-CA2_Peak1
IG-CA2_Xpr1
IT-L23_Cux1
IT-L23_Foxp1
IT-L23_Ptprt
IT-L23_Tenm2
IT-L4_Astn2
IT-L4_Shc3
IT-L5_Cdh8
IT-L5_Etv1
IT-L5_Grik3
IT-L6_Cadps2
IT-L6_Fstl4
IT-L6_Man1c1
IT-L6_Oxr1
L6b_Adcy8
L6b_Kcnk2
L6b_Nrp2
L6b_Pkhd1
LSX-Inh_Cacna1i
L

### Aggregate hypo Adata

In [17]:
adata_paths = {
    p.parent.parent.name.split('-')[1]: p
    for p in input_dir.glob('SubType-chr*/HypoDMR/TotalHits.h5ad')
}

In [27]:
records = []
obs = []
var = []
first = True
for chrom in chroms:
    path = adata_paths[chrom]
    adata = anndata.read_h5ad(path)
    adata.obs_names = adata.obs_names.map(lambda i: f'Sub{chrom}_' + i.split('_')[1])
    
    if first:
        # make sure columns have the same order
        first = False
        motif_order = adata.var_names
    else:
        adata = adata[:, motif_order].copy()
    obs.append(adata.obs)
    records.append(adata.X)
    var.append(adata.var)
    
total_adata = anndata.AnnData(X=vstack(records).tocsr(),
                              obs=pd.concat(obs),
                              var=pd.DataFrame([], motif_order))
total_adata.write_h5ad(output_dir / 'TotalHits.HypoDMR.h5ad')

... storing '#chr' as categorical


### Aggregate hyper Adata

In [15]:
adata_paths = {
    p.parent.parent.name.split('-')[1]: p
    for p in input_dir.glob('SubType-chr*/HyperDMR/TotalHits.h5ad')
}

In [17]:
records = []
obs = []
var = []
first = True
for chrom in chroms:
    path = adata_paths[chrom]
    adata = anndata.read_h5ad(path)
    adata.obs_names = adata.obs_names.map(lambda i: f'Sub{chrom}_' + i.split('_')[1])
    
    if first:
        # make sure columns have the same order
        first = False
        motif_order = adata.var_names
    else:
        adata = adata[:, motif_order].copy()
    obs.append(adata.obs)
    records.append(adata.X)
    var.append(adata.var)
    
total_adata = anndata.AnnData(X=vstack(records).tocsr(),
                              obs=pd.concat(obs),
                              var=pd.DataFrame([], motif_order))
total_adata.write_h5ad(output_dir / 'TotalHits.HyperDMR.h5ad')

... storing '#chr' as categorical


## Aggregate DMR info

In [18]:
hdf_paths = {
    p.parent.name.split('-')[1]: p
    for p in input_dir.glob('SubType-chr*/DMRInfo.h5')
}

In [22]:
rates = []
beds = []
first = True
for chrom in chroms:
    path = hdf_paths[chrom]
    with pd.HDFStore(path) as hdf:
        rate = hdf['Rate']
        bed = hdf['Bed']
        
        rate.index = rate.index.map(lambda i: f'Sub{chrom}_' + i.split('_')[1])
        bed.index = bed.index.map(lambda i: f'Sub{chrom}_' + i.split('_')[1])
        
    rates.append(rate)
    beds.append(bed)
    
rate = pd.concat(rates)
bed = pd.concat(beds)
with pd.HDFStore(output_dir / 'DMRInfo.h5') as hdf:
    hdf['Rate'] = rate
    hdf['bed'] = bed