In [1]:
# notebook config (optional)
%load_ext lab_black
%load_ext autoreload
%autoreload 2
%matplotlib inline

# Step 0: MACS2 call peaks from HiChIP data
This step is optional if supplied with pre-defined peaks.

To call peaks from religation, dangling end, and dump pairs, we use the mapped and paired bam file as input. The pipeline also provide options for including short cis validpairs determined by `local_range` parameters and whether to exlude interchromosomal validpairs (`exclude_interchro`).

In [2]:
from hichip_object.hichip_callpeaks import call_anchors_from_hichip
import ray, logging

logging.basicConfig(level=logging.INFO, format="%(asctime)s: %(message)s")

Fillin the parameters

In [3]:
digestion_site = "(CT[ATCG]AG)|(TTAA)"  # MiD HiChIP digestion enzymes
genome_fa = "/home/software/mm9.fa"
# for original HiChIP, use merged and paired bam in bwt2/ folder generated by HiC-Pro
bam_file = "/Extension_HDD2/Hanbin/ES_Cell/E14/HiC3_HL/HL28_Smc1_MiDHiChIP_Test/HL28_Smc1_MiDHiChIP_out/bowtie_results/bwt2_multipass/data/data.mergePasses.bam"
# control the PETs to include. Considering includes more validpairs if sequencing depth is low
local_range = 1000
exclude_interchro = True
# macs2 parameters; relax the stringency if sequencing depth is low
macs2_qval = 0.01
macs2_genome = "mm"
macs2_path = "/home/coco/miniconda3/envs/hichip-loop/bin/macs2"
macs2_out_dir = "/Extension_HDD2/Hanbin/ES_Cell/E14/HiC3_HL/HL28_Smc1_MiDHiChIP_Test/HL28_Smc1_MiDHiChIP_out/"
prefix = "Smc1_HiChIP_1kb"

mapq = 10
nprocs = 30

In [4]:
ray.init(num_cpus=nprocs)

2021-08-05 16:25:43,823	INFO services.py:1267 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8266[39m[22m


{'node_ip_address': '192.168.0.9',
 'raylet_ip_address': '192.168.0.9',
 'redis_address': '192.168.0.9:23885',
 'object_store_address': '/tmp/ray/session_2021-08-05_16-25-40_062093_38519/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-08-05_16-25-40_062093_38519/sockets/raylet',
 'webui_url': '127.0.0.1:8266',
 'session_dir': '/tmp/ray/session_2021-08-05_16-25-40_062093_38519',
 'metrics_export_port': 59287,
 'node_id': 'c9b60367a3a1bca30184f503ffb3ff0fb8b2067daba06b161167b692'}

In [5]:
call_anchors_from_hichip(
    bam_file,
    digestion_site,
    genome_fa,
    prefix,
    macs2_out_dir,
    exclude_interchro,
    local_range,
    macs2_path,
    macs2_qval,
    macs2_genome,
    mapq,
    nprocs,
)

2021-08-05 16:30:54,290: Parsing bam data to Hi-C pairs
2021-08-05 16:38:49,593: Removing duplications
2021-08-05 16:39:30,293: Processed 71414597 paired PETs. 24626682 paired PETs were kept after duplication removal
2021-08-05 16:39:30,296: Flattening to PETs to 100 bp reads
2021-08-05 16:41:07,868: 52188049 reads were dumped.
2021-08-05 16:41:07,870: Calling peaks by MACS2
2021-08-05 16:45:59,342: MACS2 called 143974 peaks at q < 0.01


'/Extension_HDD2/Hanbin/ES_Cell/E14/HiC3_HL/HL28_Smc1_MiDHiChIP_Test/HL28_Smc1_MiDHiChIP_out/Smc1_HiChIP_1kb_MACS2_results/Smc1_HiChIP_1kb_peaks.narrowPeak'

# Step 1: OBJECT call significant interactions

In [2]:
# skip this cell if starts from step 0
import ray, logging

ray.init(num_cpus=30)
logging.basicConfig(level=logging.INFO, format="%(asctime)s: %(message)s")

2021-08-04 23:03:31,424	INFO services.py:1267 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8266[39m[22m


In [6]:
from hichip_object.load_loop_data import process_peak_to_anchor_bins
from hichip_object.glm_loop_model import Loop_ZIP

Parameters

In [7]:
peak_file = "/Extension_HDD2/Hanbin/ES_Cell/E14/HiC3_HL/HL28_Smc1_MiDHiChIP_Test/HL28_Smc1_MiDHiChIP_out/Smc1_HiChIP_1kb_MACS2_results/Smc1_HiChIP_1kb_peaks.narrowPeak"
bin_size = 2500  # genome is bined and peaks are assigned to the genomic bins
chro_size = "/usr/local/bin/HiC-Pro_2.11.0-beta/annotation/chrom_mm9.sizes"
vp = "/Extension_HDD2/Hanbin/ES_Cell/E14/HiC3_HL/HL28_Smc1_MiDHiChIP_Test/HL28_Smc1_MiDHiChIP_out/hop_results/data.APASL.mppValidPairs"
inter_range = (5000, 2000000)

In [8]:
gb_anchors = process_peak_to_anchor_bins(
    peak_file=peak_file,
    chro_size=chro_size,
    resolution=bin_size,
)

Load validpair data and mapped to putative anchor pairs

In [9]:
loop_zip = Loop_ZIP(
    vp,
    gb_anchors,
    # inter_range = inter_range
)

2021-08-05 16:48:02,273: Note: NumExpr detected 40 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2021-08-05 16:49:24,589: Loading loop PETs
2021-08-05 16:51:57,717: Counting anchor depth
2021-08-05 16:51:57,940: Joining data
2021-08-05 16:55:03,509: 82969 of anchors forms 7663615 putative mid-range loops
2021-08-05 16:55:03,570: 839454 observed loops that contains 2257595.0 PETs (avg = 2.6893611800050987)


Fit model and calculate p values

In [10]:
loop_zip.iteratively_fit_model(disp_glm_summary=False)

2021-08-05 16:56:22,913: Fitting data to ZeroInflatedPoisson model
2021-08-05 16:59:14,504: AIC: 6650659.102784619
2021-08-05 16:59:15,164: 51379 interactions were called at qval <= 0.1, count >= 2.6893611800050987
2021-08-05 16:59:15,695: 9956 high confident interactions were removed from fitting
2021-08-05 17:06:00,473: AIC: 5876439.235001802
2021-08-05 17:06:02,778: 61760 interactions were called at qval <= 0.1, count >= 2.6893611800050987
2021-08-05 17:06:02,779: Sum square changes of E: 82425.70770671748
2021-08-05 17:06:03,347: 11995 high confident interactions were removed from fitting
2021-08-05 17:16:16,835: AIC: 5803013.3594795065
2021-08-05 17:16:17,410: 63305 interactions were called at qval <= 0.1, count >= 2.6893611800050987
2021-08-05 17:16:17,411: Sum square changes of E: 3001.3492110920974
2021-08-05 17:16:19,813: 12318 high confident interactions were removed from fitting
2021-08-05 17:26:28,205: AIC: 5792430.58363404
2021-08-05 17:26:28,734: 63552 interactions were c

In [11]:
out = "loop_zip.self_1kb_peak.bedpe"
loop_zip.write_interaction_statistics(
    out,
)

# Test overlap with Micro-C loops

In [12]:
import pandas as pd
from hichip_object.loop_merge import (
    combine_two_loop_list,
    non_redundant_loops,
    significant_loops,
    mid_range_loops,
    drop_unplaced_chromosome_data,
    read_hicuups_loop,
)

In [13]:
microc_loop_file = "/Extension_HDD2/Hanbin/ES_Cell/Jm8.N4/Loops/WT_ALL_merge.allValidPairs_5_10kb_res_SCALE_hiccups/merged_loops.bedpe"
microc_loops = non_redundant_loops(
    mid_range_loops(
        drop_unplaced_chromosome_data(read_hicuups_loop(microc_loop_file))
    ).reset_index(drop=True),
    10000,
)

2021-08-05 17:51:41,454: 21030 reduces to 21030


In [14]:
out = "loop_zip.self_1kb_peak.bedpe"
hichip_hightier_loops = non_redundant_loops(
    significant_loops(
        drop_unplaced_chromosome_data(
            pd.read_csv(
                out,
                sep="\t",
                header=None,
                names=["chr1", "x1", "y1", "chr2", "x2", "y2", "counts", "qval"],
            )
        ),
        0.05,
        3,
    ).reset_index(drop=True),
    20000,
)
combine_two_loop_list(
    microc_loops, hichip_hightier_loops, 25000, ("Micro-C", "Smc1_MiDHiChIP")
)

2021-08-05 17:51:55,584: 50946 reduces to 33732
2021-08-05 17:51:55,661: 21030 records in "Micro-C" loop set; 50946 records in "Smc1_MiDHiChIP" loop set
2021-08-05 17:51:55,684: Building Graph
2021-08-05 17:52:01,461: Assigning Components
2021-08-05 17:52:33,665: Combined to 38632 components (merged loops combined graph of both loop set)
2021-08-05 17:52:33,670: Only "Micro-C": 8194 component are consist of 8497 "Micro-C" non-redundant loops
2021-08-05 17:52:33,673: Only "Smc1_MiDHiChIP": 19978 component are consist of 20854 "Smc1_MiDHiChIP" non-redundant loops
2021-08-05 17:52:33,685: 10460 overlaping components are equal to 12533 non-redundant loops for "Micro-C" or 12878 for "Smc1_MiDHiChIP"
