In [1]:
import sys
import os
import numpy as np
sys.path.append("..")
from diachr import TadBoundarySet
from diachr import DiachromaticInteractionSet
from scipy import stats
import random

# Realtionship between interactions and TAD boundaries

The aim here is to investigate the relationship between interactions and TADs. For this purpose, there is the module `TadBoundarySet`, which contains TAD boundaries and supports two functions:

1. `get_distance_to_nearest_tad_boundary(chr, pos) -> distance_to_next_tad`
2. `get_number_of_boundaries_spanned_by_region(chr, sta_pos, end_pos) -> number_of_spanned_tads`

The first function returns the distance to the next TAD boudary for a given position. The second function returns the number of TAD boundaries that are spanned by a given region. To process the interactions, the module `DiachromaticInteractionSet` is used.

## Input data

There is one CHC dataset for each of the 17 cell types and, for eight of the cell types, there are HC data and TAD boundaries.

In [2]:
#CELL_TYPE_SHORT = 'MK'            # Has HC data
#CELL_TYPE_SHORT = 'ERY'           # Has HC data
#CELL_TYPE_SHORT = 'NEU'           # Has HC data
#CELL_TYPE_SHORT = 'MON'           # Has HC data
CELL_TYPE_SHORT = 'MAC_M0'        # Has HC data
#CELL_TYPE_SHORT = 'MAC_M1'
#CELL_TYPE_SHORT = 'MAC_M2'
#CELL_TYPE_SHORT = 'EP'
#CELL_TYPE_SHORT = 'NB'            # Has HC data
#CELL_TYPE_SHORT = 'TB'
#CELL_TYPE_SHORT = 'FOET'
#CELL_TYPE_SHORT = 'NCD4'          # Has HC data
#CELL_TYPE_SHORT = 'TCD4'
#CELL_TYPE_SHORT = 'NACD4'
#CELL_TYPE_SHORT = 'ACD4'
#CELL_TYPE_SHORT = 'NCD8'          # Has HC data
#CELL_TYPE_SHORT = 'TCD8'

A `TadBoundarySet` can be created with one of the eight BED files with the published TADs or a BED file with TAD boundaries from all eight cell types that was created using `BedTools`. See bash script in: `../additional_files/javierre_2016/tad_regions_hg38/`.

In [16]:
#tad_boundary_bed_file = '../additional_files/javierre_2016/tad_regions_hg38/hglft_genome_TADs_' + CELL_TYPE_SHORT + '_hg38.bed'
#tad_boundary_bed_file = '../additional_files/javierre_2016/tad_regions_hg38/hglft_genome_TADs_' + 'NEU' + '_hg38.bed'
#tad_boundary_bed_file = '../additional_files/javierre_2016/tad_regions_hg38/hglft_genome_TADs_' + 'MK' + '_hg38.bed'
tad_boundary_bed_file = '../additional_files/javierre_2016/tad_regions_hg38/merged_tad_boundary_centers.bed'

In [19]:
chr_size_file = '../additional_files/hg38.chrom.sizes.txt'
tbs = TadBoundarySet(tad_boundary_bed_file = tad_boundary_bed_file, chr_size_file = chr_size_file)
print('TadBoundarySet has ' + str(tbs.n_tad_boundaries) + ' boundaries.')

TadBoundarySet has 12949 boundaries.


Read an interaction file that was created with `DICer`.

In [5]:
PROTOCOL = 'CHC'
MIN_I_DIST = '20000'
FDR = '05'
RPC_RULE = 'ht'
if PROTOCOL == 'CHC':
    INTERACTION_FILE = '../DICer_interactions/' + RPC_RULE.upper() + '/FDR0' + FDR + '/CHC/JAV_' + CELL_TYPE_SHORT + '_RALT_20000_' + RPC_RULE + '_fdr0.' + FDR + '_evaluated_and_categorized_interactions.tsv.gz'
    OUT_PREFIX = 'JAV_' + CELL_TYPE_SHORT + '_CHC_RALT_' + MIN_I_DIST + '_' + RPC_RULE.lower()
if PROTOCOL == 'HC':
    INTERACTION_FILE = '../DICer_interactions/' + RPC_RULE.upper() + '/FDR0' + FDR + '/HC/' + MIN_I_DIST + '/JAV_' + CELL_TYPE_SHORT + '_HC_RALT_' + MIN_I_DIST + '_' + RPC_RULE + '_fdr0.' + FDR + '_evaluated_and_categorized_interactions.tsv.gz'
    OUT_PREFIX = 'JAV_' + CELL_TYPE_SHORT + '_HC_RALT_'  + MIN_I_DIST + '_' + RPC_RULE.lower()
if PROTOCOL == 'HC_POOLED':
    INTERACTION_FILE = '../DICer_interactions/HT/FDR0' + FDR + '/HC/20000/JAV_ALL_HC_RALL_ALT_20000_ht_fdr0.' + FDR + '_evaluated_and_categorized_interactions.tsv.gz'
    OUT_PREFIX = 'JAV_' + 'POOLED_HC_FDR005' + '_HC_RALT_20000_' + RPC_RULE.lower()

d11_interaction_set = DiachromaticInteractionSet(rpc_rule = RPC_RULE)
d11_interaction_set.parse_file(
    i_file = INTERACTION_FILE,
    verbose = True)

[INFO] Parsing Diachromatic interaction file ...
	[INFO] ../DICer_interactions/HT/FDR005/CHC/JAV_MAC_M0_RALT_20000_ht_fdr0.05_evaluated_and_categorized_interactions.tsv.gz
	[INFO] Parsed 1,000,000 interaction lines ...
	[INFO] Parsed 2,000,000 interaction lines ...
	[INFO] Parsed 3,000,000 interaction lines ...
	[INFO] Parsed 4,000,000 interaction lines ...
	[INFO] Parsed 5,000,000 interaction lines ...
	[INFO] Parsed 6,000,000 interaction lines ...
	[INFO] Parsed 7,000,000 interaction lines ...
	[INFO] Parsed 8,000,000 interaction lines ...
	[INFO] Parsed 9,000,000 interaction lines ...
	[INFO] Set size: 9,648,210
[INFO] ... done.


## Test whether interactions end closer to TAD boundaries than expected by chance

We compare the distances to the next TAD for the following interaction categories:

1. `DIX`: Imbalanced interactions with high read pair counts and without counterpart in the reference interactions
2. `DI`: Imbalanced interactions with counterpart in the reference interactions
3. `URI`: Balanced reference interactions (comparable to `DI` with respect to total number and distribution of read pair numbers)
4. `UI`: Balanced interactions (remaining powered interactions)
5. `ALL`: All interaction categories combined

### Distances between interactions and next TAD boundaries

The following function determines the distances of CHC interactions to their next TAD boundary for all interactions and saves them to separate lists for the various interaction categories. For capture Hi-C data, we only consider the distances from the non-enriched (`N`) digest to the next TAD boundary. 

In [6]:
def determine_distances_to_next_tad_boundaries_chc(
    d11_interaction_set: DiachromaticInteractionSet = None,
    tbs: TadBoundarySet = None,
    random_range: int = 0):
    
    # Dictionary with lists of distances between interactions and their next TAD boundaries
    ntb_dist_lists = {
        'DIX': [],
        'DI': [],
        'UIR': [],
        'UI': [],
        'ALL': []
    }
    for d11_inter in d11_interaction_set._inter_dict.values():
        
        # This analysis is restricted to NE and EN interactions, which typically make up mor than 90% of CHC data
        if d11_inter.enrichment_status_tag_pair == 'EN' or d11_inter.enrichment_status_tag_pair == 'NE':
        
            # Determine outermost position of the 'N' digest
            if d11_inter.enrichment_status_tag_pair == 'NE':
                pos = d11_inter.fromA

            if d11_inter.enrichment_status_tag_pair == 'EN':
                pos = d11_inter.toB

            # Determine distance to nearest TAD boundary
            dist = tbs.get_distance_to_nearest_tad_boundary(d11_inter.chrA, pos)

            # Add determined distance to list
            ntb_dist_lists[d11_inter.get_category()].append(dist)
            ntb_dist_lists['ALL'].append(dist)     
    
    return ntb_dist_lists

The following function determines the distances of Hi-C interactions to the next TAD boundary for all interactions and saves them to separate lists for the various interaction categories. For Hi-C data, there are no enriched digests. Therefore, we determine both distances to the next TAD boundary and only add the shorter distance to the list.

In [7]:
def determine_distances_to_next_tad_boundaries_hc(
    d11_interaction_set: DiachromaticInteractionSet = None,
    tbs: TadBoundarySet = None):
    
    # Dictionary with lists of distances to nearest TAD boundary
    ntb_dist_lists = {
        'DIX': [],
        'DI': [],
        'UIR': [],
        'UI': [],
        'ALL': []
    }
    for d11_inter in d11_interaction_set._inter_dict.values():
        
        pos_1 = d11_inter.fromA
        pos_2 = d11_inter.toB

        # Determine distance to nearest TAD boundary
        dist_1 = tbs.get_distance_to_nearest_tad_boundary(d11_inter.chrA, pos_1)
        dist_2 = tbs.get_distance_to_nearest_tad_boundary(d11_inter.chrA, pos_2)
        dist = min(dist_1, dist_2)

        # Add determined distance to list
        ntb_dist_lists[d11_inter.get_category()].append(dist)
        ntb_dist_lists['ALL'].append(dist)     
    
    return ntb_dist_lists

This function wraps the two functions from above into one function. Depending on which protocol has been selected, either the one or the other funtion is used.

In [8]:
def determine_distances_to_next_tad_boundaries(
    d11_interaction_set: DiachromaticInteractionSet=None,
    tbs: TadBoundarySet = None,
    protocol: str = 'CHC',
    random_range: int = 0):
    
    if protocol == 'CHC':
        return determine_distances_to_next_tad_boundaries_chc(
            d11_interaction_set = d11_interaction_set,
            tbs = tbs,
            random_range = random_range)
    elif protocol == 'HC' or protocol == 'HC_POOLED':
        return determine_distances_to_next_tad_boundaries_hc(
            d11_interaction_set = d11_interaction_set,
            tbs = tbs)

The following code determines the medians for the observed distances between interactions and the next TAD.

In [9]:
def determine_median_distances_to_next_tad_boundaries(
    d11_interaction_set: DiachromaticInteractionSet = None,
    tbs: TadBoundarySet = None,
    protocol: str = 'CHC',
    random_range: int = 0):
    
    # Get lists of distances between interactions and their next TAD boundary
    ntb_dist_lists = determine_distances_to_next_tad_boundaries(
        d11_interaction_set = d11_interaction_set,
        tbs = tbs,
        protocol = protocol,
        random_range = random_range)

    # Determine median distances for all interaction categories
    ntb_dist_medians = {}
    ntb_dist_i_nums = {}
    for i_cat in ['DIX', 'DI', 'UIR', 'UI', 'ALL']:
        ntb_dist_medians[i_cat] = np.median(ntb_dist_lists[i_cat])
        ntb_dist_i_nums[i_cat] = len(ntb_dist_lists[i_cat])

    # Determine difference in median distances between DI and UIR (same number of interactions)
    if ntb_dist_i_nums['DI'] == ntb_dist_i_nums['UIR']:
        ntb_dist_i_nums['DI-UIR'] = ntb_dist_i_nums['DI']
        ntb_dist_medians['DI-UIR'] = ntb_dist_medians['DI'] - ntb_dist_medians['UIR']
    else:
        print("[ERROR] DI and UIR must have the same number of interactions!")

    return ntb_dist_medians, ntb_dist_i_nums

In the following cell, an `TadBoundarySet` is created and the medians of distances to the next TAD boundaries are determined for each interaction category.

In [10]:
# Create a TAD boundary set
tbs = TadBoundarySet(tad_boundary_bed_file = tad_boundary_bed_file, chr_size_file = chr_size_file)

# Determine median distances to next TAD boundaries
ntb_dist_medians, ntb_dist_i_nums = determine_median_distances_to_next_tad_boundaries(
    d11_interaction_set = d11_interaction_set,
    tbs = tbs,
    protocol = PROTOCOL)
print('Observed')
print(ntb_dist_medians)
print()

Observed
{'DIX': 107465.0, 'DI': 118282.0, 'UIR': 124185.0, 'UI': 130412.0, 'ALL': 120208.0, 'DI-UIR': -5903.0}



### Randomize TAD boundaries

The aim here is to investigate whether capture Hi-C or Hi-C interactions for a given cell type are indpendent of the corresponding TAD boundaries or not. To do this, we first determine the distance from each interaction to the next TAD boundary, sepeartely for each interaction category (using the function above). From these distances, we determine the median for each interaction category.

Then we randomize the TAD boundaries by selecting the appropriate number of random TAD boundary positions for each chromosome and again determine the median distances from interactions to the next (randomized) TAD boundary. We perform `iter_num` iterations of this procedure, which gives us `iter_num` median distances for each interaction category. For these median distances, we calculate the mean, the standard deviation and finally the Z-score.

In [11]:
iter_num = 10

We have implmented two approaches for the randomization of TAD boundaries:

1. For each chromosome, a corresponding number of random positions is randomly selected from the entire sequence of the chromosome.
2. For each TAD boundary, a random postion is selected from the sourrounding sequence.

#### Approach 1: Randomly select from entire sequence

For each chromosome, a corresponding number of random TAD boundaries is randomly selected from the entire sequence of the chromosome. For this randomization procedure, we have to call the function `TadBoundarySet.get_randomized_tad_boundary_set()` with `random_range = 0` in each iteration.

In [12]:
random_range = 0

In [13]:
def perform_tad_boundary_randomization(iter_num,
                                       d11_interaction_set,
                                       tad_boundary_set,
                                       random_range,
                                       protocol,
                                       randomize_interaction_ends = False):

    print('Randomization')
    ntb_dist_medians_random_dict = {
        'DIX': {
            'I_NUMS': None,
            'MEDIANS': [],
        },
        'DI': {
            'I_NUMS': None,
            'MEDIANS': [],
        },
        'UIR': {
            'I_NUMS': None,
            'MEDIANS': [],
        },
        'UI': {
            'I_NUMS': None,
            'MEDIANS': [],
        },
        'ALL': {
            'I_NUMS': None,
            'MEDIANS': [],
        },
        'DI-UIR': {
            'I_NUMS': None,
            'MEDIANS': []
        }
    }
    for random_seed in range(0, iter_num):

        # Select type of randomization
        if not randomize_interaction_ends:
            # Randomize TAD boundaries and not interaction ends
            iter_tbs = tbs.get_randomized_tad_boundary_set(random_seed=random_seed, random_range=random_range)
            random_range_interaction_ends = 0
        else:
            # Randomize interaction ends and not TAD boundaries 
            iter_tbs = tad_boundary_set
            random_range_interaction_ends = random_range
            
        # Determine median distances to next randomized TAD boundaries
        ntb_dist_medians_random, ntb_dist_i_nums_random = determine_median_distances_to_next_tad_boundaries(
        d11_interaction_set = d11_interaction_set,
        tbs = iter_tbs,
        protocol = PROTOCOL,
        random_range = random_range_interaction_ends)

        # Append median distances and interaction numbers to lists
        for i_cat in ['DIX', 'DI', 'UIR', 'UI', 'ALL', 'DI-UIR']:
            ntb_dist_medians_random_dict[i_cat]['I_NUMS'] = ntb_dist_i_nums_random[i_cat]
            ntb_dist_medians_random_dict[i_cat]['MEDIANS'].append(ntb_dist_medians_random[i_cat])

        print(ntb_dist_medians_random)
    
    print("Done.")
    return ntb_dist_medians_random_dict

In [14]:
print('Observed')
print(ntb_dist_medians)
print()
ntb_dist_medians_random_dict = perform_tad_boundary_randomization(
    iter_num = iter_num,
    d11_interaction_set = d11_interaction_set,
    tad_boundary_set = tbs,
    random_range = 0,
    protocol = PROTOCOL)

Observed
{'DIX': 107465.0, 'DI': 118282.0, 'UIR': 124185.0, 'UI': 130412.0, 'ALL': 120208.0, 'DI-UIR': -5903.0}

Randomization
{'DIX': 149780.0, 'DI': 150126.0, 'UIR': 149055.0, 'UI': 149767.0, 'ALL': 149610.0, 'DI-UIR': 1071.0}
{'DIX': 146017.0, 'DI': 147986.5, 'UIR': 147368.0, 'UI': 147500.0, 'ALL': 147431.0, 'DI-UIR': 618.5}
{'DIX': 147512.0, 'DI': 147739.0, 'UIR': 147349.0, 'UI': 147125.0, 'ALL': 147501.0, 'DI-UIR': 390.0}
{'DIX': 148141.5, 'DI': 148589.0, 'UIR': 147193.0, 'UI': 147987.0, 'ALL': 147920.0, 'DI-UIR': 1396.0}
{'DIX': 146127.0, 'DI': 147328.0, 'UIR': 147160.0, 'UI': 146920.0, 'ALL': 147064.0, 'DI-UIR': 168.0}
{'DIX': 149312.0, 'DI': 148665.0, 'UIR': 149120.0, 'UI': 149388.0, 'ALL': 149017.0, 'DI-UIR': -455.0}
{'DIX': 149314.0, 'DI': 150238.5, 'UIR': 148993.0, 'UI': 148455.0, 'ALL': 149447.0, 'DI-UIR': 1245.5}
{'DIX': 148817.5, 'DI': 149612.5, 'UIR': 149131.0, 'UI': 149309.0, 'ALL': 149299.0, 'DI-UIR': 481.5}
{'DIX': 148339.5, 'DI': 148569.5, 'UIR': 148340.0, 'UI': 1479

Calculate Z-score and write out the results in table format.

In [15]:
print(OUT_PREFIX)
print('Interaction file:\n\t' + INTERACTION_FILE)
print('BED file with TAD boundaries:\n\t' + tad_boundary_bed_file)
print('Number of iterations: ' + str(iter_num))
print('Random range: ' + str(random_range))
print()

def get_randomization_results_table(ntb_dist_medians, ntb_dist_medians_random_dict):
    print('I_CAT\tI_NUMS\tOBS\tMEAN_RAND\tSTD_RAND\tZ_SCORE\tST_OBS')
    for i_cat in ['DIX', 'DI', 'UIR', 'UI', 'ALL', 'DI-UIR']:
        i_num = ntb_dist_medians_random_dict[i_cat]['I_NUMS']
        observed = ntb_dist_medians[i_cat]
        mean = np.mean(ntb_dist_medians_random_dict[i_cat]['MEDIANS'])
        std = np.std(ntb_dist_medians_random_dict[i_cat]['MEDIANS'])
        z_score = (observed - mean) / std
        # Find number of smaller than observed
        st_obs = 0
        for median in ntb_dist_medians_random_dict[i_cat]['MEDIANS']:
            if median < observed:
                st_obs += 1
        print(i_cat + '\t' + str(i_num) + '\t' + str(observed) + '\t' + "{:.1f}".format(mean) + '\t' + "{:.1f}".format(std) + '\t' + "{:.2f}".format(z_score) + '\t' + str(st_obs) + '/' + str(iter_num))

get_randomization_results_table(ntb_dist_medians = ntb_dist_medians,
                                ntb_dist_medians_random_dict = ntb_dist_medians_random_dict)


JAV_MK_CHC_RALT_20000_ht
Interaction file:
	../DICer_interactions/HT/FDR005/CHC/JAV_MK_RALT_20000_ht_fdr0.05_evaluated_and_categorized_interactions.tsv.gz
BED file with TAD boundaries:
	../additional_files/javierre_2016/tad_regions_hg38/hglft_genome_TADs_MK_hg38.bed
Number of iterations: 10
Random range: 0

I_CAT	I_NUMS	OBS	MEAN_RAND	STD_RAND	Z_SCORE	ST_OBS
DIX	1055134	107465.0	147989.6	1313.1	-30.86	0/10
DI	3008766	118282.0	148444.8	1325.7	-22.75	0/10
UIR	3008766	124185.0	147981.3	1025.7	-23.20	0/10
UI	884785	130412.0	148127.2	1016.2	-17.43	0/10
ALL	7957451	120208.0	148169.2	1121.5	-24.93	0/10
DI-UIR	3008766	-5903.0	463.5	617.5	-10.31	0/10


#### Approach 2: Randomly select from surrounding sequence

Instead of picking a random TAD boundary positions from the entire chromosome, we now do the following. For each TAD boundary, a random postion is selected from the sourrounding sequence. To do this, we create a new `TadBoundarySet` and set the `random_range` that can be used to control the width of the sourrounding sequence from `0` to `100000`.

In [16]:
# Set random range
random_range = 1000000

print('Observed')
print(ntb_dist_medians)
print()
ntb_dist_medians_random_dict = perform_tad_boundary_randomization(
    iter_num = 10,
    d11_interaction_set = d11_interaction_set,
    tad_boundary_set = tbs,
    random_range = random_range,
    protocol = PROTOCOL)

Observed
{'DIX': 107465.0, 'DI': 118282.0, 'UIR': 124185.0, 'UI': 130412.0, 'ALL': 120208.0, 'DI-UIR': -5903.0}

Randomization
{'DIX': 115537.5, 'DI': 121213.0, 'UIR': 124805.0, 'UI': 127423.0, 'ALL': 122391.0, 'DI-UIR': -3592.0}
{'DIX': 113138.0, 'DI': 120607.0, 'UIR': 123766.0, 'UI': 126450.0, 'ALL': 121305.0, 'DI-UIR': -3159.0}
{'DIX': 116518.0, 'DI': 122108.0, 'UIR': 125111.0, 'UI': 127726.0, 'ALL': 123048.0, 'DI-UIR': -3003.0}
{'DIX': 117069.0, 'DI': 123553.0, 'UIR': 125104.0, 'UI': 126511.0, 'ALL': 123609.0, 'DI-UIR': -1551.0}
{'DIX': 116091.0, 'DI': 122701.5, 'UIR': 125556.0, 'UI': 127675.0, 'ALL': 123400.0, 'DI-UIR': -2854.5}
{'DIX': 116766.5, 'DI': 122887.0, 'UIR': 125486.0, 'UI': 127180.0, 'ALL': 123483.0, 'DI-UIR': -2599.0}
{'DIX': 116318.5, 'DI': 122904.0, 'UIR': 125400.0, 'UI': 127808.0, 'ALL': 123445.0, 'DI-UIR': -2496.0}
{'DIX': 116029.0, 'DI': 122315.0, 'UIR': 125018.0, 'UI': 127603.0, 'ALL': 123018.0, 'DI-UIR': -2703.0}
{'DIX': 115053.0, 'DI': 121099.0, 'UIR': 123730.0

In [17]:
print(OUT_PREFIX)
print('Interaction file:\n\t' + INTERACTION_FILE)
print('BED file with TAD boundaries:\n\t' + tad_boundary_bed_file)
print('Number of iterations: ' + str(iter_num))
print('Random range: ' + str(random_range))
print()
get_randomization_results_table(ntb_dist_medians = ntb_dist_medians,
                                ntb_dist_medians_random_dict = ntb_dist_medians_random_dict)

JAV_MK_CHC_RALT_20000_ht
Interaction file:
	../DICer_interactions/HT/FDR005/CHC/JAV_MK_RALT_20000_ht_fdr0.05_evaluated_and_categorized_interactions.tsv.gz
BED file with TAD boundaries:
	../additional_files/javierre_2016/tad_regions_hg38/hglft_genome_TADs_MK_hg38.bed
Number of iterations: 10
Random range: 1000000

I_CAT	I_NUMS	OBS	MEAN_RAND	STD_RAND	Z_SCORE	ST_OBS
DIX	1055134	107465.0	115907.4	1078.5	-7.83	0/10
DI	3008766	118282.0	122165.1	882.4	-4.40	0/10
UIR	3008766	124185.0	124837.3	632.6	-1.03	2/10
UI	884785	130412.0	127136.7	558.4	5.87	10/10
ALL	7957451	120208.0	122828.7	725.9	-3.61	0/10
DI-UIR	3008766	-5903.0	-2672.2	530.2	-6.09	0/10


The results of the randomization depend heavily on the `random_range` parameter. Here are the results for `1,000,000`, `500,000`, `400,000`, `300,000`, `200,000` and `100,000`.

```
JAV_MK_CHC_RALT_20000_ht
Interaction file:
	../DICer_interactions/HT/FDR005/CHC/JAV_MK_RALT_20000_ht_fdr0.05_evaluated_and_categorized_interactions.tsv.gz
BED file with TAD boundaries:
	../additional_files/javierre_2016/tad_regions_hg38/hglft_genome_TADs_MK_hg38.bed
Number of iterations: 10
-> Random range: 1000000 <-

I_CAT	I_NUMS	OBS	MEAN_RAND	STD_RAND	Z_SCORE	ST_OBS
DIX	1055134	107465.0	115907.4	1078.5	-7.83	0/10
DI	3008766	118282.0	122165.1	882.4	-4.40	0/10
UIR	3008766	124185.0	124837.3	632.6	-1.03	2/10
UI	884785	130412.0	127136.7	558.4	5.87	10/10
ALL	7957451	120208.0	122828.7	725.9	-3.61	0/10
DI-UIR	3008766	-5903.0	-2672.2	530.2	-6.09	0/10


JAV_MK_CHC_RALT_20000_ht
Interaction file:
	../DICer_interactions/HT/FDR005/CHC/JAV_MK_RALT_20000_ht_fdr0.05_evaluated_and_categorized_interactions.tsv.gz
BED file with TAD boundaries:
	../additional_files/javierre_2016/tad_regions_hg38/hglft_genome_TADs_MK_hg38.bed
Number of iterations: 10
-> Random range: 500000 <-

I_CAT	I_NUMS	OBS	MEAN_RAND	STD_RAND	Z_SCORE	ST_OBS
DIX	1055134	107465.0	110309.4	994.3	-2.86	0/10
DI	3008766	118282.0	119625.8	829.8	-1.62	1/10
UIR	3008766	124185.0	123163.2	1015.2	1.01	9/10
UI	884785	130412.0	125785.6	915.6	5.05	10/10
ALL	7957451	120208.0	120289.4	909.9	-0.09	4/10
DI-UIR	3008766	-5903.0	-3537.4	377.6	-6.27	0/10


JAV_MK_CHC_RALT_20000_ht
Interaction file:
	../DICer_interactions/HT/FDR005/CHC/JAV_MK_RALT_20000_ht_fdr0.05_evaluated_and_categorized_interactions.tsv.gz
BED file with TAD boundaries:
	../additional_files/javierre_2016/tad_regions_hg38/hglft_genome_TADs_MK_hg38.bed
Number of iterations: 10
-> Random range: 400000 <-

I_CAT	I_NUMS	OBS	MEAN_RAND	STD_RAND	Z_SCORE	ST_OBS
DIX	1055134	107465.0	108612.1	778.4	-1.47	1/10
DI	3008766	118282.0	118252.9	644.4	0.05	5/10
UIR	3008766	124185.0	121547.0	744.4	3.54	10/10
UI	884785	130412.0	124244.3	895.3	6.89	10/10
ALL	7957451	120208.0	118765.2	697.9	2.07	9/10
DI-UIR	3008766	-5903.0	-3294.1	255.6	-10.21	0/10


JAV_MK_CHC_RALT_20000_ht
Interaction file:
	../DICer_interactions/HT/FDR005/CHC/JAV_MK_RALT_20000_ht_fdr0.05_evaluated_and_categorized_interactions.tsv.gz
BED file with TAD boundaries:
	../additional_files/javierre_2016/tad_regions_hg38/hglft_genome_TADs_MK_hg38.bed
Number of iterations: 10
-> Random range: 300000 <-

I_CAT	I_NUMS	OBS	MEAN_RAND	STD_RAND	Z_SCORE	ST_OBS
DIX	1055134	107465.0	106879.6	1330.0	0.44	7/10
DI	3008766	118282.0	116601.7	985.3	1.71	10/10
UIR	3008766	124185.0	120242.7	839.2	4.70	10/10
UI	884785	130412.0	123172.8	848.5	8.53	10/10
ALL	7957451	120208.0	117269.7	893.0	3.29	10/10
DI-UIR	3008766	-5903.0	-3641.0	463.2	-4.88	0/10


JAV_MK_CHC_RALT_20000_ht
Interaction file:
	../DICer_interactions/HT/FDR005/CHC/JAV_MK_RALT_20000_ht_fdr0.05_evaluated_and_categorized_interactions.tsv.gz
BED file with TAD boundaries:
	../additional_files/javierre_2016/tad_regions_hg38/hglft_genome_TADs_MK_hg38.bed
Number of iterations: 10
-> Random range: 200000 <-

I_CAT	I_NUMS	OBS	MEAN_RAND	STD_RAND	Z_SCORE	ST_OBS
DIX	1055134	107465.0	104065.1	522.7	6.50	10/10
DI	3008766	118282.0	113685.0	523.6	8.78	10/10
UIR	3008766	124185.0	117934.7	625.9	9.99	10/10
UI	884785	130412.0	122040.4	754.0	11.10	10/10
ALL	7957451	120208.0	114731.1	553.5	9.90	10/10
DI-UIR	3008766	-5903.0	-4249.7	309.9	-5.33	0/10


JAV_MK_CHC_RALT_20000_ht
Interaction file:
	../DICer_interactions/HT/FDR005/CHC/JAV_MK_RALT_20000_ht_fdr0.05_evaluated_and_categorized_interactions.tsv.gz
BED file with TAD boundaries:
	../additional_files/javierre_2016/tad_regions_hg38/hglft_genome_TADs_MK_hg38.bed
Number of iterations: 10
-> Random range: 100000 <-

I_CAT	I_NUMS	OBS	MEAN_RAND	STD_RAND	Z_SCORE	ST_OBS
DIX	1055134	107465.0	102608.4	314.0	15.47	10/10
DI	3008766	118282.0	113421.4	214.7	22.64	10/10
UIR	3008766	124185.0	118851.2	249.2	21.40	10/10
UI	884785	130412.0	124527.0	286.5	20.54	10/10
ALL	7957451	120208.0	115099.6	214.3	23.83	10/10
DI-UIR	3008766	-5903.0	-5429.8	222.5	-2.13	0/10
```

### Randomize interaction end positions

In another randomization approach, we leave the TAD boundaries unchanged and select random positions from the sequences that sorround the interaction end positions.

To do this, we need to update the function `determine_distances_to_next_tad_boundaries_chc`. If the following updated function is called with `random_range = 0` it works as before. If `random_range` is greater than `0`, a position at the interaction end with the `N` digest is randomly drawn from the surrounding sequence for each interaction. Note that this works only for capture Hi-C interactions with enrichment states `NE` and `EN`. Finally, we need to take care of the case when we select positions outside a chromosome.

In [18]:
def determine_distances_to_next_tad_boundaries_chc(
    d11_interaction_set: DiachromaticInteractionSet = None,
    tbs: TadBoundarySet = None,
    random_range = 0):
    
    # Dictionary with lists of distances to nearest TAD boundary
    ntb_dist_lists = {
        'DIX': [],
        'DI': [],
        'UIR': [],
        'UI': [],
        'ALL': []
    }
    for d11_inter in d11_interaction_set._inter_dict.values():
        
        # This analysis is restricted to NE and EN interactions, which typically make up mor than 90% of CHC data
        if d11_inter.enrichment_status_tag_pair == 'EN' or d11_inter.enrichment_status_tag_pair == 'NE':
        
            # Determine the distance to the next TAD from the outermost position of the 'N' digest
            if d11_inter.enrichment_status_tag_pair == 'NE':
                pos = d11_inter.fromA
                if random_range != 0:
                    # Randomize position
                    pos = random.randint(pos - random_range, pos + random_range)

            if d11_inter.enrichment_status_tag_pair == 'EN':
                pos = d11_inter.toB
                if random_range != 0:
                    # Randomize position
                    pos = random.randint(pos - random_range, pos + random_range)
            
            # Correct invalid positions that may result from randomization
            if tbs.chr_size_dict[d11_inter.chrA] < pos:
                pos = tbs.chr_size_dict[d11_inter.chrA]
            if pos < 0:
                pos = 0

            # Determine distance to nearest TAD boundary
            dist = tbs.get_distance_to_nearest_tad_boundary(d11_inter.chrA, pos)

            # Add determined distance to list
            ntb_dist_lists[d11_inter.get_category()].append(dist)
            ntb_dist_lists['ALL'].append(dist)     
    
    return ntb_dist_lists

Now we can use the same functions as before for randomizing the TAD boundaries, but this time we leave the TAD boundaries unchanged and only randomize the ends of interactions (`randomize_interaction_ends = True`).

In [25]:
# Set random range
random_range = 300000

print('Observed')
print(ntb_dist_medians)
print()
ntb_dist_medians_random_dict = perform_tad_boundary_randomization(
    iter_num = 10,
    d11_interaction_set = d11_interaction_set,
    tad_boundary_set = tbs,
    random_range = random_range,
    protocol = PROTOCOL,
    randomize_interaction_ends = True
)

Observed
{'DIX': 107465.0, 'DI': 118282.0, 'UIR': 124185.0, 'UI': 130412.0, 'ALL': 120208.0, 'DI-UIR': -5903.0}

Randomization
{'DIX': 112350.0, 'DI': 122636.5, 'UIR': 126962.5, 'UI': 129771.0, 'ALL': 123495.0, 'DI-UIR': -4326.0}
{'DIX': 112130.0, 'DI': 122624.0, 'UIR': 126887.0, 'UI': 129583.0, 'ALL': 123428.0, 'DI-UIR': -4263.0}
{'DIX': 112366.0, 'DI': 122512.0, 'UIR': 126829.0, 'UI': 129732.0, 'ALL': 123405.0, 'DI-UIR': -4317.0}
{'DIX': 112291.0, 'DI': 122553.0, 'UIR': 127102.0, 'UI': 129731.0, 'ALL': 123501.0, 'DI-UIR': -4549.0}
{'DIX': 112230.5, 'DI': 122437.0, 'UIR': 126770.0, 'UI': 129864.0, 'ALL': 123344.0, 'DI-UIR': -4333.0}
{'DIX': 112247.0, 'DI': 122519.0, 'UIR': 126773.0, 'UI': 129553.0, 'ALL': 123359.0, 'DI-UIR': -4254.0}
{'DIX': 112162.0, 'DI': 122621.0, 'UIR': 126820.0, 'UI': 129821.0, 'ALL': 123423.0, 'DI-UIR': -4199.0}
{'DIX': 112284.0, 'DI': 122552.0, 'UIR': 127044.0, 'UI': 129760.0, 'ALL': 123483.0, 'DI-UIR': -4492.0}
{'DIX': 112375.0, 'DI': 122561.0, 'UIR': 126767.0

In [26]:
print(OUT_PREFIX)
print('Interaction file:\n\t' + INTERACTION_FILE)
print('BED file with TAD boundaries:\n\t' + tad_boundary_bed_file)
print('Number of iterations: ' + str(iter_num))
print('Random range: ' + str(random_range))
print()
get_randomization_results_table(ntb_dist_medians = ntb_dist_medians,
                                ntb_dist_medians_random_dict = ntb_dist_medians_random_dict)

JAV_MK_CHC_RALT_20000_ht
Interaction file:
	../DICer_interactions/HT/FDR005/CHC/JAV_MK_RALT_20000_ht_fdr0.05_evaluated_and_categorized_interactions.tsv.gz
BED file with TAD boundaries:
	../additional_files/javierre_2016/tad_regions_hg38/hglft_genome_TADs_MK_hg38.bed
Number of iterations: 10
Random range: 300000

I_CAT	I_NUMS	OBS	MEAN_RAND	STD_RAND	Z_SCORE	ST_OBS
DIX	1055134	107465.0	112256.2	89.1	-53.80	0/10
DI	3008766	118282.0	122569.4	67.8	-63.25	0/10
UIR	3008766	124185.0	126878.6	113.1	-23.82	0/10
UI	884785	130412.0	129717.7	92.3	7.52	10/10
ALL	7957451	120208.0	123426.3	50.7	-63.45	0/10
DI-UIR	3008766	-5903.0	-4309.3	120.0	-13.29	0/10


```
JAV_MK_CHC_RALT_20000_ht
Interaction file:
	../DICer_interactions/HT/FDR005/CHC/JAV_MK_RALT_20000_ht_fdr0.05_evaluated_and_categorized_interactions.tsv.gz
BED file with TAD boundaries:
	../additional_files/javierre_2016/tad_regions_hg38/hglft_genome_TADs_MK_hg38.bed
Number of iterations: 10
Random range: 300000

I_CAT	I_NUMS	OBS	MEAN_RAND	STD_RAND	Z_SCORE	ST_OBS
DIX	1055134	107465.0	112256.2	89.1	-53.80	0/10
DI	3008766	118282.0	122569.4	67.8	-63.25	0/10
UIR	3008766	124185.0	126878.6	113.1	-23.82	0/10
UI	884785	130412.0	129717.7	92.3	7.52	10/10
ALL	7957451	120208.0	123426.3	50.7	-63.45	0/10
DI-UIR	3008766	-5903.0	-4309.3	120.0	-13.29	0/10


JAV_MK_CHC_RALT_20000_ht
Interaction file:
	../DICer_interactions/HT/FDR005/CHC/JAV_MK_RALT_20000_ht_fdr0.05_evaluated_and_categorized_interactions.tsv.gz
BED file with TAD boundaries:
	../additional_files/javierre_2016/tad_regions_hg38/hglft_genome_TADs_MK_hg38.bed
Number of iterations: 10
-> Random range: 200000 <-

I_CAT	I_NUMS	OBS	MEAN_RAND	STD_RAND	Z_SCORE	ST_OBS
DIX	1055134	107465.0	110725.6	108.2	-30.15	0/10
DI	3008766	118282.0	120913.1	56.3	-46.70	0/10
UIR	3008766	124185.0	125553.8	87.2	-15.70	0/10
UI	884785	130412.0	129354.3	91.9	11.51	10/10
ALL	7957451	120208.0	122023.6	44.9	-40.42	0/10
DI-UIR	3008766	-5903.0	-4640.6	101.8	-12.41	0/10


JAV_MK_CHC_RALT_20000_ht
Interaction file:
	../DICer_interactions/HT/FDR005/CHC/JAV_MK_RALT_20000_ht_fdr0.05_evaluated_and_categorized_interactions.tsv.gz
BED file with TAD boundaries:
	../additional_files/javierre_2016/tad_regions_hg38/hglft_genome_TADs_MK_hg38.bed
Number of iterations: 10
-> Random range: 100000 <-

I_CAT	I_NUMS	OBS	MEAN_RAND	STD_RAND	Z_SCORE	ST_OBS
DIX	1055134	107465.0	108646.9	87.9	-13.45	0/10
DI	3008766	118282.0	119359.6	49.5	-21.77	0/10
UIR	3008766	124185.0	124961.3	54.6	-14.23	0/10
UI	884785	130412.0	130315.1	123.3	0.79	9/10
ALL	7957451	120208.0	121049.1	33.7	-24.97	0/10
DI-UIR	3008766	-5903.0	-5601.6	78.7	-3.83	0/10

```

## Test whether interactions span TAD boundaries less often than expected by chance

In the second analysis it is investigated how often interctions span TAD boundaries. Because we already know that the interactions from the different categories differ in length, we normalize the number of boundaries spanned to the total length of the of the region that is spanned. In addition, we have seen that there are different levels of dependencies between interactions and TAD boundaries in the various interaction categories.

We randomize capture Hi-C interactions by randomly flipping half of all interactions at baits. For example, an `NE` spans the region between the first position of the `N` digest and the last position of the `E` digest. The corresponding randomized interactions spans the region with the same length starting at the first position of the `E` digest. For `EN` interactions, the randomization works in the same way, but the interactions are flipped in the other direction.

For a given interaction set, the following function determines the total number of spanned boundaries, the total length of the spanned region and the total number of interactions for each category.

In [6]:
def get_sum_of_spanned_boundaries_and_total_length(
    d11_interaction_set: DiachromaticInteractionSet=None,
    tbs: TadBoundarySet=None,
    random_flip_interaction=False):
        
    spanned_boundary_length_dict = {
        'DIX': {
            'SB_NUM': 0, 
            'I_DIST': 0, 
            'I_NUM': 0
        },
            'DI': {
            'SB_NUM': 0, 
            'I_DIST': 0, 
            'I_NUM': 0       
        },
        'UIR': {
            'SB_NUM': 0, 
            'I_DIST': 0, 
            'I_NUM': 0          
        },
        'UI': {
            'SB_NUM': 0, 
            'I_DIST': 0, 
            'I_NUM': 0         
        },
        'ALL': {
            'SB_NUM': 0, 
            'I_DIST': 0, 
            'I_NUM': 0     
        }
    }
    
    # Count number of randomized interactions with positions outside chromosome exception
    exception_count_dict = {'DIX': 0, 'DI': 0, 'UIR': 0, 'UI': 0, 'ALL': 0}
    
    # Iterate interactions
    for d11_inter in d11_interaction_set._inter_dict.values():

        if d11_inter.enrichment_status_tag_pair == 'NE' or d11_inter.enrichment_status_tag_pair == 'EN':
            
            i_cat = d11_inter.get_category()
            e_cat = d11_inter.enrichment_status_tag_pair
            chrom = d11_inter.chrA
            i_dist = d11_inter.i_dist
            
            # Get the coordiantes of the spanned region
            if random_flip_interaction and random.uniform(0,1) <= 0.5:
                
                if e_cat == 'NE':
                    
                    # Flip interaction at the baited digest from the left to the right
                    sta_pos = d11_inter.toB # last position of 'E' digest
                    end_pos = sta_pos + i_dist
                    
                    # Flipped interaction protudes beyond the end of the chromosome
                    if tbs.chr_size_dict[chrom] < end_pos:
                        end_pos = tbs.chr_size_dict[d11_inter.chrA]                        

                else: # EN
                    
                    # Flip interaction at the baited digest from the right to the left
                    end_pos = d11_inter.fromA # first position of 'E' digest
                    sta_pos = end_pos - i_dist
                    
                    # Flipped interaction protudes beyond the beginning of the chromosome
                    if sta_pos < 0:
                        sta_pos = 0 
                    
            else:
                
                # Use the original coordiantes
                sta_pos = d11_inter.toA 
                end_pos = d11_inter.fromB
                
            
            # Determine length of spanned_region (equal to 'i_dist' if no exceptions occurred)
            spanned_region_length = end_pos - sta_pos
            
            # Count number of exceptions for different interaction categories
            if i_dist != spanned_region_length:
                exception_count_dict[i_cat] += 1
                exception_count_dict['ALL'] += 1
            
            # Determine number of boundaries spanned by the flipped interaction
            sb_num = tbs.get_number_of_boundaries_spanned_by_region(chrom, sta_pos, end_pos)

            # Increment numbers for interaction category
            spanned_boundary_length_dict[i_cat]['I_NUM'] += 1
            spanned_boundary_length_dict[i_cat]['I_DIST'] += spanned_region_length
            spanned_boundary_length_dict[i_cat]['SB_NUM'] += sb_num

            # Increment numbers for all interaction categories combined
            spanned_boundary_length_dict['ALL']['I_NUM'] += 1
            spanned_boundary_length_dict['ALL']['I_DIST'] += spanned_region_length
            spanned_boundary_length_dict['ALL']['SB_NUM'] += sb_num
            
    # Calculate spanned TAD boundaries per million bases
    STBPMB_dict = {'DIX': 0, 'DI': 0, 'UIR': 0, 'UI': 0, 'ALL': 0, 'DI-UIR': 0}
    I_NUM_dict = {'DIX': 0, 'DI': 0, 'UIR': 0, 'UI': 0, 'ALL': 0, 'DI-UIR': 0}
    for i_cat in ['DIX', 'DI', 'UIR', 'UI', 'ALL']:
        STBPMB = 1000000*spanned_boundary_length_dict[i_cat]['SB_NUM']/spanned_boundary_length_dict[i_cat]['I_DIST']
        STBPMB_dict[i_cat] = STBPMB
        I_NUM_dict[i_cat] = spanned_boundary_length_dict[i_cat]['I_NUM']
        
    STBPMB_dict['DI-UIR'] = STBPMB_dict['DI'] - STBPMB_dict['UIR']
    
    # Check whether DI and UIR have the same number of interactions
    if spanned_boundary_length_dict['DI']['I_NUM'] == spanned_boundary_length_dict['UIR']['I_NUM']:
        I_NUM_dict['DI-UIR'] = spanned_boundary_length_dict['DI']['I_NUM']
    else:
        print('[ERROR] DI and UIR must have the same number of interactions!')
        
            
    return STBPMB_dict, I_NUM_dict, spanned_boundary_length_dict, exception_count_dict

In [20]:
STBPMB_dict, I_NUM_dict, spanned_boundary_length_dict, exception_count_dict = \
get_sum_of_spanned_boundaries_and_total_length(
    d11_interaction_set = d11_interaction_set,
    tbs = tbs)

In [21]:
print('Observed')
STBPMB_dict_print = {}
for i_cat in ['DIX', 'DI', 'UIR', 'UI', 'ALL', 'DI-UIR']:
    STBPMB_dict_print[i_cat] = "{:.4f}".format(STBPMB_dict[i_cat])
print(STBPMB_dict_print)
print()

Observed
{'DIX': '4.3767', 'DI': '4.8356', 'UIR': '4.8815', 'UI': '4.8951', 'ALL': '4.8583', 'DI-UIR': '-0.0459'}



In [22]:
STBPMB_random_dict, I_NUM_dict, spanned_boundary_length_dict, exception_count_dict = \
get_sum_of_spanned_boundaries_and_total_length(
    d11_interaction_set = d11_interaction_set,
    tbs = tbs,
    random_flip_interaction = True)

In [23]:
print(exception_count_dict)

{'DIX': 733, 'DI': 11239, 'UIR': 16019, 'UI': 10481, 'ALL': 38472}


In [24]:
print('Observed')
STBPMB_dict_print = {}
for i_cat in ['DIX', 'DI', 'UIR', 'UI', 'ALL', 'DI-UIR']:
    STBPMB_dict_print[i_cat] = "{:.4f}".format(STBPMB_random_dict[i_cat])
print(STBPMB_dict_print)
print()

Observed
{'DIX': '5.0952', 'DI': '4.9292', 'UIR': '4.8957', 'UI': '4.8516', 'ALL': '4.8993', 'DI-UIR': '0.0335'}



In [25]:
iter_num = 10

STBPMB_dict, I_NUM_dict, spanned_boundary_length_dict, exception_count_dict = \
get_sum_of_spanned_boundaries_and_total_length(
    d11_interaction_set = d11_interaction_set,
    tbs = tbs)

print('Observed')
STBPMB_dict_print = {}
for i_cat in ['DIX', 'DI', 'UIR', 'UI', 'ALL', 'DI-UIR']:
    STBPMB_dict_print[i_cat] = "{:.4f}".format(STBPMB_dict[i_cat])
print(STBPMB_dict_print)
print()

# Perform n randomizations and add results to dictionary

print('Randomized')

STBPMB_random_dict = {
    'DIX': {
        'I_NUM': [], 
        'STBPMB': []
    },
    'DI': {
        'I_NUM': [],
        'STBPMB': []
    },
    'UIR': {
        'I_NUM': [],
        'STBPMB': []
    },
    'UI': {
        'I_NUM': [],
        'STBPMB': []
    },
    'ALL': {
        'I_NUM': [],
        'STBPMB': []
    },
    'DI-UIR': {
        'I_NUM': [],
        'STBPMB': []
    }
}

for random_seed in range(0, iter_num):
    
    random.seed(random_seed)
    
    STBPMB_iter_dict, I_NUM_iter_dict, spanned_boundary_length_dict, exception_count_dict = \
    get_sum_of_spanned_boundaries_and_total_length(
        d11_interaction_set = d11_interaction_set,
        tbs = tbs,
        random_flip_interaction = True)
    
    # Print results of this iteration
    STBPMB_dict_print = {}
    for i_cat in ['DIX', 'DI', 'UIR', 'UI', 'ALL', 'DI-UIR']:
        STBPMB_dict_print[i_cat] = "{:.4f}".format(STBPMB_iter_dict[i_cat])
    print(STBPMB_dict_print)
        
    for i_cat in ['DIX', 'DI', 'UIR', 'UI', 'ALL', 'DI-UIR']:
        STBPMB_random_dict[i_cat]['I_NUM'].append(I_NUM_iter_dict[i_cat])
        STBPMB_random_dict[i_cat]['STBPMB'].append(STBPMB_iter_dict[i_cat])
    
print("Done.")

Observed
{'DIX': '4.3767', 'DI': '4.8356', 'UIR': '4.8815', 'UI': '4.8951', 'ALL': '4.8583', 'DI-UIR': '-0.0459'}

Randomized
{'DIX': '5.0978', 'DI': '4.9307', 'UIR': '4.8942', 'UI': '4.8519', 'ALL': '4.8992', 'DI-UIR': '0.0364'}
{'DIX': '5.1006', 'DI': '4.9288', 'UIR': '4.8960', 'UI': '4.8510', 'ALL': '4.8994', 'DI-UIR': '0.0328'}
{'DIX': '5.0925', 'DI': '4.9288', 'UIR': '4.8938', 'UI': '4.8554', 'ALL': '4.8993', 'DI-UIR': '0.0350'}
{'DIX': '5.0939', 'DI': '4.9297', 'UIR': '4.8945', 'UI': '4.8515', 'ALL': '4.8989', 'DI-UIR': '0.0353'}
{'DIX': '5.0937', 'DI': '4.9289', 'UIR': '4.8939', 'UI': '4.8519', 'ALL': '4.8985', 'DI-UIR': '0.0351'}
{'DIX': '5.0919', 'DI': '4.9304', 'UIR': '4.8923', 'UI': '4.8519', 'ALL': '4.8982', 'DI-UIR': '0.0380'}
{'DIX': '5.0977', 'DI': '4.9286', 'UIR': '4.8950', 'UI': '4.8542', 'ALL': '4.8996', 'DI-UIR': '0.0337'}
{'DIX': '5.0940', 'DI': '4.9294', 'UIR': '4.8955', 'UI': '4.8503', 'ALL': '4.8989', 'DI-UIR': '0.0339'}
{'DIX': '5.0960', 'DI': '4.9320', 'UIR': '

In [26]:
print(OUT_PREFIX)
print('Interaction file:\n\t' + INTERACTION_FILE)
print('BED file with TAD boundaries:\n\t' + tad_boundary_bed_file)
print('Number of iterations: ' + str(iter_num))
print()

def get_randomization_results_table_2(STBPMB_dict, STBPMB_random_dict):
    print('I_CAT\tI_NUMS\tOBS\tMEAN_RAND\tSTD_RAND\tZ_SCORE\tST_OBS')
    for i_cat in ['DIX', 'DI', 'UIR', 'UI', 'ALL', 'DI-UIR']:
        i_num = STBPMB_random_dict[i_cat]['I_NUM'][0]
        observed = STBPMB_dict[i_cat]
        mean = np.mean(STBPMB_random_dict[i_cat]['STBPMB'])
        std = np.std(STBPMB_random_dict[i_cat]['STBPMB'])
        z_score = (observed - mean) / std
        # Find number of smaller than observed
        st_obs = 0
        for median in STBPMB_random_dict[i_cat]['STBPMB']:
            if median < observed:
                st_obs += 1
        print(i_cat + '\t' + str(i_num) + '\t' + "{:.4f}".format(observed) + '\t' + "{:.4f}".format(mean) + '\t' + "{:.4f}".format(std) + '\t' + "{:.2f}".format(z_score) + '\t' + str(st_obs) + '/' + str(iter_num))

get_randomization_results_table_2(STBPMB_dict = STBPMB_dict,
                                STBPMB_random_dict = STBPMB_random_dict)



JAV_MAC_M0_CHC_RALT_20000_ht
Interaction file:
	../DICer_interactions/HT/FDR005/CHC/JAV_MAC_M0_RALT_20000_ht_fdr0.05_evaluated_and_categorized_interactions.tsv.gz
BED file with TAD boundaries:
	../additional_files/javierre_2016/tad_regions_hg38/merged_tad_boundary_centers.bed
Number of iterations: 10

I_CAT	I_NUMS	OBS	MEAN_RAND	STD_RAND	Z_SCORE	ST_OBS
DIX	1117215	4.3767	5.0952	0.0026	-274.36	0/10
DI	2985367	4.8356	4.9299	0.0011	-83.11	0/10
UIR	2985367	4.8815	4.8945	0.0010	-12.84	0/10
UI	1158445	4.8951	4.8523	0.0014	30.40	10/10
ALL	8246394	4.8583	4.8991	0.0005	-77.62	0/10
DI-UIR	2985367	-0.0459	0.0354	0.0016	-50.46	0/10


```
JAV_MAC_M0_CHC_RALT_20000_ht
Interaction file:
	../DICer_interactions/HT/FDR005/CHC/JAV_MAC_M0_RALT_20000_ht_fdr0.05_evaluated_and_categorized_interactions.tsv.gz
BED file with TAD boundaries:
	../additional_files/javierre_2016/tad_regions_hg38/merged_tad_boundary_centers.bed
Number of iterations: 10

I_CAT	I_NUMS	OBS	MEAN_RAND	STD_RAND	Z_SCORE	ST_OBS
DIX	1117215	4.3767	5.0952	0.0026	-274.36	0/10
DI	2985367	4.8356	4.9299	0.0011	-83.11	0/10
UIR	2985367	4.8815	4.8945	0.0010	-12.84	0/10
UI	1158445	4.8951	4.8523	0.0014	30.40	10/10
ALL	8246394	4.8583	4.8991	0.0005	-77.62	0/10
DI-UIR	2985367	-0.0459	0.0354	0.0016	-50.46	0/10


JAV_MAC_M0_CHC_RALT_20000_ht
Interaction file:
	../DICer_interactions/HT/FDR005/CHC/JAV_MAC_M0_RALT_20000_ht_fdr0.05_evaluated_and_categorized_interactions.tsv.gz
BED file with TAD boundaries:
	../additional_files/javierre_2016/tad_regions_hg38/hglft_genome_TADs_MAC_M0_hg38.bed
Number of iterations: 10

I_CAT	I_NUMS	OBS	MEAN_RAND	STD_RAND	Z_SCORE	ST_OBS
DIX	1117215	2.5885	3.2342	0.0015	-432.99	0/10
DI	2985367	2.9073	2.9772	0.0008	-87.83	0/10
UIR	2985367	2.9095	2.9229	0.0006	-20.81	0/10
UI	1158445	2.9106	2.8817	0.0013	22.96	10/10
ALL	8246394	2.9004	2.9362	0.0005	-76.63	0/10
DI-UIR	2985367	-0.0023	0.0543	0.0012	-47.69	0/10
```