In [1]:
import sys
import os
import pandas
import copy
from math import log10
sys.path.append("../..")
from diachr import DiachromaticInteractionSet
from diachr import BaitedDigest
from diachr import BaitedDigestSet

# Select baited digests with imbalances in the configurations

We identified two types of baited digests with imbalances in the configurations of unbalanced interactions. For the one type, interactions with the configurations `03` and `13` predominate (BDC1) and, for the other type, interactions with the configuration `12` and `02` predominate (BDC2). We developed a score that rewards such imbalances and thus can be used to define corresponding baited digests at a chosen threshold. In this notebook, two BED files are created, one for each of the two baited digest types. These files are intended to be used to integrate singnals from other sources, such as chromatin accessibility, across the ends of multiple baited digests.

## Input file

In [2]:
CELL_TYPE_SHORT = 'MAC_M0'
PROTOCOL = 'CHC'
MIN_I_DIST = '20000'
if PROTOCOL == 'CHC':
    MIN_I_DIST = '20000'
RPC_RULE = 'ht'
FDR = '05'
if PROTOCOL == 'CHC':
    INTERACTION_FILE = '../../DICer_interactions/' + RPC_RULE.upper() + '/FDR0' + FDR + '/CHC/JAV_' + CELL_TYPE_SHORT + '_RALT_20000_' + RPC_RULE + '_fdr0.' + FDR + '_evaluated_and_categorized_interactions.tsv.gz'
    OUT_PREFIX = 'JAV_' + CELL_TYPE_SHORT + '_CHC_RALT_' + MIN_I_DIST + '_' + RPC_RULE.lower()
if PROTOCOL == 'HC_POOLED':
    INTERACTION_FILE = '../../DICer_interactions/HT/FDR005/HC/20000/JAV_ALL_HC_RALL_ALT_20000_ht_fdr0.05_evaluated_and_categorized_interactions.tsv.gz'
    OUT_PREFIX = 'JAV_' + 'POOLED_HC_FDR005' + '_HC_RALT_20000_' + RPC_RULE.lower()
if PROTOCOL == 'HC':
    INTERACTION_FILE = '../../DICer_interactions/' + RPC_RULE.upper() + '/FDR0' + FDR + '/HC/' + MIN_I_DIST + '/JAV_' + CELL_TYPE_SHORT + '_HC_RALT_' + MIN_I_DIST + '_' + RPC_RULE + '_fdr0.' + FDR + '_evaluated_and_categorized_interactions.tsv.gz'
    OUT_PREFIX = 'JAV_' + CELL_TYPE_SHORT + '_HC_RALT_'  + MIN_I_DIST + '_' + RPC_RULE.lower()

## Create  ``BaitedDigestSet``

In a `BaitedDigestSet` object, interactions are grouped by bait.

In [3]:
# Create DiachromaticInteractionSet
d11_interaction_set = DiachromaticInteractionSet(rpc_rule = 'ht')
d11_interaction_set.parse_file(
    i_file = INTERACTION_FILE,
    verbose = True)
# Create BaitedDigestSet
baited_digest_set = BaitedDigestSet()
read_interactions_info_dict = baited_digest_set.ingest_interaction_set(d11_interaction_set, verbose=True)
print(baited_digest_set.get_ingest_interaction_set_info_report())

[INFO] Parsing Diachromatic interaction file ...
	[INFO] ../../DICer_interactions/HT/FDR005/CHC/JAV_MAC_M0_RALT_20000_ht_fdr0.05_evaluated_and_categorized_interactions.tsv.gz
	[INFO] Parsed 1,000,000 interaction lines ...
	[INFO] Parsed 2,000,000 interaction lines ...
	[INFO] Parsed 3,000,000 interaction lines ...
	[INFO] Parsed 4,000,000 interaction lines ...
	[INFO] Parsed 5,000,000 interaction lines ...
	[INFO] Parsed 6,000,000 interaction lines ...
	[INFO] Parsed 7,000,000 interaction lines ...
	[INFO] Parsed 8,000,000 interaction lines ...
	[INFO] Parsed 9,000,000 interaction lines ...
	[INFO] Set size: 9,648,210
[INFO] ... done.
[INFO] Reading interactions and group them according to chromosomes and baited digests ...
	[INFO] Read 1,000,000 interactions ...
	[INFO] Read 2,000,000 interactions ...
	[INFO] Read 3,000,000 interactions ...
	[INFO] Read 4,000,000 interactions ...
	[INFO] Read 5,000,000 interactions ...
	[INFO] Read 6,000,000 interactions ...
	[INFO] Read 7,000,000 int

## Get frequencies of HT configurations for a list of interations

Get frequencies of configurations separately for each interaction category, enrichment status and configuration. The following functions determines the frequencies of configurations for a given list of interactions. We will use this function to determine the frequencies of configurations at individual baits by passing all interactions that are associated with a specific bait.

In [4]:
def get_htc_freq_dicts(interaction_list):
    
    # Initialize count dictionary returned by this function
    HTC_TAG_FREQ_DICT = dict()
    for i_cat in ['DIX', 'DI', 'UIR', 'UI', 'ALL']:
        HTC_TAG_FREQ_DICT[i_cat] = dict()
        for e_cat in ['NN', 'EE', 'NE', 'EN', 'ALL']:
            HTC_TAG_FREQ_DICT[i_cat][e_cat] = dict()
            for i_conf in ['0X', '1X', '2X', '3X', '01', '02', '03', '12', '13', '23']:
                HTC_TAG_FREQ_DICT[i_cat][e_cat][i_conf] = 0

    # Get frequencies of configurations
    for d11_inter in interaction_list:
        i_cat = d11_inter.get_category()
        e_cat = d11_inter.enrichment_status_tag_pair
        HTC_TAG_FREQ_DICT[i_cat][e_cat][d11_inter.get_ht_tag()] += 1
        HTC_TAG_FREQ_DICT['ALL'][e_cat][d11_inter.get_ht_tag()] += 1
        HTC_TAG_FREQ_DICT[i_cat]['ALL'][d11_inter.get_ht_tag()] += 1
        HTC_TAG_FREQ_DICT['ALL']['ALL'][d11_inter.get_ht_tag()] += 1
                    
    return HTC_TAG_FREQ_DICT

## Select baited digests with imbalances in configurations

We distinguish two classes of baited digests:

1. **BDC1:** Baited digests at which interactions predominate that have either enrichment status `NE` and configuration `03`, or enrichment status `EN` and configuration `13`.

2. **BDC2:** Baited digests at which interactions predominate that have either enrichment status `NE` and configuration `12`, or enrichment status `EN` and configuration `02`.

To calculate a score for a baited digest, we first determine the two sums of the counts for the configurations that are associated with the two classes. Then we divide the smaller sum by the larger sum. To avoid divisions by zero, we add a pseudo count to both sums. If the score is smaller than a pre-specified threshold, then the baited digests is assigned to the BDC1 or BDC2 class. Depending on which of the two sums is larger, we assign the baited digest to one or the other class. If the score is larger then the threshold, we assign the baited digest to the class BDC0, which is for digests with no imbalances in the configurations.

In [5]:
def get_ibc_score_and_class(NE_dict, EN_dict, ibcs_threshold):
    

    # Get sum of interactions that are associated with BDC1 baited digest
    sum_0313 = NE_dict['03'] + EN_dict['13']

    # Get sum of interactions that are associated with BDC2 baited digest    
    sum_1202 = NE_dict['12'] + EN_dict['02']

    # Calculate imbalanced configuration score
    if sum_1202 < sum_0313:
        ibc_score = (sum_0313 + 1)/(sum_1202 + 1)
        bd_class = 'BDC1'
    else:
        ibc_score = (sum_1202 + 1)/(sum_0313 + 1)
        bd_class = 'BDC2'

    # No imbalances in the configurations
    if ibc_score < ibcs_threshold:
        bd_class = 'BDC0'

    return ibc_score, bd_class

The following code iterates over all baited digests. For each baited digest, a list of `NE` and `EN` interactions is retrieved and the function above is used to determine the frequencies of configurations. From these frequencies, we calculate a score for each baited digest. In addition, we assign each baited digests to one of the two classes BDC1 and BDC2. We create a BED file for each of the two classes that can be loaded into UCSC's genome browser. In the browser, baited digests of class `BDC1` are shown in blue und baited digest of class `BDC2` ar shown in green. Digests ends, which are predominatly involved in re-ligations, are highlighted with thick ends.

In [6]:
# If true, details are reported for each baited digests
verbose = False

# Threshold for fold change score
ibcs_threshold = 20

# Interactions taken into account
i_cat = 'ALL'

# Directory for output
OUT_DIR = 'bdc_lists'

# The coordinates of all baited digests that have a score lower than the threshold are written to BED files 
fh_bdc1 = open(OUT_DIR + '/' + OUT_PREFIX + '_bdc1.bed', 'w')
fh_bdc1.write("track name=\"" + OUT_PREFIX + "_bdc1\" description=\"" + OUT_PREFIX + " BDC1\" itemRgb=\"On\"" + '\n')
fh_bdc2 = open(OUT_DIR + '/' + OUT_PREFIX + '_bdc2.bed', 'w')
fh_bdc2.write("track name=\"" + OUT_PREFIX + "_bdc2\" description=\"" + OUT_PREFIX + " BDC2\" itemRgb=\"On\"" + '\n')

# Variables for counting baited digests
bd_num_total = 0
ibc_num_total_0 = 0
ibc_num_total_1 = 0
ibc_num_total_2 = 0

# Iterate over all chromosomes
for chrom in baited_digest_set._baited_digest_dict.keys():
    
    print('Chromosome: ' + chrom)
    
    # Number of baited digests on this chromosome
    bd_num = 0
    
    # Variables for counting baited digests with and without imbalances in the configurations   
    ibc_num_0 = 0
    ibc_num_1 = 0
    ibc_num_2 = 0
    
    # Iterate over all baited digests on this chromosome   
    for baited_digest_key, baited_digest in baited_digest_set._baited_digest_dict[chrom].items():
        
        # Prepare list of NE and EN interactions that belong to this baited digest 
        interaction_list = baited_digest.interactions[i_cat]['NE'] + baited_digest.interactions[i_cat]['EN']

        # Get frequencies of interactions
        HTC_TAG_FREQ_DICT = get_htc_freq_dicts(interaction_list)
        
        # Calculate score and assign to a class       
        ibc_score, bd_class = get_ibc_score_and_class(
            HTC_TAG_FREQ_DICT[i_cat]['NE'],
            HTC_TAG_FREQ_DICT[i_cat]['EN'],
            ibcs_threshold)

        # If the score is greater than the threshold, write coordinates of baited digests to one of the two BED files
        if bd_class == 'BDC1' or  bd_class == 'BDC2':
            
            # Get coordinates from key
            chom, sta, end = baited_digest_key.split('\t')
            
            # Get relevant counts for output
            sum_0313 = HTC_TAG_FREQ_DICT[i_cat]['NE']['03'] + HTC_TAG_FREQ_DICT[i_cat]['EN']['13']
            sum_1202 = HTC_TAG_FREQ_DICT[i_cat]['NE']['12'] + HTC_TAG_FREQ_DICT[i_cat]['EN']['02']
            sum_total = len(interaction_list)
            
            # Format score for output
            ibc_score_formatted = "{:.2f}".format(ibc_score)
            
            # Write coordinates and additional information to corresponding BED file
            if bd_class == 'BDC1':
                ibc_num_1 +=1
                ibc_num_total_1 += 1
                name = 'BDC1|' + ibc_score_formatted + ':' + str(sum_1202) + ':' + str(sum_0313) + ':' + str(sum_total)
                fh_bdc1.write(chom + '\t' + sta + '\t' + end + '\t' + name + '\t' + ibc_score_formatted + '\t' '.' + '\t' + sta + '\t' + str(int(sta)+100) + '\t' + '0,0,100' + '\n')
            else:
                ibc_num_2 +=1  
                ibc_num_total_2 += 1
                name = 'BDC2|' + ibc_score_formatted + ':' + str(sum_1202) + ':' + str(sum_0313) + ':' + str(sum_total)
                fh_bdc2.write(chom + '\t' + sta + '\t' + end + '\t' + name + '\t' + ibc_score_formatted + '\t' '.' + '\t' + str(int(end)-100) + '\t' + end + '\t' + '0,100,0' + '\n')
            
            # Output details about each individual baited digest            
            if verbose:
                print('-------------------------')
                print(baited_digest_key)
                print('sum_1202: ' + str(sum_1202))
                print('sum_0313: ' + str(sum_0313))
                print('sum_total: ' + str(sum_total))
                print('bd_class: ' + bd_class)            
                print('ibc_score_formatted: ' + ibc_score_formatted)
                print()
                for i_conf in ['0X', '1X', '2X', '3X', '01', '02', '03', '12', '13', '23']:
                    for e_cat in ['NE','EN']:
                        print(i_cat + '-' + e_cat + '-' + i_conf + ': ' + str(HTC_TAG_FREQ_DICT[i_cat][e_cat][i_conf]))
                    print()
        else:
            # BDC0: No imbalances in the configurations
            ibc_num_0 += 1
            ibc_num_total_0 += 1
            
        # Number of baited digests on this chromosome
        bd_num += 1
        bd_num_total += 1
                        
    print('\tNumber of baited digests: ' + "{:,}".format(bd_num))
    print('\t\tBDC0: ' + "{:,}".format(ibc_num_0))
    print('\t\tBDC1: ' + "{:,}".format(ibc_num_1))
    print('\t\tBDC2: ' + "{:,}".format(ibc_num_2))

print()
print('Total number of baited digests: ' + "{:,}".format(bd_num_total))
print('\tBDC0: ' + "{:,}".format(ibc_num_total_0))
print('\tBDC1: ' + "{:,}".format(ibc_num_total_1))
print('\tBDC2: ' + "{:,}".format(ibc_num_total_2))

fh_bdc1.close()
fh_bdc2.close()

Chromosome: chr2
	Number of baited digests: 1,598
		BDC0: 592
		BDC1: 446
		BDC2: 560
Chromosome: chr6
	Number of baited digests: 1,211
		BDC0: 457
		BDC1: 342
		BDC2: 412
Chromosome: chr9
	Number of baited digests: 856
		BDC0: 308
		BDC1: 258
		BDC2: 290
Chromosome: chr10
	Number of baited digests: 928
		BDC0: 331
		BDC1: 272
		BDC2: 325
Chromosome: chr12
	Number of baited digests: 1,196
		BDC0: 433
		BDC1: 335
		BDC2: 428
Chromosome: chr7
	Number of baited digests: 1,008
		BDC0: 375
		BDC1: 276
		BDC2: 357
Chromosome: chr3
	Number of baited digests: 1,330
		BDC0: 518
		BDC1: 344
		BDC2: 468
Chromosome: chrX
	Number of baited digests: 844
		BDC0: 345
		BDC1: 224
		BDC2: 275
Chromosome: chr4
	Number of baited digests: 940
		BDC0: 347
		BDC1: 255
		BDC2: 338
Chromosome: chr1
	Number of baited digests: 2,238
		BDC0: 781
		BDC1: 654
		BDC2: 803
Chromosome: chr18
	Number of baited digests: 351
		BDC0: 126
		BDC1: 97
		BDC2: 128
Chromosome: chr5
	Number of baited digests: 1,111
		BDC0: 459
