In [1]:
import sys
import os
import pandas
import copy
from math import log10
sys.path.append("..")
from diachr import DiachromaticInteractionSet
from diachr import BaitedDigest
from diachr import BaitedDigestSet

# Select baited digests with imbalances in the configurations

We identified two types of baited digests with imbalances in the configurations of unbalanced interactions. For the one type, interactions with the configurations `02` and `12` predominate and, for the other type, interactions with the configuration `13` and `03` predominate. We developed a score that rewards such imbalances and thus can be used to define corresponding baited digests at a chosen threshold. In this notebook, two BED files are created, one for each of the two baited digest types. These files are intended to integrate singnals from other sources, such as chromatin accessibility, across multiple baited digests.

## Input file

In [2]:
CELL_TYPE_SHORT = 'MAC_M0'
PROTOCOL = 'CHC'
MIN_I_DIST = '20000'
if PROTOCOL == 'CHC':
    MIN_I_DIST = '20000'
RPC_RULE = 'ht'
FDR = '05'
if PROTOCOL == 'CHC':
    INTERACTION_FILE = '../DICer_interactions/' + RPC_RULE.upper() + '/FDR0' + FDR + '/CHC/JAV_' + CELL_TYPE_SHORT + '_RALT_20000_' + RPC_RULE + '_fdr0.' + FDR + '_evaluated_and_categorized_interactions.tsv.gz'
    OUT_PREFIX = 'JAV_' + CELL_TYPE_SHORT + '_CHC_RALT_' + MIN_I_DIST + '_' + RPC_RULE.lower()
if PROTOCOL == 'HC_POOLED':
    INTERACTION_FILE = '../DICer_interactions/HT/FDR005/HC/20000/JAV_ALL_HC_RALL_ALT_20000_ht_fdr0.05_evaluated_and_categorized_interactions.tsv.gz'
    OUT_PREFIX = 'JAV_' + 'POOLED_HC_FDR005' + '_HC_RALT_20000_' + RPC_RULE.lower()
if PROTOCOL == 'HC':
    INTERACTION_FILE = '../DICer_interactions/' + RPC_RULE.upper() + '/FDR0' + FDR + '/HC/' + MIN_I_DIST + '/JAV_' + CELL_TYPE_SHORT + '_HC_RALT_' + MIN_I_DIST + '_' + RPC_RULE + '_fdr0.' + FDR + '_evaluated_and_categorized_interactions.tsv.gz'
    OUT_PREFIX = 'JAV_' + CELL_TYPE_SHORT + '_HC_RALT_'  + MIN_I_DIST + '_' + RPC_RULE.lower()

## Create  ``BaitedDigestSet``

In [3]:
# Create DiachromaticInteractionSet
d11_interaction_set = DiachromaticInteractionSet(rpc_rule = 'ht')
d11_interaction_set.parse_file(
    i_file = INTERACTION_FILE,
    verbose = True)
# Create BaitedDigestSet
baited_digest_set = BaitedDigestSet()
read_interactions_info_dict = baited_digest_set.ingest_interaction_set(d11_interaction_set, verbose=True)
print(baited_digest_set.get_ingest_interaction_set_info_report())

[INFO] Parsing Diachromatic interaction file ...
	[INFO] ../DICer_interactions/HT/FDR005/CHC/JAV_MAC_M0_RALT_20000_ht_fdr0.05_evaluated_and_categorized_interactions.tsv.gz
	[INFO] Parsed 1,000,000 interaction lines ...
	[INFO] Parsed 2,000,000 interaction lines ...
	[INFO] Parsed 3,000,000 interaction lines ...
	[INFO] Parsed 4,000,000 interaction lines ...
	[INFO] Parsed 5,000,000 interaction lines ...
	[INFO] Parsed 6,000,000 interaction lines ...
	[INFO] Parsed 7,000,000 interaction lines ...
	[INFO] Parsed 8,000,000 interaction lines ...
	[INFO] Parsed 9,000,000 interaction lines ...
	[INFO] Set size: 9,648,210
[INFO] ... done.
[INFO] Reading interactions and group them according to chromosomes and baited digests ...
	[INFO] Read 1,000,000 interactions ...
	[INFO] Read 2,000,000 interactions ...
	[INFO] Read 3,000,000 interactions ...
	[INFO] Read 4,000,000 interactions ...
	[INFO] Read 5,000,000 interactions ...
	[INFO] Read 6,000,000 interactions ...
	[INFO] Read 7,000,000 intera

## Get frequencies of interactions

Get frequencies of interactions separately for each interaction category, enrichment status and configuration. The following functions determines these frequencies for a given list of interactions. Therefore, it can be used to determine frequencies at individual baited digests.

In [4]:
def get_ht_freq_dicts(interaction_list):
    
    HT_TAG_FREQ_DICT = {
        'DIX': {
            'NN': {'0X': 0,'1X': 0,'2X': 0,'3X': 0,'01': 0,'02': 0,'03': 0,'12': 0,'13': 0,'23': 0},
            'EE': {'0X': 0,'1X': 0,'2X': 0,'3X': 0,'01': 0,'02': 0,'03': 0,'12': 0,'13': 0,'23': 0},
            'NE': {'0X': 0,'1X': 0,'2X': 0,'3X': 0,'01': 0,'02': 0,'03': 0,'12': 0,'13': 0,'23': 0},
            'EN': {'0X': 0,'1X': 0,'2X': 0,'3X': 0,'01': 0,'02': 0,'03': 0,'12': 0,'13': 0,'23': 0},
            'ALL': {'0X': 0,'1X': 0,'2X': 0,'3X': 0,'01': 0,'02': 0,'03': 0,'12': 0,'13': 0,'23': 0}
        },
        'DI': {
            'NN': {'0X': 0,'1X': 0,'2X': 0,'3X': 0,'01': 0,'02': 0,'03': 0,'12': 0,'13': 0,'23': 0},
            'EE': {'0X': 0,'1X': 0,'2X': 0,'3X': 0,'01': 0,'02': 0,'03': 0,'12': 0,'13': 0,'23': 0},
            'NE': {'0X': 0,'1X': 0,'2X': 0,'3X': 0,'01': 0,'02': 0,'03': 0,'12': 0,'13': 0,'23': 0},
            'EN': {'0X': 0,'1X': 0,'2X': 0,'3X': 0,'01': 0,'02': 0,'03': 0,'12': 0,'13': 0,'23': 0},
            'ALL': {'0X': 0,'1X': 0,'2X': 0,'3X': 0,'01': 0,'02': 0,'03': 0,'12': 0,'13': 0,'23': 0}
        },
        'UIR': {
            'NN': {'0X': 0,'1X': 0,'2X': 0,'3X': 0,'01': 0,'02': 0,'03': 0,'12': 0,'13': 0,'23': 0},
            'EE': {'0X': 0,'1X': 0,'2X': 0,'3X': 0,'01': 0,'02': 0,'03': 0,'12': 0,'13': 0,'23': 0},
            'NE': {'0X': 0,'1X': 0,'2X': 0,'3X': 0,'01': 0,'02': 0,'03': 0,'12': 0,'13': 0,'23': 0},
            'EN': {'0X': 0,'1X': 0,'2X': 0,'3X': 0,'01': 0,'02': 0,'03': 0,'12': 0,'13': 0,'23': 0},
            'ALL': {'0X': 0,'1X': 0,'2X': 0,'3X': 0,'01': 0,'02': 0,'03': 0,'12': 0,'13': 0,'23': 0}
        },
        'UI': {
            'NN': {'0X': 0,'1X': 0,'2X': 0,'3X': 0,'01': 0,'02': 0,'03': 0,'12': 0,'13': 0,'23': 0},
            'EE': {'0X': 0,'1X': 0,'2X': 0,'3X': 0,'01': 0,'02': 0,'03': 0,'12': 0,'13': 0,'23': 0},
            'NE': {'0X': 0,'1X': 0,'2X': 0,'3X': 0,'01': 0,'02': 0,'03': 0,'12': 0,'13': 0,'23': 0},
            'EN': {'0X': 0,'1X': 0,'2X': 0,'3X': 0,'01': 0,'02': 0,'03': 0,'12': 0,'13': 0,'23': 0},
            'ALL': {'0X': 0,'1X': 0,'2X': 0,'3X': 0,'01': 0,'02': 0,'03': 0,'12': 0,'13': 0,'23': 0}
        },
        'ALL': {
            'NN': {'0X': 0,'1X': 0,'2X': 0,'3X': 0,'01': 0,'02': 0,'03': 0,'12': 0,'13': 0,'23': 0},
            'EE': {'0X': 0,'1X': 0,'2X': 0,'3X': 0,'01': 0,'02': 0,'03': 0,'12': 0,'13': 0,'23': 0},
            'NE': {'0X': 0,'1X': 0,'2X': 0,'3X': 0,'01': 0,'02': 0,'03': 0,'12': 0,'13': 0,'23': 0},
            'EN': {'0X': 0,'1X': 0,'2X': 0,'3X': 0,'01': 0,'02': 0,'03': 0,'12': 0,'13': 0,'23': 0},
            'ALL': {'0X': 0,'1X': 0,'2X': 0,'3X': 0,'01': 0,'02': 0,'03': 0,'12': 0,'13': 0,'23': 0}
        }
    }

    # Get absolute frequencies
    for d11_inter in interaction_list:
        i_cat = d11_inter.get_category()
        e_cat = d11_inter.enrichment_status_tag_pair
        HT_TAG_FREQ_DICT[i_cat][e_cat][d11_inter.get_ht_tag()] += 1
        HT_TAG_FREQ_DICT['ALL'][e_cat][d11_inter.get_ht_tag()] += 1
        HT_TAG_FREQ_DICT[i_cat]['ALL'][d11_inter.get_ht_tag()] += 1
        HT_TAG_FREQ_DICT['ALL']['ALL'][d11_inter.get_ht_tag()] += 1

    # Fill second dictionary with realtive frequencies
    HT_TAG_DENS_DICT = copy.deepcopy(HT_TAG_FREQ_DICT)
    for i_cat in ['DIX','DI','UIR','UI','ALL']:
        for e_cat in ['NN','EE','NE','EN','ALL']:
            i_total = sum(HT_TAG_FREQ_DICT[i_cat][e_cat].values())
            if 0 < i_total:
                for ht_tag in ['0X', '1X', '2X', '3X', '01','02','03','12','13','23']:
                    HT_TAG_DENS_DICT[i_cat][e_cat][ht_tag] = HT_TAG_FREQ_DICT[i_cat][e_cat][ht_tag]/i_total
            else:
                for ht_tag in ['0X', '1X', '2X', '3X', '01','02','03','12','13','23']:
                    HT_TAG_DENS_DICT[i_cat][e_cat][ht_tag] = 0.0
                    
    return HT_TAG_FREQ_DICT, HT_TAG_DENS_DICT

## Select imbalanced  baited digests of type 0 and 1

We distinguish two types of baited digests:

1. **Type 0:** Baited digests at which interactions predominate that have either enrichment status `NE` and configuration `02`, or enrichment status `EN` and configuration `12`.

2. **Type 1:** Baited digests at which interactions predominate that have either enrichment status `NE` and configuration `13`, or enrichment status `EN` and configuration `03`.

The following code iterates over all baited digests. For each baited digest, the function from above is used to determine the frequencies of the associated interactions. Then we calculate the two sums of interactions with enrichment states and configurations that correspond to type `0` and `1`. We use the quotient of these two sums as the score, where the smaller sum is always divided by the larger sum. If the score is smaller than the chosen threshold, we assign the baited digests to one of the two types depending on which of the two sums is larger. Two BED files are generated, each containing baited digests assigned either to type `0` or `1`.

In [30]:
# If true, details are reported for each baited digests
verbose = False

# Threshold for score
threshold = 0.005

# Interactions taken into account
i_cat = 'DI'

# The coordinates of all baited digests that have a score lower than the threshold are written to BED files 
fh_0 = open(OUT_PREFIX + '_ibd_type_0.bed', 'w')
fh_0.write("track name=\"" + OUT_PREFIX + "_ibd_type_0\" description=\"" + OUT_PREFIX + " IBD type 0\" itemRgb=\"On\"" + '\n')
fh_1 = open(OUT_PREFIX + '_ibd_type_1.bed', 'w')
fh_1.write("track name=\"" + OUT_PREFIX + "_ibd_type_1\" description=\"" + OUT_PREFIX + " IBD type 1\" itemRgb=\"On\"" + '\n')

# Variables for counting baited digests with imbalances in the configurations 
ibd_num_total = 0
ibd_num_total_0 = 0
ibd_num_total_1 = 0

# Iterate over all chromosomes
for chrom in baited_digest_set._baited_digest_dict.keys():
    
    print('Chromosome: ' + chrom)
    
    # Variables for counting baited digests with imbalances in the configurations on this chromosome    
    ibd_num = 0
    ibd_num_0 = 0
    ibd_num_1 = 0
    
    # Iterate over all baited digests on this chromosome   
    for baited_digest_key, baited_digest in baited_digest_set._baited_digest_dict[chrom].items():
        
        # Prepare list of interactions that belong to this baited digest 
        interaction_list = baited_digest.interactions[i_cat]['NE'] + baited_digest.interactions[i_cat]['EN']

        # Get frequencies of interactions
        HT_TAG_FREQ_DICT, HT_TAG_DENS_DICT = get_ht_freq_dicts(interaction_list)
        
        # Get sum of interactions that are associated with baited digest type 0
        sum_0212 = HT_TAG_FREQ_DICT[i_cat]['NE']['02'] + HT_TAG_FREQ_DICT[i_cat]['EN']['12']
        
        # Get sum of interactions that are associated with baited digest type 1     
        sum_1303 = HT_TAG_FREQ_DICT[i_cat]['NE']['13'] + HT_TAG_FREQ_DICT[i_cat]['EN']['03']
        
        # Get sum of all interactions at this baited digest
        sum_total = 0
        sum_total += sum_0212      
        sum_total += sum_1303
        sum_total += HT_TAG_FREQ_DICT[i_cat]['EN']['02'] + HT_TAG_FREQ_DICT[i_cat]['NE']['12']
        sum_total += HT_TAG_FREQ_DICT[i_cat]['EN']['13'] + HT_TAG_FREQ_DICT[i_cat]['NE']['03']
        sum_total += HT_TAG_FREQ_DICT[i_cat]['NE']['01'] + HT_TAG_FREQ_DICT[i_cat]['EN']['01']
        sum_total += HT_TAG_FREQ_DICT[i_cat]['NE']['23'] + HT_TAG_FREQ_DICT[i_cat]['EN']['23']
        sum_total += HT_TAG_FREQ_DICT[i_cat]['NE']['0X'] + HT_TAG_FREQ_DICT[i_cat]['EN']['0X']
        sum_total += HT_TAG_FREQ_DICT[i_cat]['NE']['1X'] + HT_TAG_FREQ_DICT[i_cat]['EN']['1X']
        sum_total += HT_TAG_FREQ_DICT[i_cat]['NE']['2X'] + HT_TAG_FREQ_DICT[i_cat]['EN']['2X']
        sum_total += HT_TAG_FREQ_DICT[i_cat]['NE']['3X'] + HT_TAG_FREQ_DICT[i_cat]['EN']['3X']
        
        # Calculate imbalanced baited digest score
        if sum_1303 < sum_0212:
            ibd_score = (sum_1303 + 1)/(sum_0212 + 1)
            ibd_type = 0
        else:
            ibd_score = (sum_0212 + 1)/(sum_1303 + 1)
            ibd_type = 1

        # If the score is lower than the threshold, write coordinates of baited digests to one of the two BED files
        if ibd_score < threshold:
            
            # Get coordinates from key
            chom, sta, end = baited_digest_key.split('\t')
            
            # Format score for output
            ibd_score_formatted = "{:.2f}".format(-log10(ibd_score))
            
            # Write coordinates and additional information to corresponding BED file
            if ibd_type == 0:
                ibd_num_0 +=1
                ibd_num_total_0 += 1
                name = '0212|' + str(sum_0212) + ':' + str(sum_1303) + ':' + str(sum_total)
                fh_0.write(chom + '\t' + sta + '\t' + end + '\t' + name + '\t' + ibd_score_formatted + '\t' '.' + '\t' + sta + '\t' + str(int(sta)+100) + '\t' + '0,0,100' + '\n')
            else:
                ibd_num_1 +=1  
                ibd_num_total_1 += 1
                name = '1303|' + str(sum_0212) + ':' + str(sum_1303) + ':' + str(sum_total)
                fh_1.write(chom + '\t' + sta + '\t' + end + '\t' + name + '\t' + ibd_score_formatted + '\t' '.' + '\t' + str(int(end)-100) + '\t' + end + '\t' + '0,100,0' + '\n')
            
            # Increment count variables            
            ibd_num += 1
            ibd_num_total += 1
            
            # Output details about each individual baited digest            
            if verbose:
                print('-------------------------')
                print(baited_digest_key)
                print('sum_0212: ' + str(sum_0212))
                print('sum_1303: ' + str(sum_1303))
                print('sum_total: ' + str(sum_total))
                print('ibd_score: ' + ibd_score_formatted)
                print()
                print(i_cat + '-NE-02: ' + str(HT_TAG_FREQ_DICT[i_cat]['NE']['02']))
                print(i_cat + '-EN-12: ' + str(HT_TAG_FREQ_DICT[i_cat]['EN']['12']))
                print()
                print(i_cat + '-NE-13: ' + str(HT_TAG_FREQ_DICT[i_cat]['NE']['13']))
                print(i_cat + '-EN-03: ' + str(HT_TAG_FREQ_DICT[i_cat]['EN']['03']))
                print()
                print(i_cat + '-NE-12: ' + str(HT_TAG_FREQ_DICT[i_cat]['NE']['12']))
                print(i_cat + '-EN-13: ' + str(HT_TAG_FREQ_DICT[i_cat]['EN']['13']))
                print()
                print(i_cat + '-NE-03: ' + str(HT_TAG_FREQ_DICT[i_cat]['NE']['03']))
                print(i_cat + '-EN-02: ' + str(HT_TAG_FREQ_DICT[i_cat]['EN']['02']))           
                print()
                print(i_cat + '-NE-0X: ' + str(HT_TAG_FREQ_DICT[i_cat]['NE']['0X']))           
                print(i_cat + '-EN-0X: ' + str(HT_TAG_FREQ_DICT[i_cat]['EN']['0X']))
                print()
                print(i_cat + '-NE-1X: ' + str(HT_TAG_FREQ_DICT[i_cat]['NE']['1X']))           
                print(i_cat + '-EN-1X: ' + str(HT_TAG_FREQ_DICT[i_cat]['EN']['1X']))
                print()
                print(i_cat + '-NE-2X: ' + str(HT_TAG_FREQ_DICT[i_cat]['NE']['2X']))           
                print(i_cat + '-EN-2X: ' + str(HT_TAG_FREQ_DICT[i_cat]['EN']['2X']))
                print()
                print(i_cat + '-NE-3X: ' + str(HT_TAG_FREQ_DICT[i_cat]['NE']['3X']))           
                print(i_cat + '-EN-3X: ' + str(HT_TAG_FREQ_DICT[i_cat]['EN']['3X']))
                print()
                print(i_cat + '-NE-01: ' + str(HT_TAG_FREQ_DICT[i_cat]['NE']['01']))           
                print(i_cat + '-EN-01: ' + str(HT_TAG_FREQ_DICT[i_cat]['EN']['01']))
                print()
                print(i_cat + '-NE-23: ' + str(HT_TAG_FREQ_DICT[i_cat]['NE']['23']))           
                print(i_cat + '-EN-23: ' + str(HT_TAG_FREQ_DICT[i_cat]['EN']['23']))
                print()
                        
    print('\tNumber of imbalanced baited digest: ' + str(ibd_num))
    print('\tNumber of imbalanced baited digest type 0: ' + str(ibd_num_0))
    print('\tNumber of imbalanced baited digest type 1: ' + str(ibd_num_1))
print()
print('Total number of imbalanced baited digests: ' + str(ibd_num_total))
print('Total number of imbalanced baited digest type 0: ' + str(ibd_num_total_0))
print('Total number of imbalanced baited digest type 1: ' + str(ibd_num_total_1))

fh_0.close()
fh_1.close()

  # Remove the CWD from sys.path while we load stuff.
  if sys.path[0] == '':


Chromosome: chr2
	Number of imbalanced baited digest: 236
	Number of imbalanced baited digest type 0: 107
	Number of imbalanced baited digest type 1: 129
Chromosome: chr6
	Number of imbalanced baited digest: 209
	Number of imbalanced baited digest type 0: 100
	Number of imbalanced baited digest type 1: 109
Chromosome: chr9
	Number of imbalanced baited digest: 109
	Number of imbalanced baited digest type 0: 47
	Number of imbalanced baited digest type 1: 62
Chromosome: chr10
	Number of imbalanced baited digest: 119
	Number of imbalanced baited digest type 0: 50
	Number of imbalanced baited digest type 1: 69
Chromosome: chr12
	Number of imbalanced baited digest: 122
	Number of imbalanced baited digest type 0: 54
	Number of imbalanced baited digest type 1: 68
Chromosome: chr7
	Number of imbalanced baited digest: 134
	Number of imbalanced baited digest type 0: 56
	Number of imbalanced baited digest type 1: 78
Chromosome: chr3
	Number of imbalanced baited digest: 178
	Number of imbalanced ba