In [1]:
import sys
import os
import numpy as np
sys.path.append("..")
from diachr import TadBoundarySet
from diachr import DiachromaticInteractionSet
from scipy import stats

In [2]:
tad_boundaries = '../additional_files/javierre_2016/tad_regions_hg38/hglft_genome_TADs_MK_hg38.bed'
tad_boundaries = '../additional_files/javierre_2016/tad_regions_hg38/merged_tad_boundary_centers.bed'

tbs = TadBoundarySet(tad_boundaries)

nearest_tad_pos = tbs.get_nearest_tad_boundary('chr1', 10600000)
print(nearest_tad_pos)
nearest_tad_dist = tbs.get_distance_to_nearest_tad_boundary('chr1', 10600000)
print(nearest_tad_dist)

10652443
52443


In [3]:
CELL_TYPE_SHORT = 'NCD8'
ANALYSIS='HT_RMRO_FDR001'
INTERACTION_FILE = '../DICer_interactions/' + ANALYSIS.upper() + '/CHC/JAV_' + CELL_TYPE_SHORT + '_RALT_20000_' + ANALYSIS.lower() + '_evaluated_and_categorized_interactions.tsv.gz'
RPC_RULE = "ht"
OUT_PREFIX = 'JAV_' + CELL_TYPE_SHORT + '_RALT_20000_' + ANALYSIS.lower()

tad_boundaries = '../additional_files/javierre_2016/tad_regions_hg38/hglft_genome_TADs_' + CELL_TYPE_SHORT + '_hg38.bed'
#tad_boundaries = '../additional_files/javierre_2016/tad_regions_hg38/merged_tad_boundary_centers.bed'
tbs = TadBoundarySet(tad_boundaries)


# First, we create an DiachromaticInteractionSet object
d11_interaction_set = DiachromaticInteractionSet(rpc_rule = RPC_RULE)
d11_interaction_set.parse_file(
    i_file = INTERACTION_FILE,
    verbose = True)

report_dict = d11_interaction_set.select_reference_interactions_2x(verbose=True)

[INFO] Parsing Diachromatic interaction file ...
	[INFO] ../DICer_interactions/HT_RMRO_FDR001/CHC/JAV_NCD8_RALT_20000_ht_rmro_fdr001_evaluated_and_categorized_interactions.tsv.gz
	[INFO] Parsed 1,000,000 interaction lines ...
	[INFO] Parsed 2,000,000 interaction lines ...
	[INFO] Parsed 3,000,000 interaction lines ...
	[INFO] Set size: 3,387,059
[INFO] ... done.
[INFO] Select reference interactions ...
	[INFO] Treating NE and EN as one category ...
	[INFO] First pass: Count directed interactions for different read pair counts ...
	[INFO] Second pass: Select undirected reference interactions for different read pair counts ...
	[INFO] Third pass: Mark directed interactions for which there is no reference ...
[INFO] ... done.


In [4]:
tad_dist_lists = {
    'DIX': {
        'NE': [],
        'EN': [],
        'NEEN': []
    },
    'DI': {
        'NE': [],
        'EN': [],
        'NEEN': []
    },
    'UIR': {
        'NE': [],
        'EN': [],
        'NEEN': []
    },
    'UI': {
        'NE': [],
        'EN': [],
        'NEEN': []
    },
    'ALL': {
        'NE': [],
        'EN': [],
        'NEEN': []
    }
}
for d11_inter in d11_interaction_set._inter_dict.values():
    if d11_inter.enrichment_status_tag_pair == 'NE':
        pos = nearest_tad_dist = tbs.get_distance_to_nearest_tad_boundary(d11_inter.chrA, d11_inter.fromA)
        tad_dist_lists[d11_inter.get_category()]['NE'].append(pos)
        tad_dist_lists['ALL']['NE'].append(pos)
        tad_dist_lists[d11_inter.get_category()]['NEEN'].append(pos)
        tad_dist_lists['ALL']['NEEN'].append(pos)
    elif d11_inter.enrichment_status_tag_pair == 'EN':
        pos = nearest_tad_dist = tbs.get_distance_to_nearest_tad_boundary(d11_inter.chrA, d11_inter.toB)
        tad_dist_lists[d11_inter.get_category()]['EN'].append(pos)
        tad_dist_lists['ALL']['EN'].append(pos)
        tad_dist_lists[d11_inter.get_category()]['NEEN'].append(pos)
        tad_dist_lists['ALL']['NEEN'].append(pos)
    else:
        pass


In [5]:
print(OUT_PREFIX)
print()
for e_cat in ['NE', 'EN', 'NEEN']:
    print()
    print('--------\n' + e_cat)
    print()
    for i_cat in ['DIX','DI','UIR','UI', 'ALL']:
        print(i_cat + ':\n\tQuantiles: ' + str(np.quantile(tad_dist_lists[i_cat][e_cat], [0.25, 0.50, 0.75])) + '\n\tn=' + str(len(tad_dist_lists[i_cat][e_cat])))

    print()
    print('DIX vs. DI: ' + str(stats.ks_2samp(tad_dist_lists['DIX'][e_cat], tad_dist_lists['DI'][e_cat])))
    print('DI vs. UIR: ' + str(stats.ks_2samp(tad_dist_lists['DI'][e_cat], tad_dist_lists['UIR'][e_cat])))
    print('UIR vs. UI: ' + str(stats.ks_2samp(tad_dist_lists['UIR'][e_cat], tad_dist_lists['UI'][e_cat])))

JAV_NCD8_RALT_20000_ht_rmro_fdr001


--------
NE

DIX:
	Quantiles: [ 22659.  54721. 104210.]
	n=100777
DI:
	Quantiles: [ 21765.  53357. 105507.]
	n=274333
UIR:
	Quantiles: [ 22440.   55012.5 109339. ]
	n=278430
UI:
	Quantiles: [ 23937.  58296. 114488.]
	n=916588
ALL:
	Quantiles: [ 23185.  56582. 111346.]
	n=1570128

DIX vs. DI: KstestResult(statistic=0.011107402510087239, pvalue=2.509541973846486e-08)
DI vs. UIR: KstestResult(statistic=0.013394618281074133, pvalue=5.782088276849481e-22)
UIR vs. UI: KstestResult(statistic=0.020355992792967403, pvalue=2.664384758952997e-77)

--------
EN

DIX:
	Quantiles: [ 23777.  57040. 107825.]
	n=103045
DI:
	Quantiles: [ 23241.    56238.   110068.25]
	n=280382
UIR:
	Quantiles: [ 23675.  57639. 112468.]
	n=276285
UI:
	Quantiles: [ 24230.  58775. 114813.]
	n=917434
ALL:
	Quantiles: [ 23917.  58016. 113062.]
	n=1577146

DIX vs. DI: KstestResult(statistic=0.014714150097877132, pvalue=1.336029641470703e-14)
DI vs. UIR: KstestResult(statistic=0.009979034860