In [1]:
import src
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Loop Annotation Files:
Under the main Series Record (GSE63525), there are files named *_HiCCUPS_looplist.txt.gz or *_HiCCUPS_looplist_with_motifs.txt.gz, where * represents each of the cell types analyzed in this study (GM12878, HMEC, HUVEC, HeLa, IMR90, K562, KBM7, NHEK, CH12-LX), as well as biological replicate annotations for GM12878 (GM12878_primary and GM12878_replicate). These files contain Juicebox-loadable (www.aidenlab.org/juicebox) loop annotations returned by our loop calling algorithm, HiCCUPS (see Fig. 3, the Experimental Procedures, and Section VI.a.5 of the Extended Experimental Procedures of Rao, Huntley, et al., Cell 2014). These files contain a header line, followed by a line for every loop. The files named *_HiCCUPS_looplist.txt.gz contain 20 fields per line in the following format:

```chromosome1    x1    x2    chromosome2    y1    y2    color    observed    expected_bottom_left    expected_donut    expected_horizontal    expected_vertical    fdr_bottom_left    fdr_donut    fdr_horizontal    fdr_vertical    number_collapsed    centroid1    centroid2    radius```

Explanations of each field are as follows:
- chromosome = the chromosome that the loop is located on
- x1,x2 = the coordinates of the upstream locus corresponding to the peak pixel (see the Experimental Procedures and VI.a.5.iv of the Extended Experimental Procedures of Rao, Huntley, et al., Cell 2014 for a definition of the peak pixel)
- y1,y2 = the coordinates of the downstream locus corresponding to the peak pixel (see the Experimental Procedures and VI.a.5.iv of the Extended Experimental Procedures of Rao, Huntley, et al., Cell 2014 for a definition of the peak pixel)
- color = the color that the feature will be rendered as if loaded in Juicebox (www.aidenlab.org/juicebox)
- observed = the raw observed counts at the peak pixel (see the Experimental Procedures and VI.a.5.iv of the Extended Experimental Procedures of Rao, Huntley, et al., Cell 2014 for a definition of the peak pixel)
- expected_[bottom_left, donut, horizontal, vertical] = the expected counts calculated using the [bottom_left, donut, horizontal, vertical] filter (see Figure 3 and section VI.a.5.i of the Extended Experimental Procedures of Rao, Huntley, et al., Cell 2014)
- fdr_[bottom_left, donut, horizontal, vertical] = the q-value of the loop calculated using the [bottom_left, donut, horizontal, vertical] filter (see VI.a.5.ii of the Extended Experimental Procedures of Rao, Huntley, et al., Cell 2014)
- number_collapsed = the number of pixels that were clustered together as part of the loop call (see section VI.a.5.iv of the Extended Experimental Procedures of Rao, Huntley, et al., Cell 2014)
- centroid1 = the upstream coordinate of the centroid of the cluster of pixels corresponding to the loop (see section VI.a.5.iv of the Extended Experimental Procedures of Rao, Huntley, et al., Cell 2014)
- centroid2 = the downstream coordinate of the centroid of the cluster of pixels corresponding to the loop (see section VI.a.5.iv of the Extended Experimental Procedures of Rao, Huntley, et al., Cell 2014)
- radius = the Euclidean distance from the centroid of the cluster of pixels to the farthest pixel in the cluster of pixels (see section VI.a.5.iv of the Extended Experimental Procedures of Rao, Huntley, et al., Cell 2014)



In [2]:
hiccup = pd.read_table(src.external_data_path / "GSE63525_GM12878_primary+replicate_HiCCUPS_looplist.txt")
hiccup = hiccup[[
    'chr1', 'x1', 'x2',
    'chr2', 'y1', 'y2',
    'o', 'e_donut', 'fdr_donut'
]]

hiccup.columns = ['sourceChrom', 'sourceStart', 'sourceEnd', 'targetChrom', 'targetStart', 'targetEnd', 'observed', 'expected_donut', 'fdr_donut']
hiccup['enrichment_score']= hiccup.observed / hiccup.expected_donut
hiccup['sourceChrom'] = "chr" + hiccup.sourceChrom.astype(str)
hiccup['targetChrom'] = "chr" + hiccup.targetChrom.astype(str)
hiccup = hiccup.sort_values(['sourceChrom', 'sourceStart', 'sourceEnd', 'targetChrom', 'targetStart', 'targetEnd']).reset_index(drop=True)
hiccup['loop_id'] = np.arange(hiccup.shape[0], dtype=int)

x1 = hiccup[['sourceChrom', 'sourceStart', 'sourceEnd']]
x1.columns = src.coords

x2 = hiccup[['targetChrom', 'targetStart', 'targetEnd']]
x2.columns = src.coords
x = pd.concat((x1,x2), axis=0)
x = x.sort_values(src.coords)
x = x.drop_duplicates().reset_index(drop=True)
x['anchor_id'] = np.arange(x.shape[0], dtype=int)

hiccup = hiccup.merge(x, left_on=['sourceChrom', 'sourceStart', 'sourceEnd'], right_on=src.coords).drop(src.coords, axis=1).rename(columns={'anchor_id': 'sourceId'})
hiccup = hiccup.merge(x, left_on=['targetChrom', 'targetStart', 'targetEnd'], right_on=src.coords).drop(src.coords, axis=1).rename(columns={'anchor_id': 'targetId'})

hiccup.to_csv(src.processed_data_path / "GSE63525_HiCCUPS_loops.tsv", sep="\t", index=False, header=True)
hiccup.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,sourceChrom,sourceStart,sourceEnd,targetChrom,targetStart,targetEnd,observed,expected_donut,fdr_donut,enrichment_score,loop_id,sourceId,targetId
0,chr1,1050000,1060000,chr1,1180000,1190000,241,118.802,9.583457000000001e-17,2.028585,0,0,1
1,chr1,1585000,1590000,chr1,1645000,1650000,80,27.6775,1.413254e-09,2.890434,1,2,3
2,chr1,1710000,1715000,chr1,1835000,1840000,154,53.7355,1.298644e-18,2.865889,2,4,5
3,chr1,2120000,2130000,chr1,2310000,2320000,506,143.407,3.235315e-100,3.528419,3,6,8
4,chr1,2130000,2135000,chr1,2515000,2520000,52,20.7272,0.001554971,2.508781,4,7,12
