In [1]:
'''
The directory layout is defined here, as relative path are used sometimes.

$TFNET_ROOT is set to the root of TFNET dir
 /-- scripts       such as label_region, prepare_data.py, run_deeplift.py, run_modisco.py, run_pipeline.py etc
 /-- genome        hg19.fa hg19.chrom.sizes hg19.tsv
 /-- ENCODE_data   peak files downloaded from ENCODE project
 /-- results       results
      |
      /-- nandi----/-- multi_tf --/-- multi_tf_nb_18_07_09/ We are here!
      |            /-- ZNF143
      |
      /-- nautilus-/-- CTCF
      |            /-- ZFX
      |
      /-- templates/-- config
'''
import os
from os.path import basename

ROOT_DIR   = os.getenv('TFNET_ROOT', "../../") 
scriptDir  = ROOT_DIR + "/scripts/"
dataDir    = ROOT_DIR + "/ENCODE_data/"
genomeDir  = ROOT_DIR + "/genome/"
resultsDir = "./"
logDir     = resultsDir + "log/"

In [2]:
import logging
logging.basicConfig(
        format='%(asctime)s %(levelname)-5s %(message)s',
        level=logging.DEBUG,
        datefmt='%Y-%m-%d %H:%M:%S')

In [3]:

def process_task_list(task_list, add_bg):
    header = "id"
    tmp_empty_file = "/dev/null"

    positives = []
    ambiguous = []

    # [tf, cell type, experiment, rep state] in task_list
    for tf, cell, exp, rep in task_list:
        positive = dataDir + cell + "-" + tf + "-human-" + exp + "-optimal_idr.narrowPeak.gz"
        positives.append(positive)
        
        header += "\t" + exp
        
        merged = tmp_empty_file
        # merged = ""
        if rep == 1: # bit0 == 1, has rep1
            rep1 = cell + "-" + tf + "-human-" + exp + "-rep1.narrowPeak.gz"
            merged = dataDir+rep1
        if rep == 2: # bit1 == 1, has rep2
            rep2 = cell + "-" + tf + "-human-" + exp + "-rep2.narrowPeak.gz"
            merged = dataDir+rep2
        if rep == 3: # bit1,bit2 == 1, has both rep1 and rep2
            rep1 = cell + "-" + tf + "-human-" + exp + "-rep1.narrowPeak.gz"
            rep2 = cell + "-" + tf + "-human-" + exp + "-rep2.narrowPeak.gz"
            tmp_merged = tmpDir + "_tmp_" + cell + "-" + tf + "-human-" + exp + "-merged.narrowPeak.gz"
            merged = tmp_merged
            os.system("pigz -d -c " + dataDir+rep1 + " " + dataDir+rep2 + \
                      " | cut -f 1-3 | bedtools sort | bedtools merge | pigz -c > " + merged)
        ambiguous.append(merged)

    if positives != []:
        positives_str = " --positives " + ','.join(positives)
    else:
        positives_str = ""

    if ambiguous != []:
        ambiguous_str = " --ambiguous " + ','.join(ambiguous)
    else:
        ambiguous_str = ""
    
    background_str = " --background " + genomeDir + "hg19.tsv "

    # call tfdragonn labelregions
    #
    # labels_multitask_gz = "tflabel.intervals_file.tsv.gz"
    # cmd = "tfdragonn labelregions " + positives_str + ambiguous_str + \
    #       " --genome hg19 --prefix tflabel" #       + " --background background "

    labels_multitask_gz = "label.intervals_file.tsv.gz"
    cmd = scriptDir + "label_regions " + positives_str + ambiguous_str + \
          " --genome hg19 --prefix label " + " --stride 20"
    if add_bg:
         cmd = cmd + background_str
    logging.debug(cmd)
    os.system(cmd)

    labels_multitask    = labels_multitask_gz[:-3]

    os.system("pigz -d -c " + labels_multitask_gz +  " > " + labels_multitask)

    tmp_labels_wo_title = tmpDir + "_tmp_labels_without_title.txt"

    # use sed to change each line from
    # chr<\t>start<\t>end<\t>label1<\t>label2 ...
    # to 
    # chr:start-end<\t>label1<\t>label2 ...
    os.system("cat " + labels_multitask + " | sed 's/\t/:/; s/\t/-/' > " + tmp_labels_wo_title)

    os.system("bedtools getfasta -fi " + genomeDir + "hg19.fa -bed " + labels_multitask + " -fo inputs.fa")

    #make the final inputs labels files from the shuffled lines (tfdragonn shuffles already)
    os.system("echo " + header + " > labels.txt")
    os.system("cat " + tmp_labels_wo_title + " >> labels.txt")

    logging.info("split and make hdf5")
    os.system("mkdir -p splits")

    #make the splits
    valid_chrom = "chr2" # chr2 is used for validation
    test_chrom  = "chr1" # chr1 is used for testing

    os.system("cat labels.txt | grep " + valid_chrom + ": | pigz -c > splits/valid.txt.gz")
    os.system("cat labels.txt | grep " + test_chrom  + ": | pigz -c > splits/test.txt.gz")
    cmd = "cat labels.txt | grep -v \"" + test_chrom  + ":\|"+ valid_chrom + ":\|^id" + "\" | pigz -c > splits/train.txt.gz"
    os.system(cmd)

    os.system("pigz -f labels.txt")
    #os.system("pigz -f inputs.fa")

    os.system("make_hdf5 --yaml_configs make_hdf5_yaml/* --output_dir .")

    logging.info("prepare_data done")



In [4]:
# guarantee to clean up tmp dir
import contextlib
import tempfile
import shutil
@contextlib.contextmanager
def make_temp_directory():
    temp_dir = tempfile.mkdtemp(dir = ".", prefix = "_tmp_")
    try:
        yield temp_dir
    finally:
        shutil.rmtree(temp_dir)

if __name__ == '__main__':

    import sys

    if len(sys.argv) > 2:
        print("Syntax: ", sys.argv[0] , " [--no-bg]")
        quit()

    if len(sys.argv) == 2 and sys.argv[1] == "--no-bg":
        add_bg = False
    else:
        add_bg = True

    task_list = [['CTCF',  'GM12878','ENCSR000AKB',3], # CTCF
                 ['SIX5',  'GM12878','ENCSR000BJE',3], # SIX5
                 ['ZNF143','GM12878','ENCSR000DZL',3]  # ZNF143
                ]

    with make_temp_directory() as temp_dir:
        global tmpDir
        tmpDir = temp_dir + "/"
        process_task_list(task_list, add_bg)

('Syntax: ', '/home/ktian/anaconda3/envs/tfenv/lib/python2.7/site-packages/ipykernel_launcher.py', ' [--no-bg]')


2018-07-10 10:16:04 DEBUG /home/ktian/kundajelab/tfnet/scripts/label_regions  --positives /home/ktian/kundajelab/tfnet/ENCODE_data/GM12878-CTCF-human-ENCSR000AKB-optimal_idr.narrowPeak.gz,/home/ktian/kundajelab/tfnet/ENCODE_data/GM12878-SIX5-human-ENCSR000BJE-optimal_idr.narrowPeak.gz,/home/ktian/kundajelab/tfnet/ENCODE_data/GM12878-ZNF143-human-ENCSR000DZL-optimal_idr.narrowPeak.gz --ambiguous ./_tmp_qfMGWB/_tmp_GM12878-CTCF-human-ENCSR000AKB-merged.narrowPeak.gz,./_tmp_qfMGWB/_tmp_GM12878-SIX5-human-ENCSR000BJE-merged.narrowPeak.gz,./_tmp_qfMGWB/_tmp_GM12878-ZNF143-human-ENCSR000DZL-merged.narrowPeak.gz --genome hg19 --prefix label  --stride 20 --background /home/ktian/kundajelab/tfnet/genome/hg19.tsv 
2018-07-10 10:17:36 INFO  split and make hdf5
2018-07-10 10:17:38 INFO  prepare_data done


In [3]:
import pandas as pd
df = pd.read_csv("labels.txt.gz", sep='\t', index_col=0, header=0, compression='gzip')
print(df.shape)

print(df.head(5))

#df.sum(axis=0)

melted_data = pd.melt(df, value_vars=['ENCSR000AKB', 'ENCSR000BJE','ENCSR000DZL'], 
                      var_name='Task', value_name='count')
print(melted_data.groupby(by=['Task', 'count'])['count'].count())


(3535075, 3)
                         ENCSR000AKB  ENCSR000BJE  ENCSR000DZL
id                                                            
chr14:92178000-92179000            0            0            0
chrY:52392000-52393000             0            0            0
chr16:14824000-14825000            0            0            0
chr14:63851000-63852000            0            0            0
chr13:69038000-69039000            0            0            0
Task         count
ENCSR000AKB  -1        157857
              0       3019148
              1        358070
ENCSR000BJE  -1        230535
              0       3275670
              1         28870
ENCSR000DZL  -1        164586
              0       3073438
              1        297051
Name: count, dtype: int64
