In [1]:
import os
import copy
import tempfile
import json
from subprocess import call
from diConstants import (HG19_ALL_CHROMS, MM9_ALL_CHROMS,
    HG19_TRAIN_CHROMS, MM9_TRAIN_CHROMS,
    VALID_CHROMS, TEST_CHROMS) 

import models
import modelTemplates

Using Theano backend.


## All about training data (2018-07-10)
#### What is hg19?
* These are the (names/versions) of human genome references as used by UCSC browser
* hg19란 Human Genome version.19를 의미한다

#### GM12878_5+1marks-K4me3_all (for Training)
* 학습에 사용되는 데이터셋의 이름이다
* GM12878은 세포데이터의 line수를 의미한다. GM12878번째 줄의 데이터임을 나타낸다(?)
* 5+1marks는 데이터내에 구성된 mark의 수가 5개임을 나타내고, 1개는 무엇을 의미하는지 아직 찾지 못함
* K4me3_all

#### GM18526_5+1marks-K4me3_all (for Testing)
* 테스트에 사용되는 데이터셋의 이름이다

#### dataset
* 클래스 x
* 학습용 데이터가 저장된 폴더의 이름이다. 아래 2가지 폴더로 데이터가 구분되어 있다
* processed : metadata, npz파일로 구성되어 있다. metadata는 JSON포맷
* base : npz파일로 구성되어 있다

#### Peak Fraction?
* peak_fraction is the fraction of examples that should be centered on a peak that exists in the full data.
* For example, if peak_fraction = 0.5, then half of the examples will have a peak at the center of the sequence, and the other half will not.

#### GM_MARKS?
* factor_for_peaks determines which factor is used for determining whether a given location is counted as a 'peak' or not, 
* since it could be a peak in one factor but not another.
* It should be a string, like 'H3K27AC'. 
* If it is None (the singleton, not a string), then a location is counted as having a peak so long as there's a peak in any factor.

## Dataset class
#### Reads in (X, Y, peakPValueX, peakPValueY, peakBinaryX, peakBinaryY) from a previously created .npz file, 
* where X is the input (subsampled) and Y is the output (full) and 
* peakPValueX, peakPValueY contain the -log10 pvalues for the called peaks (bin by bin)
* peakBinaryX, peakBinaryY contain the binarized peak signal for the called peaks (bin by bin)

#### X is of shape num_examples x seq_length x len(input_marks)
* (sequence length -> ex. 1001)
* (input marks -> ex. [u'H3K27AC', u'H3K27ME3', u'H3K36ME3', u'H3K4ME1', u'H3K4ME3',
*  'INPUT']) # 'INPUT' is only contained in X(peakxxxX is not contained!!)
* input_marks is a list of marks that will be used as input to the model.

#### Y is of shape num_examples x seq_length x len(output_marks).
* output_marks is a list of marks that will be used as output from the model. 
* It can be of length 1-6, depending on whether we're training separate models or one single model
* and on whether we're doing classification or regression.

#### peakPValueX is of similar shape to X, except that it does not contain an INPUT track
* it is of shape num_examples x seq_length x (len(input_marks) - ('INPUT' in input_marks)).

#### peakPValueY is of similar shape to Y, except that it does not contain an INPUT track
* so it is of shape num_examples x seq_length x (len(output_marks) - ('INPUT' in output_marks)).

#### If the .npz file doesn't exist
* it will create it by calling extract_seq_dataset.

In [2]:
test_cell_line = 'GM18526'
subsample_target_string = '0.5e6'
GM_MARKS = ['H3K27AC']
for predict_binary_output in [True, False]: # original : TRUE , FALSE
    print 
    print 'Predict Binary Output', predict_binary_output
    for output_mark in GM_MARKS:
        print('Output Mark', output_mark)
        model_params = modelTemplates.make_model_params(
                                #### Prediction model(keras ver) specification ####
                                model_library='keras',
                                model_class='SeqToPoint',
                                model_type='cnn',
                                model_specific_params={
                                    'num_filters': 6,
                                    'filter_length': 51
                                },
                                compile_params={            
                                    'optimizer': 'adagrad'
                                },
            
                                #### Dataset specification ####
                                dataset_params={
                                    'train_dataset_name': 'GM12878_5+1marks-K4me3_all',
                                    'test_dataset_name': '%s_5+1marks-K4me3_all' % test_cell_line, #(1)
                                    'num_train_examples': 1000,
                                    'seq_length': 1001,
                                    'peak_fraction': 0.5,                            
                                    'train_X_subsample_target_string': subsample_target_string, #(2)
                                    'num_bins_to_test': None,
                                    'train_chroms': HG19_ALL_CHROMS, # GM-12878 ch1 ~ ch22
                                    'test_chroms': HG19_ALL_CHROMS, # CM-18526 ch1 ~ ch22
                                    'only_chr1': True
                                },
                                output_marks=[output_mark],
                                train_params={
                                    'nb_epoch': 1,
                                    'batch_size': 100
                                },
                                predict_binary_output=predict_binary_output,
                                zero_out_non_bins=True,
                                generate_bigWig=True)
        
        print model_params
        train_dataset = model_params['dataset_params']['train_dataset']
        test_datasets = model_params['dataset_params']['test_datasets']
        input_marks = model_params['input_marks']
        output_marks = model_params['output_marks']
        dataset_params = model_params['dataset_params']
        test_dataset = test_datasets[0]
        print
        
        print train_dataset
        print test_dataset
        print
        print 'Input Marks:',model_params['input_marks']
        print
        print 'Output Marks:',model_params['output_marks']
        print 
        print 'Sequence Length:',dataset_params['seq_length']
        print 
        
        X, Y, peakPValueX, peakPValueY, peakBinaryX, peakBinaryY = train_dataset.load_seq_dataset(     
            seq_length=dataset_params['seq_length'],
            input_marks=input_marks,
            output_marks=output_marks)
        
        print 'X shape: ', X.shape
        print X
        print 'peakPValueX shape: ', peakPValueX.shape
        print peakPValueX
        print 'peakBinaryX shape: ', peakBinaryX.shape
        print peakBinaryX
        print
        print 'Y shape: ', Y.shape
        print Y
        print 'peakPValueY shape: ', peakPValueY.shape
        print peakPValueY
        print 'peakBinaryY shape: ', peakBinaryX.shape
        print peakBinaryY


Predict Binary Output True
('Output Mark', 'H3K27AC')
{'compile_params': {'loss': 'binary_crossentropy', 'optimizer': 'adagrad'}, 'random_seed': 0, 'output_marks': ['H3K27AC'], 'predict_binary_output': True, 'input_marks': [u'H3K27AC', u'H3K27ME3', u'H3K36ME3', u'H3K4ME1', u'H3K4ME3', u'INPUT'], 'filter_length': 51, 'train_params': {'nb_epoch': 1, 'validation_split': 0.2, 'batch_size': 100}, 'scale_input': '01', 'generate_bigWig': True, 'zero_out_non_bins': True, 'model_library': 'keras', 'num_filters': 6, 'model_type': 'cnn', 'model_class': 'SeqToPoint', 'dataset_params': {'test_datasets': [<dataset.Dataset object at 0x7f975f5ee1d0>], 'train_dataset': <dataset.Dataset object at 0x7f975f5ee2d0>, 'only_chr1': True, 'num_bins_to_test': None, 'seq_length': 1001}}

<dataset.Dataset object at 0x7f975f5ee2d0>
<dataset.Dataset object at 0x7f975f5ee1d0>

Input Marks: [u'H3K27AC', u'H3K27ME3', u'H3K36ME3', u'H3K4ME1', u'H3K4ME3', u'INPUT']

Output Marks: ['H3K27AC']

Sequence Length: 1001

X s

X shape:  (1000, 1001, 6)
[[[3.4478207 0.        0.        4.2473273 0.        0.       ]
  [3.4478207 0.        0.        4.1345906 0.        0.       ]
  [3.4539044 0.        0.        4.124165  0.        0.       ]
  ...
  [0.        0.        0.        0.        0.        3.4791098]
  [0.        0.        0.        0.        0.        3.4791098]
  [0.        0.        0.        0.        0.        3.4791098]]

 [[0.        0.        0.        0.        0.        0.       ]
  [0.        0.        0.        0.        0.        0.       ]
  [0.        0.        0.        0.        0.        0.       ]
  ...
  [0.        0.        0.        0.        0.        0.       ]
  [0.        0.        0.        0.        0.        0.       ]
  [0.        0.        0.        0.        0.        0.       ]]

 [[0.        0.        0.        0.        0.        0.       ]
  [0.        0.        0.        0.        0.        0.       ]
  [0.        0.        0.        0.        0.        0.       