In [1]:
from __future__ import print_function

# Genomics example

The simulated data was as follows:
- 1/4 sequences with 1-3 instances of a GATA_disc1 motif embedded (see http://compbio.mit.edu/encode-motifs/ for the PWM); these were labelled 1,0,0
- 1/4 sequences with 1-3 instances of a TAL1_known1 motif embedded; these were labelled 0,1,0
- 1/4 sequences with BOTH 1-3 instances of a GATA_disc1 motif AND 1-3 instances of a TAL1_known1 motif; these were labelled 1,1,1
- 1/4 sequences with no motif

## Obtain data and keras model

We will download genomic data and model

### Download the data and model

In [2]:
!./grab_model_and_data.sh

File sequences.simdata.gz exists already
File record_5_model_PQzyq_modelJson.json exists already
File record_5_model_PQzyq_modelWeights.h5 exists already
File test.txt.gz exists already


### Read in and one-hot encode the data

The simdna package is needed for reading the data; install it if it doesn't exist

In [3]:
try:
    import simdna
except ImportError, e:
    print("installing simdna package")
    !pip install -e "git://github.com/kundajelab/simdna.git@0.4.0#egg=simdna"
    print("\n******************************************************************************")
    print("RESTART THE JUPYTER KERNEL TO PICK UP ON THE INSTALLATION!!!")
    print("******************************************************************************")

Read in the data

In [4]:
import simdna.synthetic as synthetic
reload(synthetic)
reload(synthetic.core)
import gzip
data_filename = "sequences.simdata.gz"

#read in the data in the testing set
test_ids_fh = gzip.open("test.txt.gz","rb")
ids_to_load = [x.rstrip("\n") for x in test_ids_fh]
data = synthetic.read_simdata_file(data_filename, ids_to_load=ids_to_load)

One-hot encode the data

In [5]:
import numpy as np

#this model was trained on data one-hot encoded as a 2d image, with the row-axis being the axis
#for one-hot encoding.
def one_hot_encode_along_row_axis(sequence):
    #theano dim ordering, uses row axis for one-hot
    to_return = np.zeros((1,4,len(sequence)), dtype=np.int8)
    seq_to_one_hot_fill_in_array(zeros_array=to_return[0],
                                 sequence=sequence, one_hot_axis=0)
    return to_return

def seq_to_one_hot_fill_in_array(zeros_array, sequence, one_hot_axis):
    assert one_hot_axis==0 or one_hot_axis==1
    if (one_hot_axis==0):
        assert zeros_array.shape[1] == len(sequence)
    elif (one_hot_axis==1): 
        assert zeros_array.shape[0] == len(sequence)
    #zeros_array should be an array of dim 4xlen(sequence), filled with zeros.
    #will mutate zeros_array
    for (i,char) in enumerate(sequence):
        if (char=="A" or char=="a"):
            char_idx = 0
        elif (char=="C" or char=="c"):
            char_idx = 1
        elif (char=="G" or char=="g"):
            char_idx = 2
        elif (char=="T" or char=="t"):
            char_idx = 3
        elif (char=="N" or char=="n"):
            continue #leave that pos as all 0's
        else:
            raise RuntimeError("Unsupported character: "+str(char))
        if (one_hot_axis==0):
            zeros_array[char_idx,i] = 1
        elif (one_hot_axis==1):
            zeros_array[i,char_idx] = 1
            
onehot_data = np.array([one_hot_encode_along_row_axis(seq) for seq in data.sequences])

### Load the keras model

In [6]:
import deeplift
import deeplift.conversion.keras_conversion as kc

#load the keras model
keras_model_weights = "record_5_model_PQzyq_modelWeights.h5"
keras_model_json = "record_5_model_PQzyq_modelJson.json"

keras_model = kc.load_keras_model(weights=keras_model_weights,
                                  json=keras_model_json)

 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GeForce GT 750M (CNMeM is disabled, cuDNN 5005)
Using Theano backend.


## Prepare the deeplift models

### Model conversion

Convert the keras models to deeplift models capable of computing importance scores using DeepLIFT (with 3 different variants: rescale on the conv layers and revealcancel on the fully-connected layers (the genomics default), rescale on all layers, and revealcancel on all layers), gradients and guided backprop

In [7]:
from deeplift.blobs import NonlinearMxtsMode
from collections import OrderedDict

method_to_model = OrderedDict()
for method_name, nonlinear_mxts_mode in [
    #The genomics default = rescale on conv layers, revealcance on fully-connected
    ('rescale_conv_revealcancel_fc', NonlinearMxtsMode.DeepLIFT_GenomicsDefault)]:
    method_to_model[method_name] = kc.convert_sequential_model(
        model=keras_model,
        nonlinear_mxts_mode=nonlinear_mxts_mode)

nonlinear_mxts_mode is set to: DeepLIFT_GenomicsDefault
For layer 1 the preceding linear layer is 0 of type Conv2D;
In accordance with nonlinear_mxts_mode=DeepLIFT_GenomicsDefault we are setting the NonlinearMxtsMode to Rescale
For layer 3 the preceding linear layer is 2 of type Conv2D;
In accordance with nonlinear_mxts_mode=DeepLIFT_GenomicsDefault we are setting the NonlinearMxtsMode to Rescale
For layer 7 the preceding linear layer is 6 of type Dense;
In accordance with nonlinear_mxts_modeDeepLIFT_GenomicsDefault we are setting the NonlinearMxtsMode to RevealCancel
For layer 10 the preceding linear layer is 9 of type Dense;
In accordance with nonlinear_mxts_modeDeepLIFT_GenomicsDefault we are setting the NonlinearMxtsMode to RevealCancel


### Sanity checks

To ensure that the conversion happend correctly, ensure that the models give identical predictions

In [8]:
#make sure predictions are the same as the original model
from deeplift.util import compile_func
model_to_test = method_to_model['rescale_conv_revealcancel_fc']
deeplift_prediction_func = compile_func([model_to_test.get_layers()[0].get_activation_vars()],
                                         model_to_test.get_layers()[-1].get_activation_vars())
original_model_predictions = keras_model.predict(onehot_data, batch_size=200)
converted_model_predictions = deeplift.util.run_function_in_batches(
                                input_data_list=[onehot_data],
                                func=deeplift_prediction_func,
                                batch_size=200,
                                progress_update=None)
print("maximum difference in predictions:",np.max(np.array(converted_model_predictions)-np.array(original_model_predictions)))
assert np.max(np.array(converted_model_predictions)-np.array(original_model_predictions)) < 10**-5
predictions = converted_model_predictions

maximum difference in predictions: 0.0


## Compute importance scores

### Compile various scoring functions

Using the deeplift models, we obtain the functions capable of computing the importance scores.

In [9]:
print("Compiling scoring functions")
method_to_scoring_func = OrderedDict()
for method,model in method_to_model.items():
    print("Compiling scoring function for: "+method)
    method_to_scoring_func[method] = model.get_target_multipliers_func(find_scores_layer_idx=0,
                                                                    target_layer_idx=-2)

deeplift_func = method_to_model['rescale_conv_revealcancel_fc'].get_target_contribs_func(find_scores_layer_idx=0,
                                                                                               target_layer_idx=-2)
multipliers_func = method_to_model['rescale_conv_revealcancel_fc'].get_target_multipliers_func(find_scores_layer_idx=0,
                                                                                               target_layer_idx=-2)

Compiling scoring functions
Compiling scoring function for: rescale_conv_revealcancel_fc


### Call scoring functions on the data

In the cell below, a reference representing 40\% GC content is used

## Using multiple shuffled references

As an alternative to using a flat reference based on GC content (which can sometimes produce artefacts), we propose averaging the scores produced using mutliple references which are produced by shuffling the original sequence. We find in practice that this can give more robust results. Not that in general, the optimal choice of reference is an area of active research.

In [10]:
reload(deeplift.util)
from deeplift.util import get_shuffle_seq_ref_function
#from deeplift.util import randomly_shuffle_seq
from deeplift.dinuc_shuffle import dinuc_shuffle #function to do a dinucleotide shuffle

rescale_conv_revealcancel_fc_multipliers_many_refs_func = get_shuffle_seq_ref_function(
    #score_computation_function is the original function to compute scores
    score_computation_function=multipliers_func,
    #shuffle_func is the function that shuffles the sequence
    #technically, given the background of this simulation, randomly_shuffle_seq
    #makes more sense. However, on real data, a dinuc shuffle is advisable due to
    #the strong bias against CG dinucleotides
    shuffle_func=dinuc_shuffle,
    one_hot_func=lambda x: np.array([one_hot_encode_along_row_axis(seq) for seq in x]))

rescale_conv_revealcancel_fc_many_refs_func = get_shuffle_seq_ref_function(
    score_computation_function=deeplift_func,
    shuffle_func=dinuc_shuffle,
    one_hot_func=lambda x: np.array([one_hot_encode_along_row_axis(seq) for seq in x]))

num_refs_per_seq=10 #number of references to generate per sequence
multipliers_to_save = np.squeeze(rescale_conv_revealcancel_fc_multipliers_many_refs_func(
                    task_idx=0,
                    input_data_sequences=data.sequences,
                    num_refs_per_seq=num_refs_per_seq,
                    batch_size=200,
                    progress_update=1000,
                ))
deeplift_scores_to_save = np.squeeze(np.sum(rescale_conv_revealcancel_fc_many_refs_func(
            task_idx=0,
            input_data_sequences=data.sequences,
            num_refs_per_seq=num_refs_per_seq,
            batch_size=200,
            progress_update=1000,
        ),axis=2)[:,:,None,:]*onehot_data)

1000 reference seqs generated
2000 reference seqs generated
3000 reference seqs generated
4000 reference seqs generated
5000 reference seqs generated
6000 reference seqs generated
7000 reference seqs generated
8000 reference seqs generated
One hot encoding sequences...
One hot encoding done...
Done 0
Done 1000
Done 2000
Done 3000
Done 4000
Done 5000
Done 6000
Done 7000
1000 reference seqs generated
2000 reference seqs generated
3000 reference seqs generated
4000 reference seqs generated
5000 reference seqs generated
6000 reference seqs generated
7000 reference seqs generated
8000 reference seqs generated
One hot encoding sequences...
One hot encoding done...
Done 0
Done 1000
Done 2000
Done 3000
Done 4000
Done 5000
Done 6000
Done 7000


In [11]:
print(multipliers_to_save.transpose(0,2,1).shape)
print(deeplift_scores_to_save.transpose(0,2,1).shape)

#filter for cases where task 0 == 1, save matrix
np.save("talgata_task0_positives_multipliers.npy",
        np.compress(condition=data.labels[:,0],
                    a=multipliers_to_save.transpose(0,2,1), axis=0))

np.save("talgata_task0_positives_scores.npy",
        np.compress(condition=data.labels[:,0],
                    a=deeplift_scores_to_save.transpose(0,2,1), axis=0))

(800, 200, 4)
(800, 200, 4)
