# Analysis of active TFs by chromatin state

1. Load TF PWMs (HOMER should be a good enough set)
2. Load train sequences
3. Load deepLIFT scores
4. Compute PWM-deepLIFT and PWM-sequence convolution matrices using Avanti's GPU accelerated function:
https://github.com/kundajelab/modisco_private/blob/master/test/util/test_correlation.py
5. Divide the two matrices to "normalize" the PWM-deepLIFT convolution matrix. That is, the matrix entries now represent how much the motif was used by the model in making a prediction.
6. Visualize average motif usage by chromatin state.
7. Create a motif-motif correlation matrix to see which motifs were predictive together. Cluster this matrix to reproduce known TF interactions.

In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import glob
import time

motifpwms = {}

homer_path = '../../data/motifs/homer/*.motif'
t0 = time.time()
for homer_file in glob.glob(homer_path):
    with open(homer_file) as f:
        header = f.readline().strip().split('\t')
        name = header[1]
        threshold = float(header[2])
        weights = np.array(pd.read_csv(f, sep = '\t'))
        motifpwms[name] = weights
print("Took %.3f sec to load HOMER motifs" % (time.time() - t0))

print("Number of PWMs: %d" % len(motifpwms))

In [None]:
import h5py

train_data_path = '../hdf5files/sharpr_znormed_jul23/train_data.hdf5'
data = h5py.File(train_data_path)

data_X = np.array(data['X/sequence'])
data_Y = np.array(data['Y/output'])

In [None]:
from collections import OrderedDict

modelName = "record_13_model_bgGhy_" 

task_to_deeplift_contribs = OrderedDict()
t0 = time.time()
for task_idx in [2, 5, 8, 11]:
    path = '../deeplift_scores/%s/contribs_reshaped_task%d.tab' % (modelName[:-1], task_idx)
    task_to_deeplift_contribs[task_idx] = pd.read_csv(path, 
                                                      sep = '\t',
                                                      header = None).values
    task_to_deeplift_contribs[task_idx] = task_to_deeplift_contribs[task_idx][:, 1:]
    print task_to_deeplift_contribs[task_idx].shape
    task_to_deeplift_contribs[task_idx] = np.reshape(task_to_deeplift_contribs[task_idx], (-1, 145, 4))
    print("Reading in deepLIFT scores for task %d took %.3f sec" % (task_idx, time.time() - t0))
    t0 = time.time()

In [2]:
import sys
import os
import numpy as np
import modisco
import modisco.util
import time

regions_to_scan = np.array([[
    [[0.0, 0.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0],
     [0.0, 0.0, 0.0, 0.5, 0.4, 0.0, 0.0, 0.0, 0.0],
     [0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.2, 0.0, 0.0],
     [0.0, 0.0, 0.0, 0.0, 0.0, 0.3, 0.0, 0.0, 0.0]]
],[
    [[0.0, 0.0, 0.0, 0.3, 0.0, 0.0, 0.0, 0.0, 0.0],
     [0.0, 0.0, 0.2, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0],
     [0.0, 0.0, 0.0, 0.0, 0.4, 0.5, 0.0, 0.0, 0.0],
     [0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.0, 0.0]]
]])
filters = np.array([[
    [1.0, 0.0, 1.0, 0.0],
    [2.0, 3.0, 1.0, 0.0],
    [0.0, 0.0, 1.0, 5.0],
    [0.0, 0.0, 0.0, 0.0]
],[
    [0.0, 0.0, 0.0, 0.0],
    [5.0, 1.0, 0.0, 0.0],
    [0.0, 1.0, 3.0, 2.0],
    [0.0, 1.0, 0.0, 1.0]
]])

scanning_results = np.array(modisco.util.scan_regions_with_filters(
    filters=filters,
    regions_to_scan=regions_to_scan))

print(scanning_results)
#fwd scan: [0.5, 1.1, 1.9, 3.7, 1.0, 0.0]
#rev scan: [0.2, 0.3, 0.6, 3.3, 2.9, 0.2]
correct_answer = np.array([[[
                [0.5, 1.1, 1.9, 3.7, 2.9, 0.2],
                [0,   0,   0,   0,   1,   1]
            ],[
                [0.5, 1.1, 1.9, 3.7, 2.9, 0.2],
                [1,   1,   1,   1,   0,   0]
            ],
          ],[
                [[0.2, 2.9, 3.7, 1.9, 1.1, 0.5],
                 [0,   0,   1,   1,   1,   1]],
                [[0.2, 2.9, 3.7, 1.9, 1.1, 0.5],
                 [1,   1,   0,   0,   0,   0]]
          ]])
np.testing.assert_allclose(scanning_results, correct_answer)

Done 0


RuntimeError: error getting worksize: CUDNN_STATUS_BAD_PARAM
Apply node that caused the error: GpuDnnConv{algo='small', inplace=True}(GpuContiguous.0, GpuArrayConstant{[[[[ 0.  0.  0.  0.]
   [ 5.  1.  0.  0.]
   [ 0.  1.  3.  2.]
   [ 0.  1.  0.  1.]]]


 [[[ 1.  0.  1.  0.]
   [ 2.  3.  1.  0.]
   [ 0.  0.  1.  5.]
   [ 0.  0.  0.  0.]]]]}, GpuAllocEmpty{dtype='float32', context_name=None}.0, GpuDnnConvDesc{border_mode='valid', subsample=(1, 1), conv_mode='conv', precision='float64'}.0, Constant{1.0}, Constant{0.0})
Toposort index: 23
Inputs types: [GpuArrayType<None>(float32, (False, False, False, False)), GpuArrayType<None>(float64, (False, True, False, False)), GpuArrayType<None>(float32, (False, False, False, False)), <theano.gof.type.CDataType object at 0x7fd88bd37650>, Scalar(float32), Scalar(float32)]
Inputs shapes: [(2, 1, 4, 9), (2, 1, 4, 4), (2, 2, 1, 6), 'No shapes', (), ()]
Inputs strides: [(144, 144, 36, 4), (128, 128, 32, 8), (48, 24, 24, 4), 'No strides', (), ()]
Inputs values: ['not shown', 'not shown', 'not shown', <capsule object NULL at 0x7fd88bc96e40>, 1.0, 0.0]
Outputs clients: [[GpuJoin(TensorConstant{2}, GpuDnnConv{algo='small', inplace=True}.0, GpuDnnConv{algo='small', inplace=True}.0)]]

HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.

# Playing around with TF PWM-sequence convolutions

Familiarizing myself with the PWM loading code, playing with scipy's convolve function to make sure we can first just find enriched HOMER motifs.

In [None]:
import pandas as pd
import numpy as np
import glob
import time

motifpwms = {}

homer_path = '../../data/motifs/homer/*.motif'
t0 = time.time()
for homer_file in glob.glob(homer_path):
    with open(homer_file) as f:
        header = f.readline().strip().split('\t')
        name = header[1]
        threshold = float(header[2])
        weights = np.array(pd.read_csv(f, sep = '\t'))
        motifpwms[name] = weights
print("Took %.3f sec to load HOMER motifs" % (time.time() - t0))
    
# encode_motifs = '../../data/motifs/encode/motifs.txt'
# t0 = time.time()
# with open(encode_motifs) as fp:
#     line = fp.readline().strip()
#     while True:
#         if line == '':
#             break
#         header = line
#         weights = []
#         while True:
#             line = fp.readline()
#             if line == '' or line[0] == '>':
#                 break
#             weights.append(map(float, line.split()[1:]))
#         motifpwms[header] = np.array(weights)
# print("Took %.3f sec to load ENCODE motifs" % (time.time() - t0))

In [None]:
import h5py

train_data_path = '../hdf5files/sharpr_znormed_jul23/train_data.hdf5'
data = h5py.File(train_data_path)

data_X = np.array(data['X/sequence'])
data_Y = np.array(data['Y/output'])

In [None]:
from scipy.signal import convolve

motifs = motifpwms.keys()
pwms = np.array([motifpwms[motif].astype(np.float32) for motif in motifs])

conv_matrix = np.ndarray((len(data_X), len(motifs)))
background_conv_matrix = np.ndarray((len(data_X), len(motifs)))

idxs = np.argsort(data_Y[:, 2])[::-1][:500]
bckgrnd_idxs = np.argsort(np.abs(data_Y[:, 2]))[:500]

t0 = time.time()
t1 = time.time()
progress_update = 100
# for (i, seq) in enumerate(data_X):
for (i, idx) in enumerate(np.concatenate((idxs,bckgrnd_idxs))):
#     if i % 31 != 0:
#         continue
    if i > 3000:
        break
    if i % progress_update == 0 and i > 0:
        print("Seqs %d to %d took %.3f sec" % (i - progress_update, i, time.time() - t1))
        t1 = time.time()
    for (j, pwm) in enumerate(pwms):
        if idx in idxs:
            max_conv = np.max(convolve(data_X[idx], pwms[j], mode = 'valid').ravel())
            conv_matrix[idx][j] = max_conv
        elif idx in bckgrnd_idxs:
            max_conv = np.max(convolve(data_X[idx], pwms[j], mode = 'valid').ravel())
            background_conv_matrix[idx][j] = max_conv
        
print("All convolutions took %.3f sec" % (time.time() - t0))

In [None]:
# print np.sum(conv_matrix, axis = 0)
# print np.sum(background_conv_matrix, axis = 0)
motifScores = np.sum(conv_matrix, axis = 0) / np.sum(background_conv_matrix, axis = 0)
sorted_idxs = np.argsort(motifScores)[::-1]
sortedMotifs = np.array(motifs)[sorted_idxs]
print motifScores[sorted_idxs[:10]]
print sortedMotifs[:10]