In [1]:
%matplotlib inline

from keras.models import Sequential
from keras.layers import Flatten, Dense, Dropout
from keras.layers import Reshape
from keras.layers.core import Activation
from keras.layers.normalization import BatchNormalization
from keras.layers.convolutional import UpSampling2D
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.layers.core import Flatten
from keras.optimizers import SGD
from keras import regularizers
from keras.callbacks import ModelCheckpoint

from sklearn.preprocessing import binarize
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_auc_score

import numpy as np
import scipy as sp
import scipy.io as sio

import h5py

import seaborn as sns
import matplotlib.pyplot as plt

import pickle

from deepsea import *

Using Theano backend.
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GeForce GTX 980 Ti (CNMeM is disabled, cuDNN 5005)


In [52]:
train_X, train_Y = loaddata("data/deepsea/", "train")

In [2]:
test_X, test_Y = loaddata("data/deepsea/", "test")

In [3]:
valid_X, valid_Y = loaddata("data/deepsea/", "valid")

# Preparing training data for deeperSEA

Extracting common histone mark signals (marks below) in 6 celltypes.   
Also extracting "CTCF"  signal in the 6 celltypes to use a prediction target.

**Marks**: 'H3K4me3', 'H3K4me2', 'H3K4me1', 'H2AZ', 'H3K36me3', 'H4K20me1',
       'H3K27ac', 'H3K27me3', 'H3K9me3', 'H3K79me2', 'H3K9ac'  
**Celltypes**: 'H1-hESC', 'NH-A', 'NHEK', 'K562', 'NHDF-Ad', 'NHLF'

In [4]:
raw = open("data/deepsea/features.txt").readlines()[1:-1]
indexmap = dict([(tuple(raw[i].split("\t")[1:3]), i) for i in range(len(raw))])

In [5]:
marks = np.append(np.load("data/deepersea/marks.npy"), "DNase")

In [6]:
celltypes = np.load("data/deepersea/celltypes.npy")

In [7]:
targets = np.load("data/deepersea/targets.npy")

In [53]:
data = train_Y
name = "train"

In [54]:
tf = "CTCF"
Ys = []
for c in celltypes:
    indexes = [indexmap[(c,m)] for m in marks]
    Ys.append(data[:, indexmap[(c, tf)]])

In [55]:
indexes = [[i for i in range(len(m)) if m[i]==1] for m in Ys]

In [56]:
positives = set(indexes[0])
for i in indexes[1:]:
    positives = positives.union(i)
positives = sorted(positives)

In [57]:
negatives = []
index = 0
for i in range(data.shape[0]):
    try:
        cur = positives[index]
    except:
        cur = -1
        
    if i != cur:
        negatives.append(i)
    else:
        index+=1
        
np.random.shuffle(negatives)    

In [58]:
sample_inds = sorted(positives + negatives[:int(0.3*len(positives))])

In [59]:
tf = "CTCF"

Y1s = None 
X1s = None
for c in celltypes:
    indexes = [indexmap[(c,m)] for m in marks]
    if X1s is None:
        Y1s = data[sample_inds, indexmap[(c, tf)]]
        X1s = data[:, indexes][sample_inds, :]
    else:
        Y1s = np.concatenate([Y1s, data[sample_inds, indexmap[(c, tf)]]], axis=0)
        X1s = np.concatenate([X1s, data[:, indexes][sample_inds, :]], axis=0)

In [60]:
sum(Y1s), len(Y1s)

(651234, 3558498)

In [61]:
np.save("data/deepersea/"+name+"x", X1s)

In [62]:
np.save("data/deepersea/"+name+"y", Y1s)

In [63]:
np.save("data/deepersea/sample_index_"+name, sample_inds)