In [1]:
from dragonn import models
from dragonn.plot import add_letters_to_axis

from sklearn.model_selection import train_test_split

from collections import OrderedDict
from pprint import pprint
from warnings import warn

import numpy as np
import matplotlib.pyplot as plt

import math

%matplotlib inline

Using Theano backend.
Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5005)
  "downsample module has been moved to the theano.tensor.signal.pool module.")


In [2]:
key_to_seq = OrderedDict()
seq_len = 145
reg_len = 295
skip_len = 5

with open("../../data/Scaleup_counts_sequences/ScaleUpDesign1.sequences.txt") as f:
    for line in f:
        key, seq = line.strip().split()
        
        # TODO: Figure out if this is an OK thing to do. 'N' basically means the 
        # sequencing software couldn't figure out what the base was...?
        if "N" in seq:
            seq = seq.replace("N", "A")
        
        assert key not in key_to_seq
        key_to_seq[key] = seq
        
with open("../../data/Scaleup_counts_sequences/ScaleUpDesign2.sequences.txt") as f:
    for line in f:
        key, seq = line.strip().split()
        
        if "N" in seq:
            seq = seq.replace("N", "A")
        
        assert key not in key_to_seq
        key_to_seq[key] = seq
  

In [3]:
data = {}
sample_weights = {}
cell_types =  ["HepG2", "K562"]
promoters = ["SV40P", "minP"]
design_names = ["ScaleUpDesign1", "ScaleUpDesign2"]

for cell_type in cell_types:
    for promoter in promoters:
        experiment_key = (cell_type, promoter)
        data[experiment_key] = {}
        sample_weights[experiment_key] = {}

        for design_name in design_names:

            with open("../../data/Scaleup_normalized/{}_{}_{}_mRNA_Rep1.normalized".format(cell_type, design_name, promoter)) as f:
                for line in f:
                    parts = line.strip().split()

                    key = parts[0]
                    val = float(parts[1])
                    if parts[2] == "1":
                        data[experiment_key][key] = val

            with open("../../data/Scaleup_normalized/{}_{}_{}_mRNA_Rep2.normalized".format(cell_type, design_name, promoter)) as f:
                for line in f:
                    parts = line.strip().split()

                    key = parts[0]
                    val = float(parts[1])
                    if parts[2] == "1" and key in data[experiment_key]:
                        dot_prod = (val + data[experiment_key][key])
                        norm = math.sqrt(2)*math.sqrt(val**2 + data[experiment_key][key]**2)
                        cos = dot_prod/norm
                        sample_weights[experiment_key][key] = abs(cos)
                        data[experiment_key][key] = (val + data[experiment_key][key]) / 2.0
    
# One hot encode DNA sequences the standard way.
bases = ['A', 'T', 'C', 'G']

def one_hot_encode_seq(seq):
    result = np.zeros((len(bases), len(seq)))
    
    for i, base in enumerate(seq):
        result[bases.index(base), i] = 1

    return result

def one_hot_encode_reg(reg):
    result = np.zeros((len(bases), reg_len))
    
    key = reg[0]
    parts = key.split('_')
    tile_pos = int(parts[3])
    
    seq = reg[1]
    
    for i, base in enumerate(seq):
        result[bases.index(base), i + (tile_pos * skip_len)] = 1

    return result

def seqs_to_encoded_matrix(seqs):
    # Wrangle the data into a shape that Dragonn wants.
    result = np.concatenate(
        map(one_hot_encode_seq, seqs)
    ).reshape(
        len(seqs), 1, len(bases), len(seqs[0])
    )
    
    # Check we actually did the encoding right.
    for i in range(len(seqs)):
        for j in range(len(seqs[0])):
            assert sum(result[i, 0, :, j]) == 1
    
    return result

def regs_to_encoded_matrix(regs):
    # Wrangle the data into a shape that Dragonn wants.
    result = np.concatenate(
        map(one_hot_encode_reg, regs)
    ).reshape(
        len(regs), 1, len(bases), reg_len
    )
    
    return result

valid_keys = list(reduce(
    lambda acc, d: acc.intersection(d.keys()), 
    data.values()[1:], 
    set(data.values()[0].keys())
))


In [4]:
data_dir = 'data/'
rebuild = True

if rebuild:

    X_t = seqs_to_encoded_matrix([key_to_seq[key] for key in valid_keys])
    X_r = regs_to_encoded_matrix([(key, key_to_seq[key]) for key in valid_keys])

    from sklearn.preprocessing import MinMaxScaler

    scaler = MinMaxScaler(feature_range=(-1,1))

    experiment_labels = []
    weights = []
    for experiment_key, key_to_normalized in data.items():

        filtered_normalized = np.array([key_to_normalized[key] for key in valid_keys]).reshape(-1, 1)
        filtered_weights = np.array([sample_weights[experiment_key][key] for key in valid_keys]).reshape(-1, 1)
        
        scaled = scaler.fit_transform(filtered_normalized)

        experiment_labels.append(scaled)
        weights.append(filtered_weights)

    y = np.hstack(experiment_labels)
    weights = np.hstack(weights).mean(axis=1).reshape(-1,1)

    X = X_r

    tasks = data.keys()
    
    #np.save(data_dir + 'X_t.npy', X_t)
    #np.save(data_dir + 'X_r.npy', X_r)
    #np.save(data_dir + 'y.npy', y)
    np.save(data_dir + 'tasks.npy', tasks)

else:
    
    X_t = np.load(data_dir + 'X_t.npy')
    X_r = np.load(data_dir + 'X_r.npy')
    y = np.load(data_dir + 'y.npy')
    tasks = np.load(data_dir + 'tasks.npy')
    
X_train, X_valid, y_train, y_valid, weights_train, weights_valid = train_test_split(
    X, y, weights, test_size=0.2, random_state=42
)

In [5]:
print(weights_train.shape)
print(X_train.shape)

(221927, 1)
(221927, 1, 4, 295)


In [6]:
import os.path

fn = "model"

if os.path.isfile(fn + ".arch.json") and os.path.isfile(fn + ".weights.h5"):
    model = models.SequenceDNN_Regression.load(fn + ".arch.json", fn + ".weights.h5")
else:
    model = models.SequenceDNN_Regression(
        seq_length=X_train.shape[3],
        num_filters=[100, 100],
        conv_width=[15, 15],
        pool_width=40,
        num_tasks=y_train.shape[1],
        dropout=0.1
    )

In [7]:
model.train(X_train, y_train, (X_valid, y_valid), 
            train_sample_weight=weights_train, valid_sample_weight=weights_valid)

TypeError: train() got an unexpected keyword argument 'train_sample_weight'

In [None]:
model.plot_architecture(fn + '.png')
models.SequenceDNN_Regression.save(model, fn + '.arch.json')

In [None]:
import matplotlib.gridspec as gridspec

def print_perf(model, metric):
    train_losses, valid_losses = [np.array([epoch_metrics[metric] for epoch_metrics in metrics])
                                  for metrics in (model.train_metrics, model.valid_metrics)]

    # Pretty sure early stopping works by taking the mean of losses, might want to double check
    mean_train_losses = train_losses.mean(axis=1)
    mean_valid_losses = valid_losses.mean(axis=1)
    min_loss_indx = min(enumerate(mean_valid_losses), key=lambda x: x[1])[0]
    
    gs = gridspec.GridSpec(3, 2)
    f = plt.figure(figsize=(15,10))

    for i in range(train_losses.shape[1]):
        y_max = max(max(train_losses[:,i]), max(valid_losses[:,i])) * 1.1

        ax = f.add_subplot(gs[i])

        ax.plot(range(len(train_losses[:,i])), train_losses[:,i], label='Training',lw=2)
        ax.plot(range(len(train_losses[:,i])), valid_losses[:,i], label='Validation', lw=2)
        
        ax.plot([min_loss_indx, min_loss_indx], [0, y_max], 'k--', label='Early Stop')
        if i == 0:
            ax.legend(loc="best")
            ax.set_ylabel(metric)
        ax.set_ylim((0,y_max))
        ax.set_title("Task {}".format(i))

    y_max = max(max(mean_train_losses), max(mean_valid_losses)) * 1.1

    ax = f.add_subplot(gs[train_losses.shape[1]])
    ax.plot(range(len(mean_train_losses)), mean_train_losses, label='Training',lw=2)
    ax.plot(range(len(mean_valid_losses)), mean_valid_losses, label='Validation', lw=2)
        
    ax.plot([min_loss_indx, min_loss_indx], [0, y_max], 'k--', label='Early Stop')
    ax.set_ylim((0,y_max))
    ax.set_xlabel("Epoch")
    ax.set_title("Mean losses")
        
    plt.savefig("losses.png")
    plt.show()

In [None]:
metric = "Mean Squared Error"
print_perf(model, metric)