In [21]:
import sys
import os
import itertools
import keras
from keras.layers import Input, Dense, Reshape, Flatten
from keras import layers, initializers
from keras.models import Model, load_model
import keras.backend as K
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats
from scipy.stats import norm
from scipy.optimize import minimize
from keras.utils.generic_utils import get_custom_objects
import json
#import tensorflow_probability as tfp

#tfd = tfp.distributions

def subselect_list(li, ixs) :
    return [
        li[ixs[k]] for k in range(len(ixs))
    ]

class IdentityEncoder :
    
    def __init__(self, seq_len, channel_map) :
        self.seq_len = seq_len
        self.n_channels = len(channel_map)
        self.encode_map = channel_map
        self.decode_map = {
            nt: ix for ix, nt in self.encode_map.items()
        }
    
    def encode(self, seq) :
        encoding = np.zeros((self.seq_len, self.n_channels))
        
        for i in range(len(seq)) :
            if seq[i] in self.encode_map :
                channel_ix = self.encode_map[seq[i]]
                encoding[i, channel_ix] = 1.

        return encoding
    
    def encode_inplace(self, seq, encoding) :
        for i in range(len(seq)) :
            if seq[i] in self.encode_map :
                channel_ix = self.encode_map[seq[i]]
                encoding[i, channel_ix] = 1.
    
    def encode_inplace_sparse(self, seq, encoding_mat, row_index) :
        raise NotImplementError()
    
    def decode(self, encoding) :
        seq = ''
    
        for pos in range(0, encoding.shape[0]) :
            argmax_nt = np.argmax(encoding[pos, :])
            max_nt = np.max(encoding[pos, :])
            seq += self.decode_map[argmax_nt]

        return seq
    
    def decode_sparse(self, encoding_mat, row_index) :
        raise NotImplementError()

from keras.backend.tensorflow_backend import set_session

def contain_tf_gpu_mem_usage() :
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    set_session(sess)

contain_tf_gpu_mem_usage()


In [9]:
import itertools
from keras.layers import Input, Dense, Reshape, Flatten
from keras import layers, initializers
from keras.models import Model, load_model
from seqtools import SequenceTools as ST
from util import AA, AA_IDX
from util import build_vae
from keras.utils.generic_utils import get_custom_objects
from util import one_hot_encode_aa, partition_data, get_balaji_predictions, get_samples, get_argmax
from util import convert_idx_array_to_aas, build_pred_vae_model, get_experimental_X_y
from util import get_gfp_X_y_aa
from losses import neg_log_likelihood

from gfp_gp import SequenceGP
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C

def build_model(M):
    x = Input(shape=(M, 20,))
    y = Flatten()(x)
    y = Dense(50, activation='elu')(y)
    y = Dense(2)(y)
    model = Model(inputs=x, outputs=y)
    return model


In [10]:
#Specfiy problem-specific parameters

it = 1

TRAIN_SIZE = 5000
train_size_str = "%ik" % (TRAIN_SIZE/1000)
num_models = [1, 5, 20][it]
RANDOM_STATE = it + 1

X_train, y_train, gt_train  = get_experimental_X_y(random_state=RANDOM_STATE, train_size=TRAIN_SIZE)

L = X_train.shape[1]

vae_suffix = '_%s_%i' % (train_size_str, RANDOM_STATE)
oracle_suffix = '_%s_%i_%i' % (train_size_str, num_models, RANDOM_STATE)

AA = ['a', 'r', 'n', 'd', 'c', 'q', 'e', 'g', 'h', 'i', 'l', 'k', 'm', 'f', 'p', 's', 't', 'w', 'y', 'v']
residue_map = {key.upper() : val for val, key in enumerate(AA)}
seq_encoder = IdentityEncoder(237, residue_map)


In [25]:

def killoran_opt(vae_decoder, oracles,
                 steps=20000, store_every1=5, store_every2=100, store_swap_iter=100, epsilon1=10**-5, epsilon2=1, noise_std=10**-5, save_path=None,
                 LD=100, verbose=False, adam=False):
    
    G = vae_decoder
    f = oracles
    
    sess = K.get_session()
    zt = K.tf.Variable(np.random.normal(size=[1, LD]), dtype='float32')
    zt_dummy = K.tf.Variable(np.zeros((1, 1)), trainable=False, dtype='float32')
    
    pred_input = K.tf.Variable(np.zeros((1, 237, 20)), dtype='float32')
    
    gen_output = G([zt])
    #prior = tfd.Normal(0, 1)
    #p_z = prior.log_prob(zt)
    
    predictions = K.tf.reduce_mean([f[i]([pred_input])[0, 0] for i in range(len(f))])
    update_pred_input = K.tf.assign(pred_input, gen_output)
    dfdx = K.tf.gradients(ys=-predictions, xs=pred_input)[0]
    dfdz = K.tf.gradients(gen_output, zt, grad_ys=dfdx)[0]
    #dpz = K.tf.gradients(p_z, zt)[0]
    
    noise = K.tf.random_normal(shape=[1, LD], stddev=noise_std)
    eps1 = K.tf.Variable(epsilon1, trainable=False)
    eps2 = K.tf.Variable(epsilon2, trainable=False)
    if adam:
        optimizer = K.tf.train.AdamOptimizer(learning_rate=epsilon2)
        step = dfdz + noise
    else:
        optimizer = K.tf.train.GradientDescentOptimizer(learning_rate=1)
        step = eps1 * dpz + eps2 * dfdz + noise
    
    design_op = optimizer.apply_gradients([(step, zt)])
    adam_initializers = [var.initializer for var in K.tf.global_variables() if 'Adam' in var.name or 'beta' in var.name]
    sess.run(adam_initializers)
    sess.run(pred_input.initializer)
    sess.run(zt.initializer)
    sess.run(eps1.initializer)
    sess.run(eps2.initializer)
    
    sess.run(zt_dummy.initializer)
    sess.run(pred_input.initializer)
    
    s = sess.run(K.tf.shape(zt))
    sess.run(update_pred_input, {
        zt: np.random.normal(size=s),
        zt_dummy: np.zeros((1, 1))
    })
    z_0 = sess.run([zt])
    
    store_every = store_every1
    
    xt_prev = None
    for t in range(steps):
        if t % 1000 == 0 :
            print("Running step " + str(t) + "...")
        
        if t > store_swap_iter :
            store_every = store_every2
        
        xt0, _, = sess.run([gen_output, design_op], {eps1: epsilon1, eps2:epsilon2})
        pred_in, preds = sess.run([update_pred_input, predictions])
        
        AA = ['a', 'r', 'n', 'd', 'c', 'q', 'e', 'g', 'h', 'i', 'l', 'k', 'm', 'f', 'p', 's', 't', 'w', 'y', 'v']
        nt_map_inv = {key : val.upper() for key, val in enumerate(AA)}
        
        xt_seq = ''
        for j in range(xt0.shape[1]) :
            argmax_j = np.argmax(xt0[0, j, :])
            xt_seq += nt_map_inv[argmax_j]
        
        if save_path is not None and t % store_every == 0 :
            with open(save_path + "_iter_" + str(t) + ".txt", "a+") as f :
                f.write(xt_seq + "\n")


In [26]:

def run_killoran(n_traj=5, steps=20000, vae_prefix_str="", vae_suffix=vae_suffix, oracle_suffix=oracle_suffix):
    
    for i in range(n_traj):
        RANDOM_STATE = i+1
        print(RANDOM_STATE)
        
        sess = tf.Session(graph=tf.get_default_graph())
        K.set_session(sess)
        
        #Load models
        oracles = [build_model(L) for i in range(num_models)]
        for i in range(num_models) :
            oracles[i].load_weights("models/oracle_%i%s.h5" % (i, oracle_suffix))
        
        vae_0 = build_vae(latent_dim=20, n_tokens=20, seq_length=237, enc1_units=50)

        vae_0.encoder_.load_weights("models/vae_0_encoder_weights%s.h5" % vae_suffix)
        vae_0.decoder_.load_weights("models/vae_0_decoder_weights%s.h5"% vae_suffix)
        vae_0.vae_.load_weights("models/vae_0_vae_weights%s.h5"% vae_suffix)

        #Load decoder model
        vae_0.decoder_.trainable = False
        vae_0.decoder_.compile(loss='mse', optimizer=keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999))
        decoder = vae_0.decoder_

        killoran_opt(decoder, oracles, steps=steps, epsilon1=0., epsilon2=0.1,  
                                     noise_std=1e-6, store_every1=5, store_every2=100, store_swap_iter=100,
                                     LD=20, verbose=False, adam=True,
                                     save_path='killoran/killoran_weak_balaji_vae' + vae_prefix_str + '_gfp_seqs'
                            )


In [29]:

run_killoran(n_traj=10, steps=5000, vae_prefix_str="")


1
Running step 0...
Running step 1000...
Running step 2000...
Running step 3000...
Running step 4000...
2
Running step 0...
Running step 1000...
Running step 2000...
Running step 3000...
Running step 4000...
3
Running step 0...
Running step 1000...
Running step 2000...
Running step 3000...
Running step 4000...
4
Running step 0...
Running step 1000...
Running step 2000...
Running step 3000...
Running step 4000...
5
Running step 0...
Running step 1000...
Running step 2000...
Running step 3000...
Running step 4000...
6
Running step 0...
Running step 1000...
Running step 2000...
Running step 3000...
Running step 4000...
7
Running step 0...
Running step 1000...
Running step 2000...
Running step 3000...
Running step 4000...
8
Running step 0...
Running step 1000...
Running step 2000...
Running step 3000...
Running step 4000...
9
Running step 0...
Running step 1000...
Running step 2000...
Running step 3000...
Running step 4000...
10
Running step 0...
Running step 1000...
Running step 2000...
R