In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
sys.path.insert(0, '/Users/tareen/Desktop/Research_Projects/2020_mavenn_github/mavenn_local')

import mavenn
import logomaker
import seaborn as sns
import re

%matplotlib inline

In [2]:
# Load example data
data_df = mavenn.load_example_dataset('sortseq_full-wt')

# Separate test from data_df
ix_test = data_df['set']=='test'
test_df = data_df[ix_test].reset_index(drop=True)
print(f'test N: {len(test_df):,}')

# Remove test data from data_df
data_df = data_df[~ix_test].reset_index(drop=True)
print(f'training + validation N: {len(data_df):,}')
data_df.head()

test N: 10,269
training + validation N: 40,249


Unnamed: 0,set,ct_0,ct_1,ct_2,ct_3,ct_4,ct_5,ct_6,ct_7,ct_8,ct_9,x
0,validation,2,0,0,0,0,0,0,0,0,0,AATTGATGTCCGGTAGCTCACTCATTAGGCAGCCAAGGTTTTAGAC...
1,training,0,0,0,1,0,0,0,0,0,0,AATTCATGTGAGTTATGTCTCTCATTAGGCACCCCAGGCTTGAGAT...
2,training,1,0,0,0,0,0,0,0,0,0,AATTAATGTGAGTTAGCTCACTCATTGGACACCCCAGGCTTTACAC...
3,training,0,1,0,0,0,0,0,0,0,0,ACTTAATAAAAGTCAGCTCACTCATTAATCACCCCACGCTCTACAT...
4,training,0,0,0,0,0,3,0,0,0,0,AATTAATGTGAGTTATCTAACTCATTAGGCACCCCAGGCTTTACAC...


In [3]:
# Comptue sequence length and number of bins
L = len(data_df['x'][0])
y_cols = data_df.columns[1:-1]
Y = len(y_cols)
print(f'L={L}, Y={Y}')

L=75, Y=10


In [4]:
from mavenn.src.layers.gpmap import CustomGPMapLayer


# Tensorflow imports
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.initializers import Constant
from tensorflow.keras.layers import Layer, Dense

class ThermodynamicLayer(CustomGPMapLayer):
    """Represents an thermodynamic model of transcription
       regulation in E. Coli at the lac promoter, which 
       contains binding sites for RNAP and CRP.
    """

    def __init__(self, 
                 L_crp,
                 L_rnap,
                 C,
                 regularizer, 
                 *args, **kwargs):
        """Construct layer instance."""
        self.L_crp=L_crp
        self.L_rnap=L_rnap
        self.C=C
        self.regularizer = tf.keras.regularizers.L2(regularizer)
        super().__init__(*args, **kwargs)

    def build(self, input_shape):
        """Build layer."""
        
        # define bias/chemical potential weight for crp
        self.mu_crp = self.add_weight(name='mu_crp',
                                       shape=(1,),
                                       initializer=Constant(0.),
                                       trainable=True,
                                       regularizer=self.regularizer)

        # define bias/chemical potential weight for rnap
        self.mu_rnap = self.add_weight(name='mu_rnap',
                                       shape=(1,),
                                       initializer=Constant(0.),
                                       trainable=True,
                                       regularizer=self.regularizer)


        # Define theta_crp_lc parameters
        theta_crp_lc_shape = (1, self.L_crp, self.C)

        theta_crp_lc_init = np.random.randn(*theta_crp_lc_shape)/np.sqrt(self.L_crp)
        self.theta_crp_lc = self.add_weight(name='theta_crp_lc',
                                        shape=theta_crp_lc_shape,
                                        initializer=Constant(theta_crp_lc_init),
                                        trainable=True,
                                        regularizer=self.regularizer)
        
        # Define theta_rnap_lc parameters
        theta_rnap_lc_shape = (1, self.L_rnap, self.C)

        theta_rnap_lc_init = np.random.randn(*theta_rnap_lc_shape)/np.sqrt(self.L_rnap)
        self.theta_rnap_lc = self.add_weight(name='theta_rnap_lc',
                                        shape=theta_rnap_lc_shape,
                                        initializer=Constant(theta_rnap_lc_init),
                                        trainable=True,
                                        regularizer=self.regularizer)
        
        # define interaction term. Not sure if this needs regularization
        self.interaction = self.add_weight(name='interaction',
                               shape=(1,),
                               initializer=Constant(0.),
                               trainable=True,
                               regularizer=self.regularizer)
        
        # define tsat term. Not sure if this needs regularization
        self.tsat = self.add_weight(name='tsat',
                               shape=(1,),
                               initializer=Constant(0.),
                               trainable=True,
                               regularizer=self.regularizer)        
        
        # Call superclass build
        super().build(input_shape)

    def call(self, x_lc):
        """Process layer input and return output.
        
        x_lc: (tensor)
            Input tensor that represents one-hot encoded 
            sequence values. 
        """
        
        # extract locations of binding sites from entire lac-promoter sequence.
        x_crp_lc = x_lc[:,4:108]
        x_rnap_lc = x_lc[:,136:300]
        
        # reshape according to crp and rnap lengths. 
        x_crp_lc = tf.reshape(x_crp_lc, [-1, self.L_crp, self.C])
        x_rnap_lc = tf.reshape(x_rnap_lc, [-1, self.L_rnap, self.C])
                
        # compute delta G for crp    
        phi_crp = self.mu_crp + \
              tf.reshape(K.sum(self.theta_crp_lc * x_crp_lc, axis=[1, 2]),
                         shape=[-1, 1])
            
        # compute delta G for rnap
        phi_rnap = self.mu_rnap + \
              tf.reshape(K.sum(self.theta_rnap_lc * x_rnap_lc, axis=[1, 2]),
                         shape=[-1, 1])            
        
        # compute rate of transcription
        t = (self.tsat)*(K.exp(-phi_crp)+K.exp(-phi_rnap)+K.exp(-phi_crp-phi_rnap-self.interaction))/(1+K.exp(-phi_crp)+K.exp(-phi_rnap)+K.exp(-phi_crp-phi_rnap-self.interaction))
        
        # return rate of transcription
        return t


In [None]:
# define custom gp_map parameters dictionary
gpmap_kwargs = {'L_crp':26,
                'L_rnap':41,
                'C':4,
                'regularizer':0.005}

# Create model
model = mavenn.Model(L=L, 
                     Y=Y,
                     alphabet='dna', 
                     regression_type='MPA', 
                     gpmap_type='custom',
                     gpmap_kwargs=gpmap_kwargs,
                     custom_gpmap=ThermodynamicLayer)

In [None]:
# Set training data
model.set_data(x=data_df['x'],
               y=data_df[y_cols],
               validation_flags=(data_df['set']=='validation'),
               shuffle=True)

# Fit model to data
model.fit(learning_rate=.0005,
          epochs=2000,
          batch_size=200,
          early_stopping=True,
          early_stopping_patience=25,
          linear_initialization=True)


N = 40,249 observations set as training data.
Using 24.7% for validation.
Data shuffled.
Time to set data: 0.608 sec.
Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 46/2000
Epoch 47/2000
Epoch 48/2000
Epoch 49/2000
Epoch 50/2000
Epoch 51/2000
Epoch 52/2000
Epoch 53/2000
Epoch 54/2000
Epoch 55/2000
Epoch 56/2000


Epoch 57/2000
Epoch 58/2000
Epoch 59/2000
Epoch 60/2000
Epoch 61/2000
Epoch 62/2000
Epoch 63/2000
Epoch 64/2000
Epoch 65/2000
Epoch 66/2000
Epoch 67/2000
Epoch 68/2000
Epoch 69/2000
Epoch 70/2000
Epoch 71/2000
Epoch 72/2000
Epoch 73/2000
Epoch 74/2000
Epoch 75/2000
Epoch 76/2000
Epoch 77/2000
Epoch 78/2000
Epoch 79/2000
Epoch 80/2000
Epoch 81/2000
Epoch 82/2000
Epoch 83/2000
Epoch 84/2000
Epoch 85/2000
Epoch 86/2000
Epoch 87/2000
Epoch 88/2000
Epoch 89/2000
Epoch 90/2000
Epoch 91/2000
Epoch 92/2000
Epoch 93/2000
Epoch 94/2000
Epoch 95/2000
Epoch 96/2000
Epoch 97/2000
Epoch 98/2000
Epoch 99/2000
Epoch 100/2000
Epoch 101/2000
Epoch 102/2000
Epoch 103/2000
Epoch 104/2000
Epoch 105/2000
Epoch 106/2000
Epoch 107/2000
Epoch 108/2000
Epoch 109/2000
Epoch 110/2000
Epoch 111/2000
Epoch 112/2000
Epoch 113/2000
Epoch 114/2000


Epoch 115/2000
Epoch 116/2000
Epoch 117/2000
Epoch 118/2000
Epoch 119/2000
Epoch 120/2000
Epoch 121/2000
Epoch 122/2000
Epoch 123/2000
Epoch 124/2000
Epoch 125/2000
Epoch 126/2000
Epoch 127/2000
Epoch 128/2000
Epoch 129/2000

In [None]:
# Show training history
print('On test data:')
x_test = test_df['x'].values
y_test = test_df[y_cols].values

# Compute likelihood information
I_var, dI_var =  model.I_variational(x=x_test, y=y_test)
print(f'I_var_test: {I_var:.3f} +- {dI_var:.3f} bits') 

# Compute predictive information
I_pred, dI_pred = model.I_predictive(x=x_test, y=y_test)
print(f'I_pred_test: {I_pred:.3f} +- {dI_pred:.3f} bits')

I_var_hist = model.history['I_var']
val_I_var_hist = model.history['val_I_var']

fig, ax = plt.subplots(1,1,figsize=[4,4])
ax.plot(I_var_hist, label='I_var_train')
ax.plot(val_I_var_hist, label='I_var_val')
#ax.axhline(I_var, color='C2', linestyle=':', label='I_var_test')
#ax.axhline(I_pred, color='C3', linestyle=':', label='I_pred_test')
ax.legend()
ax.set_xlabel('epochs')
ax.set_ylabel('bits')
ax.set_title('training hisotry')
#ax.set_ylim([0, I_pred*1.2]);

In [None]:
interaction_term = model.get_nn().layers[2].get_weights()[4]/1.62
crp_weights = model.get_nn().layers[2].get_weights()[2][0]
rnap_weights = model.get_nn().layers[2].get_weights()[3][0]
print(f'interaction term = {interaction_term[0]:.3f} k_cal/mol')

In [None]:
crp_df = pd.DataFrame(crp_weights,columns=model.alphabet)
rnap_df = pd.DataFrame(rnap_weights,columns=model.alphabet)

In [None]:
# # Get G-P map parameters in matrix form
# theta = model.get_theta(gauge='uniform')

# Create grid in phi space
phi_lim = [-4, 4]
phi_grid = np.linspace(phi_lim[0], phi_lim[1], 1000)

# Create array of allowable y values
Y = model.model.Y    # Y = number of bins
y_lim = [-.5, Y-.5]
y_all = range(Y)

# Compute matrix of p(y|phi) values
measurement_process = model.p_of_y_given_phi(y_all, phi_grid)

# # Create figure with two panels
fig, axs = plt.subplots(1,3,figsize=[18,4])

logomaker.Logo(-crp_df,ax=axs[0],center_values=True)
logomaker.Logo(-rnap_df,ax=axs[1],center_values=True)

# # Right panel: draw measurement process as heatmap
ax = axs[2]
im = ax.imshow(measurement_process,
               cmap='Greens',
               extent=phi_lim+y_lim,
               vmin=0,
               origin='lower',
               interpolation='nearest',
               aspect="auto")
ax.set_yticks(y_all)
ax.set_ylabel('bin number (y)')
ax.set_xlabel('latent phenotype ($\phi$)')
ax.set_title('measurement process')
cb = plt.colorbar(im)
cb.set_label('probability  $p(y|\phi)$', rotation=-90, va="bottom")

# Fix up plot
fig.tight_layout(w_pad=3)
fig.savefig('thermodynmic_custom_gpmap_mpa_visualization_sort_seq.png',bbox_index='tight',dpi=300)
# plt.show()