In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sys
sys.path.insert(0, '/Users/tareen/Desktop/Research_Projects/2020_mavenn_github/mavenn_local')
import mavenn
print(mavenn.__path__)

['/Users/tareen/Desktop/Research_Projects/2020_mavenn_github/mavenn_local/mavenn']


In [None]:
data_df = mavenn.load_example_dataset('amyloid')
# Show dataset size
print(f'Number of amino acid variants: {len(data_df):,d}.')
print(data_df.head())
WT_seq = 'DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA'

# indices of training examples
i_training = data_df['set']=='training'

# get train examples.
ABeta_train_df = data_df[i_training]
# get test examples.
ABeta_test_df = data_df[~i_training]

x_train, y_train, dy_train  = ABeta_train_df['x'], ABeta_train_df['y'], ABeta_train_df['dy']
x_test, y_test, dy_test  = ABeta_test_df['x'], ABeta_test_df['y'], ABeta_test_df['dy'] 

# Show dataset sizes
print(f'Training set size: {len(x_train):6,d} observations')
print(f'Test set size    : {len(x_test):6,d} observations')
L = len(ABeta_train_df['x'][0])

# Define model and set training data
model = mavenn.Model(regression_type='GE',
                     L=L,
                     alphabet='protein*',
                     gpmap_type='additive',
                     ge_nonlinearity_hidden_nodes=20,
                     ge_noise_model_type='Empirical',
                     ge_nonlinearity_monotonic=True)

model.set_data(x=x_train,
               y=y_train,
               dy=dy_train,
               validation_flags=(data_df['set'] == 'validation'),
               shuffle=True)

# Fit model to data
history = model.fit(learning_rate=1e-4,
                    epochs=500,
                    batch_size=64,
                    early_stopping=True,
                    early_stopping_patience=25,
                    linear_initialization=True)

In [None]:
###### 1. Compute Variational and Predictive Information ######

print('On test data:')
I_var, dI_var =  model.I_variational(x=x_test, y=y_test)
print(f'I_var_test: {I_var:.3f} +- {dI_var:.3f} bits')

# Compute predictive information
I_pred, dI_pred = model.I_predictive(x=x_test, y=y_test)
print(f'I_pred_test: {I_pred:.3f} +- {dI_pred:.3f} bits')

# Get the history of I_var for train and validation sets
I_var_hist = model.history['I_var']
val_I_var_hist = model.history['val_I_var']

fig, axs = plt.subplots(2,2,figsize=[10,10/1.6])

# Plot the history of I_var for training and 
# validation sets as functions of epochs 
ax = axs[0,0]
ax.plot(I_var_hist, label=r'I_var_train')
ax.plot(val_I_var_hist, label=r'I_var_val')
ax.axhline(I_var, color='C2', linestyle=':', 
           label=r'I_var_test')
ax.axhline(I_pred, color='C3', linestyle=':', 
           label=r'I_pred_test')
ax.set_xlabel('epochs')
ax.set_ylabel('bits')
ax.set_title('(a) training hisotry')
ax.legend()

In [None]:
###### 2. Loss functions as a function of epochs ######

ax = axs[0,1]
ax.plot(model.history['loss'], label='Training loss')
ax.plot(model.history[r'val_loss'], label='Validation loss')
ax.set_xlabel('epochs')
ax.set_ylabel('Loss')
ax.legend()
ax.set_title('(b) history of loss function')

In [None]:
###### 3. Predict and plot latent phentoype values ######
######           (phi) on test data                ######

ax = axs[1,0]
phi_test = model.x_to_phi(x_test)

## Set phi lims and create grid in phi space
phi_lim = [min(phi_test)-.5, max(phi_test)+.5]
phi_grid = np.linspace(phi_lim[0], phi_lim[1], 1000)

# Compute yhat each phi gridpoint
yhat_grid = model.phi_to_yhat(phi_grid)

# Compute 90% CI for each yhat
q = [0.05, 0.95]
yqs_grid = model.yhat_to_yq(yhat_grid, q=q)

# Illustrate measurement process with GE curve
ax.scatter(phi_test, y_test, color='C0', s=5, alpha=.2, 
           rasterized=True, label='test data')
ax.plot(phi_grid, yhat_grid, linewidth=2, color='C1',
        label='$\hat{y} = g(\phi)$')
ax.fill_between(phi_grid, yqs_grid[:, 0], 
                yqs_grid[:, 1], alpha=0.3, color='C2',
                edgecolor='red', lw=2, linestyle='--',
                label='90% CI')
ax.set_xlim(phi_lim)
ax.set_xlabel('latent phenotype ($\phi$)')
ax.set_ylabel('Nucleation score($y$)')
ax.set_title('(c) measurement process')
ax.legend()


In [None]:
###### 4. Model Performance ######

ax = axs[1,1]
yhat_test = model.x_to_yhat(x_test)
# Compute R^2 between yhat and y_test
Rsq = np.corrcoef(yhat_test.ravel(), y_test)[0, 1]**2
xlim = [-5.5,3]
# Plot y_test vs. yhat_test
ax.scatter(yhat_test, y_test, color='C0', s=5, alpha=.2, 
           rasterized=True, label='test data')
ax.plot(xlim, xlim, 'r--')
ax.set_xlabel('model prediction ($\hat{y}$)')
ax.set_ylabel('measurement ($y$)')
ax.set_title(f'(d) performance ($R^2$={Rsq:.3})');
ax.legend()

plt.tight_layout()
#plt.savefig('Abeta_GE_fitting.pdf')
plt.show()