In [1]:
# Standard imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import seaborn as sns
import time
%matplotlib inline

# Insert mavenn at beginning of path
import sys
path_to_mavenn_local = '../../../../'
sys.path.insert(0, path_to_mavenn_local)

#Load mavenn and check path
import mavenn
print(mavenn.__path__)

# MAVE-NN utilities
from mavenn.src.dev import mutations_to_dataset

['../../../../mavenn']


In [2]:
# Protein sequence determined from the wt coding DNA sequence in Snapgene
# 'AATATTACAAACTTGTGCCCTTTTGGTGAAGTTTTTAACGCCACCAGATTTGCATCTGTTTATGCTTGGAACAGGAAGAGAATCAGCAACTGTGTTGCTGATTATTCTGTCCTATATAATTCCGCATCATTTTCCACTTTTAAGTGTTATGGAGTGTCTCCTACTAAATTAAATGATCTCTGCTTTACTAATGTCTATGCAGATTCATTTGTAATTAGAGGTGATGAAGTCAGACAAATCGCTCCAGGGCAAACTGGAAAGATTGCTGATTATAATTATAAATTACCAGATGATTTTACAGGCTGCGTTATAGCTTGGAATTCTAACAATCTTGATTCTAAGGTTGGTGGTAATTATAATTACCTGTATAGATTGTTTAGGAAGTCTAATCTCAAACCTTTTGAGAGAGATATTTCAACTGAAATCTATCAGGCCGGTAGCACACCTTGTAATGGTGTTGAAGGTTTTAATTGTTACTTTCCTTTACAATCATATGGTTTCCAACCCACTAATGGTGTTGGTTACCAACCATACAGAGTAGTAGTACTTTCTTTTGAACTTCTACATGCACCAGCAACTGTTTGTGGACCTAAAAAGTCTACT'
wt_seq = 'NITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKST'
len(wt_seq)

201

In [3]:
# Save data_df
file_name = '../ace2rbd_data.csv.gz'
data_df = pd.read_csv(file_name, compression='gzip')

In [4]:
# Split into training and test sets
ix = data_df['training_set']
training_df = data_df[ix].copy()
test_df = data_df[~ix].copy()

In [5]:
# Subsample training data
ix = np.random.rand(len(training_df)) < .1
training_df = training_df[ix]

In [6]:
# Set seed
mavenn.set_seed(0)

# Set model
model = mavenn.Model(x=training_df['x'].values, 
                     y=training_df['y'].values,
                     theta_regularization=.1,
                     eta_regularization=.1,
                     alphabet='protein',
                     gpmap_type='additive',
                     regression_type='GE',
                     ge_noise_model_type='Cauchy',
                     ge_heteroskedasticity_order=2)

# Do inference
start_time = time.time()
history = model.fit(optimizer='Adam',
                    epochs=1, 
                    early_stopping=True, 
                    early_stopping_patience=20, 
                    learning_rate=.0005, 
                    batch_size=50)
training_time = time.time()-start_time
print(f'training time: {training_time:.1f} seconds')

training time: 3.1 seconds


In [7]:
# Save model
model.save('ace2rbd_model_v2')
!ls

ace2rbd.dna                        analyze_ace2rbd.ipynb
ace2rbd_data.csv.gz                analyze_ace2rbd_v2.ipynb
ace2rbd_model.csv                  analyze_ace2rbd_v3.ipynb
ace2rbd_model.h5                   make_ace2rbd_dataset.ipynb
ace2rbd_model_v2.csv               tmp.pickle
ace2rbd_model_v2.dill              tmp.txt
ace2rbd_model_v2.h5                urn_mavedb_00000044-a-2_scores.csv
ace2rbd_model_v2.pickle


In [8]:
new_model = mavenn.load('ace2rbd_model_v2')

In [9]:
new_model

<mavenn.src.model.Model at 0x1402621d0>