This notebook provides code to test all models with validation datasets (either held out test sets or external validation datasets).

In [1]:
# import statements 
import sys
sys.path.insert(1, './main_classes/')

from CAML_wrapper import run_bioseqml
from CAML_seqprop_helpers import *
from CAML_integrated_design_helpers import *
from CAML_generic_deepswarm import convert_deepswarm_input, print_summary
from CAML_transfer_learning_helpers import transform_classification_target, transform_regression_target, fit_final_deepswarm_model
from CAML_generic_autokeras import convert_autokeras_input
from CAML_generic_tpot import convert_tpot_input,reformat_data_traintest

import scipy.stats as sp
from keras.initializers import glorot_uniform
from keras.layers import BatchNormalization
from sklearn.model_selection import train_test_split
import autokeras
import torch
import pickle


Using TensorFlow backend.


Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


# Example 1: Transfer Learning on a DeepSwarm Model 

In [2]:
# Load DeepSwarm Model and freeze all except last two layers (randomly chose this - feel free to customize)
final_model_path = './final_exemplars/rbs_fullset/outputs/deepswarm/binary_classification/'
final_model_name = 'deepswarm_deploy_model.h5'
# get sequences with help from https://stackoverflow.com/questions/53183865/unknown-initializer-glorotuniform-when-loading-keras-model
with CustomObjectScope({'GlorotUniform': glorot_uniform(), 'BatchNormalizationV1': BatchNormalization()}): # , 'BatchNormalizationV1': BatchNormalization()
    model = tf.keras.models.load_model(final_model_path + final_model_name)
print(model.summary())
print('model is originally trainable: ' + str(model.trainable))
print('number of layers in the model: ' + str(len(model.layers)))

# set all layers except last two dense ones to be fixed
for layer_idx, layer in enumerate(model.layers):
    if layer_idx > len(model.layers) - 3:
        print(str(layer_idx) + ': ' + str(layer) + ', keeping trainable = ' + str(layer.trainable))
    else:
        layer.trainable = False
        print(str(layer_idx) + ': ' + str(layer) + ', setting trainable to ' + str(layer.trainable))


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
1646684106.2548301 (InputLay (None, 17, 4, 1)          0         
_________________________________________________________________
1646684106.256098 (Conv2D)   (None, 17, 4, 64)         3200      
_________________________________________________________________
1646684106.276694 (Flatten)  (None, 4352)              0         
_________________________________________________________________
1646684106.282732 (Dense)    (None, 30)                130590    
_________________________________________________________________
1646684106.296522 (Dense)    (None, 2)                 62        
Total params: 133,852
Trainable params: 133,852
Non-trainable params: 0
_________________________________________________________________
None
model is or

In [3]:
# Transform the test set RBS data to fine-tune this model
# Read in data file
data_dir = './clean_data/clean/'
file_name = 'hollerer_rbs_test.csv'
data_df = pd.read_csv(data_dir + file_name,sep=',')

# Give inputs for data generation
input_col_name = 'seq'
df_data_input = data_df[input_col_name]
df_data_output = data_df['out']
pad_seqs = 'max'
augment_data = 'none'
sequence_type = 'nucleic_acid'
task = 'binary_classification'

numerical_data_input, oh_data_input, df_data_output, scrambled_numerical_data_input, scrambled_oh_data_input, alph = convert_deepswarm_input(df_data_input, df_data_output, pad_seqs, augment_data, sequence_type)

# transform output (target) into bins for classification
transformed_output, transform_obj = transform_classification_target(df_data_output, multiclass = 'multiclass' in task)

# now, we have completed the pre-processing needed to feed our data into deepswarm
# deepswarm input: numerical_data_input
# deepswarm output: transformed_output
X = numerical_data_input
y = to_categorical(transformed_output)

Confirmed: All sequence characters are in alphabet
Confirmed: No need to pad or truncate, all sequences same length
Confirmed: No data augmentation requested
Confirmed: Scrambled control generated.


In [4]:
finetune_model_name = 'fine_tune_deepswarm_deploy_model.h5'
    
print('Fitting final model now...')
num_epochs = 30 # can choose how many epochs you want
deploy_model = fit_final_deepswarm_model(model, task, num_epochs,  X, y)
        
# Save the final deploy trained model
deploy_model.save(final_model_path + finetune_model_name)
print_summary(deploy_model, final_model_path + 'fine_tune_model_topology.txt')
print(deploy_model.summary())

Fitting final model now...
Train on 24888 samples, validate on 2766 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
1646684106.2548301 (InputLay (None, 17, 4, 1)          0         
_________________________________________________________________
1646684106.256098 (Conv2D)   (None, 17, 4, 64)         3200      
_________________________________________________________________
1646684106.276694 (Flatten)  (None, 4352)              0         
_________________________________________________________________
1646684106.282732 (Dense)    (None, 30)                130590    
_________________________________________________________________
1646684106.296522 (Dense)    (None, 2)                 62        
Total params: 133,852
Trainable params: 130,652
Non-trainable params: 3,200
_________________________________________________________________
None


# Example 2: Transfer Learning on an AutoKeras Model 

In [5]:
# Read in data file
data_dir = './clean_data/clean/'
file_name = 'hollerer_rbs_test.csv'
data_df = pd.read_csv(data_dir + file_name,sep=',')

# Give inputs for data generation
input_col_name = 'seq'
df_data_input = data_df[input_col_name]
df_data_output = data_df['out']
pad_seqs = 'max'
augment_data = 'none'
sequence_type = 'nucleic_acid'

# Format data inputs appropriately for autoML platform
numerical_data_input, oh_data_input, df_data_output, scrambled_numerical_data_input, scrambled_oh_data_input, alph = convert_autokeras_input(df_data_input, df_data_output, pad_seqs, augment_data, sequence_type)
transformed_output, transform_obj = transform_classification_target(df_data_output, multiclass = 'multiclass' in task)

# now, we have completed the pre-processing needed to feed our data into autokeras
# autokeras input: oh_data_input
# autokeras output: transformed_output
X = oh_data_input
y = transformed_output # don't convert to categorical for autokeras


Confirmed: All sequence characters are in alphabet
Confirmed: No need to pad or truncate, all sequences same length
Confirmed: No data augmentation requested
Confirmed: Scrambled control generated.


In [6]:
final_model_path = './final_exemplars/rbs_fullset/models/autokeras/binary_classification/'
final_model_name = 'optimized_autokeras_pipeline_classification.h5'

In [7]:
train_size = 0.85
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X, np.array(y).astype(float),train_size=train_size, test_size = 1-train_size)

clf = autokeras.utils.pickle_from_file(final_model_path+final_model_name)

evaluation = clf.evaluate(np.array(X_test_new), np.array(y_test_new))
print('Evaluation after no retraining: ', evaluation)

# retrain = False indicates that the weights should be reused and then retrained
# retrain = True indicates that the weights should be reinitialized from scratch
clf.fit(np.array(X_train_new),np.array(y_train_new), retrain=False)
evaluation = clf.evaluate(np.array(X_test_new), np.array(y_test_new))
print('Evaluation after some retraining: ', evaluation)

# can save and reload at will
autokeras.utils.pickle_to_file(clf, final_model_path + 'fine_tune_autokeras_pipeline_classification.h5')
test = autokeras.utils.pickle_from_file(final_model_path+'fine_tune_autokeras_pipeline_classification.h5')

# showing retrain = True wipes the old weights and ends up with a worse model
clf.fit(np.array(X_train_new),np.array(y_train_new), retrain=True)
evaluation = clf.evaluate(np.array(X_test_new), np.array(y_test_new))
print('Evaluation after training weights from scratch: ', evaluation)

Evaluation after no retraining:  0.9238370691732948
Evaluation after some retraining:  0.9226319595083152
Evaluation after training weights from scratch:  0.8920221740178357


# Part 3: Transfer Learning on TPOT Model

In [8]:
# read in data file
data_dir = './clean_data/clean/'
file_name = 'classification_test_peptides.csv'
data_df = pd.read_csv(data_dir + file_name,sep=',')

# give inputs for data generation
input_col_name = 'seq'
df_data_input = data_df[input_col_name]
df_data_output = data_df['target']
pad_seqs = False
augment_data = 'none'
sequence_type = 'protein'
task = 'regression'

numerical_data_input, oh_data_input, df_data_output, scrambled_numerical_data_input, scrambled_oh_data_input, alph = convert_tpot_input(df_data_input, df_data_output, pad_seqs, augment_data, sequence_type)
transformed_output, transform_obj = transform_regression_target(df_data_output)

X = numerical_data_input
y = transformed_output # don't convert to categorical for tpot
training_features, training_target = reformat_data_traintest(X, y)
train_size = 0.85
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(training_features, training_target, train_size=train_size, test_size = 1-train_size)

Example of bad letter J: JJHKPQAKSYLAYRILDYJJ
Replacing J with substitution : L, I
Setting all substitutions to 1 in one-hot encoded representation...
Confirmed: No need to pad or truncate, all sequences same length
Confirmed: No data augmentation requested
Confirmed: Scrambled control generated.


In [9]:
# give inputs for paths
final_model_path = './final_exemplars/peptides/outputs/tpot/regression/'
final_model_name = 'final_model_tpot_regression.pkl'
output_folder = final_model_path

with open(final_model_path+final_model_name, 'rb') as file:  
    model = pickle.load(file)

In [10]:
# partial_fit transfer learning is only possible for models that support it - most do not
# see reference list of those models here: https://scikit-learn.org/0.15/modules/scaling_strategies.html#incremental-learning
try:
    model.partial_fit(X_train_new,y_train_new)
except:
    print("No partial_fit could be applied. Trying warm_start instead.")
    print("")
try:
    # Can check out the original model parameters - should see warm_start = False
    # print(model.get_params())
    preds = model.predict(X_test_new)
    
    print('Original model on new test data: ', sp.pearsonr(y_test_new, preds))
    
    print('Keys that must be manually changed in the model to allow fine-tuning on new data: ')
    for key in list(model.get_params().keys()):
        if 'warm_start' in key or 'n_estimator' in key:
            print('\t' + key)
        model.set_params(stackingestimator__estimator__warm_start = True)
        model.set_params(extratreesregressor__warm_start = True)
        model.set_params(stackingestimator__estimator__n_estimators = 1 + model.get_params()['stackingestimator__estimator__n_estimators'])
        model.set_params(extratreesregressor__n_estimators = 1 + model.get_params()['extratreesregressor__n_estimators'])
    
    # Can check out the new model parameters - should see warm_start = True and n_estimators higher 
    # n_estimators must be increased because you need to allow new estimators to be created)
    # see reference here for more information: https://stackoverflow.com/questions/42757892/how-to-use-warm-start
    # print(model.get_params())
    model.fit(X_train_new,y_train_new)
    preds = model.predict(X_test_new)    
    print('Fine-tuned model on new test data: ', sp.pearsonr(y_test_new, preds))
except Exception as e:
    print(e)
    print("No warm_start could be applied. Model is not compatible with transfer learning.")


No partial_fit could be applied. Trying warm_start instead.

Original model on new test data:  (0.9166148084159028, 3.488342859219614e-29)
Keys that must be manually changed in the model to allow fine-tuning on new data: 
	stackingestimator__estimator__n_estimators
	stackingestimator__estimator__warm_start
	extratreesregressor__n_estimators
	extratreesregressor__warm_start
Fine-tuned model on new test data:  (0.9147476942924223, 7.255469715120219e-29)
