Please note that to run different datasets, you should restart the kernel and run the import statement and general parameter definition blocks again. This is due to graph operations needing to be reset between every experiment.

In [1]:
import sys
sys.path.insert(1, 'main_classes/')

from BioSeqAutoML_wrapper import run_bioseqml
import shutil
import os

# make directory where these results can live - no need to run if already have a directory
#os.mkdir('./final_exemplars/')

Using TensorFlow backend.


Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


# General Parameters (can be changed)

In [2]:
# ### input arguments 
#     task : str, one of 'binary_classification', 'multiclass_classification', 'regression'
#     data_folder : str representing folder where data is stored
#     data_file : str representing file name where data is stored
#     sequence_type : str, either 'nucleic_acid', 'peptide', or 'glycan'
#     model_folder : str representing folder where models are to be stored
#     output_folder : str representing folder where output is to be stored
#     automl_search_techniques : str representing which AutoML search technique should be performed, one of 'all', 'deepswarm', 'autokeras', 'tpot'
#     do_backup : bool representing if a backup should be performed
#     max_runtime_minutes : int representing max runtime for model search in minutes
#     num_folds : int representing num folds
#     verbosity : int representing 0=not verbose, 1=verbose
#     do_auto_bin : bool representing if target values should be automatically binned
#     bin_threshold : float representing threshold for positive and negative classes
#     do_transform : bool representing if target values should be transformed
#     input_col : str representing input column name where sequences can be located
#     target_col : str representing target column name where target values can be located
#     pad_seqs : str indicating pad_seqs method, either 'max', 'min', 'average'
#     augment_data : str, either 'none', 'complement', 'reverse_complement', or 'both_complements'
#     dataset_robustness : bool indicating if data ablation study should be performed
#     num_final_epochs : int representing number of final epochs to train final deepswarm model
#     yaml_params : dict of extra deepswarm parameters, with keys 'max_depth' (int), 'ant_count' (int), 'epochs' (int)
#     num_generations : int representing number of generations of tpot search
#     population_size : int representing population size of tpot search
#     run_interpretation : bool indicating if interpretation module should be executed
#     interpret_params : dict of extra interpretation parameters, with keys 'sample_number_class_activation_maps' (int), 'class_activation_grad_modifier' (str), 'class_activation_layer_index' (int);
#         'sample_number_saliency_maps' (int), 'saliency_map_grad_modifier' (str), 'saliency_map_layer_index' (int), 'sample_number_mutagenesis' (int)
#     run_design : bool indicating if design module should be executed
#     design_params :dict of extra design parameters, with keys 'k' (int), 'substitution_type' (str), 'target_y' (float), 'class_of_interest' (int), 'constraint_file_path' (str);
#         'de_novo_num_seqs_to_test' (int), 'storm_num_seqs_to_test' (int), 'num_of_optimization_rounds' (int)
# ###

In [3]:
# specify parameters for the actual search if different from defaults
num_folds = 3 # recommend 3 - 5 folds for robustness
do_backup = False

# interpretation options
run_interpretation = True

# design options
run_design = True
design_params = {'k' : 3, 
                'substitution_type' : 'random',
                'target_y' : 1,
                'class_of_interest' : 0, # must be zero if task is regression
                'constraint_file_path' : '',
                'de_novo_num_seqs_to_test' : 100,
                'storm_num_seqs_to_test' : 5,
                 'num_of_optimization_rounds' : 5}

# RBS - Hollerer et al.

In [None]:
#shutil.rmtree('./final_exemplars/rbs/', ignore_errors = True)
#os.mkdir('./final_exemplars/rbs_fullset/')

data_folder = './clean_data/clean/'
data_file = 'hollerer_rbs_mediumtrain.csv'
input_col = 'seq'
target_col = 'out'
sequence_type = 'nucleic_acid'

task = 'regression' # binary_classification, multiclass_classification, regression

model_folder = './final_exemplars/rbs_fullset/models/'
output_folder = './final_exemplars/rbs_fullset/outputs/'

run_bioseqml(task, data_folder, data_file, sequence_type, model_folder, output_folder, num_folds=num_folds, do_backup=do_backup, input_col=input_col, target_col=target_col, run_interpretation=run_interpretation, run_design=run_design, design_params=design_params)
            

# Synthetic Nucleic Acids Dataset

In [None]:
#shutil.rmtree('./final_exemplars/synthetic_nucleic_acids/', ignore_errors = True)
os.mkdir('./final_exemplars/synthetic_nucleic_acids/')

data_folder = './clean_data/clean/'
data_file = 'large_synthetic.csv'
input_col = 'seq'
target_col = 'positive_score'
sequence_type = 'nucleic_acid'

task = 'regression' # binary_classification, multiclass_classification, regression

model_folder = './final_exemplars/synthetic_nucleic_acids/models/'
output_folder = './final_exemplars/synthetic_nucleic_acids/outputs/'

run_bioseqml(task, data_folder, data_file, sequence_type, model_folder, output_folder, num_folds=num_folds, do_backup=do_backup, input_col=input_col, target_col=target_col, run_interpretation=run_interpretation, run_design=run_design, design_params=design_params)


In [4]:
# do small test
dataset_robustness = False
run_interpretation = False
run_design = False

#shutil.rmtree('./final_exemplars/synthetic_nucleic_acids/', ignore_errors = True)
#os.mkdir('./final_exemplars/small_synthetic_nucleic_acids/')

data_folder = './clean_data/clean/'
data_file = 'small_synthetic.csv'
input_col = 'seq'
target_col = 'positive_score'
sequence_type = 'nucleic_acid'

task = 'regression' # binary_classification, multiclass_classification, regression

model_folder = './final_exemplars/small_synthetic_nucleic_acids/models/'
output_folder = './final_exemplars/small_synthetic_nucleic_acids/outputs/'

run_bioseqml(task, data_folder, data_file, sequence_type, model_folder, output_folder, automl_search_techniques = 'tpot', num_folds=num_folds, do_backup=do_backup, input_col=input_col, target_col=target_col, run_interpretation=run_interpretation, run_design=run_design, design_params=design_params)


Verbosity set to 0. For more display items, set verbosity to 1.
#################################################################################################
#######################               RUNNING REGRESSION                  #######################
#################################################################################################

#################################################################################################
##############################            RUNNING TPOT                ###########################
#################################################################################################
Conducting architecture search now...
Testing scrambled control now...
Fitting final model now...
BioSeq-AutoML has concluded.


# Peptides

In [None]:
#shutil.rmtree('./final_exemplars/peptides/', ignore_errors = True)
#os.mkdir('./final_exemplars/peptides/')

data_folder = './clean_data/clean/'
data_file = 'peptides.csv'
input_col = 'seq'
target_col = 'target'
sequence_type = 'protein'

task = 'regression' # binary_classification, multiclass_classification, regression

model_folder = './final_exemplars/peptides/models/'
output_folder = './final_exemplars/peptides/outputs/'

run_bioseqml(task, data_folder, data_file, sequence_type, model_folder, output_folder, num_folds=num_folds, do_backup=do_backup, input_col=input_col, target_col=target_col, run_interpretation=run_interpretation, run_design=run_design, design_params=design_params)


# Toeholds

In [None]:
#shutil.rmtree('./final_exemplars/toeholds/', ignore_errors = True)
#os.mkdir('./final_exemplars/toeholds/')

data_folder = './clean_data/clean/'
data_file = 'toeholds.csv'
input_col = 'seq'
target_col = 'ON'
sequence_type = 'nucleic_acid'

constraint_file_path = 'clean_data/toehold_constraints.xlsx'

task = 'regression' # binary_classification, multiclass_classification, regression

run_design = True
design_params = {'k' : 3, 
                'substitution_type' : 'constrained_random',
                'target_y' : 1,
                'class_of_interest' : 0, # must be zero if task is regression
                'constraint_file_path' : constraint_file_path,
                'de_novo_num_seqs_to_test' : 100,
                'storm_num_seqs_to_test' : 5,
                 'num_of_optimization_rounds' : 5}

model_folder = './final_exemplars/toeholds/models/'
output_folder = './final_exemplars/toeholds/outputs/'

run_bioseqml(task, data_folder, data_file, sequence_type, model_folder, output_folder, num_folds=num_folds, do_backup=do_backup, input_col=input_col, target_col=target_col, run_interpretation=run_interpretation, run_design=run_design, design_params=design_params)


# Longer Protein Sequences - FLIP Experiment

In [None]:
#shutil.rmtree('./final_exemplars/flip_longer_protein/', ignore_errors = True)
os.mkdir('./final_exemplars/flip_longer_protein/')

# specify parameters for the actual search if different from defaults
num_folds = 3 # recommend 3 - 5 folds for robustness
do_backup = False # make a backup of your outputs and models

# optional add-ons
dataset_robustness = False # try with different fractions of your dataset, may add time
run_interpretation = False
run_design = False

data_folder = './clean_data/clean/'
data_file = 'flip_protein_train.csv'
input_col = 'sequence'
target_col = 'target'
sequence_type = 'protein'
task = 'regression' # binary_classification, multiclass_classification, regression

model_folder = './final_exemplars/flip_longer_protein/models/'
output_folder = './final_exemplars/flip_longer_protein/outputs/'

run_bioseqml(task, data_folder, data_file, sequence_type, model_folder, output_folder, num_folds=num_folds, do_backup=do_backup, input_col=input_col, target_col=target_col, dataset_robustness=dataset_robustness, run_interpretation=run_interpretation, run_design=run_design)

# Longer Protein Sequences - avGFP Experiment

In [4]:
#shutil.rmtree('./final_exemplars/avgfp/', ignore_errors = True)
os.mkdir('./final_exemplars/avgfp/')

# specify parameters for the actual search if different from defaults
num_folds = 3 # recommend 3 - 5 folds for robustness
do_backup = False # make a backup of your outputs and models

# optional add-ons
dataset_robustness = False # try with different fractions of your dataset, may add time
run_interpretation = False
run_design = False

data_folder = './clean_data/clean/'
data_file = 'avgfp.csv'
input_col = 'sequence'
target_col = 'target'
sequence_type = 'protein'
task = 'regression' # binary_classification, multiclass_classification, regression

model_folder = './final_exemplars/avgfp/models/'
output_folder = './final_exemplars/avgfp/outputs/'

run_bioseqml(task, data_folder, data_file, sequence_type, model_folder, output_folder, num_folds=num_folds, do_backup=do_backup, input_col=input_col, target_col=target_col, dataset_robustness=dataset_robustness, run_interpretation=run_interpretation, run_design=run_design)

Verbosity set to 0. For more display items, set verbosity to 1.
#################################################################################################
#######################               RUNNING REGRESSION                  #######################
#################################################################################################

#################################################################################################
##############################            RUNNING DEEPSWARM           ###########################
#################################################################################################
Conducting architecture search now...
0
Testing scrambled control now...
Fitting final model now...
#################################################################################################
##############################            RUNNING AUTOKERAS           ###########################
###################################################