Please note that to run different datasets, you should restart the kernel and run the import statement and general parameter definition blocks again. This is due to graph operations needing to be reset between every experiment.

In [None]:
sys.path.insert(1, './main_classes/')
from CAML_wrapper import run_bioseqml

import shutil
import os

# make directory where these results can live - no need to run if already have a directory
os.mkdir('./final_exemplars/')

# General Parameters (can be changed)

In [None]:
# specify parameters for the actual search (example)
max_runtime_minutes = 60 # time in minutes to give to each implemented AutoML algorithms
num_folds = 3 # recommend 3 - 5 folds for robustness
verbosity = 0
do_backup = False # make a backup of your outputs and models

# Deepswarm execution
num_final_epochs = 50
yaml_params = {'ant_count': 4, 'max_depth': 3, 'epochs': 5}

# TPOT execution
num_generations = 50 # No need to specify
population_size = 50 # No need to specify

# required only for binary_classification
# If do_auto_bin=True then binary threshold is set to None to be automatically generated
do_auto_bin = True
if do_auto_bin == True:
    bin_threshold=None
else:
    bin_threshold = 0.75
    
# Optional Add-ons
pad_seqs = 'max' # pads seqs to max length; options include 'max', 'min', 'average'
augment_data = 'none' # available for nucleic acids: none, complement, rev_complement, both_complements
dataset_robustness = True # try with different fractions of your dataset, may add time

# interpretation options
run_interpretation = True
interpret_params = {'sample_number_class_activation_maps' : 100, 
                   'class_activation_grad_modifier' : 'absolute',
                   'class_activation_layer_index' : -2,
                    'sample_number_saliency_maps' : 100,
                   'saliency_map_grad_modifier' : 'absolute',
                   'saliency_map_layer_index' : -1, 
                   'sample_number_mutagenesis' : 100}

# design options
run_design = True
design_params = {'k' : 3, 
                'substitution_type' : 'random',
                'target_y' : 1,
                'class_of_interest' : 3, # must be zero if task is regression
                'constraint_file_path' : '',
                'de_novo_num_seqs_to_test' : 10, #changd ffor peptiddesddown from100
                'storm_num_seqs_to_test' : 5,
                 'num_of_optimization_rounds' : 5}

# RBS - Hollerer et al.

In [None]:
os.mkdir('./final_exemplars/rbs/')

data_folder = './clean_data/multiclass/'
data_file = 'rbs_medium_multiclass.csv'
input_col = 'seq'
target_col = 'target'
sequence_type = 'nucleic_acid'

task = 'multiclass_classification' # binary_classification, multiclass_classification, regression

# Specify target folders for saving models and results
# Generic here - will add the tags specifying classification/regression
# as well as specific for the AutoML tool being used (i.e. /tpot/)
model_folder = './final_exemplars/rbs/models/'
output_folder = './final_exemplars/rbs/outputs/'

run_bioseqml(task, data_folder, data_file, sequence_type, model_folder, output_folder, max_runtime_minutes, num_folds, verbosity, do_backup, do_auto_bin, bin_threshold, input_col, target_col, pad_seqs, augment_data, dataset_robustness, num_final_epochs, yaml_params, num_generations, population_size, run_interpretation = run_interpretation, interpret_params = interpret_params, run_design = run_design, design_params = design_params)
            

# Peptides

In [None]:
os.mkdir('./final_exemplars/peptides/')

data_folder = './clean_data/multiclass/'
data_file = 'peptides_multiclass.csv'
input_col = 'seq'
target_col = 'target'
sequence_type = 'protein'

task = 'multiclass_classification' # binary_classification, multiclass_classification, regression

# Specify target folders for saving models and results
# Generic here - will add the tags specifying classification/regression
# as well as specific for the AutoML tool being used (i.e. /tpot/)
model_folder = './final_exemplars/peptides/models/'
output_folder = './final_exemplars/peptides/outputs/'

run_bioseqml(task, data_folder, data_file, sequence_type, model_folder, output_folder, max_runtime_minutes, num_folds, verbosity, do_backup, do_auto_bin, bin_threshold, input_col, target_col, pad_seqs, augment_data, dataset_robustness, num_final_epochs, yaml_params, num_generations, population_size, run_interpretation = run_interpretation, interpret_params = interpret_params, run_design = run_design, design_params = design_params)


# Glycan domain multi-class classification

In [None]:
# I prefered to do design & interpretation after the fact because I was impatient for models to run
# Dataset robustness was not super applicable because only ~1300 samples here
dataset_robustness = False
run_interpretation = False
run_design = False

os.mkdir('./final_exemplars/glycan_domain_average/')

data_folder = './clean_data/multiclass/'
data_file = 'domain_glycans.csv'
input_col = 'seq'
target_col = 'target'
sequence_type = 'glycan'
pad_seqs = 'average'

task = 'multiclass_classification' # binary_classification, multiclass_classification, regression

# Specify target folders for saving models and results
# Generic here - will add the tags specifying classification/regression
# as well as specific for the AutoML tool being used (i.e. /tpot/)
model_folder = './final_exemplars/glycan_domain_average/models/'
output_folder = './final_exemplars/glycan_domain_average/outputs/'

run_bioseqml(task, data_folder, data_file, sequence_type, model_folder, output_folder, max_runtime_minutes, num_folds, verbosity, do_backup, do_auto_bin, bin_threshold, input_col, target_col, pad_seqs, augment_data, dataset_robustness, num_final_epochs, yaml_params, num_generations, population_size, run_interpretation = run_interpretation, interpret_params = interpret_params, run_design = run_design, design_params = design_params)


# Toeholds

# Synthetic Nucleic Acids Dataset

In [None]:
os.mkdir('./final_exemplars/toeholds/')

data_folder = './clean_data/multiclass/'
data_file = 'toeholds_multiclass.csv'
input_col = 'seq'
target_col = 'target'
sequence_type = 'nucleic_acid'

constraint_file_path = 'clean_data/toehold_constraints.xlsx'

task = 'multiclass_classification' # binary_classification, multiclass_classification, regression

run_design = True
design_params = {'k' : 3, 
                'substitution_type' : 'constrained_random',
                'target_y' : 1,
                'class_of_interest' : 3, # must be zero if task is regression
                'constraint_file_path' : constraint_file_path,
                'de_novo_num_seqs_to_test' : 100,
                'storm_num_seqs_to_test' : 5,
                 'num_of_optimization_rounds' : 5}

# Specify target folders for saving models and results
# Generic here - will add the tags specifying classification/regression
# as well as specific for the AutoML tool being used (i.e. /tpot/)
model_folder = './final_exemplars/toeholds/models/'
output_folder = './final_exemplars/toeholds/outputs/'

run_bioseqml(task, data_folder, data_file, sequence_type, model_folder, output_folder, max_runtime_minutes, num_folds, verbosity, do_backup, do_auto_bin, bin_threshold, input_col, target_col, pad_seqs, augment_data, dataset_robustness, num_final_epochs, yaml_params, num_generations, population_size, run_interpretation = run_interpretation, interpret_params = interpret_params, run_design = run_design, design_params = design_params)


In [None]:
os.mkdir('./final_exemplars/synthetic_nucleic_acids/')

data_folder = './clean_data/multiclass/'
data_file = 'large_synthetic_multiclass.csv'
input_col = 'seq'
target_col = 'target'
sequence_type = 'nucleic_acid'

task = 'multiclass_classification' # binary_classification, multiclass_classification, regression

# Specify target folders for saving models and results
# Generic here - will add the tags specifying classification/regression
# as well as specific for the AutoML tool being used (i.e. /tpot/)
model_folder = './final_exemplars/synthetic_nucleic_acids/models/'
output_folder = './final_exemplars/synthetic_nucleic_acids/outputs/'

run_bioseqml(task, data_folder, data_file, sequence_type, model_folder, output_folder, max_runtime_minutes, num_folds, verbosity, do_backup, do_auto_bin, bin_threshold, input_col, target_col, pad_seqs, augment_data, dataset_robustness, num_final_epochs, yaml_params, num_generations, population_size, run_interpretation = run_interpretation, interpret_params = interpret_params, run_design = run_design, design_params = design_params)
