Please note that to run different datasets or experiments with different parameters, you should restart the kernel and run the import statement and general parameter definition blocks again. This is due to graph operations needing to be reset between every experiment.

In [None]:
import sys
sys.path.insert(1, './main_classes/')
from CAML_wrapper import run_bioseqml

import shutil
import os

# make directory where these results can live - no need to run if already have a directory
os.mkdir('./final_exemplars/S4_augmentation_tests/')

# General Parameters (can be changed)

In [None]:
# specify parameters for the actual search (example)
max_runtime_minutes = 30 # time in minutes to give to each implemented AutoML algorithms
num_folds = 3 # recommend 3 - 5 folds for robustness
verbosity = 0
do_backup = False # make a backup of your outputs and models

# required only for binary_classification
# If do_auto_bin=True then binary threshold is set to None to be automatically generated
do_auto_bin = True
if do_auto_bin == True:
    bin_threshold=None
else:
    bin_threshold = 0.75
    
# Optional Add-ons
pad_seqs = 'max' # pads seqs to max length; options include 'max', 'min', 'average'
augment_data = 'none' # available for nucleic acids: none, complement, rev_complement, both_complements
dataset_robustness = False # try with different fractions of your dataset, may add time

# interpretation options
run_interpretation = False
interpret_params = None

# design options
run_design = False
design_params = None

num_final_epochs = 50
yaml_params = {'ant_count': 4, 'max_depth': 3, 'epochs': 5}
num_generations = 50
population_size = 50

# ---------------------
# RBS
# ---------------------

# RBS - Small No Aug

In [None]:
os.mkdir('./final_exemplars/S4_augmentation_tests/rbs_small_no_aug/')

data_folder = './clean_data/clean/'
data_file = 'hollerer_rbs_smalltrain.csv' # just 2000 for the purposes of this experiment
input_col = 'seq'
target_col = 'out'
sequence_type = 'nucleic_acid'

augment_data = 'none'
task = 'binary_classification' # binary_classification, multiclass_classification, regression

# Specify target folders for saving models and results
# Generic here - will add the tags specifying classification/regression
# as well as specific for the AutoML tool being used (i.e. /tpot/)
model_folder = './final_exemplars/S4_augmentation_tests/rbs_small_no_aug/models/'
output_folder = './final_exemplars/S4_augmentation_tests/rbs_small_no_aug/outputs/'

run_bioseqml(task, data_folder, data_file, sequence_type, model_folder, output_folder, max_runtime_minutes, num_folds, verbosity, do_backup, do_auto_bin, bin_threshold, input_col, target_col, pad_seqs, augment_data, dataset_robustness, num_final_epochs, yaml_params, num_generations, population_size, run_interpretation = run_interpretation, interpret_params = interpret_params, run_design = run_design, design_params = design_params)


# RBS - Small Comp Aug

In [None]:
os.mkdir('./final_exemplars/S4_augmentation_tests/rbs_small_comp_aug/')

data_folder = './clean_data/clean/'
data_file = 'hollerer_rbs_smalltrain.csv' # just 2000
input_col = 'seq'
target_col = 'out'
sequence_type = 'nucleic_acid'

augment_data = 'complement'
task = 'binary_classification' # binary_classification, multiclass_classification, regression

# Specify target folders for saving models and results
# Generic here - will add the tags specifying classification/regression
# as well as specific for the AutoML tool being used (i.e. /tpot/)
model_folder = './final_exemplars/S4_augmentation_tests/rbs_small_comp_aug/models/'
output_folder = './final_exemplars/S4_augmentation_tests/rbs_small_comp_aug/outputs/'

run_bioseqml(task, data_folder, data_file, sequence_type, model_folder, output_folder, max_runtime_minutes, num_folds, verbosity, do_backup, do_auto_bin, bin_threshold, input_col, target_col, pad_seqs, augment_data, dataset_robustness, num_final_epochs, yaml_params, num_generations, population_size, run_interpretation = run_interpretation, interpret_params = interpret_params, run_design = run_design, design_params = design_params)


# RBS - Small Rev Comp Aug

In [None]:
os.mkdir('./final_exemplars/S4_augmentation_tests/rbs_small_rev_aug/')

data_folder = './clean_data/clean/'
data_file = 'hollerer_rbs_smalltrain.csv' # just 2000
input_col = 'seq'
target_col = 'out'
sequence_type = 'nucleic_acid'

augment_data = 'reverse_complement'
task = 'binary_classification' # binary_classification, multiclass_classification, regression

# Specify target folders for saving models and results
# Generic here - will add the tags specifying classification/regression
# as well as specific for the AutoML tool being used (i.e. /tpot/)
model_folder = './final_exemplars/S4_augmentation_tests/rbs_small_rev_aug/models/'
output_folder = './final_exemplars/S4_augmentation_tests/rbs_small_rev_aug/outputs/'

run_bioseqml(task, data_folder, data_file, sequence_type, model_folder, output_folder, max_runtime_minutes, num_folds, verbosity, do_backup, do_auto_bin, bin_threshold, input_col, target_col, pad_seqs, augment_data, dataset_robustness, num_final_epochs, yaml_params, num_generations, population_size, run_interpretation = run_interpretation, interpret_params = interpret_params, run_design = run_design, design_params = design_params)


# RBS - Small Both Aug

In [None]:
os.mkdir('./final_exemplars/S4_augmentation_tests/rbs_small_both_aug/')

data_folder = './clean_data/clean/'
data_file = 'hollerer_rbs_smalltrain.csv' # just 2000
input_col = 'seq'
target_col = 'out'
sequence_type = 'nucleic_acid'

augment_data = 'both_complements'
task = 'binary_classification' # binary_classification, multiclass_classification, regression

# Specify target folders for saving models and results
# Generic here - will add the tags specifying classification/regression
# as well as specific for the AutoML tool being used (i.e. /tpot/)
model_folder = './final_exemplars/S4_augmentation_tests/rbs_small_both_aug/models/'
output_folder = './final_exemplars/S4_augmentation_tests/rbs_small_both_aug/outputs/'

run_bioseqml(task, data_folder, data_file, sequence_type, model_folder, output_folder, max_runtime_minutes, num_folds, verbosity, do_backup, do_auto_bin, bin_threshold, input_col, target_col, pad_seqs, augment_data, dataset_robustness, num_final_epochs, yaml_params, num_generations, population_size, run_interpretation = run_interpretation, interpret_params = interpret_params, run_design = run_design, design_params = design_params)


# ---------------------
# TOEHOLDS
# ---------------------

# Toeholds - Small No Aug

In [None]:
os.mkdir('./final_exemplars/S4_augmentation_tests/toeholds_small_no_aug/')

data_folder = './clean_data/clean/'
data_file = 'toeholds_small.csv' # just 2000 for the purposes of this experiment
input_col = 'seq'
target_col = 'ON'
sequence_type = 'nucleic_acid'

constraint_file_path = 'clean_data/toehold_constraints.xlsx'

augment_data = 'none'
task = 'binary_classification' # binary_classification, multiclass_classification, regression

# Specify target folders for saving models and results
# Generic here - will add the tags specifying classification/regression
# as well as specific for the AutoML tool being used (i.e. /tpot/)
model_folder = './final_exemplars/S4_augmentation_tests/toeholds_small_no_aug/models/'
output_folder = './final_exemplars/S4_augmentation_tests/toeholds_small_no_aug/outputs/'

run_bioseqml(task, data_folder, data_file, sequence_type, model_folder, output_folder, max_runtime_minutes, num_folds, verbosity, do_backup, do_auto_bin, bin_threshold, input_col, target_col, pad_seqs, augment_data, dataset_robustness, num_final_epochs, yaml_params, num_generations, population_size, run_interpretation = run_interpretation, interpret_params = interpret_params, run_design = run_design, design_params = design_params)


# Toeholds - Small Comp Aug

In [None]:
os.mkdir('./final_exemplars/S4_augmentation_tests/toeholds_small_comp_aug/')

data_folder = './clean_data/clean/'
data_file = 'toeholds_small.csv' # just 2000
input_col = 'seq'
target_col = 'ON'
sequence_type = 'nucleic_acid'

augment_data = 'complement'
constraint_file_path = 'clean_data/toehold_constraints.xlsx'

task = 'binary_classification' # binary_classification, multiclass_classification, regression

# Specify target folders for saving models and results
# Generic here - will add the tags specifying classification/regression
# as well as specific for the AutoML tool being used (i.e. /tpot/)
model_folder = './final_exemplars/S4_augmentation_tests/toeholds_small_comp_aug/models/'
output_folder = './final_exemplars/S4_augmentation_tests/toeholds_small_comp_aug/outputs/'

run_bioseqml(task, data_folder, data_file, sequence_type, model_folder, output_folder, max_runtime_minutes, num_folds, verbosity, do_backup, do_auto_bin, bin_threshold, input_col, target_col, pad_seqs, augment_data, dataset_robustness, num_final_epochs, yaml_params, num_generations, population_size, run_interpretation = run_interpretation, interpret_params = interpret_params, run_design = run_design, design_params = design_params)


# Toeholds - Small Rev Comp Aug

In [None]:
os.mkdir('./final_exemplars/S4_augmentation_tests/toeholds_small_rev_aug/')

data_folder = './clean_data/clean/'
data_file = 'toeholds_small.csv' # just 2000
input_col = 'seq'
target_col = 'ON'
sequence_type = 'nucleic_acid'

augment_data = 'reverse_complement'
constraint_file_path = 'clean_data/toehold_constraints.xlsx'

task = 'binary_classification' # binary_classification, multiclass_classification, regression

# Specify target folders for saving models and results
# Generic here - will add the tags specifying classification/regression
# as well as specific for the AutoML tool being used (i.e. /tpot/)
model_folder = './final_exemplars/S4_augmentation_tests/toeholds_small_rev_aug/models/'
output_folder = './final_exemplars/S4_augmentation_tests/toeholds_small_rev_aug/outputs/'

run_bioseqml(task, data_folder, data_file, sequence_type, model_folder, output_folder, max_runtime_minutes, num_folds, verbosity, do_backup, do_auto_bin, bin_threshold, input_col, target_col, pad_seqs, augment_data, dataset_robustness, num_final_epochs, yaml_params, num_generations, population_size, run_interpretation = run_interpretation, interpret_params = interpret_params, run_design = run_design, design_params = design_params)


# Toeholds - Small Both Aug

In [None]:
os.mkdir('./final_exemplars/S4_augmentation_tests/toeholds_small_both_aug/')

data_folder = './clean_data/clean/'
data_file = 'toeholds_small.csv' # just 2000
input_col = 'seq'
target_col = 'ON'
sequence_type = 'nucleic_acid'

constraint_file_path = 'clean_data/toehold_constraints.xlsx'

augment_data = 'both_complements'
task = 'binary_classification' # binary_classification, multiclass_classification, regression

# Specify target folders for saving models and results
# Generic here - will add the tags specifying classification/regression
# as well as specific for the AutoML tool being used (i.e. /tpot/)
model_folder = './final_exemplars/S4_augmentation_tests/toeholds_small_both_aug/models/'
output_folder = './final_exemplars/S4_augmentation_tests/toeholds_small_both_aug/outputs/'

run_bioseqml(task, data_folder, data_file, sequence_type, model_folder, output_folder, max_runtime_minutes, num_folds, verbosity, do_backup, do_auto_bin, bin_threshold, input_col, target_col, pad_seqs, augment_data, dataset_robustness, num_final_epochs, yaml_params, num_generations, population_size, run_interpretation = run_interpretation, interpret_params = interpret_params, run_design = run_design, design_params = design_params)
