Please note that to run different datasets, you should restart the kernel and run the import statement block again. This is due to graph operations needing to be reset between every experiment.

In [None]:
import sys
sys.path.insert(1, '../main_classes/')

from wrapper import run_bioautomated
import shutil
import os

# make directory where these results can live - no need to run if already have a directory
#os.mkdir('../exemplars/')

# General Parameters (can be changed)

In [None]:
# ### input arguments 
#     task : str, one of 'binary_classification', 'multiclass_classification', 'regression'
#     data_folder : str representing folder where data is stored
#     data_file : str representing file name where data is stored
#     sequence_type : str, either 'nucleic_acid', 'peptide', or 'glycan'
#     model_folder : str representing folder where models are to be stored
#     output_folder : str representing folder where output is to be stored
#     automl_search_techniques : str representing which AutoML search technique should be performed, one of 'all', 'deepswarm', 'autokeras', 'tpot'
#     do_backup : bool representing if a backup should be performed
#     max_runtime_minutes : int representing max runtime for model search in minutes
#     num_folds : int representing num folds
#     verbosity : int representing 0=not verbose, 1=verbose
#     do_auto_bin : bool representing if target values should be automatically binned
#     bin_threshold : float representing threshold for positive and negative classes
#     do_transform : bool representing if target values should be transformed
#     input_col : str representing input column name where sequences can be located
#     target_col : str representing target column name where target values can be located
#     pad_seqs : str indicating pad_seqs method, either 'max', 'min', 'average'
#     augment_data : str, either 'none', 'complement', 'reverse_complement', or 'both_complements'
#     dataset_robustness : bool indicating if data ablation study should be performed
#     num_final_epochs : int representing number of final epochs to train final deepswarm model
#     yaml_params : dict of extra deepswarm parameters, with keys 'max_depth' (int), 'ant_count' (int), 'epochs' (int)
#     num_generations : int representing number of generations of tpot search
#     population_size : int representing population size of tpot search
#     run_interpretation : bool indicating if interpretation module should be executed
#     interpret_params : dict of extra interpretation parameters, with keys 'sample_number_class_activation_maps' (int), 'class_activation_grad_modifier' (str), 'class_activation_layer_index' (int);
#         'sample_number_saliency_maps' (int), 'saliency_map_grad_modifier' (str), 'saliency_map_layer_index' (int), 'sample_number_mutagenesis' (int)
#     run_design : bool indicating if design module should be executed
#     design_params :dict of extra design parameters, with keys 'k' (int), 'substitution_type' (str), 'target_y' (float), 'class_of_interest' (int), 'constraint_file_path' (str);
#         'de_novo_num_seqs_to_test' (int), 'storm_num_seqs_to_test' (int), 'num_of_optimization_rounds' (int)
# ###

# Synthetic Nucleic Acids Dataset

In [None]:
# optional add-ons
dataset_robustness = False # try with different fractions of your dataset, may add time
run_interpretation = False
run_design = False

#shutil.rmtree('../exemplars/small_synthetic_nucleic_acids/', ignore_errors = True)
os.mkdir('../exemplars/small_synthetic_nucleic_acids/')

data_folder = '../clean_data/clean/'
data_file = 'small_synthetic.csv'
input_col = 'seq'
target_col = 'positive_score'
sequence_type = 'nucleic_acid'

task = 'binary_classification' # binary_classification, multiclass_classification, regression

model_folder = '../exemplars/small_synthetic_nucleic_acids/models/'
output_folder = '../exemplars/small_synthetic_nucleic_acids/outputs/'

run_bioautomated(task, data_folder, data_file, sequence_type, model_folder, output_folder, input_col=input_col, target_col=target_col, dataset_robustness=dataset_robustness, run_interpretation=run_interpretation, run_design=run_design)


In [None]:
# optional add-ons
dataset_robustness = True # try with different fractions of your dataset, may add time
run_interpretation = True
run_design = True

#shutil.rmtree('../exemplars/large_synthetic_nucleic_acids/', ignore_errors = True)
os.mkdir('../exemplars/large_synthetic_nucleic_acids/')

data_folder = '../clean_data/clean/'
data_file = 'large_synthetic.csv'
input_col = 'seq'
target_col = 'positive_score'
sequence_type = 'nucleic_acid'

task = 'binary_classification' # binary_classification, multiclass_classification, regression

model_folder = '../exemplars/large_synthetic_nucleic_acids/models/'
output_folder = '../exemplars/large_synthetic_nucleic_acids/outputs/'

run_bioautomated(task, data_folder, data_file, sequence_type, model_folder, output_folder, input_col=input_col, target_col=target_col, dataset_robustness=dataset_robustness, run_interpretation=run_interpretation, run_design=run_design)


# RBS - Hollerer et al.

In [None]:
# optional add-ons
dataset_robustness = True # try with different fractions of your dataset, may add time
run_interpretation = True
run_design = True

#shutil.rmtree('../exemplars/rbs/', ignore_errors = True)
os.mkdir('../exemplars/rbs/')

data_folder = '../clean_data/clean/'
data_file = 'hollerer_rbs_train.csv'
input_col = 'seq'
target_col = 'out'
sequence_type = 'nucleic_acid'
task = 'binary_classification' # binary_classification, multiclass_classification, regression

model_folder = '../exemplars/rbs/models/'
output_folder = '../exemplars/rbs/outputs/'

run_bioautomated(task, data_folder, data_file, sequence_type, model_folder, output_folder, input_col=input_col, target_col=target_col, dataset_robustness=dataset_robustness, run_interpretation=run_interpretation, run_design=run_design)


# Peptides

In [None]:
# optional add-ons
dataset_robustness = True # try with different fractions of your dataset, may add time
run_interpretation = True
run_design = True

#shutil.rmtree('../exemplars/peptides/', ignore_errors = True)
os.mkdir('../exemplars/peptides/')

data_folder = '../clean_data/clean/'
data_file = 'classification_train_peptides.csv'
input_col = 'seq'
target_col = 'target'
sequence_type = 'protein'

task = 'binary_classification' # binary_classification, multiclass_classification, regression

model_folder = '../exemplars/peptides/models/'
output_folder = '../exemplars/peptides/outputs/'

run_bioautomated(task, data_folder, data_file, sequence_type, model_folder, output_folder, input_col=input_col, target_col=target_col, dataset_robustness=dataset_robustness, run_interpretation=run_interpretation, run_design=run_design)


# Glycans - immunogenic

In [None]:
# optional add-ons
dataset_robustness = True # try with different fractions of your dataset, may add time
run_interpretation = False
run_design = False

# glycan specific
max_runtime_minutes = 180

#shutil.rmtree('../exemplars/glycans_immunogenic/', ignore_errors = True)
os.mkdir('../exemplars/glycans_immunogenic/')

pad_seqs = 'max' # pads seqs to max length

data_folder = '../clean_data/clean/'
data_file = 'immunogenic_glycans.csv'
input_col = 'seq'
target_col = 'target'
sequence_type = 'glycan'

task = 'binary_classification' # binary_classification, multiclass_classification, regression

model_folder = '../exemplars/glycans_immunogenic/models/'
output_folder = '../exemplars/glycans_immunogenic/outputs/'

run_bioautomated(task, data_folder, data_file, sequence_type, model_folder, output_folder, input_col=input_col, target_col=target_col, pad_seqs=pad_seqs, max_runtime_minutes=max_runtime_minutes, dataset_robustness=dataset_robustness, run_interpretation=run_interpretation, run_design=run_design)


# Toeholds

In [None]:
# optional add-ons
dataset_robustness = True # try with different fractions of your dataset, may add time
run_interpretation = True
run_design = True

#shutil.rmtree('../exemplars/toeholds/', ignore_errors = True)
os.mkdir('../exemplars/toeholds/')

data_folder = '../clean_data/clean/'
data_file = 'toeholds.csv'
input_col = 'seq'
target_col = 'ON'
sequence_type = 'nucleic_acid'

task = 'binary_classification' # binary_classification, multiclass_classification, regression

run_design = True
constraint_file_path = '../clean_data/toehold_constraints.xlsx'
design_params = {'k' : 3, 
                'substitution_type' : 'constrained_random',
                'target_y' : 1,
                'class_of_interest' : 1, # must be zero if task is regression
                'constraint_file_path' : constraint_file_path,
                'de_novo_num_seqs_to_test' : 100,
                'storm_num_seqs_to_test' : 5,
                 'num_of_optimization_rounds' : 5}

model_folder = '../exemplars/toeholds/models/'
output_folder = '../exemplars/toeholds/outputs/'

run_bioautomated(task, data_folder, data_file, sequence_type, model_folder, output_folder, input_col=input_col, target_col=target_col, dataset_robustness=dataset_robustness, run_interpretation=run_interpretation, run_design=run_design, design_params=design_params)
