Please note that to run different datasets, you should restart the kernel and run the import statement and general parameter definition blocks again. This is due to graph operations needing to be reset between every experiment.

In [None]:
sys.path.insert(1, './main_classes/')
from CAML_wrapper import run_bioseqml

import shutil
import os

# make directory where these results can live - no need to run if already have a directory
os.mkdir('./final_exemplars/S5_length_tests/')

# General Parameters (can be changed)

In [None]:
# specify parameters for the actual search (example)
max_runtime_minutes = 180 # time in minutes to give to each implemented AutoML algorithms
num_folds = 3 # recommend 3 - 5 folds for robustness
verbosity = 0
do_backup = False # make a backup of your outputs and models

# Deepswarm execution
num_final_epochs = 50
num_final_epochs = 50
yaml_params = {'ant_count': 4, 'max_depth': 3, 'epochs': 5}

# TPOT execution
num_generations = 50 # or do hyperparameter optimization in an automated way
population_size = 50 # or do hyperparameter optimization in an automated way

# required only for binary_classification
# If do_auto_bin=True then binary threshold is set to None to be automatically generated
do_auto_bin = True
if do_auto_bin == True:
    bin_threshold=None
else:
    bin_threshold = 0.75
    
# Optional Add-ons
pad_seqs = 'max' # pads seqs to max length; options include 'max', 'min', 'average'
augment_data = 'none' # available for nucleic acids: none, complement, rev_complement, both_complements
dataset_robustness = False # try with different fractions of your dataset, may add time

# interpretation options
run_interpretation = False
interpret_params = {}

# design options
run_design = False
design_params = {}


# ---------------------
# Glycans-  IMMUNOGENIC
# ---------------------

# Glycans - Max Pad

In [None]:
os.mkdir('./final_exemplars/S5_length_tests/immunogenic_glycans_max/')

pad_seqs = 'max' # pads seqs to max length

data_folder = './clean_data/clean/'
data_file = 'immunogenic_glycans.csv'
input_col = 'seq'
target_col = 'target'
sequence_type = 'glycan'

task = 'binary_classification' # binary_classification, multiclass_classification, regression

# Specify target folders for saving models and results
# Generic here - will add the tags specifying classification/regression
# as well as specific for the AutoML tool being used (i.e. /tpot/)
model_folder = './final_exemplars/S5_length_tests/immunogenic_glycans_max/models/'
output_folder = './final_exemplars/S5_length_tests/immunogenic_glycans_max/outputs/'

run_bioseqml(task, data_folder, data_file, sequence_type, model_folder, output_folder, max_runtime_minutes, num_folds, verbosity, do_backup, do_auto_bin, bin_threshold, input_col, target_col, pad_seqs, augment_data, dataset_robustness, num_final_epochs, yaml_params, num_generations, population_size, run_interpretation = run_interpretation, interpret_params = interpret_params, run_design = run_design, design_params = design_params)

# Glycans - Min Pad

In [None]:
os.mkdir('./final_exemplars/S5_length_tests/immunogenic_glycans_min/')

pad_seqs = 'min' # pads seqs to min length

data_folder = './clean_data/clean/'
data_file = 'immunogenic_glycans.csv'
input_col = 'seq'
target_col = 'target'
sequence_type = 'glycan'

task = 'binary_classification' # binary_classification, multiclass_classification, regression

# Specify target folders for saving models and results
# Generic here - will add the tags specifying classification/regression
# as well as specific for the AutoML tool being used (i.e. /tpot/)
model_folder = './final_exemplars/S5_length_tests/immunogenic_glycans_min/models/'
output_folder = './final_exemplars/S5_length_tests/immunogenic_glycans_min/outputs/'

run_bioseqml(task, data_folder, data_file, sequence_type, model_folder, output_folder, max_runtime_minutes, num_folds, verbosity, do_backup, do_auto_bin, bin_threshold, input_col, target_col, pad_seqs, augment_data, dataset_robustness, num_final_epochs, yaml_params, num_generations, population_size, run_interpretation = run_interpretation, interpret_params = interpret_params, run_design = run_design, design_params = design_params)

# Glycans - Avg Pad

In [None]:
os.mkdir('./final_exemplars/S5_length_tests/immunogenic_glycans_avg/')

pad_seqs = 'average' # pads seqs to min length

data_folder = './clean_data/clean/'
data_file = 'immunogenic_glycans.csv'
input_col = 'seq'
target_col = 'target'
sequence_type = 'glycan'

task = 'binary_classification' # binary_classification, multiclass_classification, regression

# Specify target folders for saving models and results
# Generic here - will add the tags specifying classification/regression
# as well as specific for the AutoML tool being used (i.e. /tpot/)
model_folder = './final_exemplars/S5_length_tests/immunogenic_glycans_avg/models/'
output_folder = './final_exemplars/S5_length_tests/immunogenic_glycans_avg/outputs/'

run_bioseqml(task, data_folder, data_file, sequence_type, model_folder, output_folder, max_runtime_minutes, num_folds, verbosity, do_backup, do_auto_bin, bin_threshold, input_col, target_col, pad_seqs, augment_data, dataset_robustness, num_final_epochs, yaml_params, num_generations, population_size, run_interpretation = run_interpretation, interpret_params = interpret_params, run_design = run_design, design_params = design_params)

# ---------------------
# Glycans-  DOMAIN CLASS
# ---------------------

# Glycans - Max Pad

In [None]:
os.mkdir('./final_exemplars/S5_length_tests/domain_glycans_max/')

pad_seqs = 'max' # pads seqs to max length

data_folder = './clean_data/multiclass/'
data_file = 'domain_glycans.csv'
input_col = 'seq'
target_col = 'target'
sequence_type = 'glycan'

task = 'multiclass_classification' # binary_classification, multiclass_classification, regression

# Specify target folders for saving models and results
# Generic here - will add the tags specifying classification/regression
# as well as specific for the AutoML tool being used (i.e. /tpot/)
model_folder = './final_exemplars/S5_length_tests/domain_glycans_max/models/'
output_folder = './final_exemplars/S5_length_tests/domain_glycans_max/outputs/'

run_bioseqml(task, data_folder, data_file, sequence_type, model_folder, output_folder, max_runtime_minutes, num_folds, verbosity, do_backup, do_auto_bin, bin_threshold, input_col, target_col, pad_seqs, augment_data, dataset_robustness, num_final_epochs, yaml_params, num_generations, population_size, run_interpretation = run_interpretation, interpret_params = interpret_params, run_design = run_design, design_params = design_params)

# Glycans - Min Pad

In [None]:
os.mkdir('./final_exemplars/S5_length_tests/domain_glycans_min/')

pad_seqs = 'min' # pads seqs to min length

data_folder = './clean_data/multiclass/'
data_file = 'domain_glycans.csv'
input_col = 'seq'
target_col = 'target'
sequence_type = 'glycan'

task = 'multiclass_classification' # binary_classification, multiclass_classification, regression

# Specify target folders for saving models and results
# Generic here - will add the tags specifying classification/regression
# as well as specific for the AutoML tool being used (i.e. /tpot/)
model_folder = './final_exemplars/S5_length_tests/domain_glycans_min/models/'
output_folder = './final_exemplars/S5_length_tests/domain_glycans_min/outputs/'

run_bioseqml(task, data_folder, data_file, sequence_type, model_folder, output_folder, max_runtime_minutes, num_folds, verbosity, do_backup, do_auto_bin, bin_threshold, input_col, target_col, pad_seqs, augment_data, dataset_robustness, num_final_epochs, yaml_params, num_generations, population_size, run_interpretation = run_interpretation, interpret_params = interpret_params, run_design = run_design, design_params = design_params)

# Glycans - Avg Pad

In [None]:
os.mkdir('./final_exemplars/S5_length_tests/domain_glycans_avg/')

pad_seqs = 'average' # pads seqs to max length

data_folder = './clean_data/multiclass/'
data_file = 'domain_glycans.csv'
input_col = 'seq'
target_col = 'target'
sequence_type = 'glycan'

task = 'multiclass_classification' # binary_classification, multiclass_classification, regression

# Specify target folders for saving models and results
# Generic here - will add the tags specifying classification/regression
# as well as specific for the AutoML tool being used (i.e. /tpot/)
model_folder = './final_exemplars/S5_length_tests/domain_glycans_avg/models/'
output_folder = './final_exemplars/S5_length_tests/domain_glycans_avg/outputs/'

run_bioseqml(task, data_folder, data_file, sequence_type, model_folder, output_folder, max_runtime_minutes, num_folds, verbosity, do_backup, do_auto_bin, bin_threshold, input_col, target_col, pad_seqs, augment_data, dataset_robustness, num_final_epochs, yaml_params, num_generations, population_size, run_interpretation = run_interpretation, interpret_params = interpret_params, run_design = run_design, design_params = design_params)