This notebook reproduces results in the length tests portion of the manuscript.

In [None]:
import sys
sys.path.insert(1, 'main_classes/')

from BioSeqAutoML_wrapper import run_bioseqml
import shutil
import os

# make directory where these results can live - no need to run if already have a directory
#os.mkdir('./final_exemplars/')

# General Parameters (can be changed)

In [1]:
# ### input arguments 
#     task : str, one of 'binary_classification', 'multiclass_classification', 'regression'
#     data_folder : str representing folder where data is stored
#     data_file : str representing file name where data is stored
#     sequence_type : str, either 'nucleic_acid', 'peptide', or 'glycan'
#     model_folder : str representing folder where models are to be stored
#     output_folder : str representing folder where output is to be stored
#     automl_search_techniques : str representing which AutoML search technique should be performed, one of 'all', 'deepswarm', 'autokeras', 'tpot'
#     do_backup : bool representing if a backup should be performed
#     max_runtime_minutes : int representing max runtime for model search in minutes
#     num_folds : int representing num folds
#     verbosity : int representing 0=not verbose, 1=verbose
#     do_auto_bin : bool representing if target values should be automatically binned
#     bin_threshold : float representing threshold for positive and negative classes
#     do_transform : bool representing if target values should be transformed
#     input_col : str representing input column name where sequences can be located
#     target_col : str representing target column name where target values can be located
#     pad_seqs : str indicating pad_seqs method, either 'max', 'min', 'average'
#     augment_data : str, either 'none', 'complement', 'reverse_complement', or 'both_complements'
#     dataset_robustness : bool indicating if data ablation study should be performed
#     num_final_epochs : int representing number of final epochs to train final deepswarm model
#     yaml_params : dict of extra deepswarm parameters, with keys 'max_depth' (int), 'ant_count' (int), 'epochs' (int)
#     num_generations : int representing number of generations of tpot search
#     population_size : int representing population size of tpot search
#     run_interpretation : bool indicating if interpretation module should be executed
#     interpret_params : dict of extra interpretation parameters, with keys 'sample_number_class_activation_maps' (int), 'class_activation_grad_modifier' (str), 'class_activation_layer_index' (int);
#         'sample_number_saliency_maps' (int), 'saliency_map_grad_modifier' (str), 'saliency_map_layer_index' (int), 'sample_number_mutagenesis' (int)
#     run_design : bool indicating if design module should be executed
#     design_params :dict of extra design parameters, with keys 'k' (int), 'substitution_type' (str), 'target_y' (float), 'class_of_interest' (int), 'constraint_file_path' (str);
#         'de_novo_num_seqs_to_test' (int), 'storm_num_seqs_to_test' (int), 'num_of_optimization_rounds' (int)
# ###

In [None]:
# specify parameters for the actual search if different from defaults
max_runtime_minutes = 180 # time in minutes to give to each implemented AutoML algorithms
num_folds = 3 # recommend 3 - 5 folds for robustness

# Deepswarm execution - for domain max, reduced num:
yaml_params = {'ant_count': 2, 'max_depth': 3, 'epochs': 5}

# ---------------------
# Glycans-  IMMUNOGENIC
# ---------------------

# Glycans - Max Pad

In [None]:
#shutil.rmtree('./final_exemplars/S5_length_tests/immunogenic_glycans_max/', ignore_errors = True)
os.mkdir('./final_exemplars/S5_length_tests/immunogenic_glycans_max/')

pad_seqs = 'max' # pads seqs to max length

data_folder = './clean_data/clean/'
data_file = 'immunogenic_glycans.csv'
input_col = 'seq'
target_col = 'target'
sequence_type = 'glycan'

task = 'binary_classification' # binary_classification, multiclass_classification, regression

model_folder = './final_exemplars/S5_length_tests/immunogenic_glycans_max/models/'
output_folder = './final_exemplars/S5_length_tests/immunogenic_glycans_max/outputs/'

run_bioseqml(task, data_folder, data_file, sequence_type, model_folder, output_folder, num_folds=num_folds, do_backup=do_backup, input_col=input_col, target_col=target_col, pad_seqs=pad_seqs, max_runtime_minutes=max_runtime_minutes, yaml_params=yaml_params)


# Glycans - Min Pad

In [None]:
#shutil.rmtree('./final_exemplars/S5_length_tests/immunogenic_glycans_min/', ignore_errors = True)
os.mkdir('./final_exemplars/S5_length_tests/immunogenic_glycans_min/')

pad_seqs = 'min' # pads seqs to min length

data_folder = './clean_data/clean/'
data_file = 'immunogenic_glycans.csv'
input_col = 'seq'
target_col = 'target'
sequence_type = 'glycan'

task = 'binary_classification' # binary_classification, multiclass_classification, regression

model_folder = './final_exemplars/S5_length_tests/immunogenic_glycans_min/models/'
output_folder = './final_exemplars/S5_length_tests/immunogenic_glycans_min/outputs/'

run_bioseqml(task, data_folder, data_file, sequence_type, model_folder, output_folder, num_folds=num_folds, do_backup=do_backup, input_col=input_col, target_col=target_col, pad_seqs=pad_seqs, max_runtime_minutes=max_runtime_minutes, yaml_params=yaml_params)


# Glycans - Avg Pad

In [None]:
#shutil.rmtree('./final_exemplars/S5_length_tests/immunogenic_glycans_avg/', ignore_errors = True)
os.mkdir('./final_exemplars/S5_length_tests/immunogenic_glycans_avg/')

pad_seqs = 'average' # pads seqs to avg length

data_folder = './clean_data/clean/'
data_file = 'immunogenic_glycans.csv'
input_col = 'seq'
target_col = 'target'
sequence_type = 'glycan'

task = 'binary_classification' # binary_classification, multiclass_classification, regression

model_folder = './final_exemplars/S5_length_tests/immunogenic_glycans_avg/models/'
output_folder = './final_exemplars/S5_length_tests/immunogenic_glycans_avg/outputs/'

run_bioseqml(task, data_folder, data_file, sequence_type, model_folder, output_folder, num_folds=num_folds, do_backup=do_backup, input_col=input_col, target_col=target_col, pad_seqs=pad_seqs, max_runtime_minutes=max_runtime_minutes, yaml_params=yaml_params)


# ---------------------
# Glycans-  DOMAIN CLASS
# ---------------------

# Glycans - Max Pad

In [None]:
#shutil.rmtree('./final_exemplars/S5_length_tests/domain_glycans_max/', ignore_errors = True)
os.mkdir('./final_exemplars/S5_length_tests/domain_glycans_max/')

pad_seqs = 'max' # pads seqs to max length

data_folder = './clean_data/multiclass/'
data_file = 'domain_glycans.csv'
input_col = 'seq'
target_col = 'target'
sequence_type = 'glycan'

task = 'multiclass_classification' # binary_classification, multiclass_classification, regression

model_folder = './final_exemplars/S5_length_tests/domain_glycans_max/models/'
output_folder = './final_exemplars/S5_length_tests/domain_glycans_max/outputs/'

run_bioseqml(task, data_folder, data_file, sequence_type, model_folder, output_folder, num_folds=num_folds, do_backup=do_backup, input_col=input_col, target_col=target_col, pad_seqs=pad_seqs, max_runtime_minutes=max_runtime_minutes, yaml_params=yaml_params)


# Glycans - Min Pad

In [None]:
#shutil.rmtree('./final_exemplars/S5_length_tests/domain_glycans_min/', ignore_errors = True)
os.mkdir('./final_exemplars/S5_length_tests/domain_glycans_min/')

pad_seqs = 'min' # pads seqs to min length

data_folder = './clean_data/multiclass/'
data_file = 'domain_glycans.csv'
input_col = 'seq'
target_col = 'target'
sequence_type = 'glycan'

task = 'multiclass_classification' # binary_classification, multiclass_classification, regression

model_folder = './final_exemplars/S5_length_tests/domain_glycans_min/models/'
output_folder = './final_exemplars/S5_length_tests/domain_glycans_min/outputs/'

run_bioseqml(task, data_folder, data_file, sequence_type, model_folder, output_folder, num_folds=num_folds, do_backup=do_backup, input_col=input_col, target_col=target_col, pad_seqs=pad_seqs, max_runtime_minutes=max_runtime_minutes, yaml_params=yaml_params)


# Glycans - Avg Pad

In [None]:
#shutil.rmtree('./final_exemplars/S5_length_tests/domain_glycans_avg/', ignore_errors = True)
os.mkdir('./final_exemplars/S5_length_tests/domain_glycans_avg/')

pad_seqs = 'average' # pads seqs to avg length

data_folder = './clean_data/multiclass/'
data_file = 'domain_glycans.csv'
input_col = 'seq'
target_col = 'target'
sequence_type = 'glycan'

task = 'multiclass_classification' # binary_classification, multiclass_classification, regression

model_folder = './final_exemplars/S5_length_tests/domain_glycans_avg/models/'
output_folder = './final_exemplars/S5_length_tests/domain_glycans_avg/outputs/'

run_bioseqml(task, data_folder, data_file, sequence_type, model_folder, output_folder, num_folds=num_folds, do_backup=do_backup, input_col=input_col, target_col=target_col, pad_seqs=pad_seqs, max_runtime_minutes=max_runtime_minutes, yaml_params=yaml_params)
