# NOTE: this code is normally obscured within BioSeq-AutoML
### For maximum flexibility and so users can continue to explore the codebase, we have pulled out the relevant functionality for all intepretation module functions.

In [None]:
# import statements 
import sys
sys.path.insert(1, './main_classes/')
from BioSeqAutoML_generic_deepswarm import convert_deepswarm_input
from BioSeqAutoML_interpret_helpers import plot_rawseqlogos, get_one_bp_mismatches, get_new_mismatch_seqs
from BioSeqAutoML_generic_automl_classes import AutoMLBackend
from BioSeqAutoML_generic_automl_classes import process_glycans, checkValidity, fill, makeComplement
from BioSeqAutoML_constraints_for_design_helpers import *
from BioSeqAutoML_interpret_helpers import plot_mutagenesis, plot_rawseqlogos, plot_activation_maps, plot_saliency_maps, plot_seqlogos
from BioSeqAutoML_seqprop_helpers import *
from BioSeqAutoML_integrated_design_helpers import *

# Toeholds - binary classification example

In [None]:
# read in data file
data_dir = './clean_data/clean/'
file_name = 'toeholds.csv'
data_df = pd.read_csv(data_dir + file_name,sep=',')
data_df = data_df.iloc[0:5000,:]
data_df.head(3)
constraint_file_path = './clean_data/toehold_constraints.xlsx'

# give inputs for data generation
input_col_name = 'seq'
df_data_input = data_df[input_col_name]
df_data_output = data_df['target']
pad_seqs = False
augment_data = 'none'
sequence_type = 'nucleic_acid'

# give inputs for paths
final_model_path = './exemplars/toeholds/outputs/deepswarm/binary_classification/'
final_model_name = 'deepswarm_deploy_model.h5'
model_type = 'deepswarm'
output_folder = final_model_path
plot_path = final_model_path + 'design/'
plot_name = '_design.png'

# give inputs for design options
design_params = {'k' : 1, 
                'substitution_type' : 'constrained_random',
                'target_y' : 1,
                'class_of_interest' : 1, # must be zero if task is regression, recommend to be 3 if task if multi-class classification with 4 classes
                'constraint_file_path' : constraint_file_path,
                'de_novo_num_seqs_to_test' : 100,
                'storm_num_seqs_to_test' : 5,
                 'num_of_optimization_rounds' : 3}

# format data inputs appropriately for autoML platform
numerical_data_input, oh_data_input, df_data_output, scrambled_numerical_data_input, scrambled_oh_data_input, alph = convert_deepswarm_input(df_data_input, df_data_output, pad_seqs, augment_data, sequence_type)

# handle numerical data inputs
numerical = []
numericalbool = True
for x in list(df_data_output.values):
    try:
        x = float(x)
        numerical.append(x)
    except Exception as e:
        numericalbool = False
        break

# now do design
integrated_design(numerical_data_input, oh_data_input, alph, numerical, numericalbool, final_model_path, final_model_name, output_folder + 'design/', '_design.png', sequence_type, model_type = 'deepswarm', design_params = design_params)


# Toeholds - regression example

In [None]:
# read in data file
data_dir = './clean_data/clean/'
file_name = 'toeholds.csv'
data_df = pd.read_csv(data_dir + file_name,sep=',')
data_df = data_df.iloc[0:5000,:]
data_df.head(3)
constraint_file_path = './clean_data/toehold_constraints.xlsx'

# give inputs for data generation
input_col_name = 'seq'
df_data_input = data_df[input_col_name]
df_data_output = data_df['target']
pad_seqs = False
augment_data = 'none'
sequence_type = 'nucleic_acid'

# give inputs for paths
final_model_path = './exemplars/toeholds/outputs/deepswarm/regression/'
final_model_name = 'deepswarm_deploy_model.h5'
model_type = 'deepswarm'
output_folder = final_model_path
plot_path = final_model_path + 'design/'
plot_name = '_design.png'

# give inputs for design options
design_params = {'k' : 1, 
                'substitution_type' : 'constrained_random',
                'target_y' : 1,
                'class_of_interest' : 0, # must be zero if task is regression, recommend to be 3 if task if multi-class classification with 4 classes
                'constraint_file_path' : constraint_file_path,
                'de_novo_num_seqs_to_test' : 100,
                'storm_num_seqs_to_test' : 5,
                 'num_of_optimization_rounds' : 3}

# format data inputs appropriately for autoML platform
numerical_data_input, oh_data_input, df_data_output, scrambled_numerical_data_input, scrambled_oh_data_input, alph = convert_deepswarm_input(df_data_input, df_data_output, pad_seqs, augment_data, sequence_type)

# handle numerical data inputs
numerical = []
numericalbool = True
for x in list(df_data_output.values):
    try:
        x = float(x)
        numerical.append(x)
    except Exception as e:
        numericalbool = False
        break

# now do design
integrated_design(numerical_data_input, oh_data_input, alph, numerical, numericalbool, final_model_path, final_model_name, output_folder + 'design/', '_design.png', sequence_type, model_type = 'deepswarm', design_params = design_params)


# RBS - classification example

In [None]:
# read in data file
data_dir = './clean_data/clean/'
file_name = 'hollerer_rbs_train.csv'
data_df = pd.read_csv(data_dir + file_name,sep=',')
data_df = data_df.iloc[0:5000,:] # subset first 5000 so processing is easier
data_df.head(3)
constraint_file_path = ''

# give inputs for data generation
input_col_name = 'seq'
df_data_input = data_df[input_col_name]
df_data_output = data_df['out']
pad_seqs = False
augment_data = 'none'
sequence_type = 'nucleic_acid'

# give inputs for paths
final_model_path = './exemplars/rbs/outputs/deepswarm/binary_classification/'
final_model_name = 'deepswarm_deploy_model.h5'
model_type = 'deepswarm'
output_folder = final_model_path
plot_path = final_model_path + 'design/'
plot_name = '_design.png'

# give inputs for design options
design_params = {'k' : 1, 
                'substitution_type' : 'constrained_random',
                'target_y' : 1,
                'class_of_interest' : 1,
                'constraint_file_path' : constraint_file_path,
                'de_novo_num_seqs_to_test' : 100,
                'storm_num_seqs_to_test' : 5,
                 'num_of_optimization_rounds' : 3}

# format data inputs appropriately for autoML platform
numerical_data_input, oh_data_input, df_data_output, scrambled_numerical_data_input, scrambled_oh_data_input, alph = convert_deepswarm_input(df_data_input, df_data_output, pad_seqs, augment_data, sequence_type)

# handle numerical data inputs
numerical = []
numericalbool = True
for x in list(df_data_output.values):
    try:
        x = float(x)
        numerical.append(x)
    except Exception as e:
        numericalbool = False
        break

print(numericalbool)
# now do design
integrated_design(numerical_data_input, oh_data_input, alph, numerical, numericalbool, final_model_path, final_model_name, output_folder + 'design/', '_design.png', sequence_type, model_type = 'deepswarm', design_params = design_params)


# RBS - regression example

In [None]:
# read in data file
data_dir = './clean_data/clean/'
file_name = 'hollerer_rbs_train.csv'
data_df = pd.read_csv(data_dir + file_name,sep=',')
data_df = data_df.iloc[0:5000,:] # subset first 5000 so processing is easier
constraint_file_path = ''

# give inputs for data generation
input_col_name = 'seq'
df_data_input = data_df[input_col_name]
df_data_output = data_df['out']
pad_seqs = False
augment_data = 'none'
sequence_type = 'nucleic_acid'

# give inputs for paths
final_model_path = './exemplars/rbs/outputs/deepswarm/regression/'
final_model_name = 'deepswarm_deploy_model.h5'
model_type = 'deepswarm'
output_folder = final_model_path
plot_path = final_model_path + 'design/'
plot_name = '_design.png'

# give inputs for design options
design_params = {'k' : 1, 
                'substitution_type' : 'constrained_random',
                'target_y' : 1,
                'class_of_interest' : 0, # must be zero if task is regression, recommend to be 3 if task if multi-class classification with 4 classes
                'constraint_file_path' : constraint_file_path,
                'de_novo_num_seqs_to_test' : 100,
                'storm_num_seqs_to_test' : 5,
                 'num_of_optimization_rounds' : 3}

# format data inputs appropriately for autoML platform
numerical_data_input, oh_data_input, df_data_output, scrambled_numerical_data_input, scrambled_oh_data_input, alph = convert_deepswarm_input(df_data_input, df_data_output, pad_seqs, augment_data, sequence_type)

# handle numerical data inputs
numerical = []
numericalbool = True
for x in list(df_data_output.values):
    try:
        x = float(x)
        numerical.append(x)
    except Exception as e:
        numericalbool = False
        break

print(numericalbool)
# now do design
integrated_design(numerical_data_input, oh_data_input, alph, numerical, numericalbool, final_model_path, final_model_name, output_folder + 'design/', '_design.png', sequence_type, model_type = 'deepswarm', design_params = design_params)


# Peptides - classification example

In [None]:
# read in data file
data_dir = './clean_data/clean/'
file_name = 'classification_train_peptides.csv'
data_df = pd.read_csv(data_dir + file_name,sep=',')
data_df = data_df.iloc[0:5000,:] # subset first 5000 so processing is easier
data_df.head(3)
constraint_file_path = ''

# give inputs for data generation
input_col_name = 'seq'
df_data_input = data_df[input_col_name]
df_data_output = data_df['target']
pad_seqs = False
augment_data = 'none'
sequence_type = 'protein'

# give inputs for paths
final_model_path = './exemplars/peptides/outputs/deepswarm/binary_classification/'
final_model_name = 'deepswarm_deploy_model.h5'
model_type = 'deepswarm'
output_folder = final_model_path
plot_path = final_model_path + 'design/'
plot_name = '_design.png'

# give inputs for design options
design_params = {'k' : 1, 
                'substitution_type' : 'constrained_random',
                'target_y' : 1,
                'class_of_interest' : 1,
                'constraint_file_path' : constraint_file_path,
                'de_novo_num_seqs_to_test' : 100,
                'storm_num_seqs_to_test' : 5,
                 'num_of_optimization_rounds' : 3}

# format data inputs appropriately for autoML platform
numerical_data_input, oh_data_input, df_data_output, scrambled_numerical_data_input, scrambled_oh_data_input, alph = convert_deepswarm_input(df_data_input, df_data_output, pad_seqs, augment_data, sequence_type)

# handle numerical data inputs
numerical = []
numericalbool = True
for x in list(df_data_output.values):
    try:
        x = float(x)
        numerical.append(x)
    except Exception as e:
        numericalbool = False
        break

print(numericalbool)
# now do design
integrated_design(numerical_data_input, oh_data_input, alph, numerical, numericalbool, final_model_path, final_model_name, output_folder + 'design/', '_design.png', sequence_type, model_type = 'deepswarm', design_params = design_params)


# Peptides - regression example

In [None]:
# read in data file
data_dir = './clean_data/clean/'
file_name = 'classification_train_peptides.csv'
data_df = pd.read_csv(data_dir + file_name,sep=',')
data_df = data_df.iloc[0:5000,:] # subset first 5000 so processing is easier
constraint_file_path = ''

# give inputs for data generation
input_col_name = 'seq'
df_data_input = data_df[input_col_name]
df_data_output = data_df['target']
pad_seqs = False
augment_data = 'none'
sequence_type = 'protein'

# give inputs for paths
final_model_path = './exemplars/peptides/outputs/deepswarm/regression/'
final_model_name = 'deepswarm_deploy_model.h5'
model_type = 'deepswarm'
output_folder = final_model_path
plot_path = final_model_path + 'design/'
plot_name = '_design.png'

# give inputs for design options
design_params = {'k' : 1, 
                'substitution_type' : 'constrained_random',
                'target_y' : 1,
                'class_of_interest' : 0, # must be zero if task is regression, recommend to be 3 if task if multi-class classification with 4 classes
                'constraint_file_path' : constraint_file_path,
                'de_novo_num_seqs_to_test' : 100,
                'storm_num_seqs_to_test' : 5,
                 'num_of_optimization_rounds' : 3}

# format data inputs appropriately for autoML platform
numerical_data_input, oh_data_input, df_data_output, scrambled_numerical_data_input, scrambled_oh_data_input, alph = convert_deepswarm_input(df_data_input, df_data_output, pad_seqs, augment_data, sequence_type)

# handle numerical data inputs
numerical = []
numericalbool = True
for x in list(df_data_output.values):
    try:
        x = float(x)
        numerical.append(x)
    except Exception as e:
        numericalbool = False
        break

print(numericalbool)
# now do design
integrated_design(numerical_data_input, oh_data_input, alph, numerical, numericalbool, final_model_path, final_model_name, output_folder + 'design/', '_design.png', sequence_type, model_type = 'deepswarm', design_params = design_params)


# Synthetic nucleic acids - classification example

In [None]:
# read in data file
data_dir = './clean_data/clean/'
file_name = 'large_synthetic.csv'
data_df = pd.read_csv(data_dir + file_name,sep=',')
data_df = data_df.iloc[0:5000,:] # subset first 5000 so processing is easier
data_df.head(3)
constraint_file_path = ''

# give inputs for data generation
input_col_name = 'seq'
df_data_input = data_df[input_col_name]
df_data_output = data_df['positive_score']
pad_seqs = False
augment_data = 'none'
sequence_type = 'nucleic_acid'

# give inputs for paths
final_model_path = './exemplars/large_synthetic_nucleic_acids/outputs/deepswarm/binary_classification/'
final_model_name = 'deepswarm_deploy_model.h5'
model_type = 'deepswarm'
output_folder = final_model_path
plot_path = final_model_path + 'design/'
plot_name = '_design.png'

# give inputs for design options
design_params = {'k' : 1, 
                'substitution_type' : 'constrained_random',
                'target_y' : 1,
                'class_of_interest' : 1,
                'constraint_file_path' : constraint_file_path,
                'de_novo_num_seqs_to_test' : 100,
                'storm_num_seqs_to_test' : 5,
                 'num_of_optimization_rounds' : 3}

# format data inputs appropriately for autoML platform
numerical_data_input, oh_data_input, df_data_output, scrambled_numerical_data_input, scrambled_oh_data_input, alph = convert_deepswarm_input(df_data_input, df_data_output, pad_seqs, augment_data, sequence_type)

# handle numerical data inputs
numerical = []
numericalbool = True
for x in list(df_data_output.values):
    try:
        x = float(x)
        numerical.append(x)
    except Exception as e:
        numericalbool = False
        break

print(numericalbool)
# now do design
integrated_design(numerical_data_input, oh_data_input, alph, numerical, numericalbool, final_model_path, final_model_name, output_folder + 'design/', '_design.png', sequence_type, model_type = 'deepswarm', design_params = design_params)


# Synthetic nucleic acids - regression example

In [None]:
# read in data file
data_dir = './clean_data/clean/'
file_name = 'large_synthetic.csv'
data_df = pd.read_csv(data_dir + file_name,sep=',')
data_df = data_df.iloc[0:5000,:] # subset first 5000 so processing is easier
constraint_file_path = ''

# give inputs for data generation
input_col_name = 'seq'
df_data_input = data_df[input_col_name]
df_data_output = data_df['positive_score']
pad_seqs = False
augment_data = 'none'
sequence_type = 'nucleic_acid'

# give inputs for paths
final_model_path = './exemplars/large_synthetic_nucleic_acids/outputs/deepswarm/regression/'
final_model_name = 'deepswarm_deploy_model.h5'
model_type = 'deepswarm'
output_folder = final_model_path
plot_path = final_model_path + 'design/'
plot_name = '_design.png'

# give inputs for design options
design_params = {'k' : 1, 
                'substitution_type' : 'constrained_random',
                'target_y' : 1,
                'class_of_interest' : 0, # must be zero if task is regression, recommend to be 3 if task if multi-class classification with 4 classes
                'constraint_file_path' : constraint_file_path,
                'de_novo_num_seqs_to_test' : 100,
                'storm_num_seqs_to_test' : 5,
                 'num_of_optimization_rounds' : 3}

# format data inputs appropriately for autoML platform
numerical_data_input, oh_data_input, df_data_output, scrambled_numerical_data_input, scrambled_oh_data_input, alph = convert_deepswarm_input(df_data_input, df_data_output, pad_seqs, augment_data, sequence_type)

# handle numerical data inputs
numerical = []
numericalbool = True
for x in list(df_data_output.values):
    try:
        x = float(x)
        numerical.append(x)
    except Exception as e:
        numericalbool = False
        break

print(numericalbool)
# now do design
integrated_design(numerical_data_input, oh_data_input, alph, numerical, numericalbool, final_model_path, final_model_name, output_folder + 'design/', '_design.png', sequence_type, model_type = 'deepswarm', design_params = design_params)


# Immunogenic glycans

In [None]:
# read in data file
data_dir = './clean_data/clean/'
file_name = 'immunogenic_glycans.csv'
data_df = pd.read_csv(data_dir + file_name,sep=',')
display(data_df.head(3))

# give inputs for data generation
input_col_name = 'seq'
df_data_input = data_df[input_col_name]
df_data_output = data_df['target']
pad_seqs = 'max'
augment_data = 'none'
sequence_type = 'glycan'

# give inputs for paths
final_model_path = './exemplars/glycans_immunogenic/outputs/deepswarm/binary_classification/'
final_model_name = 'deepswarm_deploy_model.h5'
model_type = 'deepswarm'
output_folder = final_model_path
plot_path = final_model_path + 'design/'

# give inputs for design options
design_params = {'k' : 3, 
                'substitution_type' : 'constrained_random',
                'target_y' : 1,
                'class_of_interest' : 1, # must be zero if task is regression, recommend to be 3 if task if multi-class classification with 4 classes
                'constraint_file_path' : '',
                'de_novo_num_seqs_to_test' : 100,
                'storm_num_seqs_to_test' : 5,
                 'num_of_optimization_rounds' : 3}

# format data inputs appropriately for autoML platform
numerical_data_input, oh_data_input, df_data_output, scrambled_numerical_data_input, scrambled_oh_data_input, alph = convert_deepswarm_input(df_data_input, df_data_output, pad_seqs, augment_data, sequence_type)

# handle numerical data inputs
numerical = []
numericalbool = True
for x in list(df_data_output.values):
    try:
        x = float(x)
        numerical.append(x)
    except Exception as e:
        numericalbool = False
        break

# now do design
integrated_design(numerical_data_input, oh_data_input, alph, numerical, numericalbool, final_model_path, final_model_name, output_folder + 'design/', '_design.png', sequence_type, model_type = 'deepswarm', design_params = design_params)


# Glycans multi-class domain

In [None]:
# read in data file
data_dir = './clean_data/multiclass/'
file_name = 'domain_glycans.csv'
data_df = pd.read_csv(data_dir + file_name,sep=',')
display(data_df.head(3))

# give inputs for data generation
input_col_name = 'seq'
df_data_input = data_df[input_col_name]
df_data_output = data_df['target']
pad_seqs = 'average'
augment_data = 'none'
sequence_type = 'glycan'

# give inputs for paths
final_model_path = './exemplars/glycan_domain_average/outputs/deepswarm/multiclass_classification/'
final_model_name = 'deepswarm_deploy_model.h5'
model_type = 'deepswarm'
output_folder = final_model_path
plot_path = final_model_path + 'design/'

# give inputs for design options
design_params = {'k' : 3, 
                'substitution_type' : 'constrained_random',
                'target_y' : 1,
                'class_of_interest' : 3, # must be zero if task is regression, recommend to be 3 if task if multi-class classification with 4 classes
                'constraint_file_path' : '',
                'de_novo_num_seqs_to_test' : 100,
                'storm_num_seqs_to_test' : 5,
                 'num_of_optimization_rounds' : 5}

# format data inputs appropriately for autoML platform
numerical_data_input, oh_data_input, df_data_output, scrambled_numerical_data_input, scrambled_oh_data_input, alph = convert_deepswarm_input(df_data_input, df_data_output, pad_seqs, augment_data, sequence_type)

# handle numerical data inputss
numerical = []
numericalbool = True
for x in list(df_data_output.values):
    try:
        x = float(x)
        numerical.append(x)
    except Exception as e:
        numericalbool = False
        break

# now do design
integrated_design(numerical_data_input, oh_data_input, alph, numerical, numericalbool, final_model_path, final_model_name, output_folder + 'design/', '_design.png', sequence_type, model_type = 'deepswarm', design_params = design_params)