# NOTE: this code is normally obscured within BioSeq-AutoML
### For maximum flexibility and so users can continue to explore the codebase, we have pulled out the relevant functionality for all intepretation module functions.

In [1]:
# import statements 
import sys
sys.path.insert(1, './main_classes/')
from BioSeqAutoML_generic_deepswarm import convert_deepswarm_input
from BioSeqAutoML_interpret_helpers import plot_rawseqlogos, get_one_bp_mismatches, get_new_mismatch_seqs
from BioSeqAutoML_generic_automl_classes import AutoMLBackend
from BioSeqAutoML_generic_automl_classes import process_glycans, checkValidity, fill, makeComplement
from BioSeqAutoML_constraints_for_design_helpers import *
from BioSeqAutoML_interpret_helpers import plot_ft_importance, plot_mutagenesis, plot_rawseqlogos, plot_activation_maps, plot_saliency_maps, plot_seqlogos


Using TensorFlow backend.


Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


# Immunogenic glycans

In [2]:
# read in data file
data_dir = './clean_data/clean/'
file_name = 'immunogenic_glycans.csv'
data_df = pd.read_csv(data_dir + file_name,sep=',')
display(data_df.head(3))

# give inputs for data generation
input_col_name = 'seq'
df_data_input = data_df[input_col_name]
df_data_output = data_df['target']
pad_seqs = 'max'
augment_data = 'none'
sequence_type = 'glycan'

# give inputs for paths
final_model_path = './exemplars/glycans_immunogenic/outputs/deepswarm/binary_classification/'
final_model_name = 'deepswarm_deploy_model.h5'
model_type = 'deepswarm'
output_folder = final_model_path
plot_path = final_model_path + 'interpretation/'

# give inputs for interpretation options
interpret_params = {'sample_number_class_activation_maps' : 100, 
                   'class_activation_grad_modifier' : 'absolute',
                   'class_activation_layer_index' : -2,
                    'sample_number_saliency_maps' : 100,
                   'saliency_map_grad_modifier' : 'absolute',
                   'saliency_map_layer_index' : -1, 
                   'sample_number_mutagenesis' : 100}

# format data inputs appropriately for autoML platform
numerical_data_input, oh_data_input, df_data_output, scrambled_numerical_data_input, scrambled_oh_data_input, alph = convert_deepswarm_input(df_data_input, df_data_output, pad_seqs, augment_data, sequence_type)

# handle numerical data inputs
numerical = []
numericalbool = True
for x in list(df_data_output.values):
    try:
        x = float(x)
        numerical.append(x)
    except Exception as e:
        numericalbool = False
        break

Unnamed: 0,seq,target
0,Rha(a1-2)Rha(a1-3)Rha(b1-4)Rha,1.0
1,FucNAc(a1-3)QuiNAcNBut(b1-4)FucNAc,1.0
2,[Col(a1-2)]Gal(b1-3)[Col(a1-4)]GlcNAc(b1-4)Gal...,1.0


Confirmed: All sequence characters are in alphabet
Padding all sequences to a length of 47
Confirmed: No data augmentation requested
Confirmed: Scrambled control generated.


In [4]:
# now do the interpretation plots        
# saliency maps
print("Generating saliency maps...")
plot_name = '_saliency.png'
plot_saliency_maps(numerical_data_input, oh_data_input, alph, final_model_path, final_model_name, plot_path, plot_name, sequence_type, interpret_params)

# class activation maps
print("Generating class activation maps...")
plot_name = '_activation.png'
plot_activation_maps(numerical_data_input, oh_data_input, alph, final_model_path, final_model_name, plot_path, plot_name, sequence_type, interpret_params)
 
# in silico mutagenesis     
print("Generating in silico mutagenesis plots...")
print("DeepSwarm")
final_model_path = './exemplars/glycans_immunogenic/outputs/deepswarm/binary_classification/'
final_model_name = 'deepswarm_deploy_model.h5'
output_folder = final_model_path
plot_path = final_model_path + 'interpretation/'
plot_name = '_mutagenesis.png'
plot_mutagenesis(numerical_data_input, oh_data_input, alph, numerical, numericalbool, final_model_path, final_model_name, plot_path, plot_name, sequence_type, model_type, interpret_params)

print("AutoKeras")
final_model_path = './exemplars/glycans_immunogenic/models/autokeras/binary_classification/'
final_model_name = 'optimized_autokeras_pipeline_classification.h5'
output_folder = './final_exemplars/glycans_immunogenic/outputs/autokeras/binary_classification/'
plot_path = output_folder + 'interpretation/'
model_type = 'autokeras'
plot_mutagenesis(numerical_data_input, oh_data_input, alph, numerical, numericalbool, final_model_path, final_model_name, plot_path, plot_name, sequence_type, model_type, interpret_params)

print("TPOT")
final_model_path = './final_exemplars/glycans_immunogenic/outputs/tpot/binary_classification/'
final_model_name = 'final_model_tpot_classification.pkl'
output_folder = final_model_path
plot_path = final_model_path + 'interpretation/'
model_type = 'tpot'
plot_ft_importance(oh_data_input, final_model_path, final_model_name, final_model_path + 'interpretation/', '_feature_importances.png')
plot_mutagenesis(numerical_data_input, oh_data_input, alph, numerical, numericalbool, final_model_path, final_model_name, plot_path, plot_name, sequence_type, model_type, interpret_params)

Generating saliency maps...
Saliency map saved to ./exemplars/glycans_immunogenic/outputs/deepswarm/binary_classification/interpretation/deepswarm_deploy_model_saliency.png
Generating class activation maps...
Activation map saved to ./exemplars/glycans_immunogenic/outputs/deepswarm/binary_classification/interpretation/deepswarm_deploy_model_activation.png
Generating in silico mutagenesis plots...
DeepSwarm
In silico mutagenesis plot saved to ./exemplars/glycans_immunogenic/outputs/deepswarm/binary_classification/interpretation/deepswarm_deploy_model_mutagenesis.png
AutoKeras
In silico mutagenesis plot saved to ./final_exemplars/glycans_immunogenic/outputs/autokeras/binary_classification/interpretation/optimized_autokeras_pipeline_classification_mutagenesis.png
TPOT


NameError: name 'plot_ft_importance' is not defined

In [7]:

print("TPOT")
final_model_path = './final_exemplars/glycans_immunogenic/outputs/tpot/binary_classification/'
final_model_name = 'final_model_tpot_classification.pkl'
output_folder = final_model_path
plot_path = final_model_path + 'interpretation/'
model_type = 'tpot'
plot_ft_importance(oh_data_input, final_model_path, final_model_name, final_model_path + 'interpretation/', '_feature_importances.png')
plot_mutagenesis(numerical_data_input, oh_data_input, alph, numerical, numericalbool, final_model_path, final_model_name, plot_path, plot_name, sequence_type, model_type, interpret_params)

TPOT
No feature importances can be computed from this model.
In silico mutagenesis plot saved to ./final_exemplars/glycans_immunogenic/outputs/tpot/binary_classification/interpretation/final_model_tpot_classification.pkl_mutagenesis.png


# Glycans - multi-class domain classification

In [2]:
# read in data file
data_dir = './clean_data/multiclass/'
file_name = 'domain_glycans.csv'
data_df = pd.read_csv(data_dir + file_name,sep=',')
display(data_df.head(3))

# give inputs for data generation
input_col_name = 'seq'
df_data_input = data_df[input_col_name]
df_data_output = data_df['target']
pad_seqs = 'average'
augment_data = 'none'
sequence_type = 'glycan'

# give inputs for paths
final_model_path = './exemplars/glycan_domain_average/outputs/deepswarm/multiclass_classification/'
final_model_name = 'deepswarm_deploy_model.h5'
model_type = 'deepswarm'
output_folder = final_model_path
plot_path = final_model_path + 'interpretation/'

# give inputs for interpretation options
interpret_params = {'sample_number_class_activation_maps' : 100, 
                   'class_activation_grad_modifier' : 'absolute',
                   'class_activation_layer_index' : -2,
                    'sample_number_saliency_maps' : 100,
                   'saliency_map_grad_modifier' : 'absolute',
                   'saliency_map_layer_index' : -1, 
                   'sample_number_mutagenesis' : 100}

# format data inputs appropriately for autoML platform
numerical_data_input, oh_data_input, df_data_output, scrambled_numerical_data_input, scrambled_oh_data_input, alph = convert_deepswarm_input(df_data_input, df_data_output, pad_seqs, augment_data, sequence_type)

# handle numerical data inputs
numerical = []
numericalbool = True
for x in list(df_data_output.values):
    try:
        x = float(x)
        numerical.append(x)
    except Exception as e:
        numericalbool = False
        break

Unnamed: 0,seq,target
0,Man(a1-3)[Man(a1-6)][Xyl(b1-2)]Man(b1-4)GlcNAc...,Eukarya
1,GlcNAc(b1-2)Man(a1-3)[Man(a1-6)][Xyl(b1-2)]Man...,Eukarya
2,GlcNAc(b1-2)Man(a1-6)[Man(a1-3)][Xyl(b1-2)]Man...,Eukarya


Example of bad letter Man: ['Man', 'a1-3', 'Man', 'a1-6', 'Xyl', 'b1-2', 'Man', 'b1-4', 'GlcNAc', 'b1-4', 'Fuc', 'a1-3', 'GlcNAc']
Truncating all sequences to a length of 11
Confirmed: No data augmentation requested
Confirmed: Scrambled control generated.


In [3]:
# now do the interpretation plots        
# saliency maps
print("Generating saliency maps...")
plot_name = '_saliency.png'
plot_saliency_maps(numerical_data_input, oh_data_input, alph, final_model_path, final_model_name, plot_path, plot_name, sequence_type, interpret_params)

# class activation maps
print("Generating class activation maps...")
plot_name = '_activation.png'
plot_activation_maps(numerical_data_input, oh_data_input, alph, final_model_path, final_model_name, plot_path, plot_name, sequence_type, interpret_params)
 
# in silico mutagenesis     
print("Generating in silico mutagenesis plots...")
print("DeepSwarm")
final_model_path = './exemplars/glycan_domain_average/outputs/deepswarm/multiclass_classification/'
final_model_name = 'deepswarm_deploy_model.h5'
output_folder = final_model_path
plot_path = final_model_path + 'interpretation/'
plot_name = '_mutagenesis.png'
plot_mutagenesis(numerical_data_input, oh_data_input, alph, numerical, numericalbool, final_model_path, final_model_name, plot_path, plot_name, sequence_type, model_type, interpret_params)

print("AutoKeras")
final_model_path = './exemplars/glycan_domain_average/models/autokeras/multiclass_classification/'
final_model_name = 'optimized_autokeras_pipeline_classification.h5'
output_folder = './exemplars/glycan_domain_average/outputs/autokeras/multiclass_classification/'
plot_path = output_folder + 'interpretation/'
model_type = 'autokeras'
plot_mutagenesis(numerical_data_input, oh_data_input, alph, numerical, numericalbool, final_model_path, final_model_name, plot_path, plot_name, sequence_type, model_type, interpret_params)

print("TPOT")
final_model_path = './exemplars/glycan_domain_average/outputs/tpot/multiclass_classification/'
final_model_name = 'final_model_tpot_classification.pkl'
output_folder = final_model_path
plot_path = final_model_path + 'interpretation/'
model_type = 'tpot'
plot_ft_importance(oh_data_input, final_model_path, final_model_name, final_model_path + 'interpretation/', '_feature_importances.png')
plot_mutagenesis(numerical_data_input, oh_data_input, alph, numerical, numericalbool, final_model_path, final_model_name, plot_path, plot_name, sequence_type, model_type, interpret_params)

Generating saliency maps...
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Saliency map saved to ./exemplars/glycan_domain_average/outputs/deepswarm/multiclass_classification/interpretation/deepswarm_deploy_model_saliency.png
Generating class activation maps...
Activation map saved to ./exemplars/glycan_domain_average/outputs/deepswarm/multiclass_classification/interpretation/deepswarm_deploy_model_activation.png
Generating in silico mutagenesis plots...
DeepSwarm
In silico mutagenesis plot saved to ./exemplars/glycan_domain_average/outputs/deepswarm/multiclass_classification/interpretation/deepswarm_deploy_model_mutagenesis.png
AutoKeras
In silico mutagenesis plot saved to ./exemplars/glycan_domain_average/outputs/autokeras/multiclass_classification/interpretation/optimized_autokeras_pipeline_classification_mutagenesis.png
TPOT
No feature importances can be computed from this model.
In silico mutagenesis plot sav

# Toeholds - binary classification example

In [2]:
# read in data file
data_dir = './clean_data/clean/'
file_name = 'toeholds.csv'
data_df = pd.read_csv(data_dir + file_name,sep=',')
data_df = data_df.iloc[0:5000,:]
data_df.head(3)

# give inputs for data generation
input_col_name = 'seq'
df_data_input = data_df[input_col_name]
df_data_output = data_df['target']
pad_seqs = False
augment_data = 'none'
sequence_type = 'nucleic_acid'

# give inputs for paths
final_model_path = './final_exemplars/toeholds/outputs/deepswarm/binary_classification/'
final_model_name = 'deepswarm_deploy_model.h5'
model_type = 'deepswarm'
output_folder = final_model_path
plot_path = final_model_path + 'interpretation/'

# give inputs for interpretation options
interpret_params = {'sample_number_class_activation_maps' : 100, 
                   'class_activation_grad_modifier' : 'absolute',
                   'class_activation_layer_index' : -2,
                    'sample_number_saliency_maps' : 100,
                   'saliency_map_grad_modifier' : 'absolute',
                   'saliency_map_layer_index' : -1, 
                   'sample_number_mutagenesis' : 100}

# format data inputs appropriately for autoML platform
numerical_data_input, oh_data_input, df_data_output, scrambled_numerical_data_input, scrambled_oh_data_input, alph = convert_deepswarm_input(df_data_input, df_data_output, pad_seqs, augment_data, sequence_type)

# handle numerical data inputs
numerical = []
numericalbool = True
for x in list(df_data_output.values):
    try:
        x = float(x)
        numerical.append(x)
    except Exception as e:
        numericalbool = False
        break


Confirmed: All sequence characters are in alphabet
Confirmed: No need to pad or truncate, all sequences same length
Confirmed: No data augmentation requested
Confirmed: Scrambled control generated.


In [3]:
# now do the interpretation plots        
# saliency maps
print("Generating saliency maps...")
plot_name = '_saliency.png'
plot_saliency_maps(numerical_data_input, oh_data_input, alph, final_model_path, final_model_name, plot_path, plot_name, sequence_type, interpret_params)

# class activation maps
print("Generating class activation maps...")
plot_name = '_activation.png'
plot_activation_maps(numerical_data_input, oh_data_input, alph, final_model_path, final_model_name, plot_path, plot_name, sequence_type, interpret_params)
 
# in silico mutagenesis     
print("Generating in silico mutagenesis plots...")
print("DeepSwarm")
final_model_path = './final_exemplars/toeholds/outputs/deepswarm/binary_classification/'
final_model_name = 'deepswarm_deploy_model.h5'
output_folder = final_model_path
plot_path = final_model_path + 'interpretation/'
plot_name = '_mutagenesis.png'
plot_mutagenesis(numerical_data_input, oh_data_input, alph, numerical, numericalbool, final_model_path, final_model_name, plot_path, plot_name, sequence_type, model_type, interpret_params)

print("AutoKeras")
final_model_path = './final_exemplars/toeholds/models/autokeras/binary_classification/'
final_model_name = 'optimized_autokeras_pipeline_classification.h5'
output_folder = './final_exemplars/toeholds/outputs/autokeras/binary_classification/'
plot_path = output_folder + 'interpretation/'
model_type = 'autokeras'
plot_mutagenesis(numerical_data_input, oh_data_input, alph, numerical, numericalbool, final_model_path, final_model_name, plot_path, plot_name, sequence_type, model_type, interpret_params)

print("TPOT")
final_model_path = './final_exemplars/toeholds/outputs/tpot/binary_classification/'
final_model_name = 'final_model_tpot_classification.pkl'
output_folder = final_model_path
plot_path = final_model_path + 'interpretation/'
model_type = 'tpot'
plot_ft_importance(oh_data_input, final_model_path, final_model_name, final_model_path + 'interpretation/', '_feature_importances.png')
plot_mutagenesis(numerical_data_input, oh_data_input, alph, numerical, numericalbool, final_model_path, final_model_name, plot_path, plot_name, sequence_type, model_type, interpret_params)

Generating saliency maps...
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Saliency map saved to ./final_exemplars/toeholds/outputs/deepswarm/binary_classification/interpretation/deepswarm_deploy_model_saliency.png
Generating class activation maps...
Activation map saved to ./final_exemplars/toeholds/outputs/deepswarm/binary_classification/interpretation/deepswarm_deploy_model_activation.png
Generating in silico mutagenesis plots...
DeepSwarm
In silico mutagenesis plot saved to ./final_exemplars/toeholds/outputs/deepswarm/binary_classification/interpretation/deepswarm_deploy_model_mutagenesis.png
AutoKeras
In silico mutagenesis plot saved to ./final_exemplars/toeholds/outputs/autokeras/binary_classification/interpretation/optimized_autokeras_pipeline_classification_mutagenesis.png
TPOT
In silico mutagenesis plot saved to ./final_exemplars/toeholds/outputs/tpot/binary_classification/interpretation/final_model_tpot_