This notebook provides code to test all models with validation datasets (either held out test sets or external validation datasets).

In [1]:
# import statements 
import sys
sys.path.insert(1, './main_classes/')

from CAML_wrapper import run_bioseqml
from CAML_seqprop_helpers import *
from CAML_integrated_design_helpers import *
from CAML_generic_deepswarm import convert_deepswarm_input
from CAML_generic_autokeras import convert_autokeras_input

import scipy.stats as sp

Using TensorFlow backend.


Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


# RBS - binary classification example
# Test set is test set from the paper

In [2]:
# Read in data file
data_dir = './clean_data/clean/'
file_name = 'hollerer_rbs_test.csv'
data_df = pd.read_csv(data_dir + file_name,sep=',')
data_df.head(3)

# Give inputs for data generation
input_col_name = 'seq'
df_data_input = data_df[input_col_name]
df_data_output = data_df['out']
pad_seqs = 'max'
augment_data = 'none'
sequence_type = 'nucleic_acid'

# Give inputs for paths
final_model_path = './final_exemplars/rbs_fullset/outputs/deepswarm/binary_classification/'
final_model_name = 'deepswarm_deploy_model.h5'
model_type = 'deepswarm'
val_path = final_model_path + 'external_validation/'
#os.mkdir(val_path)

# Format data inputs appropriately for autoML platform
numerical_data_input, oh_data_input, df_data_output, scrambled_numerical_data_input, scrambled_oh_data_input, alph = convert_deepswarm_input(df_data_input, df_data_output, pad_seqs, augment_data, sequence_type)
preds = AutoMLBackend.generic_predict(oh_data_input, numerical_data_input, model_type, final_model_path, final_model_name)

# We are interested in class 1
preddf = pd.DataFrame(preds)
data_df['predicted'] = preddf.iloc[:,1]

# Use same cut off as was used in the initial training (do_auto_bin = True, bin_threshold = None)
medium_train = pd.read_csv(data_dir + 'hollerer_rbs_mediumtrain.csv',sep=',')
medium_cutoff_for_positive = np.median(medium_train['out'].values)
print('Cut off for positive: ' + str(medium_cutoff_for_positive))
targs = list(df_data_output.iloc[:,0]) # use output that was scrambled in same order
data_df['out'] = targs
targs = [1 if t > medium_cutoff_for_positive else 0 for t in targs]
data_df['binary target'] = targs
print('Number of labeled binary positives: ' + str(sum(targs)))
print('Number of total test set: ' + str(len(targs)))

# do r2
# do auROC
print('Lin regress')
slope, intercept, r_val, p_val, std_error = sp.linregress(data_df['out'], data_df['predicted'])
print(sp.linregress(data_df['out'], data_df['predicted']))
print('R2')
print(r_val ** 2)
print('Pearson R')
print(sp.pearsonr(data_df['out'], data_df['predicted']))
print('Spearman R')
print(sp.spearmanr(data_df['out'], data_df['predicted']))
print('auROC')
print(sklearn.metrics.roc_auc_score(data_df['binary target'], data_df['predicted']))

Confirmed: All sequence characters are in alphabet
Confirmed: No need to pad or truncate, all sequences same length
Confirmed: No data augmentation requested
Confirmed: Scrambled control generated.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Cut off for positive: 0.15019412977510438
Number of labeled binary positives: 12747
Number of total test set: 27654
Lin regress
LinregressResult(slope=1.3864976164749958, intercept=-0.03040007416916457, rvalue=0.8855011631091553, pvalue=0.0, stderr=0.00437502618911789)
R2
0.7841123098676668
Pearson R
(0.8855011696011187, 0.0)
Spearman R
SpearmanrResult(correlation=0.841209123879085, pvalue=0.0)
auROC
0.9351482788908502


# RBS - regression model example
# Test set is test set from the paper

In [3]:
# Read in data file
data_dir = './clean_data/clean/'
file_name = 'hollerer_rbs_test.csv'
data_df = pd.read_csv(data_dir + file_name,sep=',')
data_df.head(3)

# Give inputs for data generation
input_col_name = 'seq'
df_data_input = data_df[input_col_name]
df_data_output = data_df['out']
pad_seqs = 'max'
augment_data = 'none'
sequence_type = 'nucleic_acid'

# Give inputs for paths
final_model_path = './final_exemplars/rbs_fullset/outputs/deepswarm/regression/'
final_model_name = 'deepswarm_deploy_model.h5'
model_type = 'deepswarm'
val_path = final_model_path + 'external_validation/'
#os.mkdir(val_path)

# Format data inputs appropriately for autoML platform
numerical_data_input, oh_data_input, df_data_output, scrambled_numerical_data_input, scrambled_oh_data_input, alph = convert_deepswarm_input(df_data_input, df_data_output, pad_seqs, augment_data, sequence_type)
preds = AutoMLBackend.generic_predict(oh_data_input, numerical_data_input, model_type, final_model_path, final_model_name)

# We are interested in just single col of preds
preddf = pd.DataFrame(preds)
data_df['predicted'] = preddf.iloc[:,0]

# Use same cut off as was used in the initial training (do_auto_bin = True, bin_threshold = None)
medium_train = pd.read_csv(data_dir + 'hollerer_rbs_mediumtrain.csv',sep=',')
medium_cutoff_for_positive = np.median(medium_train['out'].values)
print('Cut off for positive: ' + str(medium_cutoff_for_positive))
targs = list(df_data_output.iloc[:,0]) # use output that was scrambled in same order
data_df['out'] = targs
targs = [1 if t > medium_cutoff_for_positive else 0 for t in targs]
data_df['binary target'] = targs
print('Number of labeled binary positives: ' + str(sum(targs)))
print('Number of total test set: ' + str(len(targs)))

# do r2
# do auROC
print('Lin regress')
slope, intercept, r_val, p_val, std_error = sp.linregress(data_df['out'], data_df['predicted'])
print(sp.linregress(data_df['out'], data_df['predicted']))
print('R2')
print(r_val ** 2)
print('Pearson R')
print(sp.pearsonr(data_df['out'], data_df['predicted']))
print('Spearman R')
print(sp.spearmanr(data_df['out'], data_df['predicted']))
print('auROC')
print(sklearn.metrics.roc_auc_score(data_df['binary target'], data_df['predicted']))

Confirmed: All sequence characters are in alphabet
Confirmed: No need to pad or truncate, all sequences same length
Confirmed: No data augmentation requested
Confirmed: Scrambled control generated.
Cut off for positive: 0.15019412977510438
Number of labeled binary positives: 12747
Number of total test set: 27654
Lin regress
LinregressResult(slope=1.5572871300858278, intercept=-0.18586200028130317, rvalue=0.9037767746663077, pvalue=0.0, stderr=0.004434985568105475)
R2
0.8168124584262338
Pearson R
(0.9037767878480586, 0.0)
Spearman R
SpearmanrResult(correlation=0.8449002594975955, pvalue=0.0)
auROC
0.9289958665248559


# Peptides - binary classification
# External validation with held-out classification test set

In [4]:
# Read in data file
data_dir = './clean_data/clean/'
file_name = 'classification_test_peptides.csv'
data_df = pd.read_csv(data_dir + file_name,sep=',')
data_df.head(3)

# Give inputs for data generation
input_col_name = 'seq'
df_data_input = data_df[input_col_name]
df_data_output = data_df['target']
pad_seqs = False
augment_data = 'none'
sequence_type = 'protein'

# Give inputs for paths
final_model_path = './final_exemplars/peptides/outputs/deepswarm/binary_classification/'
final_model_name = 'deepswarm_deploy_model.h5'
model_type = 'deepswarm'
val_path = final_model_path + 'external_validation/'
#os.mkdir(val_path)

# Format data inputs appropriately for autoML platform
numerical_data_input, oh_data_input, df_data_output, scrambled_numerical_data_input, scrambled_oh_data_input, alph = convert_deepswarm_input(df_data_input, df_data_output, pad_seqs, augment_data, sequence_type)
preds = AutoMLBackend.generic_predict(oh_data_input, numerical_data_input, model_type, final_model_path, final_model_name)

# We are interested in class 1
preddf = pd.DataFrame(preds)
data_df['predicted'] = preddf.iloc[:,1]

# 1 chosen as cut-off because in the supplement they said: 
# "In the regression datasets, the R2-to-R3 enrichment was used as a label" 
targs = list(df_data_output.iloc[:,0]) # use output that was scrambled in same order
data_df['target'] = targs
targs = [1 if t > 1 else 0 for t in targs ]
data_df['binary target'] = targs
print('Number of labeled binary positives: ' + str(sum(targs)))
print('Number of total test set: ' + str(len(targs)))

# do r2
# do auROC
print('Lin regress')
slope, intercept, r_val, p_val, std_error = sp.linregress(data_df['target'], data_df['predicted'])
print(sp.linregress(data_df['target'], data_df['predicted']))
print('R2')
print(r_val ** 2)
print('Pearson R')
print(sp.pearsonr(data_df['target'], data_df['predicted']))
print('Spearman R')
print(sp.spearmanr(data_df['target'], data_df['predicted']))
print('auROC')
print(sklearn.metrics.roc_auc_score(data_df['binary target'], data_df['predicted']))

Example of bad letter J: JJHKPQAKSYLAYRILDYJJ
Replacing J with substitution : L, I
Setting all substitutions to 1 in one-hot encoded representation...
Confirmed: No need to pad or truncate, all sequences same length
Confirmed: No data augmentation requested
Confirmed: Scrambled control generated.
Number of labeled binary positives: 207
Number of total test set: 471
Lin regress
LinregressResult(slope=0.8717547644788819, intercept=-0.2699977864729496, rvalue=0.546579540753808, pvalue=4.826547887761016e-38, stderr=0.06167245311411808)
R2
0.2987491943706437
Pearson R
(0.5465795329611515, 4.826561704760958e-38)
Spearman R
SpearmanrResult(correlation=0.623677725954115, pvalue=3.9878460998564286e-52)
auROC
0.8303689064558629


# Peptides - regression model example
# External validation with held-out classification test set

In [5]:
# Read in data file
data_dir = './clean_data/clean/'
file_name = 'classification_test_peptides.csv'
data_df = pd.read_csv(data_dir + file_name,sep=',')
data_df.head(3)

# Give inputs for data generation
input_col_name = 'seq'
df_data_input = data_df[input_col_name]
df_data_output = data_df['target']
pad_seqs = False
augment_data = 'none'
sequence_type = 'protein'

# Give inputs for paths
final_model_path = './final_exemplars/peptides/outputs/deepswarm/regression/'
final_model_name = 'deepswarm_deploy_model.h5'
model_type = 'deepswarm'
val_path = final_model_path + 'external_validation/'
#os.mkdir(val_path)

# Format data inputs appropriately for autoML platform
numerical_data_input, oh_data_input, df_data_output, scrambled_numerical_data_input, scrambled_oh_data_input, alph = convert_deepswarm_input(df_data_input, df_data_output, pad_seqs, augment_data, sequence_type)
preds = AutoMLBackend.generic_predict(oh_data_input, numerical_data_input, model_type, final_model_path, final_model_name)

# We are interested in class 1
preddf = pd.DataFrame(preds)
data_df['predicted'] = preddf.iloc[:,0]

# 1 chosen as cut-off because in the supplement they said: 
# "In the regression datasets, the R2-to-R3 enrichment was used as a label" 
targs = list(df_data_output.iloc[:,0]) # use output that was scrambled in same order
data_df['target'] = targs
targs = [1 if t > 1 else 0 for t in targs ]
data_df['binary target'] = targs
print('Number of labeled binary positives: ' + str(sum(targs)))
print('Number of total test set: ' + str(len(targs)))

# do r2
# do auROC
print('Lin regress')
slope, intercept, r_val, p_val, std_error = sp.linregress(data_df['target'], data_df['predicted'])
print(sp.linregress(data_df['target'], data_df['predicted']))
print('R2')
print(r_val ** 2)
print('Pearson R')
print(sp.pearsonr(data_df['target'], data_df['predicted']))
print('Spearman R')
print(sp.spearmanr(data_df['target'], data_df['predicted']))
print('auROC')
print(sklearn.metrics.roc_auc_score(data_df['binary target'], data_df['predicted']))

Example of bad letter J: JJHKPQAKSYLAYRILDYJJ
Replacing J with substitution : L, I
Setting all substitutions to 1 in one-hot encoded representation...
Confirmed: No need to pad or truncate, all sequences same length
Confirmed: No data augmentation requested
Confirmed: Scrambled control generated.
Number of labeled binary positives: 207
Number of total test set: 471
Lin regress
LinregressResult(slope=2.0889198101841475, intercept=-0.810672602474305, rvalue=0.6424325707252971, pvalue=3.5584611616640834e-56, stderr=0.11506163647344805)
R2
0.4127196079287139
Pearson R
(0.6424325756394311, 3.5584521632508258e-56)
Spearman R
SpearmanrResult(correlation=0.6354017143462788, pvalue=1.2652739850410366e-54)
auROC
0.84667325428195


# Toeholds - binary classification example
# Test set is additional toeholds from Valeri, Collins, Ramesh et al.

In [6]:
# Read in data file
data_dir = './clean_data/clean/'
file_name = 'green_sequences_toehold_test_set.csv'
data_df = pd.read_csv(data_dir + file_name,sep=',')
data_df.head(3)

# Give inputs for data generation
input_col_name = 'seq'
df_data_input = data_df[input_col_name]
df_data_output = data_df['target']
pad_seqs = False
augment_data = 'none'
sequence_type = 'nucleic_acid'

# Give inputs for paths
final_model_path = './final_exemplars/toeholds/outputs/deepswarm/binary_classification/'
final_model_name = 'deepswarm_deploy_model.h5'
model_type = 'deepswarm'
val_path = final_model_path + 'external_validation/'
#os.mkdir(val_path)

# Format data inputs appropriately for autoML platform
numerical_data_input, oh_data_input, df_data_output, scrambled_numerical_data_input, scrambled_oh_data_input, alph = convert_deepswarm_input(df_data_input, df_data_output, pad_seqs, augment_data, sequence_type)
preds = AutoMLBackend.generic_predict(oh_data_input, numerical_data_input, model_type, final_model_path, final_model_name)

# We are interested in class 1
preddf = pd.DataFrame(preds)
data_df['predicted'] = preddf.iloc[:,1]

data_df['target'] = list(df_data_output.iloc[:,0]) # use output that was scrambled in same order

# Use same cut off as was used in the initial training (do_auto_bin = True, bin_threshold = None)
print('Number of labeled binary positives: ' + str(sum(data_df['target'])))
print('Number of total test set: ' + str(len(data_df['target'])))

# do r2
# do auROC
print('Lin regress')
slope, intercept, r_val, p_val, std_error = sp.linregress(data_df['target'], data_df['predicted'])
print(sp.linregress(data_df['target'], data_df['predicted']))
print('R2')
print(r_val ** 2)
print('Pearson R')
print(sp.pearsonr(data_df['target'], data_df['predicted']))
print('Spearman R')
print(sp.spearmanr(data_df['target'], data_df['predicted']))
print('auROC')
print(sklearn.metrics.roc_auc_score(data_df['target'], data_df['predicted']))

Confirmed: All sequence characters are in alphabet
Confirmed: No need to pad or truncate, all sequences same length
Confirmed: No data augmentation requested
Confirmed: Scrambled control generated.
Number of labeled binary positives: 42
Number of total test set: 168
Lin regress
LinregressResult(slope=0.30525034445360233, intercept=0.35442169065814483, rvalue=0.3236949612846848, pvalue=1.868540689491163e-05, stderr=0.06925186104525981)
R2
0.10477842796109357
Pearson R
(0.32369496423219457, 1.8685403434725678e-05)
Spearman R
SpearmanrResult(correlation=0.33674065943243764, pvalue=8.077523099017082e-06)
auROC
0.7244897959183674


In [7]:
# Read in data file
data_dir = './clean_data/clean/'
file_name = 'pardee_sequences_toehold_test_set.csv'
data_df = pd.read_csv(data_dir + file_name,sep=',')
data_df['seq'] = [s[0:59] for s in data_df['seq']]
data_df.head(3)

# Give inputs for data generation
input_col_name = 'seq'
df_data_input = data_df[input_col_name]
df_data_output = data_df['rank']
pad_seqs = False
augment_data = 'none'
sequence_type = 'nucleic_acid'

# Give inputs for paths
final_model_path = './final_exemplars/toeholds/outputs/deepswarm/binary_classification/'
final_model_name = 'deepswarm_deploy_model.h5'
model_type = 'deepswarm'
val_path = final_model_path + 'external_validation/'
#os.mkdir(val_path)

# Format data inputs appropriately for autoML platform
numerical_data_input, oh_data_input, df_data_output, scrambled_numerical_data_input, scrambled_oh_data_input, alph = convert_deepswarm_input(df_data_input, df_data_output, pad_seqs, augment_data, sequence_type)
preds = AutoMLBackend.generic_predict(oh_data_input, numerical_data_input, model_type, final_model_path, final_model_name)

# We are interested in class 1
preddf = pd.DataFrame(preds)
data_df['predicted'] = preddf.iloc[:,1]
data_df['rank'] = list(df_data_output.iloc[:,0]) # use output that was scrambled in same order

# do r2
print('Lin regress')
slope, intercept, r_val, p_val, std_error = sp.linregress(data_df['rank'], data_df['predicted'])
print(sp.linregress(data_df['rank'], data_df['predicted']))
print('R2')
print(r_val ** 2)
print('Pearson R')
print(sp.pearsonr(data_df['rank'], data_df['predicted']))
print('Spearman R')
print(sp.spearmanr(data_df['rank'], data_df['predicted']))


Example of bad letter U: UCUUCAGCCUCCAUGUGUCAUUCUUCUCACUCUCAAGUUAUAGUUAUGAACAGAGGAGA
Confirmed: No need to pad or truncate, all sequences same length
Confirmed: No data augmentation requested
Confirmed: Scrambled control generated.
Lin regress
LinregressResult(slope=1.3776955267016914e-07, intercept=-2.9492084007535176e-09, rvalue=0.1478419694647521, pvalue=0.49056295458611643, stderr=1.964921882075778e-07)
R2
0.02185724793521669
Pearson R
(0.14784196984158113, 0.49056295346826406)
Spearman R
SpearmanrResult(correlation=0.25217391304347825, pvalue=0.2345275709216195)
