This notebook provides code to test all models with validation datasets (either held out test sets or external validation datasets).

In [1]:
# import statements 
import sys
sys.path.insert(1, './main_classes/')

from transfer_learning_helpers import read_in_format_data_and_pred
import scipy.stats as sp
import pandas as pd
import numpy as np

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


Using TensorFlow backend.


# RBS - binary classification example
# Test set is test set from the paper

In [2]:
# Read in data file
data_folder = './clean_data/clean/'
data_file = 'hollerer_rbs_test.csv'

# Give inputs for data generation
input_col= 'seq'
target_col = 'out'
pad_seqs = 'max'
augment_data = 'none'
sequence_type = 'nucleic_acid'

# Give inputs for paths
model_folder = './exemplars/rbs/models/'
output_folder = './exemplars/rbs/outputs/'
model_type = 'deepswarm'
task = 'binary_classification'
class_of_interest = 1 # 1 for binary classification typically!

# can provide cut-off for binarizing; for instance, here we would supply the SAME cut-off as 
# the one used in the original TRAIN set
train_set = pd.read_csv(data_folder + 'hollerer_rbs_train.csv')
cutoff = np.median(train_set[target_col].values)

read_in_format_data_and_pred(task, data_folder, data_file, input_col, target_col, pad_seqs, augment_data, sequence_type, model_type, model_folder, output_folder, class_of_interest = class_of_interest, cutoff_true = cutoff, cutoff_pred = cutoff);

Confirmed: All sequence characters are in alphabet
Confirmed: No need to pad or truncate, all sequences same length
Confirmed: No data augmentation requested
Confirmed: Scrambled control generated.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Number of labeled binary positives with cutoff of 0.12491: 13836
Number of total test set: 27654

Computing statistics now...
R2:  0.72785  with a p-val of  0.0
Pearson R:  0.85314  with a p-val of  0.0
Spearman R:  0.8184  with a p-val of  0.0
auROC:  0.91454
MCC:  0.65858


# RBS - regression model example
# Test set is test set from the paper

In [3]:
# Read in data file
data_folder = './clean_data/clean/'
data_file = 'hollerer_rbs_test.csv'

# Give inputs for data generation
input_col= 'seq'
target_col = 'out'
pad_seqs = 'max'
augment_data = 'none'
sequence_type = 'nucleic_acid'

# Give inputs for paths
model_folder = './exemplars/rbs/models/'
output_folder = './exemplars/rbs/outputs/'
model_type = 'autokeras'
task = 'regression'
class_of_interest = 0 # 0 for regression

# can provide cut-off for binarizing; for instance, here we would supply the SAME cut-off as 
# the one used in the original TRAIN set
train_set = pd.read_csv(data_folder + 'hollerer_rbs_train.csv')
cutoff = np.median(train_set[target_col].values)

read_in_format_data_and_pred(task, data_folder, data_file, input_col, target_col, pad_seqs, augment_data, sequence_type, model_type, model_folder, output_folder, class_of_interest = class_of_interest, cutoff_true = cutoff, cutoff_pred = cutoff);

Confirmed: All sequence characters are in alphabet
Confirmed: No need to pad or truncate, all sequences same length
Confirmed: No data augmentation requested
Confirmed: Scrambled control generated.
Number of labeled binary positives with cutoff of 0.12491: 13836
Number of total test set: 27654

Computing statistics now...
R2:  0.86654  with a p-val of  0.0
Pearson R:  0.93088  with a p-val of  0.0
Spearman R:  0.87941  with a p-val of  0.0
auROC:  0.93908
MCC:  0.72465


# Peptides - binary classification
# External validation with held-out classification test set

In [4]:
# Read in data file
data_folder = './clean_data/clean/'
data_file = 'classification_test_peptides.csv'

# Give inputs for data generation
input_col= 'seq'
target_col = 'target'
pad_seqs = 'max'
augment_data = 'none'
sequence_type = 'protein'

# Give inputs for paths
model_folder = './exemplars/peptides/models/'
output_folder = './exemplars/peptides/outputs/'
model_type = 'deepswarm'
task = 'binary_classification'
class_of_interest = 1 # 1 for binary classification typically

# 1 chosen as cut-off because in the supplement they said: 
# "In the regression datasets, the R2-to-R3 enrichment was used as a label" 
cutoff_true = 1
cutoff_pred = 0.5 # use 0.5 as predicted ys cut-off, since they will max out at 1

read_in_format_data_and_pred(task, data_folder, data_file, input_col, target_col, pad_seqs, augment_data, sequence_type, model_type, model_folder, output_folder, class_of_interest = class_of_interest, cutoff_true = cutoff_true, cutoff_pred = cutoff_pred);

Example of bad letter X: JJHKPQAKSYXPYRILDYJJ
Example of bad letter J: JJHKPQAKSYLAYRILDYJJ
Replacing J with substitution : L, I
Setting all substitutions to 1 in one-hot encoded representation...
Confirmed: No need to pad or truncate, all sequences same length
Confirmed: No data augmentation requested
Confirmed: Scrambled control generated.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Number of labeled binary positives with cutoff of 1: 207
Number of total test set: 471

Computing statistics now...
R2:  0.31839  with a p-val of  0.0
Pearson R:  0.56426  with a p-val of  0.0
Spearman R:  0.66286  with a p-val of  0.0
auROC:  0.86909
MCC:  0.54779


# Peptides - regression model example
# External validation with held-out classification test set

In [5]:
# Read in data file
data_folder = './clean_data/clean/'
data_file = 'classification_test_peptides.csv'

# Give inputs for data generation
input_col= 'seq'
target_col = 'target'
pad_seqs = 'max'
augment_data = 'none'
sequence_type = 'protein'

# Give inputs for paths
model_folder = './exemplars/peptides/models/'
output_folder = './exemplars/peptides/outputs/'
model_type = 'tpot'
task = 'regression'
class_of_interest = 0 # 0 for regression

# 1 chosen as cut-off because in the supplement they said: 
# "In the regression datasets, the R2-to-R3 enrichment was used as a label" 
cutoff_true = 1
cutoff_pred = 1 

read_in_format_data_and_pred(task, data_folder, data_file, input_col, target_col, pad_seqs, augment_data, sequence_type, model_type, model_folder, output_folder, class_of_interest = class_of_interest, cutoff_true = cutoff_true, cutoff_pred = cutoff_pred);

Example of bad letter X: JJHKPQAKSYXPYRILDYJJ
Example of bad letter J: JJHKPQAKSYLAYRILDYJJ
Replacing J with substitution : L, I
Setting all substitutions to 1 in one-hot encoded representation...
Confirmed: No need to pad or truncate, all sequences same length
Confirmed: No data augmentation requested
Confirmed: Scrambled control generated.
Number of labeled binary positives with cutoff of 1: 207
Number of total test set: 471

Computing statistics now...
R2:  0.43467  with a p-val of  0.0
Pearson R:  0.6593  with a p-val of  0.0
Spearman R:  0.67809  with a p-val of  0.0
auROC:  0.86764
MCC:  0.52267


# FLIP proteins - regression model example with DeepSwarm
# External validation with held-out regression test set

In [6]:
# Read in data file
data_folder = './clean_data/clean/'
data_file = 'flip_protein_test.csv'

# Give inputs for data generation
input_col= 'sequence'
target_col = 'target'
pad_seqs = 'max'
augment_data = 'none'
sequence_type = 'protein'

# Give inputs for paths
model_folder = './exemplars/flip_longer_protein/models/'
output_folder = './exemplars/flip_longer_protein/outputs/'
model_type = 'autokeras'
task = 'regression'
class_of_interest = 0 # 0 for regression

cutoff_true = 1
cutoff_pred = 1 

read_in_format_data_and_pred(task, data_folder, data_file, input_col, target_col, pad_seqs, augment_data, sequence_type, model_type, model_folder, output_folder, class_of_interest = class_of_interest, cutoff_true = cutoff_true, cutoff_pred = cutoff_pred);

Confirmed: All sequence characters are in alphabet
Padding all sequences to a length of 749
Confirmed: No data augmentation requested
Confirmed: Scrambled control generated.
Number of labeled binary positives with cutoff of 1: 3580
Number of total test set: 16517

Computing statistics now...
R2:  0.87665  with a p-val of  0.0
Pearson R:  0.9363  with a p-val of  0.0
Spearman R:  0.93038  with a p-val of  0.0
auROC:  0.95711
MCC:  0.23709


# Toeholds - regression example
# Test set is additional toeholds from Valeri, Collins, Ramesh et al.

In [7]:
# Read in data file
data_folder = './clean_data/clean/'
data_file = 'green_sequences_toehold_test_set.csv'

# Give inputs for data generation
input_col= 'seq'
target_col = 'target'
pad_seqs = 'max'
augment_data = 'none'
sequence_type = 'nucleic_acid'

# Give inputs for paths
model_folder = './exemplars/toeholds/models/'
output_folder = './exemplars/toeholds/outputs/'
model_type = 'autokeras'
task = 'regression'
class_of_interest = 0 # 0 for regression

cutoff_true = 0.5
cutoff_pred = 0.5

read_in_format_data_and_pred(task, data_folder, data_file, input_col, target_col, pad_seqs, augment_data, sequence_type, model_type, model_folder, output_folder, class_of_interest = class_of_interest, cutoff_true = cutoff_true, cutoff_pred = cutoff_pred);

Confirmed: All sequence characters are in alphabet
Confirmed: No need to pad or truncate, all sequences same length
Confirmed: No data augmentation requested
Confirmed: Scrambled control generated.
Number of labeled binary positives with cutoff of 0.5: 42
Number of total test set: 168

Computing statistics now...
R2:  0.10803  with a p-val of  1e-05
Pearson R:  0.32868  with a p-val of  1e-05
Spearman R:  0.33702  with a p-val of  1e-05
auROC:  0.72468
MCC:  0.28776


In [8]:
# fix dataset so it is amenable to previously trained model
pardee = pd.read_csv('./clean_data/clean/pardee_sequences_toehold_test_set.csv')
pardee['seq'] = [s[18:77] for s in pardee['seq']] # based on Pardee et al.
pardee['rank'] = [24 - x for x in pardee['rank']] # reverse
pardee.to_csv('./clean_data/clean/clean_pardee_sequences_toehold_test_set.csv', index = False)

# Read in data file
data_folder = './clean_data/clean/'
data_file = 'clean_pardee_sequences_toehold_test_set.csv'

# Give inputs for data generation
input_col= 'seq'
target_col = 'rank'
pad_seqs = 'max'
augment_data = 'none'
sequence_type = 'nucleic_acid'

# Give inputs for paths
model_folder = './exemplars/toeholds/models/'
output_folder = './exemplars/toeholds/outputs/'
model_type = 'autokeras'
task = 'regression'
class_of_interest = 0 # 0 for regression

# have to do this one manually since we only have rank data for these toeholds
y_pred, y_true = read_in_format_data_and_pred(task, data_folder, data_file, input_col, target_col, pad_seqs, augment_data, sequence_type, model_type, model_folder, output_folder, class_of_interest = class_of_interest, stats = False)

print('Number of total test set: ' + str(len(y_true)))
print('\nComputing statistics now...')
slope, intercept, r_val, p_val, std_error = sp.linregress(y_true, y_pred)
print('R2: ', np.round(r_val ** 2, 5), ' with a p-val of ', np.round(p_val, 5))
pear = sp.pearsonr(y_true, y_pred)
print('Pearson R: ', np.round(pear[0], 5) , ' with a p-val of ', np.round(pear[1], 5))
spear = sp.spearmanr(y_true, y_pred)
print('Spearman R: ', np.round(spear[0], 5) , ' with a p-val of ', np.round(spear[1], 5))

Confirmed: All sequence characters are in alphabet
Confirmed: No need to pad or truncate, all sequences same length
Confirmed: No data augmentation requested
Confirmed: Scrambled control generated.
Number of total test set: 24

Computing statistics now...
R2:  0.097  with a p-val of  0.13848
Pearson R:  0.31145  with a p-val of  0.13848
Spearman R:  0.30696  with a p-val of  0.14457
