Please note that to run different datasets, you should restart the kernel and run the import statement block again. This is due to graph operations needing to be reset between every experiment.

In [2]:
import sys
sys.path.insert(1, '../main_classes/')

from wrapper import run_bioautomated
import shutil
import os
import pandas as pd
import numpy as np
from transfer_learning_helpers import read_in_format_data_and_pred
import scipy.stats as sp

# make directory where these results can live - no need to run if already have a directory
#os.mkdir('../exemplars/')

Using TensorFlow backend.


Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [2]:
dataset_robustness = False
run_interpretation = False
run_design = False

# Synthetic Nucleic Acids Dataset

In [3]:
# make 90/10 train/test set for DNABERT comparison

synth = pd.read_csv('../clean_data/clean/large_synthetic.csv')
currmed = np.median(synth['positive_score'])
print(currmed)
synth['target'] = [1 if x > currmed else 0 for x in list(synth['positive_score'])]
synth = synth[['seq', 'target']]

train90 = synth.iloc[0:90000,:]
test10 = synth.iloc[90000:100000,:]
print(len(train90))
print(len(test10))
display(test10)
train90.to_csv('../clean_data/clean/large_synthetic_90perc_dataset.csv', index = False)
test10.to_csv('../clean_data/clean/large_synthetic_10perc_dataset.csv', index = False)

50.0
90000
10000


Unnamed: 0,seq,target
90000,GCAATATGTACCTGTGTCGC,1
90001,GGTACCCCTATAGCGATCCG,1
90002,CTGACAACCTATCCCACAGG,0
90003,GGCCTACGTGATCGATAGAT,1
90004,TCGCTATTCTTGCACAAGGA,0
...,...,...
99995,CTGTTAGGGATAAATATATG,0
99996,GAGCCGCGACGACCATCCAT,1
99997,TAGTTACTCATTCCCTCGTG,0
99998,AAGCTCATCAGCATTAAATC,0


In [4]:
#os.mkdir('../exemplars/90perc_synthetic_nucleic_acids/')

data_folder = '../clean_data/clean/'
data_file = 'large_synthetic_90perc_dataset.csv'
input_col = 'seq'
target_col = 'target'
sequence_type = 'nucleic_acid'

task = 'binary_classification' # binary_classification, multiclass_classification, regression

model_folder = '../exemplars/90perc_synthetic_nucleic_acids/models/'
output_folder = '../exemplars/90perc_synthetic_nucleic_acids/outputs/'

run_bioautomated(task, data_folder, data_file, sequence_type, model_folder, output_folder, input_col=input_col, target_col=target_col, run_interpretation=run_interpretation, run_design=run_design)


Verbosity set to 0. For more display items, set verbosity to 1.
#################################################################################################
#######################               RUNNING BINARY CLASSIFICATION            ##################
#################################################################################################

#################################################################################################
##############################            RUNNING DEEPSWARM           ###########################
#################################################################################################
Conducting architecture search now...
Testing scrambled control now...
Fitting final model now...
#################################################################################################
##############################            RUNNING AUTOKERAS           ###########################
#####################################################

# Toeholds

In [3]:
# make 90/10 train/test set for DNABERT comparison

toehold = pd.read_csv('../clean_data/clean/toeholds.csv')
currmed = np.median(toehold['ON'])
print(currmed)
toehold['target'] = [1 if x > currmed else 0 for x in list(toehold['ON'])]
toehold = toehold[['seq', 'target']]

train90 = toehold.iloc[0:82380,:]
test10 = toehold.iloc[82380:len(toehold),:]
print(len(train90))
print(len(test10))
display(test10)
train90.to_csv('../clean_data/clean/toehold_90perc_dataset.csv', index = False)
test10.to_csv('../clean_data/clean/toehold_10perc_dataset.csv', index = False)

0.44570123700000003
82380
9154


Unnamed: 0,seq,target
82380,TTAAAGCGACGGTAAATGCATTTGAAAAAAAACAGAGGAGATTTTT...,1
82381,TTAAAGCTATTGTTGGAAACTAAAACTATCAACAGAGGAGAGATAG...,1
82382,TTAAAGCTGAGCTGCAGATTTTCAAATCGGAACAGAGGAGACCGAT...,0
82383,TTAAAGGAATTTACTTCGTTCTTGACCTTAAACAGAGGAGATAAGG...,1
82384,TTAAAGGATGGTGATTCAAGTATGCTAGGTAACAGAGGAGAACCTA...,0
...,...,...
91529,TTTTTTTTTAATATTTTCACAAATATCGTTAACAGAGGAGAAACGA...,1
91530,TTTTTTTTTCTTGATTTATCAACTTCTTTTAACAGAGGAGAAAAAG...,0
91531,TTTTTTTTTGTCATAGCTTTCCTTTTTAAAAACAGAGGAGATTTAA...,1
91532,TTTTTTTTTTTATAATTTTTAGTGATTTTGAACAGAGGAGACAAAA...,0


In [5]:
os.mkdir('../exemplars/90perc_toeholds/')

data_folder = '../clean_data/clean/'
data_file = 'toehold_90perc_dataset.csv'
input_col = 'seq'
target_col = 'target'
sequence_type = 'nucleic_acid'

task = 'binary_classification' # binary_classification, multiclass_classification, regression

model_folder = '../exemplars/90perc_toeholds/models/'
output_folder = '../exemplars/90perc_toeholds/outputs/'

run_bioautomated(task, data_folder, data_file, sequence_type, model_folder, output_folder, input_col=input_col, target_col=target_col, run_interpretation=run_interpretation, run_design=run_design)


Verbosity set to 0. For more display items, set verbosity to 1.
#################################################################################################
#######################               RUNNING BINARY CLASSIFICATION            ##################
#################################################################################################

#################################################################################################
##############################            RUNNING DEEPSWARM           ###########################
#################################################################################################
Conducting architecture search now...
Testing scrambled control now...
Fitting final model now...
#################################################################################################
##############################            RUNNING AUTOKERAS           ###########################
#####################################################

# Test synthetic NAs

In [4]:
# Read in data file
data_folder = '../clean_data/clean/'
data_file = 'large_synthetic_10perc_dataset.csv'

# Give inputs for data generation
input_col= 'seq'
target_col = 'target'
pad_seqs = 'max'
augment_data = 'none'
sequence_type = 'nucleic_acid'

# Give inputs for paths
model_folder = '../exemplars/90perc_synthetic_nucleic_acids/models/'
output_folder = '../exemplars/90perc_synthetic_nucleic_acids/outputs/'
model_type = 'deepswarm'
task = 'binary_classification'
class_of_interest = 1 # 1 for binary classification typically!

# can provide cut-off for binarizing; for instance, here we would supply the SAME cut-off as 
# the one used in the original TRAIN set
cutoff = 0.5

read_in_format_data_and_pred(task, data_folder, data_file, input_col, target_col, pad_seqs, augment_data, sequence_type, model_type, model_folder, output_folder, class_of_interest = class_of_interest, cutoff_true = cutoff, cutoff_pred = cutoff);

Confirmed: All sequence characters are in alphabet
Confirmed: No need to pad or truncate, all sequences same length
Confirmed: No data augmentation requested
Confirmed: Scrambled control generated.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Number of labeled binary positives with cutoff of 0.5: 4569
Number of total test set: 10000

Computing statistics now...
R2:  1.0  with a p-val of  0.0
Pearson R:  1.0  with a p-val of  0.0
Spearman R:  0.90733  with a p-val of  0.0
auROC:  1.0
MCC:  1.0


# Test toeholds

In [5]:
# Read in data file
data_folder = '../clean_data/clean/'
data_file = 'toehold_10perc_dataset.csv'

# Give inputs for data generation
input_col= 'seq'
target_col = 'target'
pad_seqs = 'max'
augment_data = 'none'
sequence_type = 'nucleic_acid'

# Give inputs for paths
model_folder = '../exemplars/90perc_toeholds/models/'
output_folder = '../exemplars/90perc_toeholds/outputs/'
model_type = 'deepswarm'
task = 'binary_classification'
class_of_interest = 1 # 1 for binary classification typically!

# can provide cut-off for binarizing; for instance, here we would supply the SAME cut-off as 
# the one used in the original TRAIN set
cutoff = 0.446

read_in_format_data_and_pred(task, data_folder, data_file, input_col, target_col, pad_seqs, augment_data, sequence_type, model_type, model_folder, output_folder, class_of_interest = class_of_interest, cutoff_true = cutoff, cutoff_pred = cutoff);

Confirmed: All sequence characters are in alphabet
Confirmed: No need to pad or truncate, all sequences same length
Confirmed: No data augmentation requested
Confirmed: Scrambled control generated.
Number of labeled binary positives with cutoff of 0.446: 4753
Number of total test set: 9154

Computing statistics now...
R2:  0.46394  with a p-val of  0.0
Pearson R:  0.68113  with a p-val of  0.0
Spearman R:  0.68124  with a p-val of  0.0
auROC:  0.89361
MCC:  0.6478
