This notebook provides code to test all models with validation datasets (either held out test sets or external validation datasets).

In [1]:
# import statements 
import sys
sys.path.insert(1, './main_classes/')

from BioSeqAutoML_wrapper import run_bioseqml
from BioSeqAutoML_seqprop_helpers import *
from BioSeqAutoML_integrated_design_helpers import *
from BioSeqAutoML_generic_deepswarm import convert_deepswarm_input
from BioSeqAutoML_generic_autokeras import convert_autokeras_input
from BioSeqAutoML_generic_tpot import convert_tpot_input

import scipy.stats as sp

Using TensorFlow backend.


Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


# RBS - binary classification example
# Test set is test set from the paper

In [2]:
# Read in data file
data_dir = './clean_data/clean/'
file_name = 'hollerer_rbs_test.csv'
data_df = pd.read_csv(data_dir + file_name,sep=',')
data_df.head(3)

# Give inputs for data generation
input_col_name = 'seq'
df_data_input = data_df[input_col_name]
df_data_output = data_df['out']
pad_seqs = 'max'
augment_data = 'none'
sequence_type = 'nucleic_acid'

# Give inputs for paths
final_model_path = './exemplars/rbs/outputs/deepswarm/binary_classification/'
final_model_name = 'deepswarm_deploy_model.h5'
model_type = 'deepswarm'
val_path = final_model_path + 'external_validation/'
#os.mkdir(val_path)

# Format data inputs appropriately for autoML platform
numerical_data_input, oh_data_input, df_data_output, scrambled_numerical_data_input, scrambled_oh_data_input, alph = convert_deepswarm_input(df_data_input, df_data_output, pad_seqs, augment_data, sequence_type)
preds = AutoMLBackend.generic_predict(oh_data_input, numerical_data_input, model_type, final_model_path, final_model_name)

# We are interested in class 1
preddf = pd.DataFrame(preds)
data_df['predicted'] = preddf.iloc[:,1]

# Use same cut off as was used in the initial training  (do_auto_bin = True, bin_threshold = None)
medium_train = pd.read_csv(data_dir + 'hollerer_rbs_train.csv',sep=',')
medium_cutoff_for_positive = np.median(medium_train['out'].values)
print('Cut off for positive: ' + str(medium_cutoff_for_positive))
targs = list(df_data_output.iloc[:,0]) # use output that was scrambled in same order
data_df['out'] = targs
targs = [1 if t > medium_cutoff_for_positive else 0 for t in targs]
data_df['binary target'] = targs
print('Number of labeled binary positives: ' + str(sum(targs)))
print('Number of total test set: ' + str(len(targs)))

# do r2
# do auROC
print('Lin regress')
slope, intercept, r_val, p_val, std_error = sp.linregress(data_df['out'], data_df['predicted'])
print(sp.linregress(data_df['out'], data_df['predicted']))
print('R2')
print(r_val ** 2)
print('Pearson R')
print(sp.pearsonr(data_df['out'], data_df['predicted']))
print('Spearman R')
print(sp.spearmanr(data_df['out'], data_df['predicted']))
print('auROC')
print(sklearn.metrics.roc_auc_score(data_df['binary target'], data_df['predicted']))
print('MCC')
pred_targs = [1 if t > medium_cutoff_for_positive else 0 for t in data_df['predicted']]
print(sklearn.metrics.matthews_corrcoef(data_df['binary target'], pred_targs))

Confirmed: All sequence characters are in alphabet
Confirmed: No need to pad or truncate, all sequences same length
Confirmed: No data augmentation requested
Confirmed: Scrambled control generated.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Cut off for positive: 0.12490951287039195
Number of labeled binary positives: 13836
Number of total test set: 27654
Lin regress
LinregressResult(slope=1.387637244174339, intercept=-0.04024776385449347, rvalue=0.8531425650975318, pvalue=0.0, stderr=0.005102622599818262)
R2
0.7278522363811963
Pearson R
(0.8531425652518687, 0.0)
Spearman R
SpearmanrResult(correlation=0.8184005573811536, pvalue=0.0)
auROC
0.9145419853461121
MCC
0.6585810558526338


# RBS - regression model example
# Test set is test set from the paper

In [3]:
# Read in data file
data_dir = './clean_data/clean/'
file_name = 'hollerer_rbs_test.csv'
data_df = pd.read_csv(data_dir + file_name,sep=',')
data_df.head(3)

# Give inputs for data generation
input_col_name = 'seq'
df_data_input = data_df[input_col_name]
df_data_output = data_df['out']
pad_seqs = 'max'
augment_data = 'none'
sequence_type = 'nucleic_acid'

# Give inputs for paths
final_model_path = './exemplars/rbs/models/autokeras/regression/'
final_model_name = 'optimized_autokeras_pipeline_regression.h5'
model_type = 'autokeras'
val_path = final_model_path + 'external_validation/'
#os.mkdir(val_path)

# Format data inputs appropriately for autoML platform
numerical_data_input, oh_data_input, df_data_output, scrambled_numerical_data_input, scrambled_oh_data_input, alph = convert_autokeras_input(df_data_input, df_data_output, pad_seqs, augment_data, sequence_type)
preds = AutoMLBackend.generic_predict(oh_data_input, numerical_data_input, model_type, final_model_path, final_model_name)

# We are interested in just single col of preds
preddf = pd.DataFrame(preds)
data_df['predicted'] = preddf.iloc[:,0]

# Use same cut off as was used in the initial training (do_auto_bin = True, bin_threshold = None)
medium_train = pd.read_csv(data_dir + 'hollerer_rbs_train.csv',sep=',')
medium_cutoff_for_positive = np.median(medium_train['out'].values)
print('Cut off for positive: ' + str(medium_cutoff_for_positive))
targs = list(df_data_output.iloc[:,0]) # use output that was scrambled in same order
data_df['out'] = targs
targs = [1 if t > medium_cutoff_for_positive else 0 for t in targs]
data_df['binary target'] = targs
print('Number of labeled binary positives: ' + str(sum(targs)))
print('Number of total test set: ' + str(len(targs)))

# do r2
# do auROC
print('Lin regress')
slope, intercept, r_val, p_val, std_error = sp.linregress(data_df['out'], data_df['predicted'])
print(sp.linregress(data_df['out'], data_df['predicted']))
print('R2')
print(r_val ** 2)
print('Pearson R')
print(sp.pearsonr(data_df['out'], data_df['predicted']))
print('Spearman R')
print(sp.spearmanr(data_df['out'], data_df['predicted']))
print('auROC')
print(sklearn.metrics.roc_auc_score(data_df['binary target'], data_df['predicted']))
print('MCC')
pred_targs = [1 if t > medium_cutoff_for_positive else 0 for t in data_df['predicted']]
print(sklearn.metrics.matthews_corrcoef(data_df['binary target'], pred_targs))

Confirmed: All sequence characters are in alphabet
Confirmed: No need to pad or truncate, all sequences same length
Confirmed: No data augmentation requested
Confirmed: Scrambled control generated.
Cut off for positive: 0.12490951287039195
Number of labeled binary positives: 13836
Number of total test set: 27654
Lin regress
LinregressResult(slope=2.0922500711111742, intercept=-0.2212792129850918, rvalue=0.9308813757923073, pvalue=0.0, stderr=0.004937775210886783)
R2
0.866540135796979
Pearson R
(0.9308813690838763, 0.0)
Spearman R
SpearmanrResult(correlation=0.8794077954663353, pvalue=0.0)
auROC
0.9390802267958662
MCC
0.7246482453631695


# Peptides - binary classification
# External validation with held-out classification test set

In [4]:
# Read in data file
data_dir = './clean_data/clean/'
file_name = 'classification_test_peptides.csv'
data_df = pd.read_csv(data_dir + file_name,sep=',')
data_df.head(3)

# Give inputs for data generation
input_col_name = 'seq'
df_data_input = data_df[input_col_name]
df_data_output = data_df['target']
pad_seqs = False
augment_data = 'none'
sequence_type = 'protein'

# Give inputs for paths
final_model_path = './exemplars/peptides/outputs/deepswarm/binary_classification/'
final_model_name = 'deepswarm_deploy_model.h5'
model_type = 'deepswarm'
val_path = final_model_path + 'external_validation/'
#os.mkdir(val_path)

# Format data inputs appropriately for autoML platform
numerical_data_input, oh_data_input, df_data_output, scrambled_numerical_data_input, scrambled_oh_data_input, alph = convert_deepswarm_input(df_data_input, df_data_output, pad_seqs, augment_data, sequence_type)
preds = AutoMLBackend.generic_predict(oh_data_input, numerical_data_input, model_type, final_model_path, final_model_name)

# We are interested in class 1
preddf = pd.DataFrame(preds)
data_df['predicted'] = preddf.iloc[:,1]

# 1 chosen as cut-off because in the supplement they said: 
# "In the regression datasets, the R2-to-R3 enrichment was used as a label" 
targs = list(df_data_output.iloc[:,0]) # use output that was scrambled in same order
data_df['target'] = targs
targs = [1 if t > 1 else 0 for t in targs ]
data_df['binary target'] = targs
print('Number of labeled binary positives: ' + str(sum(targs)))
print('Number of total test set: ' + str(len(targs)))

# do r2
# do auROC
print('Lin regress')
slope, intercept, r_val, p_val, std_error = sp.linregress(data_df['target'], data_df['predicted'])
print(sp.linregress(data_df['target'], data_df['predicted']))
print('R2')
print(r_val ** 2)
print('Pearson R')
print(sp.pearsonr(data_df['target'], data_df['predicted']))
print('Spearman R')
print(sp.spearmanr(data_df['target'], data_df['predicted']))
print('auROC')
print(sklearn.metrics.roc_auc_score(data_df['binary target'], data_df['predicted']))
pred_targs = [1 if t > 1 else 0 for t in data_df['predicted']]
print('MCC')
print(sklearn.metrics.matthews_corrcoef(data_df['binary target'], pred_targs))

Example of bad letter J: JJHKPQAKSYLAYRILDYJJ
Replacing J with substitution : L, I
Setting all substitutions to 1 in one-hot encoded representation...
Confirmed: No need to pad or truncate, all sequences same length
Confirmed: No data augmentation requested
Confirmed: Scrambled control generated.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Number of labeled binary positives: 207
Number of total test set: 471
Lin regress
LinregressResult(slope=0.8354442716623416, intercept=-0.23136764550359468, rvalue=0.5642612603682072, pvalue=5.980752251999268e-41, stderr=0.05644410109327849)
R2
0.31839076995231774
Pearson R
(0.5642612939312804, 5.980673965935668e-41)
Spearman R
SpearmanrResult(correlation=0.6628628063277253, pvalue=6.382983469359719e-61)
auROC
0.8690894451764016
MCC
0.0


# Peptides - regression model example
# External validation with held-out classification test set

In [5]:
# Read in data file
data_dir = './clean_data/clean/'
file_name = 'classification_test_peptides.csv'
data_df = pd.read_csv(data_dir + file_name,sep=',')
data_df.head(3)

# Give inputs for data generation
input_col_name = 'seq'
df_data_input = data_df[input_col_name]
df_data_output = data_df['target']
pad_seqs = False
augment_data = 'none'
sequence_type = 'protein'

# Give inputs for paths
final_model_path = './exemplars/peptides/outputs/tpot/regression/'
final_model_name = 'final_model_tpot_regression.pkl'
model_type = 'tpot'
val_path = final_model_path + 'external_validation/'
#os.mkdir(val_path)

# Format data inputs appropriately for autoML platform
numerical_data_input, oh_data_input, df_data_output, scrambled_numerical_data_input, scrambled_oh_data_input, alph = convert_tpot_input(df_data_input, df_data_output, pad_seqs, augment_data, sequence_type)
preds = AutoMLBackend.generic_predict(oh_data_input, numerical_data_input, model_type, final_model_path, final_model_name)

# We are interested in class 1
preddf = pd.DataFrame(preds)
data_df['predicted'] = preddf.iloc[:,0]

# 1 chosen as cut-off because in the supplement they said: 
# "In the regression datasets, the R2-to-R3 enrichment was used as a label" 
targs = list(df_data_output.iloc[:,0]) # use output that was scrambled in same order
data_df['target'] = targs
targs = [1 if t > 1 else 0 for t in targs ]
data_df['binary target'] = targs
print('Number of labeled binary positives: ' + str(sum(targs)))
print('Number of total test set: ' + str(len(targs)))

# do r2
# do auROC
print('Lin regress')
slope, intercept, r_val, p_val, std_error = sp.linregress(data_df['target'], data_df['predicted'])
print(sp.linregress(data_df['target'], data_df['predicted']))
print('R2')
print(r_val ** 2)
print('Pearson R')
print(sp.pearsonr(data_df['target'], data_df['predicted']))
print('Spearman R')
print(sp.spearmanr(data_df['target'], data_df['predicted']))
print('auROC')
print(sklearn.metrics.roc_auc_score(data_df['binary target'], data_df['predicted']))
pred_targs = [1 if t > 1 else 0 for t in data_df['predicted']]
print('MCC')
print(sklearn.metrics.matthews_corrcoef(data_df['binary target'], pred_targs))

Example of bad letter J: JJHKPQAKSYLAYRILDYJJ
Replacing J with substitution : L, I
Setting all substitutions to 1 in one-hot encoded representation...
Confirmed: No need to pad or truncate, all sequences same length
Confirmed: No data augmentation requested
Confirmed: Scrambled control generated.
Number of labeled binary positives: 207
Number of total test set: 471
Lin regress
LinregressResult(slope=1.625969075166658, intercept=-0.3391775925576763, rvalue=0.6592983133659704, pvalue=4.569905259756776e-60, stderr=0.08562355846687711)
R2
0.4346742660072134
Pearson R
(0.6592983133659704, 4.5699052597562033e-60)
Spearman R
SpearmanrResult(correlation=0.6780932604231608, pvalue=1.0371252594738499e-64)
auROC
0.8676438296003514
MCC
0.52267373802454


# FLIP proteins - regression model example with DeepSwarm
# External validation with held-out regression test set

In [2]:
# Read in data file
data_dir = './clean_data/clean/'
file_name = 'flip_protein_test.csv'
data_df = pd.read_csv(data_dir + file_name,sep=',')
data_df.head(3)

# Give inputs for data generation
input_col_name = 'sequence'
df_data_input = data_df[input_col_name]
df_data_output = data_df['target']
pad_seqs = 'max'
augment_data = 'none'
sequence_type = 'protein'

# Give inputs for paths
final_model_path = './exemplars/flip_longer_protein/models/autokeras/regression/'
final_model_name = 'optimized_autokeras_pipeline_regression.h5'
model_type = 'autokeras'
val_path = final_model_path + 'external_validation/'
#os.mkdir(val_path)

# Format data inputs appropriately for autoML platform
numerical_data_input, oh_data_input, df_data_output, scrambled_numerical_data_input, scrambled_oh_data_input, alph = convert_autokeras_input(df_data_input, df_data_output, pad_seqs, augment_data, sequence_type)
preds = AutoMLBackend.generic_predict(oh_data_input, numerical_data_input, model_type, final_model_path, final_model_name)

# We are interested in class 1
preddf = pd.DataFrame(preds)
data_df['predicted'] = preddf.iloc[:,0]

# 1 chosen as cut-off because in the supplement they said: 
# "In the regression datasets, the R2-to-R3 enrichment was used as a label" 
targs = list(df_data_output.iloc[:,0]) # use output that was scrambled in same order
data_df['target'] = targs
targs = [1 if t > 1 else 0 for t in targs ]
data_df['binary target'] = targs
print('Number of labeled binary positives: ' + str(sum(targs)))
print('Number of total test set: ' + str(len(targs)))

# do r2
# do auROC
print('Lin regress')
slope, intercept, r_val, p_val, std_error = sp.linregress(data_df['target'], data_df['predicted'])
print(sp.linregress(data_df['target'], data_df['predicted']))
print('R2')
print(r_val ** 2)
print('Pearson R')
print(sp.pearsonr(data_df['target'], data_df['predicted']))
print('Spearman R')
print(sp.spearmanr(data_df['target'], data_df['predicted']))
print('auROC')
print(sklearn.metrics.roc_auc_score(data_df['binary target'], data_df['predicted']))

Confirmed: All sequence characters are in alphabet
Padding all sequences to a length of 749
Confirmed: No data augmentation requested
Confirmed: Scrambled control generated.
Number of labeled binary positives: 3580
Number of total test set: 16517
Lin regress
LinregressResult(slope=0.16171363780610856, intercept=0.39060079771755807, rvalue=0.936296016689693, pvalue=0.0, stderr=0.0004720232084365707)
R2
0.8766502308689859
Pearson R
(0.9362960394988724, 0.0)
Spearman R
SpearmanrResult(correlation=0.9303786201300126, pvalue=0.0)
auROC
0.9571143763740309


In [7]:
# Read in data file
data_dir = './clean_data/clean/'
file_name = 'flip_protein_test.csv'
data_df = pd.read_csv(data_dir + file_name,sep=',')
data_df.head(3)

# Give inputs for data generation
input_col_name = 'sequence'
df_data_input = data_df[input_col_name]
df_data_output = data_df['target']
pad_seqs = 'max'
augment_data = 'none'
sequence_type = 'protein'

# Give inputs for paths
final_model_path = './exemplars/flip_longer_protein/models/autokeras/regression/'
final_model_name = 'optimized_autokeras_pipeline_regression.h5'
model_type = 'autokeras'
val_path = final_model_path + 'external_validation/'
#os.mkdir(val_path)

# Format data inputs appropriately for autoML platform
numerical_data_input, oh_data_input, df_data_output, scrambled_numerical_data_input, scrambled_oh_data_input, alph = convert_autokeras_input(df_data_input, df_data_output, pad_seqs, augment_data, sequence_type)
preds = AutoMLBackend.generic_predict(oh_data_input, numerical_data_input, model_type, final_model_path, final_model_name)

# We are interested in class 1
preddf = pd.DataFrame(preds)
data_df['predicted'] = preddf.iloc[:,0]

# 1 chosen as cut-off because in the supplement they said: 
# "In the regression datasets, the R2-to-R3 enrichment was used as a label" 
targs = list(df_data_output.iloc[:,0]) # use output that was scrambled in same order
data_df['target'] = targs
targs = [1 if t > 1 else 0 for t in targs ]
data_df['binary target'] = targs
print('Number of labeled binary positives: ' + str(sum(targs)))
print('Number of total test set: ' + str(len(targs)))

# do r2
# do auROC
print('Lin regress')
slope, intercept, r_val, p_val, std_error = sp.linregress(data_df['target'], data_df['predicted'])
print(sp.linregress(data_df['target'], data_df['predicted']))
print('R2')
print(r_val ** 2)
print('Pearson R')
print(sp.pearsonr(data_df['target'], data_df['predicted']))
print('Spearman R')
print(sp.spearmanr(data_df['target'], data_df['predicted']))
print('auROC')
print(sklearn.metrics.roc_auc_score(data_df['binary target'], data_df['predicted']))

Confirmed: All sequence characters are in alphabet
Padding all sequences to a length of 749
Confirmed: No data augmentation requested
Confirmed: Scrambled control generated.
Number of labeled binary positives: 3580
Number of total test set: 16517
Lin regress
LinregressResult(slope=0.16171363780649425, intercept=0.39060080516888784, rvalue=0.9362960166878931, pvalue=0.0, stderr=0.0004720232084450529)
R2
0.8766502308656153
Pearson R
(0.9362960393084019, 0.0)
Spearman R
SpearmanrResult(correlation=0.9303786201300126, pvalue=0.0)
auROC
0.9571143763740309


# Toeholds - regression example
# Test set is additional toeholds from Valeri, Collins, Ramesh et al.

In [15]:
# Read in data file
data_dir = './clean_data/clean/'
file_name = 'green_sequences_toehold_test_set.csv'
data_df = pd.read_csv(data_dir + file_name,sep=',')
data_df.head(3)

# Give inputs for data generation
input_col_name = 'seq'
df_data_input = data_df[input_col_name]
df_data_output = data_df['target']
pad_seqs = False
augment_data = 'none'
sequence_type = 'nucleic_acid'

# Give inputs for paths
final_model_path = './exemplars/toeholds/models/autokeras/regression/'
final_model_name = 'optimized_autokeras_pipeline_regression.h5'
model_type = 'autokeras'
val_path = final_model_path + 'external_validation/'
#os.mkdir(val_path)

# Format data inputs appropriately for autoML platform
numerical_data_input, oh_data_input, df_data_output, scrambled_numerical_data_input, scrambled_oh_data_input, alph = convert_deepswarm_input(df_data_input, df_data_output, pad_seqs, augment_data, sequence_type)
preds = AutoMLBackend.generic_predict(oh_data_input, numerical_data_input, model_type, final_model_path, final_model_name)

# We are interested in class 1
preddf = pd.DataFrame(preds)
data_df['predicted'] = preddf.iloc[:,0]

data_df['target'] = list(df_data_output.iloc[:,0]) # use output that was scrambled in same order

# Use same cut off as was used in the initial training (do_auto_bin = True, bin_threshold = None)
print('Number of labeled binary positives: ' + str(sum(data_df['target'])))
print('Number of total test set: ' + str(len(data_df['target'])))

# do r2
# do auROC
print('Lin regress')
slope, intercept, r_val, p_val, std_error = sp.linregress(data_df['target'], data_df['predicted'])
print(sp.linregress(data_df['target'], data_df['predicted']))
print('R2')
print(r_val ** 2)
print('Pearson R')
print(sp.pearsonr(data_df['target'], data_df['predicted']))
print('Spearman R')
print(sp.spearmanr(data_df['target'], data_df['predicted']))
print('auROC')
print(sklearn.metrics.roc_auc_score(data_df['target'], data_df['predicted']))

Confirmed: All sequence characters are in alphabet
Confirmed: No need to pad or truncate, all sequences same length
Confirmed: No data augmentation requested
Confirmed: Scrambled control generated.
Number of labeled binary positives: 42
Number of total test set: 168
Lin regress
LinregressResult(slope=0.34759877823174906, intercept=0.1952581139843143, rvalue=0.32868358446233814, pvalue=1.3620258019370843e-05, stderr=0.07752122037527745)
R2
0.10803289869501097
Pearson R
(0.32868359966679295, 1.3620244778538249e-05)
Spearman R
SpearmanrResult(correlation=0.33702411116596664, pvalue=7.928280346578557e-06)
auROC
0.7246787603930461


In [16]:
# Read in data file
data_dir = './clean_data/clean/'
file_name = 'pardee_sequences_toehold_test_set.csv'
data_df = pd.read_csv(data_dir + file_name,sep=',')
data_df['seq'] = [s[18:77] for s in data_df['seq']] # based on Pardee et al.
display(data_df.head(3))

# Give inputs for data generation
input_col_name = 'seq'
df_data_input = data_df[input_col_name]
df_data_output = data_df['rank']
pad_seqs = False
augment_data = 'none'
sequence_type = 'nucleic_acid'

# Give inputs for paths
final_model_path = './exemplars/toeholds/models/autokeras/regression/'
final_model_name = 'optimized_autokeras_pipeline_regression.h5'
model_type = 'autokeras'
val_path = final_model_path + 'external_validation/'
#os.mkdir(val_path)

# Format data inputs appropriately for autoML platform
numerical_data_input, oh_data_input, df_data_output, scrambled_numerical_data_input, scrambled_oh_data_input, alph = convert_deepswarm_input(df_data_input, df_data_output, pad_seqs, augment_data, sequence_type)
preds = AutoMLBackend.generic_predict(oh_data_input, numerical_data_input, model_type, final_model_path, final_model_name)

# We are interested in class 1
preddf = pd.DataFrame(preds)
data_df['predicted'] = preddf.iloc[:,0]
data_df['rank'] = list(df_data_output.iloc[:,0]) # use output that was scrambled in same order
data_df['rank'] = [24 - x for x in list(data_df['rank'])]

# do r2
print('Lin regress')
slope, intercept, r_val, p_val, std_error = sp.linregress(data_df['rank'], data_df['predicted'])
print(sp.linregress(data_df['rank'], data_df['predicted']))
print('R2')
print(r_val ** 2)
print('Pearson R')
print(sp.pearsonr(data_df['rank'], data_df['predicted']))
print('Spearman R')
print(sp.spearmanr(data_df['rank'], data_df['predicted']))


Unnamed: 0,seq,rank
0,CAUUCUUCUCACUCUCAAGUUAUAGUUAUGAACAGAGGAGACAUAA...,6
1,UUCUCUUUUUCCCAUCAUGUUAUAGUUAUGAACAGAGGAGACAUAA...,14
2,AGUUUCAUGUCCUGUGUCGUUAUAGUUAUGAACAGAGGAGACAUAA...,8


Example of bad letter C: CAUUCUUCUCACUCUCAAGUUAUAGUUAUGAACAGAGGAGACAUAACAUGAACUUGAGA
Confirmed: No need to pad or truncate, all sequences same length
Confirmed: No data augmentation requested
Confirmed: Scrambled control generated.
Lin regress
LinregressResult(slope=0.0073320089474968285, intercept=0.6576945766806602, rvalue=0.31145396722112567, pvalue=0.1384758830509397, stderr=0.004769367571460242)
R2
0.09700357369777803
Pearson R
(0.3114539546371699, 0.138475899834939)
Spearman R
SpearmanrResult(correlation=0.3069565217391304, pvalue=0.1445678681051738)
