In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Part 1: Clean toeholds

In [8]:
# Load in data
toehold = pd.read_csv('clean/toeholds.csv')
median = np.median(list(toehold['ON']))
outpath = "clean/bioautoml/classification_toeholds"
file = open(outpath + '.fasta', 'w')
index = 0
vals = []
posseqs = []
negseqs = []
for seq, on in zip(list(toehold['seq']), list(toehold['ON'])):
    file.write(">" + str(index) + "\n" + seq + "\n")
    index = index + 1
    if on <= median:
        binary = 0
        negseqs.append(seq)
    if on > median:
        binary = 1
        posseqs.append(seq)
    vals.append(binary)
file.close() #do not forget to close it
df = pd.DataFrame()
df['value'] = vals
df.to_csv(outpath + '.csv')

# and make specific fastas for the negative and positive seqs
file = open(outpath + '_negseqs.fasta', 'w')
for index, seq in enumerate(negseqs):
    file.write(">" + str(index) + "\n" + seq + "\n")
file.close() #do not forget to close it
file = open(outpath + '_posseqs.fasta', 'w')
for index, seq in enumerate(posseqs):
    file.write(">" + str(index) + "\n" + seq + "\n")
file.close() #do not forget to close it

In [9]:
# Load in data
toehold = pd.read_csv('clean/toeholds.csv')
median = np.median(list(toehold['ON']))
outpath = "clean/bioautoml/classification_small_toeholds"
file = open(outpath + '.fasta', 'w')
index = 0
vals = []
posseqs = []
negseqs = []
for seq, on in zip(list(toehold['seq']), list(toehold['ON'])):
    if index > 1000:
        break
    file.write(">" + str(index) + "\n" + seq + "\n")
    index = index + 1
    if on <= median:
        binary = 0
        negseqs.append(seq)
    if on > median:
        binary = 1
        posseqs.append(seq)
    vals.append(binary)
file.close() #do not forget to close it
df = pd.DataFrame()
df['value'] = vals
df.to_csv(outpath + '.csv')

# and make specific fastas for the negative and positive seqs
file = open(outpath + '_negseqs.fasta', 'w')
for index, seq in enumerate(negseqs):
    file.write(">" + str(index) + "\n" + seq + "\n")
file.close() #do not forget to close it
file = open(outpath + '_posseqs.fasta', 'w')
for index, seq in enumerate(posseqs):
    file.write(">" + str(index) + "\n" + seq + "\n")
file.close() #do not forget to close it

# Part 2: Peptides

In [10]:
# Load in data
peptide = pd.read_csv('clean/classification_train_peptides.csv')
outpath = "clean/bioautoml/classification_train_peptides"
file = open(outpath + '.fasta', 'w')
index = 0
vals = []
posseqs = []
negseqs = []
for seq, on in zip(list(peptide['seq']), list(peptide['target'])):
    if on < 0.5:
        negseqs.append(seq)
    else:
        posseqs.append(seq)
    file.write(">" + str(index) + "\n" + seq + "\n")
    index = index + 1
    vals.append(on)
file.close() #do not forget to close it
df = pd.DataFrame()
df['value'] = vals
df.to_csv(outpath + '.csv')

# and make specific fastas for the negative and positive seqs
file = open(outpath + '_negseqs.fasta', 'w')
for index, seq in enumerate(negseqs):
    file.write(">" + str(index) + "\n" + seq + "\n")
file.close() #do not forget to close it
file = open(outpath + '_posseqs.fasta', 'w')
for index, seq in enumerate(posseqs):
    file.write(">" + str(index) + "\n" + seq + "\n")
file.close() #do not forget to close it

In [2]:
# BIOAUTOML cannot handle Js in peptide data! replace with L for all J
# Load in data
peptide = pd.read_csv('clean/classification_train_peptides.csv')
outpath = "clean/bioautoml/classification_train_NO_J_peptides"
file = open(outpath + '.fasta', 'w')
index = 0
vals = []
posseqs = []
negseqs = []
for seq, on in zip(list(peptide['seq']), list(peptide['target'])):
    seq = seq.replace("J", "L")
    if on < 0.5:
        negseqs.append(seq)
    else:
        posseqs.append(seq)
    file.write(">" + str(index) + "\n" + seq + "\n")
    index = index + 1
    vals.append(on)
file.close() #do not forget to close it
df = pd.DataFrame()
df['value'] = vals
df.to_csv(outpath + '.csv')

# and make specific fastas for the negative and positive seqs
file = open(outpath + '_negseqs.fasta', 'w')
for index, seq in enumerate(negseqs):
    seq = seq.replace("J", "L")
    file.write(">" + str(index) + "\n" + seq + "\n")
file.close() #do not forget to close it
file = open(outpath + '_posseqs.fasta', 'w')
for index, seq in enumerate(posseqs):
    seq = seq.replace("J", "L")
    file.write(">" + str(index) + "\n" + seq + "\n")
file.close() #do not forget to close it

# Part 3: RBS

In [11]:
# and do binary classification version manually based on median (same as BioSeq-AutoML)
rbs = pd.read_csv('clean/hollerer_rbs_train.csv')
median = np.median(list(rbs['out']))
outpath = "clean/bioautoml/classification_hollerer_rbs_train"
file = open(outpath + '.fasta', 'w')
index = 0
vals = []
posseqs = []
negseqs = []
for seq, on in zip(list(rbs['seq']), list(rbs['out'])):
    if on <= median:
        binary = 0
        negseqs.append(seq)
    if on > median:
        binary = 1
        posseqs.append(seq)
    vals.append(binary)
    file.write(">" + str(index) + "\n" + seq + "\n")
    index = index + 1
    vals.append(binary)
file.close() #do not forget to close it
df = pd.DataFrame()
df['value'] = vals
df.to_csv(outpath + '.csv')

# and make specific fastas for the negative and positive seqs
file = open(outpath + '_negseqs.fasta', 'w')
for index, seq in enumerate(negseqs):
    file.write(">" + str(index) + "\n" + seq + "\n")
file.close() #do not forget to close it
file = open(outpath + '_posseqs.fasta', 'w')
for index, seq in enumerate(posseqs):
    file.write(">" + str(index) + "\n" + seq + "\n")
file.close() #do not forget to close it

# and mediumtrain
rbs = pd.read_csv('clean/hollerer_rbs_mediumtrain.csv')
median = np.median(list(rbs['out']))
outpath = "clean/bioautoml/classification_hollerer_rbs_mediumtrain"
file = open(outpath + '.fasta', 'w')
index = 0
vals = []
posseqs = []
negseqs = []
for seq, on in zip(list(rbs['seq']), list(rbs['out'])):
    if on <= median:
        binary = 0
        negseqs.append(seq)
    if on > median:
        binary = 1
        posseqs.append(seq)
    vals.append(binary)
    file.write(">" + str(index) + "\n" + seq + "\n")
    index = index + 1
    vals.append(binary)
file.close() #do not forget to close it
df = pd.DataFrame()
df['value'] = vals
df.to_csv(outpath + '.csv')

# and make specific fastas for the negative and positive seqs
file = open(outpath + '_negseqs.fasta', 'w')
for index, seq in enumerate(negseqs):
    file.write(">" + str(index) + "\n" + seq + "\n")
file.close() #do not forget to close it
file = open(outpath + '_posseqs.fasta', 'w')
for index, seq in enumerate(posseqs):
    file.write(">" + str(index) + "\n" + seq + "\n")
file.close() #do not forget to close it

# Part 4: Synthetic Control

In [2]:
# and do binary classification version manually based on median (same as BioSeq-AutoML)
synth = pd.read_csv('clean/small_synthetic.csv')
median = np.median(list(synth['positive_score']))
outpath = "clean/bioautoml/classification_small_synthetic"
file = open(outpath + '.fasta', 'w')
index = 0
vals = []
posseqs = []
negseqs = []
for seq, on in zip(list(synth['seq']), list(synth['positive_score'])):
    if on <= median:
        binary = 0
        negseqs.append(seq)
    if on > median:
        binary = 1
        posseqs.append(seq)
    file.write(">" + str(index) + "\n" + seq + "\n")
    index = index + 1
    vals.append(binary)
file.close() #do not forget to close it
df = pd.DataFrame()
df['value'] = vals
df.to_csv(outpath + '.csv')

# and make specific fastas for the negative and positive seqs
file = open(outpath + '_negseqs.fasta', 'w')
for index, seq in enumerate(negseqs):
    file.write(">" + str(index) + "\n" + seq + "\n")
file.close() #do not forget to close it
file = open(outpath + '_posseqs.fasta', 'w')
for index, seq in enumerate(posseqs):
    file.write(">" + str(index) + "\n" + seq + "\n")
file.close() #do not forget to close it

# large one too
synth = pd.read_csv('clean/large_synthetic.csv')
median = np.median(list(synth['positive_score']))
outpath = "clean/bioautoml/classification_large_synthetic"
file = open(outpath + '.fasta', 'w')
index = 0
vals = []
posseqs = []
negseqs = []
for seq, on in zip(list(synth['seq']), list(synth['positive_score'])):
    if on <= median:
        binary = 0
        negseqs.append(seq)
    if on > median:
        binary = 1
        posseqs.append(seq)
    file.write(">" + str(index) + "\n" + seq + "\n")
    index = index + 1
    vals.append(binary)
file.close() #do not forget to close it
df = pd.DataFrame()
df['value'] = vals
df.to_csv(outpath + '.csv')

# and make specific fastas for the negative and positive seqs
file = open(outpath + '_negseqs.fasta', 'w')
for index, seq in enumerate(negseqs):
    file.write(">" + str(index) + "\n" + seq + "\n")
file.close() #do not forget to close it
file = open(outpath + '_posseqs.fasta', 'w')
for index, seq in enumerate(posseqs):
    file.write(">" + str(index) + "\n" + seq + "\n")
file.close() #do not forget to close it