In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Helper function

In [1]:
def write_to_fasta_and_csv(outpath, df, seq_col, sco_col, median, small_break = None, mod_peptides = False):
    file = open(outpath + '.fasta', 'w')
    index = 0
    vals = []
    posseqs = []
    negseqs = []
    for seq, on in zip(list(df[seq_col]), list(df[sco_col])):
        if small_break is not None:
            if index > small_break:
                break
        if mod_peptides:
            seq = seq.replace("J", "L")
        file.write(">" + str(index) + "\n" + seq + "\n")
        index = index + 1
        if on <= median:
            binary = 0
            negseqs.append(seq)
        if on > median:
            binary = 1
            posseqs.append(seq)
        vals.append(binary)
    file.close() #do not forget to close it
    df = pd.DataFrame()
    df['value'] = vals
    df.to_csv(outpath + '.csv')
    
    # and make specific fastas for the negative and positive seqs
    file = open(outpath + '_negseqs.fasta', 'w')
    for index, seq in enumerate(negseqs):
        if mod_peptides:
            seq = seq.replace("J", "L")
        file.write(">" + str(index) + "\n" + seq + "\n")
    file.close() #do not forget to close it
    file = open(outpath + '_posseqs.fasta', 'w')
    for index, seq in enumerate(posseqs):
        if mod_peptides:
            seq = seq.replace("J", "L")
        file.write(">" + str(index) + "\n" + seq + "\n")
    file.close() #do not forget to close it

# Part 1: Clean toeholds

In [8]:
# Load in data
toehold = pd.read_csv('clean/toeholds.csv')
median = np.median(list(toehold['ON']))
outpath = "clean/bioautoml/classification_toeholds"
write_to_fasta_and_csv(outpath, toehold, 'seq', 'ON', median)

In [9]:
# Load in data
toehold = pd.read_csv('clean/toeholds.csv')
median = np.median(list(toehold['ON']))
outpath = "clean/bioautoml/classification_small_toeholds"
write_to_fasta_and_csv(outpath, toehold, 'seq', 'ON', median, small_break = 1000)

# Part 2: Peptides

In [10]:
# Load in data
peptide = pd.read_csv('clean/classification_train_peptides.csv')
outpath = "clean/bioautoml/classification_train_peptides"
write_to_fasta_and_csv(outpath, peptide, 'seq', 'target', 0.5)

In [2]:
# BIOAUTOML cannot handle Js in peptide data! replace with L for all J
# Load in data
peptide = pd.read_csv('clean/classification_train_peptides.csv')
outpath = "clean/bioautoml/classification_train_NO_J_peptides"
write_to_fasta_and_csv(outpath, peptide, 'seq', 'target', 0.5, mod_peptides = True)

# Part 3: RBS

In [11]:
# and do binary classification version manually based on median (same as BioSeq-AutoML)
rbs = pd.read_csv('clean/hollerer_rbs_train.csv')
median = np.median(list(rbs['out']))
outpath = "clean/bioautoml/classification_hollerer_rbs_train"
write_to_fasta_and_csv(outpath, rbs, 'seq', 'out', median)

# and mediumtrain
rbs = pd.read_csv('clean/hollerer_rbs_mediumtrain.csv')
median = np.median(list(rbs['out']))
outpath = "clean/bioautoml/classification_hollerer_rbs_mediumtrain"
write_to_fasta_and_csv(outpath, rbs, 'seq', 'out', median)

# Part 4: Synthetic Control

In [2]:
# and do binary classification version manually based on median (same as BioSeq-AutoML)
synth = pd.read_csv('clean/small_synthetic.csv')
median = np.median(list(synth['positive_score']))
outpath = "clean/bioautoml/classification_small_synthetic"
write_to_fasta_and_csv(outpath, synth, 'seq', 'positive_score', median)

# large one too
synth = pd.read_csv('clean/large_synthetic.csv')
median = np.median(list(synth['positive_score']))
outpath = "clean/bioautoml/classification_large_synthetic"
write_to_fasta_and_csv(outpath, synth, 'seq', 'positive_score', median)