In [1]:
import pandas as pd
import numpy as np

In [2]:
def convert_to_four_classes(df, col):
    test1 = df[col]
    newvec = []
    qs = np.quantile(test1, [0, 0.25, 0.5, 0.75, 1.0])

    for t in test1:
        if t <= qs[1]:
            newvec.append('a')
        elif t <= qs[2] and t > qs[1]:
            newvec.append('b')
        elif t <= qs[3] and t > qs[2]:
            newvec.append('c')
        elif t <= qs[4] and t > qs[3]:
            newvec.append('d')
    df['target'] = newvec
    return(df)

# Part 1: Synthetic datasets

In [11]:
bigsynth = pd.read_csv('clean/large_synthetic.csv')
bigsynth = convert_to_four_classes(bigsynth, 'positive_score')
bigsynth.to_csv('multiclass/large_synthetic_multiclass.csv', index = False)
bigsynth

Unnamed: 0,seq,positive_score,mixed_score,target
0,ACCACTTACTTCGTAATTCG,45,35,a
1,CAAATTGGCTTCCACCTCGC,50,35,b
2,GCCTTATAAGACGGTAAGGC,50,20,b
3,CCGCACGTAACGCTATACGC,51,31,c
4,CGTTTCCTGAAAGACGGGTT,52,22,c
...,...,...,...,...
99995,CTGTTAGGGATAAATATATG,44,19,a
99996,GAGCCGCGACGACCATCCAT,53,28,c
99997,TAGTTACTCATTCCCTCGTG,49,34,b
99998,AAGCTCATCAGCATTAAATC,41,31,a


In [12]:
smallsynth = pd.read_csv('clean/small_synthetic.csv')
smallsynth = convert_to_four_classes(smallsynth, 'positive_score')
smallsynth.to_csv('multiclass/small_synthetic_multiclass.csv', index = False)
smallsynth

Unnamed: 0,seq,positive_score,mixed_score,target
0,GTATGGCGACCCGAGGTTCG,58,18,d
1,GCAGACGAGAATGCAGCTCC,52,22,c
2,ATGCTAATAAAAATGACAAG,37,22,a
3,CAGCCTATCATCACGGAAAG,47,27,a
4,TGTGTAGCTAGGGATGGACA,53,13,c
...,...,...,...,...
995,GAAGCACCGCACATCTACAG,48,28,b
996,TGCTCGGCTTAAGTATTTCT,49,29,b
997,GCTAAAGTTAACTCCAGCCA,45,30,a
998,GAAGGTGCAAACTCTACGGT,50,20,b


# Part 2: Nucleid Acids

In [13]:
toeholds = pd.read_csv('clean/toeholds.csv')
toeholds = convert_to_four_classes(toeholds, 'ON')
toeholds.to_csv('multiclass/toeholds_multiclass.csv', index = False)
toeholds

Unnamed: 0,seq,ON,OFF,target
0,AAAAAAAAATTACTACTATTGTTAATTTAGAACAGAGGAGACTAAA...,0.068295,0.000000,a
1,AAAAAAAATAACGTAGGACTACTACTTGGAAACAGAGGAGATCCAA...,0.000000,0.038847,a
2,AAAAAAAATGGAAAACAGTTACTAATATGTAACAGAGGAGAACATA...,0.080666,0.123289,a
3,AAAAAAACATGAGCTTTGCTTTTTTCAAGTAACAGAGGAGAACTTG...,0.933884,0.514158,d
4,AAAAAAACCAACACAGCTCCAGGAACATTAAACAGAGGAGATAATG...,1.000000,0.089756,d
...,...,...,...,...
91529,TTTTTTTTTAATATTTTCACAAATATCGTTAACAGAGGAGAAACGA...,1.000000,0.942347,d
91530,TTTTTTTTTCTTGATTTATCAACTTCTTTTAACAGAGGAGAAAAAG...,0.333333,0.014546,b
91531,TTTTTTTTTGTCATAGCTTTCCTTTTTAAAAACAGAGGAGATTTAA...,0.856589,0.776737,d
91532,TTTTTTTTTTTATAATTTTTAGTGATTTTGAACAGAGGAGACAAAA...,0.063088,0.001406,a


In [14]:
rbs = pd.read_csv('clean/hollerer_rbs_mediumtrain.csv')
rbs = convert_to_four_classes(rbs, 'out')
rbs.to_csv('multiclass/rbs_medium_multiclass.csv', index = False)
rbs

Unnamed: 0,seq,out,target
0,AGATGTGGAGGGAGCGA,0.764483,d
1,AAAAGCGTGGAGATTTT,0.696517,d
2,CAAAGCGTGGAGATTCT,0.549834,c
3,AAAAGCGAGGAGTTACT,0.856013,d
4,CTAAGCGTGGAGATACT,0.749364,d
...,...,...,...
49995,ACGGTAGGCAAAACACA,0.356187,c
49996,ACTAGAAGCTACAGAGC,0.013144,a
49997,ACTATACCGCACTACAC,0.025287,a
49998,ACTCGTCAGCAAAAGAA,0.072713,b


In [3]:
rbs = pd.read_csv('clean/hollerer_rbs_train.csv')
rbs = convert_to_four_classes(rbs, 'out')
rbs.to_csv('multiclass/rbs_full_multiclass.csv', index = False)
rbs

Unnamed: 0,seq,out,target
0,AGATGTGGAGGGAGCGA,0.764483,d
1,AAAAGCGTGGAGATTTT,0.696517,d
2,CAAAGCGTGGAGATTCT,0.549834,d
3,AAAAGCGAGGAGTTACT,0.856013,d
4,CTAAGCGTGGAGATACT,0.749364,d
...,...,...,...
275844,TTTTTTACCAACCTAGA,0.010468,a
275845,TTTTTTACCATAGCTAC,0.046439,b
275846,TTTTTTCAGTTTTAAGT,0.190806,c
275847,TTTTTTCTTTAGACTTA,0.000000,a


# Part 3: Peptides

In [16]:
peptides = pd.read_csv('clean/peptides.csv')
peptides = convert_to_four_classes(peptides, 'target')
peptides.to_csv('multiclass/peptides_multiclass.csv', index = False)
peptides

Unnamed: 0,seq,target
0,JJJJAAAAYDYWFDYJJJJJ,a
1,JAAADSYDYYAGGYDFDVJJ,c
2,JAAADSYEYYAGGYDFDVJJ,b
3,JJJAAAIDHSSSYLDYJJJJ,b
4,JJJAAAPASSDDYFDYJJJJ,c
...,...,...
67764,JJJYYYSYDYWGWFDYJJJJ,b
67765,JJJYYYVWPHWYYWFDYJJJ,c
67766,JJJJJYYYWHWGFDYJJJJJ,c
67767,JJJYYYYISVQYVFDYJJJJ,a
