In [1]:
import pandas as pd
import numpy as np

In [2]:
def convert_to_four_classes(df, col):
    test1 = df[col]
    newvec = []
    qs = np.quantile(test1, [0, 0.25, 0.5, 0.75, 1.0])

    for t in test1:
        if t <= qs[1]:
            newvec.append('a')
        elif t <= qs[2] and t > qs[1]:
            newvec.append('b')
        elif t <= qs[3] and t > qs[2]:
            newvec.append('c')
        elif t <= qs[4] and t > qs[3]:
            newvec.append('d')
    df['target'] = newvec
    return(df)

# Part 1: Synthetic datasets

In [3]:
bigsynth = pd.read_csv('clean/large_synthetic.csv')
bigsynth = convert_to_four_classes(bigsynth, 'positive_score')
bigsynth.to_csv('multiclass/large_synthetic_multiclass.csv', index = False)
bigsynth

Unnamed: 0,seq,positive_score,mixed_score,target
0,CAATCGTATTGCCTAAGCTAAACAAAAGATACACTGGATTATTATT...,110,60,a
1,GTCAACCTAAGTAGTCTAGAGATTCTATCGTACTATACCCCCAACC...,115,85,a
2,CTCCCTGCTTGTCGGACTCGATACATGCATATGCTCTATGACAGTT...,124,74,b
3,AGACCAGCCTTGGGGCAGCACCGAAGAGGCTTACTTCTAGAAGCAA...,129,54,c
4,CTCGGGGCTGAGAAGGTACGAGTACAGGACAGCGTCCCTACGTATA...,133,48,d
...,...,...,...,...
99995,AACTAACCTCCAGACCTAGCTAGCAAGGTTCACCAGTTATGGCAGC...,121,66,b
99996,TGTTAGTATCTTCTTCGATTTCGGTGTGTGCCATCGACACTCTTCT...,125,80,b
99997,GGAATTGCGCCAGTACAATTGGGGGTGTGTGGCCCTCACTCAGCAC...,133,53,d
99998,AACCTACGTTAGTTTTAGTGGATAGTGTCACACACAAGTGGTTCCA...,120,60,a


In [4]:
smallsynth = pd.read_csv('clean/small_synthetic.csv')
smallsynth = convert_to_four_classes(smallsynth, 'positive_score')
smallsynth.to_csv('multiclass/small_synthetic_multiclass.csv', index = False)
smallsynth

Unnamed: 0,seq,positive_score,mixed_score,target
0,GCGACCTGGAAGCACCCGGCATTTGGTCGAAAAATGCCGCAACGGA...,126,56,c
1,GAAAGCGGTAGGGGGTTGTAAGATGGGACGGTTCGTGTGCTCCCGA...,139,34,d
2,TTTGAAGTAAAGACAGCAAGCATTGGTAGACATCATATTCGTAGAG...,111,56,a
3,CACGCCTTGCCCGAGTCTCTTGACTTTACGACGACCTATAATTCCT...,122,82,b
4,ATCTATATCGTGCTCAATGCGCCAAGGAACTAGACTATTACCAGAA...,113,63,a
...,...,...,...,...
995,GTCGGTTGTAAGACGCGTGGGAAACGTGAATAATGGGTAGCACGTT...,132,37,d
996,CGCAGTTATAATTATGGTGCGTCTAGGATCAGCACACCATCAAATG...,120,60,b
997,GTCAGCCGTCTGCAGGATTGTCCATATAGGACAATAACGCCAGTAC...,122,62,b
998,GTATGTTAACGAATGTCGTGATCTACCGATCTGACAAGTCGTTCGG...,126,61,c


# Part 2: Nucleid Acids

In [5]:
toeholds = pd.read_csv('clean/toeholds.csv')
toeholds = convert_to_four_classes(toeholds, 'ON')
toeholds.to_csv('multiclass/toeholds_multiclass.csv', index = False)
toeholds

Unnamed: 0,seq,ON,OFF,target
0,AAAAAAAAATTACTACTATTGTTAATTTAGAACAGAGGAGACTAAA...,0.068295,0.000000,a
1,AAAAAAAATAACGTAGGACTACTACTTGGAAACAGAGGAGATCCAA...,0.000000,0.038847,a
2,AAAAAAAATGGAAAACAGTTACTAATATGTAACAGAGGAGAACATA...,0.080666,0.123289,a
3,AAAAAAACATGAGCTTTGCTTTTTTCAAGTAACAGAGGAGAACTTG...,0.933884,0.514158,d
4,AAAAAAACCAACACAGCTCCAGGAACATTAAACAGAGGAGATAATG...,1.000000,0.089756,d
...,...,...,...,...
91529,TTTTTTTTTAATATTTTCACAAATATCGTTAACAGAGGAGAAACGA...,1.000000,0.942347,d
91530,TTTTTTTTTCTTGATTTATCAACTTCTTTTAACAGAGGAGAAAAAG...,0.333333,0.014546,b
91531,TTTTTTTTTGTCATAGCTTTCCTTTTTAAAAACAGAGGAGATTTAA...,0.856589,0.776737,d
91532,TTTTTTTTTTTATAATTTTTAGTGATTTTGAACAGAGGAGACAAAA...,0.063088,0.001406,a


In [6]:
promoters = pd.read_csv('clean/promoters.csv')
promoters = convert_to_four_classes(promoters, 'target')
promoters.to_csv('multiclass/promoters_multiclass.csv', index = False)
promoters

Unnamed: 0,seq,target
0,TGCATTTTTTTCACAAGAGCACTTGAAGGGCGCCTATGACAAGGGA...,d
1,TGCATTTTTTTCACACATATACTTGGGTGACTTAGATATTTGCATG...,a
2,TGCATTTTTTTCACACATCTGGATTGTCTGGTGTGCTGGTATCTTC...,d
3,TGCATTTTTTTCACACCACCGTGGGGATTCGCAGCTATGTGCATAA...,a
4,TGCATTTTTTTCACACCATGGATTTAAGAATTAATCACCGGACAAC...,c
...,...,...
9977,TGCATTTTTTTCACTCTTTCACGTGGGGCCTGCGGGGTATCGGTGA...,d
9978,TGCATTTTTTTCACTGATGTGGTGCGCGTAATTTCTTTGTTGTGTT...,c
9979,TGCATTTTTTTCACTTCCAGTAATATGCGAAAGGGTGATGTGAACT...,a
9980,TGCATTTTTTTCACTTCGCACTCCACTTCTCGGTTTCTGGTATTAT...,b


# Part 3: Peptides

In [7]:
peptides = pd.read_csv('clean/peptides.csv')
peptides = convert_to_four_classes(peptides, 'target')
peptides.to_csv('multiclass/peptides_multiclass.csv', index = False)
peptides

Unnamed: 0,seq,target
0,JJJJAAAAYDYWFDYJJJJJ,a
1,JAAADSYDYYAGGYDFDVJJ,c
2,JAAADSYEYYAGGYDFDVJJ,b
3,JJJAAAIDHSSSYLDYJJJJ,b
4,JJJAAAPASSDDYFDYJJJJ,c
...,...,...
67764,JJJYYYSYDYWGWFDYJJJJ,b
67765,JJJYYYVWPHWYYWFDYJJJ,c
67766,JJJJJYYYWHWGFDYJJJJJ,c
67767,JJJYYYYISVQYVFDYJJJJ,a
