In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Part 1: Clean toeholds

From https://github.com/midas-wyss/engineered-riboregulator-ML 

In [2]:
# Load in data
data_dir = 'raw/'
file_name = 'newQC_toehold_data.csv'
data_df = pd.read_csv(data_dir + file_name,sep=',')
print(data_df.head(5))

                                              off_id  \
0  AACCAAACACACAAACGCACAAAAAAAAAAAAAAAAAATGGAAAAC...   
1  AACCAAACACACAAACGCACAAAAAAAAAAAAATGGAAAACAGTTA...   
2  AACCAAACACACAAACGCACAAAAAAAAATTACTACTATTGTTAAT...   
3  AACCAAACACACAAACGCACAAAAAAAATAACGTAGGACTACTACT...   
4  AACCAAACACACAAACGCACAAAAAAAATGGAAAACAGTTACTAAT...   

                                               on_id source_sequence  \
0  AACTGTTTTCCATTTTTTTTTTTTTTTTTTAACCAAACACACAAAC...        smallpox   
1  TTAGTAACTGTTTTCCATTTTTTTTTTTTTAACCAAACACACAAAC...        smallpox   
2  CTAAATTAACAATAGTAGTAATTTTTTTTTAACCAAACACACAAAC...        smallpox   
3  TCCAAGTAGTAGTCCTACGTTATTTTTTTTAACCAAACACACAAAC...        smallpox   
4  ACATATTAGTAACTGTTTTCCATTTTTTTTAACCAAACACACAAAC...        smallpox   

          sequence_id               pre_seq              promoter  \
0  smallpox_tile_2626  CTCTGGGCTAACTGTCGCGC  TAATACGACTCACTATAGGG   
1  smallpox_tile_2625  CTCTGGGCTAACTGTCGCGC  TAATACGACTCACTATAGGG   
2  smallpox_tile_4951  

Filter the data based on QCs, as defined in Angenent-Mari et al.

In [4]:
new_data_df = data_df[~np.isnan(data_df['on_value'])]
new_data_df = new_data_df[~np.isnan(new_data_df['off_value'])]
new_data_df = new_data_df[new_data_df['off_qc'] >= 1.1]
new_data_df = new_data_df[new_data_df['on_qc'] >= 1.1].reset_index()

toehold_seqs = new_data_df['switch_sequence']
seq_len = len(toehold_seqs[0])

print('Number of remaining sequences: ', len(new_data_df))

Number of remaining sequences:  91534


In [5]:
clean_df = new_data_df[['switch_sequence', 'on_value', 'off_value', 'onoff_value']]
clean_df.columns = ['seq', 'ON', 'OFF', 'target']
clean_df.to_csv('clean/toeholds.csv', index = False)
clean_df


Unnamed: 0,seq,ON,OFF,target
0,AAAAAAAAATTACTACTATTGTTAATTTAGAACAGAGGAGACTAAA...,0.068295,0.000000,0.068295
1,AAAAAAAATAACGTAGGACTACTACTTGGAAACAGAGGAGATCCAA...,0.000000,0.038847,-0.038847
2,AAAAAAAATGGAAAACAGTTACTAATATGTAACAGAGGAGAACATA...,0.080666,0.123289,-0.042624
3,AAAAAAACATGAGCTTTGCTTTTTTCAAGTAACAGAGGAGAACTTG...,0.933884,0.514158,0.419726
4,AAAAAAACCAACACAGCTCCAGGAACATTAAACAGAGGAGATAATG...,1.000000,0.089756,0.910244
...,...,...,...,...
91529,TTTTTTTTTAATATTTTCACAAATATCGTTAACAGAGGAGAAACGA...,1.000000,0.942347,0.057653
91530,TTTTTTTTTCTTGATTTATCAACTTCTTTTAACAGAGGAGAAAAAG...,0.333333,0.014546,0.318787
91531,TTTTTTTTTGTCATAGCTTTCCTTTTTAAAAACAGAGGAGATTTAA...,0.856589,0.776737,0.079852
91532,TTTTTTTTTTTATAATTTTTAGTGATTTTGAACAGAGGAGACAAAA...,0.063088,0.001406,0.061682


In [14]:
# Make test sets from Valeri, Collins, Ramesh et al.
green_seqs = pd.read_csv('raw/Valeri, Collins, Ramesh et al/Green2014_clean.csv')
green_seqs['target'] = [1 if i=='Good' else 0 for i in list(green_seqs['Toehold Rating'])]
green_seqs = green_seqs[['Toehold sequence', 'target']]
green_seqs.columns = ['seq', 'target']
green_seqs.to_csv('clean/green_sequences_toehold_test_set.csv', index = False)
green_seqs

Unnamed: 0,seq,target
0,AATGTATGTAATAGTTCGTCGAGGTGTCCAAGCAGAGGAGATGGAC...,0
1,ATGATAATGTAGAGGTGCGGAGTGATTGTAAACAGAGGAGATACAA...,0
2,CGAAGTATTGTAAGGTGTAGTGTGCGTTGAGACAGAGGAGATCAAC...,0
3,TAAGTAAATGAAAGTGTATGTATGTTGCTGGACAGAGGAGACAGCA...,0
4,TCAATAAGGCGGAGTTCGTCGAGGTGCCTGAGCAGAGGAGACAGGC...,0
...,...,...
163,ATAATGATGATGAGTATGTTGAAGGTGTAAGACAGAGGAGATTACA...,1
164,AATTTGGAAGTAGAGTAGTAGATAGTTATGAACAGAGGAGACATAA...,1
165,ATTGATTTGTATTCGTGTTATGTGTCGTTGGACAGAGGAGACAACG...,0
166,AATGATTTGAAACGATGAACTACCTACTTGAACAGAGGAGACAAGT...,0


In [15]:
# Make test sets from Valeri, Collins, Ramesh et al.
pardee_seqs = pd.read_excel('raw/Valeri, Collins, Ramesh et al/Pardee2016_clean.xlsx')
pardee_seqs = pardee_seqs[['Sensor sequence', 'Rank']]
pardee_seqs.columns = ['seq', 'rank']
pardee_seqs.to_csv('clean/pardee_sequences_toehold_test_set.csv', index = False)
pardee_seqs

Unnamed: 0,seq,rank
0,UCUUCAGCCUCCAUGUGUCAUUCUUCUCACUCUCAAGUUAUAGUUA...,6
1,AAAUUCCCCUUGUUUCUUUUCUCUUUUUCCCAUCAUGUUAUAGUUA...,14
2,UUUCGCUCUAUUCUCAUCAGUUUCAUGUCCUGUGUCGUUAUAGUUA...,8
3,GCUCCCCUUCUACUGAUCUCCACAUGAUGUUUUCCAGUUAUAGUUA...,5
4,AACUUCUUUAUUAUUUCCAUAGCCUCUUUUUUCCCCGUUAUAGUUA...,23
5,AGGGUGGCUUCGGCUCUUGGUGAAUUGGGCGUUAUCGUUAUAGUUA...,19
6,CUUCCACUGCAGUCUUCCACUCUUUUUCCUCUUCAAGUUAUAGUUA...,2
7,CUGGGAUCAAGUACAUGUAGUGCGCCACGAGCAAAAGUUAUAGUUA...,1
8,ACUAUUAGGGUCAGGGGUGUUAAUUGUGAGUAGCAAGUUAUAGUUA...,22
9,CAUCGCCAUUAAGGAGUAGUUGUUGUAUGAGGUGGUGUUAUAGUUA...,24


# Part 2: Peptides

From https://github.com/gifford-lab/antibody-2019

In [4]:
# this is the regression dataset
seq = pd.read_csv(data_dir + 'peptide/regression_all_data_x.tsv', sep = '\t', header = None)
target = pd.read_csv(data_dir + 'peptide/regression_all_data_y.txt', sep = '\t', header = None)
peptide = pd.DataFrame(zip(seq.iloc[:,1],target.iloc[:,0]))
peptide.columns = ['seq', 'target']
peptide.to_csv('clean/regression_all_peptides.csv', index = False)
peptide

Unnamed: 0,seq,target
0,JJJJAAAAYDYWFDYJJJJJ,-1.200356
1,JAAADSYDYYAGGYDFDVJJ,-0.294225
2,JAAADSYEYYAGGYDFDVJJ,-0.939320
3,JJJAAAIDHSSSYLDYJJJJ,-1.007436
4,JJJAAAPASSDDYFDYJJJJ,-0.595255
...,...,...
67764,JJJYYYSYDYWGWFDYJJJJ,-0.771346
67765,JJJYYYVWPHWYYWFDYJJJ,-0.470316
67766,JJJJJYYYWHWGFDYJJJJJ,-0.746523
67767,JJJYYYYISVQYVFDYJJJJ,-1.572742


In [7]:
# now do for the classification train & test sets
seq = pd.read_csv(data_dir + 'peptide/classification_train_x.tsv', sep = '\t', header = None)
target = pd.read_csv(data_dir + 'peptide/classification_train_y.txt', sep = '\t', header = None)
peptide = pd.DataFrame(zip(seq.iloc[:,1],target.iloc[:,0]))
peptide.columns = ['seq', 'target']
peptide.to_csv('clean/classification_train_peptides.csv', index = False)
peptide

Unnamed: 0,seq,target
0,JJAADSLWDVFDDVFDYJJJ,1
1,JJJAAFERQYYYSYFDYJJJ,1
2,JJJAAGFKAYYYSFDYJJJJ,1
3,JJJAAHDSTDQSSVFDYJJJ,1
4,JJAAHEKGQDYYQGQFDYJJ,1
...,...,...
63395,JJJYYYVWPHWYYWFDYJJJ,0
63396,JJJJJYYYWHWGFDYJJJJJ,0
63397,JJJYYYYISVQYVFDYJJJJ,0
63398,JJYYYYRKSWQYHHWFDYJJ,0


In [10]:
# now do for the classification train & test sets
seq = pd.read_csv(data_dir + 'peptide/classification_test_x.tsv', sep = '\t', header = None)
target = pd.read_csv(data_dir + 'peptide/classification_test_y.txt', sep = '\t', header = None)
peptide = pd.DataFrame(zip(seq.iloc[:,1],target.iloc[:,0]))
peptide.columns = ['seq', 'target']
peptide.to_csv('clean/classification_test_peptides.csv', index = False)
peptide

Unnamed: 0,seq,target
0,JJHKPQAKSYLAYRILDYJJ,2.071263
1,JJHKPQAKSYLSYRILDYJJ,1.959289
2,JJHKPQAKSYLTYRILDYJJ,1.953657
3,JJHKPQAKSYVPYRILDYJJ,1.856950
4,JJHKPQANSYLPYRILDYJJ,1.852442
...,...,...
466,JJGYGGLSWDQDKGQLDYJJ,0.615814
467,JJJJJDVVQRYFFDYJJJJJ,0.611789
468,JJJJIKEGEGVPDFDYJJJJ,0.611789
469,JJGGPFKVSSFYRYHLDYJJ,0.608865


# Part 3: Glycans
#### Courtesy of Daniel Bojar, thanks Daniel!


In [3]:
# Downloading table S2 from the original glycans paper, https://www.cell.com/cell-host-microbe/fulltext/S1931-3128(20)30562-X#relatedArticles
# Using glycans that are labelled 0 and 1 for immunogenicity
# Also downloading table S1 from the paper and using it for multiclass classification

In [11]:
tables2 = pd.read_excel(data_dir + 'mmc3.xlsx')

print('full table length: ' + str(len(tables2)))
immunogen1 = tables2[tables2['immunogenicity'] == 1]
print('immunogenic 1: ' + str(len(immunogen1)))
immunogen0 = tables2[tables2['immunogenicity'] == 0]
print('immunogenic 0: ' + str(len(immunogen0)))
immunogenfull = pd.concat([immunogen1, immunogen0])
print('full length of immunogencity dataset: ' + str(len(immunogenfull)))
immunogenfull = immunogenfull[['glycan', 'immunogenicity']]
immunogenfull.columns = ['seq', 'target']
immunogenfull.to_csv('clean/immunogenic_glycans.csv', index = False)
immunogenfull

full table length: 19299
immunogenic 1: 636
immunogenic 0: 684
full length of immunogencity dataset: 1320


Unnamed: 0,seq,target
34,Rha(a1-2)Rha(a1-3)Rha(b1-4)Rha,1.0
45,FucNAc(a1-3)QuiNAcNBut(b1-4)FucNAc,1.0
65,[Col(a1-2)]Gal(b1-3)[Col(a1-4)]GlcNAc(b1-4)Gal...,1.0
76,GalNAc(a1-4)ManNAcA(b1-6)GalNAc,1.0
77,QuiNAlaAc(b1-4)GalNAcA(a1-4)GalOAc(a1-2)QuiNAlaAc,1.0
...,...,...
19223,Gal(b1-4)GlcNAc(b1-2)Man(a1-6)[GlcNAc(b1-2)Man...,0.0
19242,Fuc(a1-2)Gal(b1-4)GlcNAc(b1-2)Man(a1-3)[Gal(b1...,0.0
19279,Fuc(a1-2)[GalNAc(a1-3)]Gal(b1-4)GlcNAc(b1-2)Ma...,0.0
19286,NeuNAc(a2-6)Gal(b1-4)GlcNAc(b1-2)Man(a1-3)[Glc...,0.0


In [17]:
tables1 = pd.read_excel(data_dir + 'mmc2.xlsx')

print('full table length: ' + str(len(tables1)))
for i, n in tables1.groupby('domain'):
    print(i, len(n))

tables1 = tables1[['target', 'domain']]
tables1.columns = ['seq', 'target']
tables1.to_csv('multiclass/domain_glycans.csv', index = False)
tables1

full table length: 12674
Archaea 34
Bacteria 5856
Eukarya 6635
Virus 149


Unnamed: 0,seq,target
0,Man(a1-3)[Man(a1-6)][Xyl(b1-2)]Man(b1-4)GlcNAc...,Eukarya
1,GlcNAc(b1-2)Man(a1-3)[Man(a1-6)][Xyl(b1-2)]Man...,Eukarya
2,GlcNAc(b1-2)Man(a1-6)[Man(a1-3)][Xyl(b1-2)]Man...,Eukarya
3,Fuc(a1-6)[Gal(b1-4)]GlcNAc(b1-2)Man(a1-6)[GlcN...,Eukarya
4,GlcNAc(b1-2)Man(a1-3)[GlcNAc(b1-2)Man(a1-6)][X...,Eukarya
...,...,...
12669,QuiNAc(b1-7)LDManHep(a1-6)Glc(a1-2)Glc(a1-3)Glc,Bacteria
12670,Rha(a1-3)Rha(a1-4)GalNAcA(a1-3)GlcNAc(b1-2)Rha,Bacteria
12671,LDManHep(a1-6)Glc(a1-2)Glc,Bacteria
12672,Gal(a1-3)FucNAc(a1-3)GlcNAc(b1-3)Rib-ol,Bacteria
