In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Part 1: Clean toeholds

From https://github.com/midas-wyss/engineered-riboregulator-ML 

In [3]:
# Load in data
data_dir = 'raw/'
file_name = 'newQC_toehold_data.csv'
data_df = pd.read_csv(data_dir + file_name,sep=',')
print(data_df.head(5))

                                              off_id  \
0  AACCAAACACACAAACGCACAAAAAAAAAAAAAAAAAATGGAAAAC...   
1  AACCAAACACACAAACGCACAAAAAAAAAAAAATGGAAAACAGTTA...   
2  AACCAAACACACAAACGCACAAAAAAAAATTACTACTATTGTTAAT...   
3  AACCAAACACACAAACGCACAAAAAAAATAACGTAGGACTACTACT...   
4  AACCAAACACACAAACGCACAAAAAAAATGGAAAACAGTTACTAAT...   

                                               on_id source_sequence  \
0  AACTGTTTTCCATTTTTTTTTTTTTTTTTTAACCAAACACACAAAC...        smallpox   
1  TTAGTAACTGTTTTCCATTTTTTTTTTTTTAACCAAACACACAAAC...        smallpox   
2  CTAAATTAACAATAGTAGTAATTTTTTTTTAACCAAACACACAAAC...        smallpox   
3  TCCAAGTAGTAGTCCTACGTTATTTTTTTTAACCAAACACACAAAC...        smallpox   
4  ACATATTAGTAACTGTTTTCCATTTTTTTTAACCAAACACACAAAC...        smallpox   

          sequence_id               pre_seq              promoter  \
0  smallpox_tile_2626  CTCTGGGCTAACTGTCGCGC  TAATACGACTCACTATAGGG   
1  smallpox_tile_2625  CTCTGGGCTAACTGTCGCGC  TAATACGACTCACTATAGGG   
2  smallpox_tile_4951  

Filter the data based on QCs, as defined in Angenent-Mari et al.

In [4]:
new_data_df = data_df[~np.isnan(data_df['on_value'])]
new_data_df = new_data_df[~np.isnan(new_data_df['off_value'])]
new_data_df = new_data_df[new_data_df['off_qc'] >= 1.1]
new_data_df = new_data_df[new_data_df['on_qc'] >= 1.1].reset_index()

toehold_seqs = new_data_df['switch_sequence']
seq_len = len(toehold_seqs[0])

print('Number of remaining sequences: ', len(new_data_df))

Number of remaining sequences:  91534


In [5]:
clean_df = new_data_df[['switch_sequence', 'on_value', 'off_value', 'onoff_value']]
clean_df.columns = ['seq', 'ON', 'OFF', 'target']
clean_df.to_csv('clean/toeholds.csv', index = False)
clean_df


Unnamed: 0,seq,ON,OFF,target
0,AAAAAAAAATTACTACTATTGTTAATTTAGAACAGAGGAGACTAAA...,0.068295,0.000000,0.068295
1,AAAAAAAATAACGTAGGACTACTACTTGGAAACAGAGGAGATCCAA...,0.000000,0.038847,-0.038847
2,AAAAAAAATGGAAAACAGTTACTAATATGTAACAGAGGAGAACATA...,0.080666,0.123289,-0.042624
3,AAAAAAACATGAGCTTTGCTTTTTTCAAGTAACAGAGGAGAACTTG...,0.933884,0.514158,0.419726
4,AAAAAAACCAACACAGCTCCAGGAACATTAAACAGAGGAGATAATG...,1.000000,0.089756,0.910244
...,...,...,...,...
91529,TTTTTTTTTAATATTTTCACAAATATCGTTAACAGAGGAGAAACGA...,1.000000,0.942347,0.057653
91530,TTTTTTTTTCTTGATTTATCAACTTCTTTTAACAGAGGAGAAAAAG...,0.333333,0.014546,0.318787
91531,TTTTTTTTTGTCATAGCTTTCCTTTTTAAAAACAGAGGAGATTTAA...,0.856589,0.776737,0.079852
91532,TTTTTTTTTTTATAATTTTTAGTGATTTTGAACAGAGGAGACAAAA...,0.063088,0.001406,0.061682


# Part 2: Clean promoters

From https://www.nature.com/articles/s41587-019-0315-8 

In [6]:
prom = pd.read_csv(data_dir + 'GSE104878_20160503_average_promoter_ELs_per_seq_atLeast100Counts.txt', sep = '\t', header = None)
prom.columns = ['seq', 'target']
prom.to_csv('clean/promoters.csv', index = False)
prom

Unnamed: 0,seq,target
0,TGCATTTTTTTCACAAGAGCACTTGAAGGGCGCCTATGACAAGGGA...,13.708592
1,TGCATTTTTTTCACACATATACTTGGGTGACTTAGATATTTGCATG...,2.553335
2,TGCATTTTTTTCACACATCTGGATTGTCTGGTGTGCTGGTATCTTC...,13.369969
3,TGCATTTTTTTCACACCACCGTGGGGATTCGCAGCTATGTGCATAA...,3.328683
4,TGCATTTTTTTCACACCATGGATTTAAGAATTAATCACCGGACAAC...,10.466688
...,...,...
9977,TGCATTTTTTTCACTCTTTCACGTGGGGCCTGCGGGGTATCGGTGA...,14.962475
9978,TGCATTTTTTTCACTGATGTGGTGCGCGTAATTTCTTTGTTGTGTT...,11.999880
9979,TGCATTTTTTTCACTTCCAGTAATATGCGAAAGGGTGATGTGAACT...,4.541188
9980,TGCATTTTTTTCACTTCGCACTCCACTTCTCGGTTTCTGGTATTAT...,7.456650


# Part 3: Peptides

From https://github.com/gifford-lab/antibody-2019

In [7]:
seq = pd.read_csv(data_dir + 'sequence.tsv', sep = '\t', header = None)
target = pd.read_csv(data_dir + 'target.txt', sep = '\t', header = None)
peptide = pd.DataFrame(zip(seq.iloc[:,1],target.iloc[:,0]))
peptide.columns = ['seq', 'target']
peptide.to_csv('clean/peptides.csv', index = False)
peptide

Unnamed: 0,seq,target
0,JJJJAAAAYDYWFDYJJJJJ,-1.200356
1,JAAADSYDYYAGGYDFDVJJ,-0.294225
2,JAAADSYEYYAGGYDFDVJJ,-0.939320
3,JJJAAAIDHSSSYLDYJJJJ,-1.007436
4,JJJAAAPASSDDYFDYJJJJ,-0.595255
...,...,...
67764,JJJYYYSYDYWGWFDYJJJJ,-0.771346
67765,JJJYYYVWPHWYYWFDYJJJ,-0.470316
67766,JJJJJYYYWHWGFDYJJJJJ,-0.746523
67767,JJJYYYYISVQYVFDYJJJJ,-1.572742


# Part 4: Glycans

From https://github.com/midas-wyss/sweettalk

In [8]:
all_glycs = pd.read_csv(data_dir + 'combined_database_long.csv')
immuno_glycs = pd.read_csv(data_dir + 'immunogenic_glycans_clean.csv')

print(len(all_glycs))
print(len(immuno_glycs))

immunoglycan_seqs = list(immuno_glycs['seq'])

for im in immunoglycan_seqs:
    if im not in list(all_glycs['target']):
        print(im) # okay so there are two that are not in there
        # will ask daniel
        
# need to dereplicate 
all_glycs = all_glycs.drop_duplicates(subset = 'target', keep = 'first')

21296
685
GalNAc(b1-4)[NeuNAc(a2-3)]Gal(b1-3)[NeuNAc(a2-8)]GlcNAc(b1-3)Gal
Glc(a1-4)Gal(b1-4)[Gal(b1-7)]LDManHep(a1-5)KdoOP)


In [9]:
all_glycs['immunogenic'] = [0] * len(all_glycs)
for im in immunoglycan_seqs:
    all_glycs.loc[all_glycs['target'] == im, 'immunogenic'] = 1

all_glycs

Unnamed: 0,target,immunogenic
0,Glc(a1-3)Glc(a1-3)Man(a1-2)Man(a1-2)Man(a1-3)[...,0
1,Gal(a1-3)Galf(a1-3)Galf(a1-3)Man(a1-3)Man(a1-4...,0
2,Gal(a1-2)Man(b1-4)Rha,0
3,Gal(b1-4)[Fuc(a1-3)]GlcNAc(b1-2)Man(a1-3)[Gal(...,0
4,GlcNAc(b1-3)Gal(b1-3)[NeuNAc(a2-6)]GalNAc,0
...,...,...
21291,Galf(a1-3)RhaOAc(a1-4)GlcNAc(a1-2)Rha(a1-3)Glc...,1
21292,Man(a1-2)Man(a1-2)[Glc(a1-4)]Man(b1-3)FucNAc(a...,1
21293,Rha(a1-3)Ribf(b1-4)Rha(a1-3)Ribf(b1-4)Rha(a1-3...,1
21294,Man(b1-3)ManNAc(a1-3)Rha(b1-4)GlcNAc(a1-2)Man,1


Check dereplication

In [10]:
print(sum(all_glycs['immunogenic']))

uniques = []
for seq in list(all_glycs['target']):
    if seq not in uniques:
        uniques.append(seq)
    else:
        print('duplicate?: ' + seq)      
# good- none anymore

# four before dereplicating- just one now
example = 'Gal(b1-4)[Fuc(a1-3)]GlcNAc(b1-3)Gal'
all_glycs[all_glycs['target'] == example]

667


Unnamed: 0,target,immunogenic
909,Gal(b1-4)[Fuc(a1-3)]GlcNAc(b1-3)Gal,1


In [11]:
all_glycs.columns = ['seq', 'target']
all_glycs.to_csv('clean/glycans.csv', index = False)
all_glycs

Unnamed: 0,seq,target
0,Glc(a1-3)Glc(a1-3)Man(a1-2)Man(a1-2)Man(a1-3)[...,0
1,Gal(a1-3)Galf(a1-3)Galf(a1-3)Man(a1-3)Man(a1-4...,0
2,Gal(a1-2)Man(b1-4)Rha,0
3,Gal(b1-4)[Fuc(a1-3)]GlcNAc(b1-2)Man(a1-3)[Gal(...,0
4,GlcNAc(b1-3)Gal(b1-3)[NeuNAc(a2-6)]GalNAc,0
...,...,...
21291,Galf(a1-3)RhaOAc(a1-4)GlcNAc(a1-2)Rha(a1-3)Glc...,1
21292,Man(a1-2)Man(a1-2)[Glc(a1-4)]Man(b1-3)FucNAc(a...,1
21293,Rha(a1-3)Ribf(b1-4)Rha(a1-3)Ribf(b1-4)Rha(a1-3...,1
21294,Man(b1-3)ManNAc(a1-3)Rha(b1-4)GlcNAc(a1-2)Man,1


# Part 4B: New, updated glycans
#### Courtesy of Daniel Bojar, thanks Daniel!


In [12]:
v2 = pd.read_csv(data_dir + 'v2_glycobase.csv')
v2['immunogenicity'] = v2['immunogenicity'].fillna(0)
v2

Unnamed: 0,glycan_id,glycan,species,immunogenicity,inferred_origin,link,genus,family,order,class,phylum,kingdom,domain
0,1,Glc(a1-1)[L-Xyl(b1-3)L-Xyl(b1-3)L-Xyl(b1-3)L-X...,,0.0,Bacteria,,,,,,,,
1,2,GlcNAc(b1-2)[Gal(b1-3)[NeuNAc(a2-6)]GlcNAc(b1-...,,0.0,Chordata,,,,,,,,
2,3,Man(b1-2)Man(b1-2)D-4dLyxHexOMe,['Candida_albicans'],0.0,,,['Candida'],['Saccharomycetaceae'],['Saccharomycetales'],['Saccharomycetes'],['Ascomycota'],['Fungi'],['Eukarya']
3,4,Man(a1-3)[Man(a1-3)[Man(a1-6)]Man(a1-6)]Man(b1...,,0.0,Eukarya,,,,,,,,
4,5,GalA(a1-4)GalA(a1-4)GalA(a1-2)[GalOMe(a1-4)Gal...,,0.0,Angiosperms,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19294,19295,NeuNAc(a2-3)Gal(b1-4)GlcNAc(a1-2)Man(a1-3)[Gal...,,0.0,Eukarya,,,,,,,,
19295,19296,GalNAc(b1-4)[NeuNAc(a2-3)]Gal(b1-4)GlcNAc(b1-6...,['Macaca_radiata'],0.0,,O,['Macaca'],['Cercopithecidae'],['Primates'],['Mammalia'],['Chordata'],['Animalia'],['Eukarya']
19296,19297,D-3dLyxHepUlosaric(a2-6)Glc(b1-4)D-3dLyxHepUlo...,['Acinetobacter_haemolyticus'],0.0,,,['Acinetobacter'],['Moraxellaceae'],['Pseudomonadales'],['Gammaproteobacteria'],['Proteobacteria'],['Bacteria'],['Bacteria']
19297,19298,Gal(b1-6)Gal(b1-6)Gal(b1-4)Fruf,,0.0,Angiosperms,,,,,,,,


In [13]:
uniques = []
for seq in list(v2['glycan']):
    if seq not in uniques:
        uniques.append(seq)
    else:
        print('duplicate?: ' + seq)  
# super no duplicates!

In [14]:
# make ints
v2['value'] = [int(i) for i in v2['immunogenicity']]
v2 = v2[['glycan', 'value']]
v2.columns = ['seq', 'target']
v2.to_csv('clean/glycans.csv', index = False)
v2

Unnamed: 0,seq,target
0,Glc(a1-1)[L-Xyl(b1-3)L-Xyl(b1-3)L-Xyl(b1-3)L-X...,0
1,GlcNAc(b1-2)[Gal(b1-3)[NeuNAc(a2-6)]GlcNAc(b1-...,0
2,Man(b1-2)Man(b1-2)D-4dLyxHexOMe,0
3,Man(a1-3)[Man(a1-3)[Man(a1-6)]Man(a1-6)]Man(b1...,0
4,GalA(a1-4)GalA(a1-4)GalA(a1-2)[GalOMe(a1-4)Gal...,0
...,...,...
19294,NeuNAc(a2-3)Gal(b1-4)GlcNAc(a1-2)Man(a1-3)[Gal...,0
19295,GalNAc(b1-4)[NeuNAc(a2-3)]Gal(b1-4)GlcNAc(b1-6...,0
19296,D-3dLyxHepUlosaric(a2-6)Glc(b1-4)D-3dLyxHepUlo...,0
19297,Gal(b1-6)Gal(b1-6)Gal(b1-4)Fruf,0


In [15]:
v2small = v2.iloc[0:1000,:]
v2small.to_csv('clean/small_glycans.csv', index = False)
v2small

v2tiny = v2.iloc[0:10,:]
v2tiny.iloc[0,1] = 1 # manual make fake data to test
v2tiny.to_csv('clean/tiny_glycans.csv', index = False)
v2tiny

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Unnamed: 0,seq,target
0,Glc(a1-1)[L-Xyl(b1-3)L-Xyl(b1-3)L-Xyl(b1-3)L-X...,1
1,GlcNAc(b1-2)[Gal(b1-3)[NeuNAc(a2-6)]GlcNAc(b1-...,0
2,Man(b1-2)Man(b1-2)D-4dLyxHexOMe,0
3,Man(a1-3)[Man(a1-3)[Man(a1-6)]Man(a1-6)]Man(b1...,0
4,GalA(a1-4)GalA(a1-4)GalA(a1-2)[GalOMe(a1-4)Gal...,0
5,Gal(a1-3)Gal(b1-4)GlcNAc(b1-6)Gal(b1-4)Glc,0
6,GlcA(b1-2)[Rha(a1-3)]Man,0
7,Man(b1-3)[Man(b1-4)]Man(b1-4)GlcNAc(b1-4)GlcNAc,0
8,GlcOAcNOOle(b1-4)GlcNAc(b1-4)GlcNAc(b1-4)[FucO...,0
9,Gal(b1-3)[D-Fuc(a1-3)GlcNAc(b1-6)]Gal(b1-3)[Gl...,0
