In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Part 0: Scramble function

In [2]:
def scramble(seq):
    seq = list(seq)
    np.random.shuffle(seq)
    scr = ''.join(seq)
    return(scr)

def scramble_column(df): #assume column named seq
    scrs = [scramble(seq) for seq in list(df['seq'])]
    df['seq'] = scrs
    return(df)

# Part 1: Scramble toeholds

From https://github.com/midas-wyss/engineered-riboregulator-ML 

In [3]:
# Load in data
clean_df = pd.read_csv('clean/toeholds.csv')
clean_df = scramble_column(clean_df)
clean_df.to_csv('scramble/toeholds.csv')
clean_df

Unnamed: 0,seq,ON,OFF,target
0,GTATAGAATAATAAATTACGAACGACCCTAGAAAATAAAATGTTGG...,0.068295,0.000000,0.068295
1,CAAAAATGAGCTGAAGCATGCAAAACACAGGGGACTTACGTAAGAG...,0.000000,0.038847,-0.038847
2,AATGTATGATGGTTACAGTAGAAGAAGGAAACACCAATACAATAGA...,0.080666,0.123289,-0.042624
3,GCGTCAAATGTTTATGCAAAAAAAGGCTGTTTCCTAAACAAAAGAA...,0.933884,0.514158,0.419726
4,TATCACGAATACGGCAAAGAAAGCGATAAAGAGTAGTAGCTACGCG...,1.000000,0.089756,0.910244
...,...,...,...,...
91529,GATTCTATGATTTCAATATTGATTATAAATGAACAATTGAAAAGAG...,1.000000,0.942347,0.057653
91530,TTAATTCAATTACTCAGTATTGACTCATATATTATATGTTGAATGA...,0.333333,0.014546,0.318787
91531,TTCTAGATTTAAAATATTTTATGGTCAGAGGAATACTGTACTTTGA...,0.856589,0.776737,0.079852
91532,GAAATATTATAAAATGTTAACAAATATTATACTGTTTTATGTTAGT...,0.063088,0.001406,0.061682


# Part 2: Scramble promoters

From https://www.nature.com/articles/s41587-019-0315-8 

In [4]:
prom = pd.read_csv('clean/promoters.csv')
prom = scramble_column(prom)
prom.to_csv('scramble/promoters.csv')
prom

Unnamed: 0,seq,target
0,AGTAGATAGATAGGGACCAGAAGTAGTCTTGGTGCCTAATCGTGTC...,13.708592
1,CAACCTCATCCGTGTTTCAAGTCAAACTCGAATGTCTTAAATATTA...,2.553335
2,GTGCACCTTCCTTTGGTGAGGCTTGCTTGAGTTTAACTCATTCGGA...,13.369969
3,GGCGTATACCACTTGGCGTTCATGTTGTTCCTGCTGCCGGCGCGTG...,3.328683
4,TCGGATTTACTAGTTTAAAAAATTAAGACTGCTGGTCACTAGCATA...,10.466688
...,...,...
9977,CTGGTTACGGGGGTGATTCGGGGTGGGGCATGGTCGGGGTGCTTTT...,14.962475
9978,TTATGTTTGTACGACTTTTTTTTAATAGTTTATTAGAGTCCATTGG...,11.999880
9979,GCTACCGATTAATGAGGCTGGATAATCAAGGAGAAGCAGATAAATG...,4.541188
9980,TCGCTCAGACTTCCGGGTTTATGGTTCCTATGACGTCACTTTTACA...,7.456650


# Part 3: Scramble peptides

From https://github.com/gifford-lab/antibody-2019

In [5]:
peptide = pd.read_csv('clean/peptides.csv')
peptide = scramble_column(peptide)
peptide.to_csv('scramble/peptides.csv')
peptide

Unnamed: 0,seq,target
0,AJJAADJDJFAJWJJYYJJY,-1.200356
1,DGDYSDJYGVDAAJFYAJYA,-0.294225
2,GDDJEDVYJAAYFGYJAYSA,-0.939320
3,DJJSIAHJYLJJAJAJSYDS,-1.007436
4,AYAAJPJJJDDJSDSAJFYJ,-0.595255
...,...,...
67764,SFYJJWDYYJJJYJGYYWDJ,-0.771346
67765,YYYPVHYJJWWWYDYJFJJJ,-0.470316
67766,JYFGHWWJYJJJJJDYJYJJ,-0.746523
67767,JYJYYVFJJVYJJDYJSIQY,-1.572742


# Part 4: Scramble glycans

From https://github.com/midas-wyss/sweettalk

In [6]:
# need to do special scrambles for glycans
import sys
sys.path.insert(1, '../CL_MixAutoML_NAR/src/')
from datacleaner_updated import *
sys.path.insert(1, '../../clean_data/')

def scramble_glyc_column(df): #assume column named seq
    glyc_list = list(df['seq'])
    glyc_list = process_glycans(glyc_list)
    scrs = [scramble(seq) for seq in glyc_list]
    df['seq'] = scrs
    return(df)

In [10]:
# look at glycolib
glycolib = pd.read_pickle('../CL_MixAutoML_NAR/src/glycoletter_lib.pkl')
print(len(glycolib))

1027


In [7]:
glyc = pd.read_csv('clean/glycans.csv')
glyc = scramble_glyc_column(glyc)
glyc.to_csv('scramble/glycans.csv')
glyc

Unnamed: 0,seq,target
0,b1-4L-Xyla1-3b1-3b1-3b1-3b1-3L-AcoL-Xyla1-1b1-...,0
1,b1-3ManGlcNAcb1-2NeuNAcGlcNAcGalb1-4a1-6b1-4Fu...,0
2,D-4dLyxHexOMeManb1-2Manb1-2,0
3,a1-6b1-4b1-4ManManManManGlca1-6a1-3GlcNAcMana1-3,0
4,RhaGalOMea1-4GalOMea1-2a1-4a1-4a1-4GalAGalAGalA,0
...,...,...
19294,b1-4Galaldehyde-QuiNAcb1-4ManGlcNAcb1-4a1-3a1-...,0
19295,GalNAcGlcNAcGalNAcGala1-2b1-4Fucb1-3Gala2-3Neu...,0
19296,a2-6a1-5a1-6D-3dLyxHepUlosaricGlca2-6GlcOPNGlc...,0
19297,b1-6FrufGalGalb1-6Galb1-4,0


# Part 5: Scramble Cas data

In [8]:
ccas = pd.read_csv('../CL_MixAutoML_NAR/data/20190411_CcaCas13b_data.csv')
ccas = scramble_column(ccas)
ccas.to_csv('scramble/ccacas13b.csv')
ccas

Unnamed: 0,id,seq,len,type,avg,avg_norm_med,avg_norm_max,ab_bel_quintile,target
0,zika_0,tgttacaggtttttggctgcgggtcgttct,30,zika,341695.25,51.349927,0.880619,1,0.880619
1,zika_1,cggggttattagttgcttttctgttcggcg,30,zika,264914.00,39.811248,0.682738,1,0.682738
2,zika_2,cggcgcgttggtttgtaatttttcgggttc,30,zika,129871.25,19.517038,0.334705,1,0.334705
3,zika_3,tggctatactttcccgttttggtggtggtt,30,zika,-45.00,-0.006763,-0.000116,0,-0.000116
4,zika_4,ggggcttgtcttttttcctgaaccgtgttg,30,zika,25937.00,3.897810,0.066845,0,0.066845
...,...,...,...,...,...,...,...,...,...
724,acyl_guide_142,actgtcaagaccggtcccgctgaagccgaa,30,acryl,-3526.50,2.476040,-0.015977,0,-0.015977
725,acyl_guide_143,cgcgcatctgatacgcatcggccagagcta,30,acryl,-3167.75,2.224153,-0.014352,0,-0.014352
726,acyl_guide_144,ctggactgcgactcgggacacttaccacga,30,acryl,-2682.75,1.883623,-0.012154,0,-0.012154
727,acyl_guide_145,gcttaaatactatcccaggggggcaccccc,30,acryl,-2924.75,2.053537,-0.013251,0,-0.013251


In [9]:
lwacas = pd.read_csv('../CL_MixAutoML_NAR/data/20190411_LwaCas13b_data.csv')
lwacas = scramble_column(lwacas)
lwacas.to_csv('scramble/lwacas13b.csv')
lwacas

Unnamed: 0,id,seq,len,type,avg_norm_max,ab_bel_quartile,target
0,dengue_34,ctcgttctcggagcatcatgtttcgatc,28,dengue,1.000000,1,1.000000
1,dengue_8,gcatagccctgtgctgtgatagtggatc,28,dengue,0.998592,1,0.998592
2,dengue_24,gtcgaactgtgacctgttctcgttcgta,28,dengue,0.997861,1,0.997861
3,dengue_22,tgtttcggtaatggcttcctgcatgagc,28,dengue,0.992782,1,0.992782
4,dengue_84,cgcggcattgcaatttgttcttcctcga,28,dengue,0.982198,1,0.982198
...,...,...,...,...,...,...,...
540,APML long top predicted guide 16,cccgctcgccgaccgaccgtgtacgtgc,28,acryl,18725.250000,0,18725.250000
541,APML long top predicted guide 17,ccgcgcggcgatcgtgtcccctccaggc,28,acryl,21741.500000,1,21741.500000
542,APML long top predicted guide 18,cctctaggctgccgcttgccggcgcgca,28,acryl,11403.750000,0,11403.750000
543,APML long top predicted guide 19,ttggcttcccccgatggggctggcatgg,28,acryl,13778.000000,0,13778.000000
