In [None]:
### INSTRUCTIONS TO RUN
###
### To run this file you need to have the packages imported below installed in your 
### environment: pandas, regex, biopython, and OligoArrayAux
### NOTE: the OligoArrayAux package is available as an RPM and needs to be built
### on your computer.
###
### Additionally, you need the following raw data files:
###
### 13059_2018_1459_MOESM5_ESM.xlsx         from 
### Doench 2014 Supplemental Tables.xlsx    from 
### Fusi_Data.csv                           from 
### Shalem_KO_data.xlsx                     from 
### Supplementary Table 1.xlsx              from
###
###
### by Ian Hay, Jaeson Pyeon, and Zain Alam
### April 19, 2023 for CS4120 NLP Final Project

In [56]:
import pandas as pd
import re
## need biopython to run: https://anaconda.org/anaconda/biopython
from Bio.SeqUtils import MeltingTemp as mt
from Bio.Seq import Seq
## need OligoArrayAux to run: http://www.unafold.org/Dinamelt/software/OligoArrayAux.php
import oligo_melting as OligoMelt

## Featurization
### Functions

In [57]:
def letter_at_position(string, letter, position):
    ## takes in a string, letter (both as strings), and position as an integer
    ## returns boolean whether the given letter is at the given position in the string
    return string[position:position+1].__eq__(letter)

def tuple_at_position(string, letters, position):
    ## takes in a string, 2 letters (as a single string) and position as an integer
    ## returns boolean whether the given letters are at the given position in the string
    return string[position:position+2].__eq__(letters)

def triple_at_position(string, letters, position):
    ## takes in a string, 3 letters (as a single string) and position as an integer
    ## returns boolean whether the given letters are at the given position in the string
    return string[position:position+3].__eq__(letters)

def find_PAM(sequence, PAM):
    ## takes in sequence and PAM as strings representing genetic target sequence and the promotospacer-adjacent motif
    ## of the gene editing complex
    ## returns the position as an int or -1 if not found
    match = re.search(PAM, sequence)
    return match.start()

def gc_count(string):
    ## takes in a string
    ## returns an integer representing the number of G or C letters in that string
    count = 0
    for letter in string:
        if letter.__eq__('G') | letter.__eq__('C'):
            count=count+1
    return count

In [58]:
def v1Featurization(df, sequenceCol):

    V1_bool_parameters=["GC<10","GC>10"]

    texts = df[sequenceCol]

    for k in range(20):
        ## creates columns for each nucleotide at each position in sequence
        for x in ['A',"G","C","T"]:
            df[x + str(k+1)] = letter_at_position(texts,x,k)
            V1_bool_parameters.append(x + str(k+1))
            if k < 19:
                for y in ["A","G","C","T"]:
                    df[x + y + str(k+1)] = tuple_at_position(texts,x+y,k)
                    V1_bool_parameters.append(x + y + str(k+1))
            else:
                for y in ["A","G","C","T"]:
                    df[x + y + "?"] =  False # not technically correct but will work as placeholder
                    V1_bool_parameters.append(x + y + "?")
            df[x + "?"] = False # same with this
            V1_bool_parameters.append(x + "?")
            
    df['GC Count'] = -1 ## creates column for GC count

    df['Melting Temp Wallace'] = 0
    df['Melting Temp GC'] = 0
    df['Melting Temp NN1'] = 0
    df['Melting Temp NN2'] = 0
    df['Melting Temp NN3'] = 0
    df['Melting Temp NN4'] = 0

    df["GC<10"] = False
    df["GC>10"] = False

    
    for a in range(texts.size):
        ## loops through each sequence and computes the necessary one-hot nucleotide encoding features
        for k in range(20):
            for x in ['A',"G","C","T"]:
                df[x + str(k+1)][a] = letter_at_position(texts[a],x,k)
                if k < 19:
                    for y in ["A","G","C","T"]:
                        df[x + y + str(k+1)][a] = tuple_at_position(texts[a],x+y,k)
                        if (not df[x + y + "?"][a]):
                            df[x + y + "?"][a] = df[x + y + str(k+1)][a]
                
                if (not df[x + "?"][a]):
                    df[x + "?"][a] = df[x + str(k+1)][a]
                            
        ## generates the GC count data for each column
        df['GC Count'][a] = gc_count(texts[a])

    for a in range(texts.size):
        seq = Seq(texts[a])
        df["Melting Temp Wallace"][a] = mt.Tm_Wallace(seq)
        df["Melting Temp GC"][a] = mt.Tm_GC(seq)
        df["Melting Temp NN1"][a] = mt.Tm_NN(seq, nn_table=mt.DNA_NN1)
        df["Melting Temp NN2"][a] = mt.Tm_NN(seq, nn_table=mt.DNA_NN2)
        df["Melting Temp NN3"][a] = mt.Tm_NN(seq, nn_table=mt.DNA_NN3)
        df["Melting Temp NN4"][a] = mt.Tm_NN(seq, nn_table=mt.DNA_NN4)
        
        df["GC<10"][a] = df["GC Count"][a] < 10
        df["GC>10"][a] = df["GC Count"][a] > 10
    
    for v1 in V1_bool_parameters:
        df[v1]=df[v1].astype(int)

    return df

In [59]:
def v2Featurization(df, sequenceCol):
    texts = df[sequenceCol]

    V2_bool_parameters=[]

    df["AAAAA"] = "AAAAA" in texts ## initializes each column to false
    df["GGGGG"] = "GGGGG" in texts
    df["CCCCC"] = "CCCCC" in texts
    df["TTTTT"] = "TTTTT" in texts
    V2_bool_parameters.append(["AAAAA","GGGGG","CCCCC","TTTTT"])

    for k in range(19):
        for x in ['A',"G","C","T"]:
            for y in ['A',"G","C","T"]:
                for z in ["A","G","C","T"]:
                    
                    if k < 18:
                        df[x + y + z + str(k+1)] = False
                        V2_bool_parameters.append(x + y + z + str(k+1))
                    else:
                        df[x+y+z] = False
                        V2_bool_parameters.append(x+y+z)
    df["Delta G"] = 0

    for a in range(texts.size):
        df["AAAAA"][a] = "AAAAA" in texts[a]
        df["GGGGG"][a] = "GGGGG" in texts[a]
        df["CCCCC"][a] = "CCCCC" in texts[a]
        df["TTTTT"][a] = "TTTTT" in texts[a]
        
        for k in range(19):
            for x in ['A',"G","C","T"]:
                for y in ['A',"G","C","T"]:
                    for z in ["A","G","C","T"]:

                        if k < 18:
                            df[x + y + z + str(k+1)][a] = triple_at_position(texts[a], x+y+z, k)
                            if (not df[x+y+z][a]):
                                df[x+y+z][a] = df[x + y + z + str(k+1)][a]
    for a in range(texts.size):
        ## note that this uses the 20-n.t. guide RNA sequence

        # # Calculate melting temperature for 25uM oligos
        # (name, g, h, s, tm, seq)
        df["Delta G"][a] = OligoMelt.Duplex.calc_tm(texts[a])[1]

    for v2 in V2_bool_parameters:
        df[v2]=df[v2].astype(int)

    return df

### Doench et al 2014

In [60]:
df_doench=pd.read_excel("Doench 2014 Supplemental Tables.xlsx", sheet_name="Supp Table 10")
df_doench.head()

  for idx, row in parser.parse():


Unnamed: 0,Spacer Sequence,Expanded Sequence,Spacer_ID,Gene Symbol,Essential Gene Set,Expression Value (CCLE),Early Time Point,DMSO Day 14 Rep 1,DMSO Day 14 Rep 2,ETP - DMSO,Gene Avg,Normalized sgRNA Activity,sgRNA Score
0,GTAATGGCTTCCTCGTGAGT,GGCCGTAATGGCTTCCTCGTGAGTTGGTCC,s_40338,PARS2,aminoacyl,7.013512,3.623884,3.727199,2.739853,0.390358,1.173702,-0.783345,0.325738
1,GCTCGTGCGAGTGATAGACC,AGAAGCTCGTGCGAGTGATAGACCAGGAGA,s_40337,PARS2,aminoacyl,7.013512,3.432073,0.942266,2.950215,1.485832,1.173702,0.312129,0.162559
2,CACGGACGGTATATGGCAGG,ATGGCACGGACGGTATATGGCAGGAGGTGG,s_40336,PARS2,aminoacyl,7.013512,4.679738,3.576145,2.493495,1.644918,1.173702,0.471215,0.035653
3,TACTTCTGCGAGACACGGAC,TGGCTACTTCTGCGAGACACGGACAGGTCG,s_32891,MARS2,aminoacyl,7.065261,2.050553,1.291687,1.034652,0.887383,1.086888,-0.199505,0.311042
4,CGGCCCACGCTACTACAGTT,ACTTCGGCCCACGCTACTACAGTTCGGGCT,s_32889,MARS2,aminoacyl,7.065261,5.252744,4.041662,4.907459,0.778184,1.086888,-0.308705,0.199951


In [61]:
df_doench.shape

(1278, 13)

### Doench et al 2016

In [62]:
df_doench_2016=pd.read_excel("13059_2018_1459_MOESM5_ESM.xlsx", sheet_name="hek293t")
df_doench_2016["20mer"] = df_doench_2016["sgRNA"].str[:20]
df_doench_2016.head()

Unnamed: 0,Chromosome,Start,End,Strand,sgRNA,Normalized efficacy,20mer
0,chr16,28602245,28602267,-,GGCTGCTTTACCCGCTGTGGGGG,0.323783,GGCTGCTTTACCCGCTGTGG
1,chr16,28602131,28602153,-,TCCGGGTTGGCCTTCCACTGGGG,0.349162,TCCGGGTTGGCCTTCCACTG
2,chr16,28600418,28600440,+,CAGCATCCTTCGGAAAGCTCTGG,0.280197,CAGCATCCTTCGGAAAGCTC
3,chr16,28602206,28602228,-,CGGTAGAAGCAGGTAGTCTGGGG,0.226491,CGGTAGAAGCAGGTAGTCTG
4,chr16,28602121,28602143,+,CATCCCGCTGCCCCAGTGGAAGG,0.255358,CATCCCGCTGCCCCAGTGGA


In [63]:
df_doench_2016.shape

(2333, 7)

### Fusi et al 2015

In [64]:
df_fusi=pd.read_csv("Fusi_Data.csv")
df_fusi["20mer"] = df_fusi["30mer"].str[5:-5]
df_fusi.head()

Unnamed: 0.1,Unnamed: 0,30mer,Target gene,Percent Peptide,Amino Acid Cut position,score_drug_gene_rank,score_drug_gene_threshold,drug,predictions,20mer
0,0,CAGAAAAAAAAACACTGCAACAAGAGGGTA,CD5,72.87,360.0,0.083682,0,nodrug,0.544412,AAAAAAACACTGCAACAAGA
1,1,TTTTAAAAAACCTACCGTAAACTCGGGTCA,NF1,65.8,1868.0,0.868207,1,PLX_2uM,0.617512,AAAAACCTACCGTAAACTCG
2,2,TCAGAAAAAGCAGCGTCAGTGGATTGGCCC,CD5,84.21,416.0,0.1841,0,nodrug,0.476232,AAAAGCAGCGTCAGTGGATT
3,3,AATAAAAAATAGGATTCCCAGCTTTGGAAG,NF1,56.39,1601.0,0.432065,0,PLX_2uM,0.459882,AAAATAGGATTCCCAGCTTT
4,4,GATGAAAAATATGTAAACAGCATTTGGGAC,CUL3,4.3,33.0,0.149351,0,PLX_2uM,0.290841,AAAATATGTAAACAGCATTT


In [65]:
df_fusi.shape

(5310, 10)

### Hart et al 2016

In [66]:
df_hct=pd.read_excel("Supplementary Table 1.xlsx", sheet_name="Hct116_lib1")
df_hct.head()

Unnamed: 0,dataset,guide,seq,db,position,longSeq100Bp,Raw,Normalized
0,hart2016-Hct1162lib1Avg,CTTGCTCGCGCAGGACGAGG,CTTGCTCGCGCAGGACGAGG,hg19,chr17:33469084-33469184:-,GGGCGGGCGACCAGACTGACTCCCGCCCCACTTGCTCGCGCAGGAC...,1.969083,0.210869
1,hart2016-Hct1162lib1Avg,ACATCAGGTTACCTCTACCA,ACATCAGGTTACCTCTACCA,hg19,chr4:184605936-184606036:-,ATTTTATGTATCAAGAATTTTACTCAAAAAACATCAGGTTACCTCT...,2.482667,0.462385
2,hart2016-Hct1162lib1Avg,CTGATGCCAGCTAGTGGGCG,CTGATGCCAGCTAGTGGGCG,hg19,chr1:11736835-11736935:+,AAAGGGGTGGGTGTGTTGCCAGAGACACACCTGATGCCAGCTAGTG...,-0.575667,-1.737013
3,hart2016-Hct1162lib1Avg,CTGTTTCCCATCCTTCCGGG,CTGTTTCCCATCCTTCCGGG,hg19,chr4:25379036-25379136:+,CAGGGCCGTCCCCATGTTGCGTTTTCCGACCTGTTTCCCATCCTTC...,3.847833,1.072793
4,hart2016-Hct1162lib1Avg,AATGTATGCACAGGGAACAG,AATGTATGCACAGGGAACAG,hg19,chr12:57936689-57936789:-,ACTTTAATTTTATCAGTTTTTAAAGTAGGTAATGTATGCACAGGGA...,-1.003333,-2.195469


In [67]:
df_hct.shape

(4239, 8)

In [68]:
df_hela=pd.read_excel("Supplementary Table 1.xlsx", sheet_name="Hela")
df_hela.head()

Unnamed: 0,dataset,guide,seq,db,position,longSeq100Bp,Raw,Normalized
0,hart2016-HelaLib1Avg,ACATCAGGTTACCTCTACCA,ACATCAGGTTACCTCTACCA,hg19,chr4:184605936-184606036:-,ATTTTATGTATCAAGAATTTTACTCAAAAAACATCAGGTTACCTCT...,1.411667,-0.0164
1,hart2016-HelaLib1Avg,CTGATGCCAGCTAGTGGGCG,CTGATGCCAGCTAGTGGGCG,hg19,chr1:11736835-11736935:+,AAAGGGGTGGGTGTGTTGCCAGAGACACACCTGATGCCAGCTAGTG...,0.012667,-0.93081
2,hart2016-HelaLib1Avg,CTGTTTCCCATCCTTCCGGG,CTGTTTCCCATCCTTCCGGG,hg19,chr4:25379036-25379136:+,CAGGGCCGTCCCCATGTTGCGTTTTCCGACCTGTTTCCCATCCTTC...,4.1075,1.217073
3,hart2016-HelaLib1Avg,AATGTATGCACAGGGAACAG,AATGTATGCACAGGGAACAG,hg19,chr12:57936689-57936789:-,ACTTTAATTTTATCAGTTTTTAAAGTAGGTAATGTATGCACAGGGA...,-0.098917,-1.034615
4,hart2016-HelaLib1Avg,CCAGACTCACCCGCTTGCCC,CCAGACTCACCCGCTTGCCC,hg19,chr2:131103446-131103546:-,GCCTGGCCCGCAGCCCCAGCCTGAAGGCCCCCAGACTCACCCGCTT...,2.3345,0.428281


In [69]:
df_hela.shape

(8101, 8)

### Wang et al 2014

In [70]:
df_hl60=pd.read_excel("Supplementary Table 1.xlsx", sheet_name="Hl60")
df_hl60["20mer"] = df_hl60["seq"].str[3:]
df_hl60.head()

Unnamed: 0,dataset,guide,seq,db,position,longSeq100Bp,Raw,Normalized,20mer
0,xu2015TrainHl60,ACTR8-1,GAAGGGCGGCGAGAAGGAGAAGG,hg19,chr3:53916017-53916117:-,TGAGAAGGGTGATACGGAGAACGGAAAGGAGAAGGGCGGCGAGAAG...,-2.062623,-0.640675,GGGCGGCGAGAAGGAGAAGG
1,xu2015TrainHl60,ACTR8-2,GAGAACGGAAAGGAGAAGGGCGG,hg19,chr3:53916031-53916131:-,ATAATGACCCAGGCTGAGAAGGGTGATACGGAGAACGGAAAGGAGA...,-2.971773,-1.426366,AACGGAAAGGAGAAGGGCGG
2,xu2015TrainHl60,ACTR8-3,GAGAAGGGTGATACGGAGAACGG,hg19,chr3:53916046-53916146:-,TGGGTGTCTCCGGCCATAATGACCCAGGCTGAGAAGGGTGATACGG...,-2.036724,-0.611302,AAGGGTGATACGGAGAACGG
3,xu2015TrainHl60,ACTR8-4,GTCTCCGGCCATAATGACCCAGG,hg19,chr3:53916071-53916171:-,CTGCAGCGCTGAGGCGAGAGGTTGGTGGGTGTCTCCGGCCATAATG...,-1.150398,0.29708,TCCGGCCATAATGACCCAGG
4,xu2015TrainHl60,BCL2-1,GGAGGAGAAGATGCCCGGTGCGG,hg19,chr18:60985716-60985816:+,TGCGGCTGGATGGGGCGTGTGCCCGGGCTGGGAGGAGAAGATGCCC...,-3.202624,-1.613839,GGAGAAGATGCCCGGTGCGG


In [71]:
df_hl60.shape

(2076, 9)

### Chari et al 2015

In [72]:
df_chari=pd.read_excel("Supplementary Table 1.xlsx", sheet_name="Hek293t")
df_chari["20mer"] = df_chari["seq"].str[3:]
df_chari.head()

Unnamed: 0,dataset,guide,seq,db,position,longSeq100Bp,Raw,Normalized,20mer
0,chari2015Train293T,ABI1_chr10_27066055,ACAGGGCGCTCCATATTCGCAGG,hg19,chr10:27066024-27066124:+,GTGTAATCGATAGGTTTCCGAATATACCTTACAGGGCGCTCCATAT...,1.300487,0.21841,GGGCGCTCCATATTCGCAGG
1,chari2015Train293T,ABL2_chr1_179086468,TCTCAATTAAATCTGACGTCTGG,hg19,chr1:179086420-179086520:-,CAGCACCAGAGAGTCTTGCCTACAATACCTTCTCAATTAAATCTGA...,1.159525,0.042761,CAATTAAATCTGACGTCTGG
2,chari2015Train293T,ACKR3_chr2_237489282,CATGATTGCCAACTCCGTGGTGG,hg19,chr2:237489251-237489351:+,CTTCATTTACATTTTCATCTTCGTCATCGGCATGATTGCCAACTCC...,0.92688,-0.291517,GATTGCCAACTCCGTGGTGG
3,chari2015Train293T,ACSL3_chr2_223783789,TTTCTTTGGTCCCACGCCTGCGG,hg19,chr2:223783758-223783858:+,GATGAATGTTGGTGTATTTCTAGGATATAGTTTCTTTGGTCCCACG...,0.672197,-0.742464,CTTTGGTCCCACGCCTGCGG
4,chari2015Train293T,ACSL6_chr5_131326586,CCCACTACTATGATGATGCCCGG,hg19,chr5:131326538-131326638:-,CTGTGATTGGGTCTGGCCCTCAGCTACTTACCCACTACTATGATGA...,1.235033,0.125624,ACTACTATGATGATGCCCGG


In [73]:
df_chari.shape

(850, 9)

### Testing Data: HEL (from GNL Scorer)

In [74]:
df_hel=pd.read_excel("Supplementary Table 1.xlsx", sheet_name="HEL")
df_hel.head()

Unnamed: 0,sgRNA_sequence,30mer,modFreq
0,GAGTCGGGGTTTCGTCATGT,AGTAGAGTCGGGGTTTCGTCATGTTGGTCA,0.0
1,CGCCGCCGCTTTCGGTGATG,CTGCCGCCGCCGCTTTCGGTGATGAGGAAA,0.0
2,GGCAGCGTCGTGCACGGGTC,CCCGGGCAGCGTCGTGCACGGGTCGGGTGA,0.0
3,TGGGCGGATCACTTGACGTC,GAGGTGGGCGGATCACTTGACGTCAGGAGT,0.0
4,TTACCATAGTGTACGGGTGC,CTTTTTACCATAGTGTACGGGTGCAGGCAT,0.0


### Off-Target: Shalem et al

In [75]:
df_KO=pd.read_excel("Shalem_KO_data.xlsx")
df_KO.head()

  for idx, row in parser.parse():


Unnamed: 0,CCDS ID,Gene name,Exon number,Number of 1bp mismatches,Number of 2bp mismatches,Number of 3bp mismatches,Off-target score (OS),sgRNA sequence
0,CCDS10.1,/TNFRSF18,2,0,0,15,215.2632,TACCCTGGGACTGTACCCCC
1,CCDS10.1,/TNFRSF18,2,0,0,27,304.1053,ACCCTTGCTGCACGACCTGC
2,CCDS10.1,/TNFRSF18,5,0,0,5,60.7368,TCGCTCGCCCCGCTCTTCCT
3,CCDS100.2,/GPR157,2,0,0,6,61.7368,TGACGCCTCGGACGTGTCTG
4,CCDS100.2,/GPR157,2,0,0,7,116.1053,CGTCATAGCCAATCTTCTTC


### Isolating necessary data into one dataframe


In [76]:
df_doench_features=df_doench[["Spacer Sequence","sgRNA Score"]]

df_doench_features["sgRNA Rank"] = df_doench_features["sgRNA Score"].rank(method='max')
n = df_doench_features["sgRNA Rank"].size
df_doench_features["sgRNA Normalized"] = (df_doench_features["sgRNA Rank"] - (3/8)) / (n + 1/4)


fusi_feature_dict = {"20mer":"Spacer Sequence",
                    "score_drug_gene_rank":"sgRNA Score"}
df_fusi_features=df_fusi[["20mer","score_drug_gene_rank"]]
df_fusi_features.rename(columns=fusi_feature_dict, inplace = True)

df_fusi_features["sgRNA Rank"] = df_fusi_features["sgRNA Score"].rank(method='max')
n = df_fusi_features["sgRNA Rank"].size
df_fusi_features["sgRNA Normalized"] = (df_fusi_features["sgRNA Rank"] - (3/8)) / (n + 1/4)


doench_feature_dict = {"20mer":"Spacer Sequence",
                        "Normalized efficacy":"sgRNA Score"}
df_doench_2016_features = df_doench_2016[["20mer","Normalized efficacy"]]
df_doench_2016_features.rename(columns=doench_feature_dict, inplace=True)

df_doench_2016_features["sgRNA Rank"] = df_doench_2016_features["sgRNA Score"].rank(method='max')
n = df_doench_2016_features["sgRNA Rank"].size
df_doench_2016_features["sgRNA Normalized"] = (df_doench_2016_features["sgRNA Rank"] - (3/8)) / (n + 1/4)


wang_feature_dict = {"20mer":"Spacer Sequence",
                    "Raw":"sgRNA Score"}
df_hl60_features = df_hl60[["20mer","Raw"]]
df_hl60_features.rename(columns=wang_feature_dict, inplace=True)

df_hl60_features["sgRNA Rank"] = df_hl60_features["sgRNA Score"].rank(method='max')
n = df_hl60_features["sgRNA Rank"].size
df_hl60_features["sgRNA Normalized"] = (df_hl60_features["sgRNA Rank"] - (3/8)) / (n + 1/4)


chari_feature_dict = {"20mer":"Spacer Sequence",
                    "Raw":"sgRNA Score"}
df_chari_features = df_chari[["20mer","Raw"]]
df_chari_features.rename(columns=chari_feature_dict, inplace=True)

df_chari_features["sgRNA Rank"] = df_chari_features["sgRNA Score"].rank(method='max')
n = df_chari_features["sgRNA Rank"].size
df_chari_features["sgRNA Normalized"] = (df_chari_features["sgRNA Rank"] - (3/8)) / (n + 1/4)


hart_feature_dict = {"guide":"Spacer Sequence",
                    "Normalized":"sgRNA Score"}
df_hct_features = df_hct[["guide","Normalized"]]
df_hela_features = df_hela[["guide","Normalized"]]
df_hct_features.rename(columns=hart_feature_dict, inplace = True)
df_hela_features.rename(columns=hart_feature_dict, inplace = True)

df_hct_features["sgRNA Rank"] = df_hct_features["sgRNA Score"].rank(method='max')
n = df_hct_features["sgRNA Rank"].size
df_hct_features["sgRNA Normalized"] = (df_hct_features["sgRNA Rank"] - (3/8)) / (n + 1/4)

df_hela_features["sgRNA Rank"] = df_hela_features["sgRNA Score"].rank(method='max')
n = df_hela_features["sgRNA Rank"].size
df_hela_features["sgRNA Normalized"] = (df_hela_features["sgRNA Rank"] - (3/8)) / (n + 1/4)


df_features = pd.concat([df_doench_features, df_fusi_features, df_hct_features, df_hela_features, df_chari_features, df_hl60_features, df_doench_2016_features]).reset_index()
df_features = df_features[["Spacer Sequence","sgRNA Normalized"]]

df_features.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_doench_features["sgRNA Rank"] = df_doench_features["sgRNA Score"].rank(method='max')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_doench_features["sgRNA Normalized"] = (df_doench_features["sgRNA Rank"] - (3/8)) / (n + 1/4)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fusi_features.rename(columns=fusi_feature_dict, inplace = True)
A value is 

Unnamed: 0,Spacer Sequence,sgRNA Normalized
0,GTAATGGCTTCCTCGTGAGT,0.789849
1,GCTCGTGCGAGTGATAGACC,0.552807
2,CACGGACGGTATATGGCAGG,0.14913
3,TACTTCTGCGAGACACGGAC,0.767944
4,CGGCCCACGCTACTACAGTT,0.627909


In [77]:
df_features.shape

(24187, 2)

In [78]:
df_test=df_hel[["sgRNA_sequence", "modFreq"]]
test_feature_dict = {"sgRNA_sequence":"Spacer Sequence",
                    "modFreq":"sgRNA Normalized"}
df_test.rename(columns=test_feature_dict, inplace = True)
df_test.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test.rename(columns=test_feature_dict, inplace = True)


Unnamed: 0,Spacer Sequence,sgRNA Normalized
0,GAGTCGGGGTTTCGTCATGT,0.0
1,CGCCGCCGCTTTCGGTGATG,0.0
2,GGCAGCGTCGTGCACGGGTC,0.0
3,TGGGCGGATCACTTGACGTC,0.0
4,TTACCATAGTGTACGGGTGC,0.0


In [79]:
df_test.shape

(424, 2)

In [80]:
df_features_KO=df_KO[["sgRNA sequence","Off-target score (OS)"]]
feature_dict = {"Off-target score (OS)":"sgRNA Score",
                "sgRNA sequence":"Spacer Sequence"}

df_features_KO.rename(columns=feature_dict, inplace=True)
df_features_KO["sgRNA Rank"] = df_features_KO["sgRNA Score"].rank(method='max')
n = df_features_KO["sgRNA Rank"].size
# D=(r − 3/8) / (n + 1/4)
df_features_KO["sgRNA Normalized"] = (df_features_KO["sgRNA Rank"] - (3/8)) / (n + 1/4)
df_features_KO = df_features_KO[["Spacer Sequence","sgRNA Normalized"]]
df_features_KO.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_features_KO.rename(columns=feature_dict, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_features_KO["sgRNA Rank"] = df_features_KO["sgRNA Score"].rank(method='max')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_features_KO["sgRNA Normalized"] = (df_features_KO["sgRNA Rank"] - (3/8)) / (n + 1/4)


Unnamed: 0,Spacer Sequence,sgRNA Normalized
0,TACCCTGGGACTGTACCCCC,0.604353
1,ACCCTTGCTGCACGACCTGC,0.730003
2,TCGCTCGCCCCGCTCTTCCT,0.229828
3,TGACGCCTCGGACGTGTCTG,0.232793
4,CGTCATAGCCAATCTTCTTC,0.399647


In [81]:
df_features_KO.shape

(64751, 2)

### V1 Featurization

In [82]:
df_V1 = df_features.copy(deep=True) ## creates a hard copy of data to mutate in place
df_V1['Spacer Sequence'] = df_V1['Spacer Sequence'].astype('string') ## sets data of sequence to String for parsing
df_V1['Spacer Sequence']

0        GTAATGGCTTCCTCGTGAGT
1        GCTCGTGCGAGTGATAGACC
2        CACGGACGGTATATGGCAGG
3        TACTTCTGCGAGACACGGAC
4        CGGCCCACGCTACTACAGTT
                 ...         
24182    CGGCGGGCGCTTCACGCTCT
24183    CTGCAGCGGTACCGGCGAAA
24184    AGGACTTTGTCCAGGTAGCT
24185    ACTCCTTGCATGACATGAAC
24186    CAATCTGGTGCCAGCCTTCC
Name: Spacer Sequence, Length: 24187, dtype: string

In [83]:
df_V1 = v1Featurization(df_V1, 'Spacer Sequence')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[x + y + "?"][a] = df[x + y + str(k+1)][a]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[x + "?"][a] = df[x + str(k+1)][a]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['GC Count'][a] = gc_count(texts[a])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Melting Temp Wallace"][a] = mt.Tm_Wallace(s

In [84]:
df_V1.to_csv('AIO_onTarget_V1_Featurized.csv')
df_V1.head()

Unnamed: 0,Spacer Sequence,sgRNA Normalized,A1,AA1,AG1,AC1,AT1,A?,G1,GA1,...,TT?,GC Count,Melting Temp Wallace,Melting Temp GC,Melting Temp NN1,Melting Temp NN2,Melting Temp NN3,Melting Temp NN4,GC<10,GC>10
0,GTAATGGCTTCCTCGTGAGT,0.789849,0,0,0,0,0,1,1,0,...,1,10,60,50.154892,57.445249,55.860862,51.557644,53.31673,0,0
1,GCTCGTGCGAGTGATAGACC,0.552807,0,0,0,0,0,1,1,0,...,0,12,64,54.254892,61.004907,59.847295,54.962358,56.100232,0,1
2,CACGGACGGTATATGGCAGG,0.14913,0,0,0,0,0,1,0,0,...,0,12,64,54.254892,64.514516,59.135979,54.590105,58.22629,0,1
3,TACTTCTGCGAGACACGGAC,0.767944,0,0,0,0,0,1,0,0,...,1,11,62,52.204892,58.72417,58.600497,53.785797,54.92646,0,1
4,CGGCCCACGCTACTACAGTT,0.627909,0,0,0,0,0,1,0,0,...,1,12,64,54.254892,63.94652,60.89366,56.441821,59.480088,0,1


In [85]:
df_V1_test = df_test.copy(deep=True)
df_V1_test['Spacer Sequence'] = df_V1_test['Spacer Sequence'].astype('string') ## sets data of sequence to String for parsing
df_V1_test['Spacer Sequence']

0      GAGTCGGGGTTTCGTCATGT
1      CGCCGCCGCTTTCGGTGATG
2      GGCAGCGTCGTGCACGGGTC
3      TGGGCGGATCACTTGACGTC
4      TTACCATAGTGTACGGGTGC
               ...         
419    GCGTTGGGTGGTACTTGCAG
420    ATGTAGGGCTTGGCGAGTCT
421    GGTAGAAGGTGTAACTCCGG
422    GTTTAGCCAAGTATCATGCA
423    GCGCGGTAGTCGTGACCCGG
Name: Spacer Sequence, Length: 424, dtype: string

In [86]:
df_V1_test = v1Featurization(df_V1_test, 'Spacer Sequence')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[x + y + "?"][a] = df[x + y + str(k+1)][a]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[x + "?"][a] = df[x + str(k+1)][a]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['GC Count'][a] = gc_count(texts[a])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Melting Temp Wallace"][a] = mt.Tm_Wallace(s

In [87]:
df_V1_test.to_csv('AIO_onTarget_TEST_V1_Featurized.csv')
df_V1_test.head()

Unnamed: 0,Spacer Sequence,sgRNA Normalized,A1,AA1,AG1,AC1,AT1,A?,G1,GA1,...,TT?,GC Count,Melting Temp Wallace,Melting Temp GC,Melting Temp NN1,Melting Temp NN2,Melting Temp NN3,Melting Temp NN4,GC<10,GC>10
0,GAGTCGGGGTTTCGTCATGT,0.0,0,0,0,0,0,1,1,1,...,1,11,62,52.204892,62.544974,58.012148,54.271214,56.040621,0,1
1,CGCCGCCGCTTTCGGTGATG,0.0,0,0,0,0,0,1,0,0,...,1,14,68,58.354892,76.984632,66.891785,61.639884,63.393031,0,1
2,GGCAGCGTCGTGCACGGGTC,0.0,0,0,0,0,0,1,1,0,...,0,15,70,60.404892,74.488777,66.469738,63.245023,65.002345,0,1
3,TGGGCGGATCACTTGACGTC,0.0,0,0,0,0,0,1,0,0,...,1,12,64,54.254892,66.708478,60.499212,56.722325,58.508208,0,1
4,TTACCATAGTGTACGGGTGC,0.0,0,0,0,0,0,1,0,0,...,1,10,60,50.154892,56.793596,54.975062,51.262231,54.915035,0,0


In [88]:
df_V1_KO = df_features_KO.copy(deep=True) ## creates a hard copy of data to work with
df_V1_KO['Spacer Sequence'] = df_V1_KO['Spacer Sequence'].astype('string') ## sets data of sequence to String for parsing
df_V1_KO['Spacer Sequence']

0        TACCCTGGGACTGTACCCCC
1        ACCCTTGCTGCACGACCTGC
2        TCGCTCGCCCCGCTCTTCCT
3        TGACGCCTCGGACGTGTCTG
4        CGTCATAGCCAATCTTCTTC
                 ...         
64746    TTGCCGGGTACTACTACGCC
64747    CCGCGCTGTACACCACGACC
64748    CGCGGGGGCAATGCGGGCGC
64749    GCGCTCCTTTACCCTCATCG
64750    GACAACGATACCACCCCGAA
Name: Spacer Sequence, Length: 64751, dtype: string

In [89]:
df_V1_KO = v1Featurization(df_V1_KO, 'Spacer Sequence')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[x + y + "?"][a] = df[x + y + str(k+1)][a]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[x + "?"][a] = df[x + str(k+1)][a]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['GC Count'][a] = gc_count(texts[a])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Melting Temp Wallace"][a] = mt.Tm_Wallace(s

In [90]:
df_V1_KO.to_csv('AIO_offTarget_V1_Featurized.csv')
df_V1_KO.head()

Unnamed: 0,Spacer Sequence,sgRNA Normalized,A1,AA1,AG1,AC1,AT1,A?,G1,GA1,...,TT?,GC Count,Melting Temp Wallace,Melting Temp GC,Melting Temp NN1,Melting Temp NN2,Melting Temp NN3,Melting Temp NN4,GC<10,GC>10
0,TACCCTGGGACTGTACCCCC,0.604353,0,0,0,0,0,1,0,0,...,0,13,66,56.304892,64.288218,58.295048,56.093572,62.63574,0,1
1,ACCCTTGCTGCACGACCTGC,0.730003,1,0,0,1,0,1,0,0,...,1,13,66,56.304892,68.782052,63.030395,59.903077,61.714875,0,1
2,TCGCTCGCCCCGCTCTTCCT,0.229828,0,0,0,0,0,0,0,0,...,1,14,68,58.354892,74.767577,67.19587,62.41018,64.892673,0,1
3,TGACGCCTCGGACGTGTCTG,0.232793,0,0,0,0,0,1,0,0,...,0,13,66,56.304892,67.667812,63.336328,58.819906,59.9754,0,1
4,CGTCATAGCCAATCTTCTTC,0.399647,0,0,0,0,0,1,0,0,...,1,9,58,48.104892,55.747308,53.514861,48.297668,49.37352,1,0


### V2 Featurization

In [91]:
df_V2 = df_V1.copy(deep=True)

In [93]:
df_V2 = v2Featurization(df_V2, 'Spacer Sequence')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["AAAAA"][a] = "AAAAA" in texts[a]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["GGGGG"][a] = "GGGGG" in texts[a]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["CCCCC"][a] = "CCCCC" in texts[a]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["TTTTT"][a] = "TTTTT" in texts[a]
A value is trying to

In [94]:
df_V2.to_csv('AIO_onTarget_V2_Featurized.csv')
df_V2.head()

Unnamed: 0,Spacer Sequence,sgRNA Normalized,A1,AA1,AG1,AC1,AT1,A?,G1,GA1,...,TGT,TCA,TCG,TCC,TCT,TTA,TTG,TTC,TTT,Delta G
0,GTAATGGCTTCCTCGTGAGT,0.789849,0,0,0,0,0,1,1,0,...,0,0,1,1,0,0,0,1,0,-24.63324
1,GCTCGTGCGAGTGATAGACC,0.552807,0,0,0,0,0,1,1,0,...,0,0,1,0,0,0,0,0,0,-26.54678
2,CACGGACGGTATATGGCAGG,0.14913,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,-26.207115
3,TACTTCTGCGAGACACGGAC,0.767944,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,1,0,-25.848845
4,CGGCCCACGCTACTACAGTT,0.627909,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,-27.07234


In [95]:
df_V2.shape

(24187, 1636)

In [96]:
df_V2_test = df_V1_test.copy(deep=True)

In [97]:
df_V2_test = v2Featurization(df_V2_test, 'Spacer Sequence')

  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str

In [98]:
df_V2_test.to_csv('AIO_onTarget_TEST_V2_Featurized.csv')
df_V2_test.head()

Unnamed: 0,Spacer Sequence,sgRNA Normalized,A1,AA1,AG1,AC1,AT1,A?,G1,GA1,...,TGT,TCA,TCG,TCC,TCT,TTA,TTG,TTC,TTT,Delta G
0,GAGTCGGGGTTTCGTCATGT,0.0,0,0,0,0,0,1,1,1,...,1,1,1,0,0,0,0,1,1,-25.99264
1,CGCCGCCGCTTTCGGTGATG,0.0,0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,1,1,-30.090205
2,GGCAGCGTCGTGCACGGGTC,0.0,0,0,0,0,0,1,1,0,...,0,0,1,0,0,0,0,0,0,-30.84152
3,TGGGCGGATCACTTGACGTC,0.0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,1,0,0,-27.217265
4,TTACCATAGTGTACGGGTGC,0.0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,1,0,0,0,-24.48136


In [99]:
df_V2_test.shape

(424, 1636)

In [None]:
df_V2_KO = df_V1_KO.copy(deep=True)

In [None]:
df_V2_KO = v2Featurization(df_V2_KO, 'Spacer Sequence')

  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str(k+1)] = False
  df[x + y + z + str

In [None]:
df_V2_KO.to_csv('AIO_offTarget_V2_Featurized.csv')
df_V2_KO.head()

Unnamed: 0,Spacer Sequence,sgRNA Normalized,A1,AA1,AG1,AC1,AT1,A?,G1,GA1,...,TGT,TCA,TCG,TCC,TCT,TTA,TTG,TTC,TTT,Delta G
0,TACCCTGGGACTGTACCCCC,0.604353,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,-26.433045
1,ACCCTTGCTGCACGACCTGC,0.730003,1,0,0,1,0,1,0,0,...,0,0,0,0,0,0,1,0,0,-28.73437
2,TCGCTCGCCCCGCTCTTCCT,0.229828,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,1,0,-29.86482
3,TGACGCCTCGGACGTGTCTG,0.232793,0,0,0,0,0,1,0,0,...,1,0,1,0,1,0,0,0,0,-28.40505
4,CGTCATAGCCAATCTTCTTC,0.399647,0,0,0,0,0,1,0,0,...,0,1,0,0,1,0,0,1,0,-23.16219


In [None]:
df_V2_KO.shape

(64751, 1636)