In [35]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
import gzip
import matplotlib.pyplot as plt
import seaborn as sns
import gpplot as gpp
gpp.set_aesthetics(context = 'paper')

Load the nonessential-targeting guides cleaned in the prior jupyter notebook. These guides come from the avana library, and the screening data utilizes the median read counts across cell lines. 

In [36]:
screening_data= pd.read_csv("../Data/avana_noness_screenresults.csv")
screening_data

Unnamed: 0,Target Sequence,On-target Gene Symbols,promiscuous
0,AAAAGAACTGAGCCAAGCAG,OR1B1,False
1,AAACAACGACAGCGAGACCG,SPACA1,False
2,AAACAGGGAGGTCAAAACGG,OR2T12,False
3,AAACGAGGCCAGTTACCCGG,PPP3R2,False
4,AAACTCTGCATGGTCCCCTA,CLRN1,False
...,...,...,...
1578,TTTGGTTATAGAAACTTGGG,MS4A13,False
1579,TTTGTAGCTGAGCACCAGCA,"IFNA10,IFNA16,IFNA17,IFNA21,IFNA4",False
1580,TTTGTCATTGCAGGTCATGG,DEFB103B,False
1581,TTTGTCTTCCAACAGGCCAA,MBL2,False


Avana nonessential-targeting guides cleaned are copied into library mode of CRISPick to report all predicted off-target sites. This enables calculation of aggregate CFD of these guides. 

retrieving avana nonessentials scored for off-target activity from server: 

```scp ldrepano@login.broadinstitute.org:/broad/hptmp/gpp/sgrna-miner-extras/dev/sgrna-miner/59/37/5937db4e-b182-41ae-8b31-3bc9354ec95d/avana_noness_verbose-sgrna-designs.offtargetdisco.txt .```

compress file

``` gzip avana_noness_verbose-sgrna-designs.offtargetdisco.txt```

In [37]:
with gzip.open('../Data/avana_noness_verbose-sgrna-designs.offtargetdisco.txt.gz') as f:
    crispick_verbose = (pd.read_table(f,index_col=False,header=None,
                                      names=["Target","Target Sequence","Context Sequence","Match Tier", "CFD Score","PAM Sequence", "# of mismatches","identifier"],
                                      usecols=["Target Sequence", "CFD Score", "# of mismatches"],
                                      on_bad_lines="warn"))
#get rid of NA pick orders (i.e. MAX OTS)
crispick_verbose=crispick_verbose[pd.to_numeric(crispick_verbose["# of mismatches"], errors='coerce').notnull()].reset_index(drop=True).copy()
crispick_verbose["# of mismatches"]=crispick_verbose["# of mismatches"].astype(np.int64)
crispick_verbose["CFD Score"]=crispick_verbose["CFD Score"].astype(np.float64)

crispick_verbose

  crispick_verbose = (pd.read_table(f,index_col=False,header=None,


Unnamed: 0,Target Sequence,CFD Score,# of mismatches
0,ACCTCGGTTCAGGTAGCACA,0.187075,3
1,ACCTCGGTTCAGGTAGCACA,0.437500,3
2,ACCTCGGTTCAGGTAGCACA,0.363636,3
3,ACCTCGGTTCAGGTAGCACA,0.440000,3
4,ACCTCGGTTCAGGTAGCACA,0.440000,3
...,...,...,...
146175686,GCAGGCTTGCCCCAGAGCCA,0.941176,1
146175687,GCAGGCTTGCCCCAGAGCCA,0.192308,1
146175688,GCAGGCTTGCCCCAGAGCCA,0.192308,1
146175689,GCAGGCTTGCCCCAGAGCCA,1.000000,0


In [38]:
crispick_verbose["CFD_upto1mm"]= np.where(crispick_verbose["# of mismatches"]<=1, crispick_verbose["CFD Score"],0)


crispick_agg = (crispick_verbose.groupby("Target Sequence")
            .agg(aggcfd= ("CFD_upto1mm","sum"))
            .reset_index())
crispick_agg

Unnamed: 0,Target Sequence,aggcfd
0,AAAAAAAATGAACTATGAAG,3.473545
1,AAAAAAAGCATTTCTTATGT,2.167371
2,AAAAAAAGTGCTGGATTTCA,8.011492
3,AAAAAAATACTTTGACATAC,1.630128
4,AAAAAAATCAGCCACGCGAC,1.016129
...,...,...
136202,TTTTTTTTCTGAGGCATGCG,0.428571
136203,TTTTTTTTTCAGCCCAGGAA,2.024615
136204,TTTTTTTTTTACCTGAATGG,1.790558
136205,TTTTTTTTTTAGCAGTCCCC,1.220833


In [39]:
#merge with actual data
merged=crispick_agg.merge(screening_data[["Target Sequence","promiscuous"]],on="Target Sequence")
merged

Unnamed: 0,Target Sequence,aggcfd,promiscuous
0,AAAAGAACTGAGCCAAGCAG,2.971561,False
1,AAACAACGACAGCGAGACCG,0.000000,False
2,AAACAGGGAGGTCAAAACGG,0.448276,False
3,AAACGAGGCCAGTTACCCGG,1.000000,False
4,AAACTCTGCATGGTCCCCTA,3.069444,False
...,...,...,...
1571,TTTGGTTATAGAAACTTGGG,0.000000,False
1572,TTTGTAGCTGAGCACCAGCA,24.395605,False
1573,TTTGTCATTGCAGGTCATGG,1.666667,False
1574,TTTGTCTTCCAACAGGCCAA,0.000000,False


In [46]:
metric=merged["aggcfd"].astype(np.float64)
f1=[]
cutoffs=[i*0.1 for i in range(80)]
for cutoff in cutoffs:
    metric_binary= metric > cutoff
    performance= f1_score(merged["promiscuous"], metric_binary ,average="macro")
    f1.append(performance)
f1_best=max(f1) 
bestcutoff=cutoffs[f1.index(f1_best)]
rs3valcutoff=4.8
merged["Agg CFD Predicted"]=merged["aggcfd"]>rs3valcutoff
print("F1= ", f1_score(merged["promiscuous"], merged["Agg CFD Predicted"] ,average="macro"), " at Agg CFD cutoff ",rs3valcutoff)


F1=  0.7255183165956199  at Agg CFD cutoff  4.8


In [41]:
pd.DataFrame(merged[["Agg CFD Predicted","promiscuous"]].value_counts()).sort_values(by="Agg CFD Predicted")


Unnamed: 0_level_0,Unnamed: 1_level_0,0
Agg CFD Predicted,promiscuous,Unnamed: 2_level_1
False,False,1214
False,True,23
True,False,253
True,True,86
