In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
import gzip
import matplotlib.pyplot as plt
import seaborn as sns
import gpplot as gpp
gpp.set_aesthetics(context = 'paper')

Load the nonessential-targeting guides cleaned in the prior jupyter notebook. These guides come from the TKOv3 library, and the screen is described [here](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8904132/#SD4)

In [3]:
screening_data= pd.read_csv("../Data/tkov3_noness_screenresults.csv")
screening_data

Unnamed: 0,Target Sequence,On-target Gene Symbols,promiscuous
0,GAGGAGAGAGGGCTGCCGAA,ABCG8,False
1,GAGCCAGCTGCTCAAACCAA,ABCG8,False
2,ATCACACAGAACCATGGCCA,ACCSL,False
3,GTAGGCGATGTGCATTGCAG,ACTL7A,False
4,TGAAGAACATCTCCGAGCAG,ACTL7A,False
...,...,...,...
1910,ATGCAAGTAGAAGAGACCCA,ZP2,False
1911,GAGTGGCCAGCATAAGCCTG,ZP4,False
1912,GAAGCTGCTCAAGTGTCCTA,ZP4,False
1913,GAGAGACTGGACAAACACCT,ZSWIM2,False


TKOv3 nonessential-targeting guides cleaned are copied into library mode of CRISPick to report all predicted off-target sites. This enables calculation of aggregate CFD of these guides. 

retrieving TKOv3 nonessentials scored for off-target activity from server: 

```scp ldrepano@login.broadinstitute.org:/broad/hptmp/gpp/sgrna-miner-extras/dev/sgrna-miner/7f/cd/7fcdfc85-7b79-40ed-89dd-1b44a521a08e/tkov3_noness_verbose-sgrna-designs.offtargetdisco.txt .```

compress file

``` gzip tkov3_noness_verbose-sgrna-designs.offtargetdisco.txt```

In [4]:
with gzip.open('../Data/tkov3_noness_verbose-sgrna-designs.offtargetdisco.txt.gz') as f:
    crispick_verbose = (pd.read_table(f,index_col=False,header=None,
                                      names=["Target","Target Sequence","Context Sequence","Match Tier", "CFD Score","PAM Sequence", "# of mismatches","identifier"],
                                      usecols=["Target Sequence", "CFD Score", "# of mismatches"],
                                      on_bad_lines="warn"))
#get rid of NA pick orders (i.e. MAX OTS)
crispick_verbose=crispick_verbose[pd.to_numeric(crispick_verbose["# of mismatches"], errors='coerce').notnull()].reset_index(drop=True).copy()
crispick_verbose["# of mismatches"]=crispick_verbose["# of mismatches"].astype(np.int64)
crispick_verbose["CFD Score"]=crispick_verbose["CFD Score"].astype(np.float64)
crispick_verbose

  crispick_verbose = (pd.read_table(f,index_col=False,header=None,


Unnamed: 0,Target Sequence,CFD Score,# of mismatches
0,ACCTTAAGGACACGTAGTAC,0.220863,3
1,ACCTTAAGGACACGTAGTAC,0.188383,3
2,ACCTTAAGGACACGTAGTAC,0.101587,3
3,ACCTTAAGGACACGTAGTAC,0.630252,3
4,ACCTTAAGGACACGTAGTAC,0.630252,3
...,...,...,...
138918447,ATCAAGTCCAAAGTCTCAGG,0.048611,2
138918448,ATCAAGTCCAAAGTCTCAGG,0.666667,1
138918449,ATCAAGTCCAAAGTCTCAGG,0.069444,1
138918450,ATCAAGTCCAAAGTCTCAGG,1.000000,0


In [5]:
crispick_verbose["CFD_upto1mm"]= np.where(crispick_verbose["# of mismatches"]<=1, crispick_verbose["CFD Score"],0)
crispick_verbose["CFD_upto1mm"].unique()

crispick_agg = (crispick_verbose.groupby("Target Sequence")
            .agg(aggcfd= ("CFD_upto1mm","sum"))
            .reset_index())
crispick_agg

Unnamed: 0,Target Sequence,aggcfd
0,AAAAAAAATGAACTATGAAG,3.473545
1,AAAAAAAGCATTTCTTATGT,2.167371
2,AAAAAAAGTGCTGGATTTCA,8.011492
3,AAAAAAATCAGCCACGCGAC,1.016129
4,AAAAAAATGGGGACATTGGA,1.476461
...,...,...
131998,TTTTTTTTCAATTCTAATAA,7.459751
131999,TTTTTTTTCTGAGGCATGCG,0.428571
132000,TTTTTTTTTCAGCCCAGGAA,2.024615
132001,TTTTTTTTTTACCTGAATGG,1.790558


In [6]:
#merge with actual data
merged=crispick_agg.merge(screening_data[["Target Sequence","promiscuous"]],left_on="Target Sequence",right_on="Target Sequence")
merged

Unnamed: 0,Target Sequence,aggcfd,promiscuous
0,AAACAAACCACCGAAACCCT,0.000000,False
1,AAACAACGACAGCGAGACCG,0.000000,False
2,AAACACTGGCATACACTCCA,0.000000,False
3,AAACAGAACAGCGAATAGCG,3.000000,False
4,AAACAGCCAAGACACAGGAG,0.621429,False
...,...,...,...
1906,TTTGGAATGAGCCAAGTCGT,1.218519,False
1907,TTTGGAGATAACGGCAGTGG,0.000000,False
1908,TTTGGAGCTGTCCTCTCGGA,0.000000,False
1909,TTTGGGCCTCAGGTACACGA,0.650000,True


In [7]:
#4.6 is the agg cfd threshold from rs3 validation data 
merged[merged["aggcfd"]>4.8]["promiscuous"].value_counts()

False    40
True     10
Name: promiscuous, dtype: int64

In [8]:
metric=merged["aggcfd"].astype(np.float64)
f1=[]
cutoffs=[i*0.1 for i in range(300)]
for cutoff in cutoffs:
    metric_binary= metric > cutoff
    performance= f1_score(merged["promiscuous"], metric_binary ,average="macro")
    f1.append(performance)
f1_best=max(f1) 
bestcutoff=cutoffs[f1.index(f1_best)]
merged["Agg CFD Predicted"]=merged["aggcfd"]>bestcutoff
print("F1= ", f1_best, " at Agg CFD cutoff ",bestcutoff)

F1=  0.5635208495880513  at Agg CFD cutoff  4.5


Poor performance could be explained by lack of promiscuous guides in this dataset?

In [9]:
pd.DataFrame(merged[["Agg CFD Predicted","promiscuous"]].value_counts()).sort_values(by="Agg CFD Predicted")


Unnamed: 0_level_0,Unnamed: 1_level_0,0
Agg CFD Predicted,promiscuous,Unnamed: 2_level_1
False,False,1774
False,True,82
True,False,43
True,True,12
