In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import random
from cycler import cycler
from matplotlib_venn import venn3  
from matplotlib_venn.layout.venn3 import DefaultLayoutAlgorithm

import gpplot as gpp
gpp.set_aesthetics(context = 'paper')

Combining first set of CRISPick runs to assemble Jacquere in order to ...
- Assess the guides selected from the picking scheme, troubleshooting any odd patterns observed
- Assess if 4 guides per gene is necessary, or if this improved picking scheme enables a more compact library

In [2]:
#CRISPick output of three design runs: 
## 1. all protein coding genes in ensembl
## 2. those in NCBI but not ensembl
## 3. those unique to CHESS
ensembl_designs=pd.read_table("../Data/ensembl_113_protein_coding_aggcfd_targetlocal-sgrna-designs.txt")
NCBI_designs=pd.read_table("../Data/ncbi_IDS_forjacquere-sgrna-designs.txt")
CHESS_designs= pd.read_table("../Data/chess_targetlocal_aggcfd-sgrna-designs.txt")

#replace exon ranges with identifiers for CHESS genes 
#CHESS_identifier= pd.read_csv("../Jacquere_library/design_annotations/CHESS3.1.3_unique_protein_coding_genes_coordinates_with_identifier.csv",header=None)
#CHESS_designs=CHESS_designs.merge(CHESS_identifier,left_on="Input",right_on=1)
#CHESS_designs["Input"]=CHESS_designs[0]
#CHESS_designs=CHESS_designs.drop([0,1],axis=1)

jacquere_og=pd.concat([ensembl_designs,NCBI_designs,CHESS_designs],keys=["GENCODE","RefSeq","CHESS"]).reset_index()
jacquere_og["Source"]=jacquere_og["level_0"]
jacquere_og=jacquere_og.drop(["level_0","level_1"],axis=1)


#remove any guides with Aggregate CFD > 4.8
jacquere=jacquere_og[jacquere_og["Aggregate CFD Score"]!="MAX"].reset_index(drop=True)
jacquere["Aggregate CFD Score"]=jacquere["Aggregate CFD Score"].astype(float)
jacquere=jacquere[jacquere["Aggregate CFD Score"]<=4.8].reset_index(drop=True)


jacquere

  ensembl_designs=pd.read_table("../Data/ensembl_113_protein_coding_aggcfd_targetlocal-sgrna-designs.txt")


Unnamed: 0,Input,Quota,Target Taxon,Target Gene ID,Target Gene Symbol,Target Transcript,Target Reference Coords,Target Alias,CRISPR Mechanism,Target Domain,...,On-Target Efficacy Score,On-Target Rank,Preselected As,Matching Active Arrayed Oligos,Matching Arrayed Constructs,Pools Containing Matching Construct,Pick Order,Picking Round,Picking Notes,Source
0,ENSG00000000003,4,9606,ENSG00000000003,TSPAN6,ENST00000373020.9,,,CRISPRko,CDS,...,1.0210,2,,,,"CP1889, CP2115",1,1,,GENCODE
1,ENSG00000000003,4,9606,ENSG00000000003,TSPAN6,ENST00000373020.9,,,CRISPRko,CDS,...,0.9891,3,,,BRDN0003487346,"CP1889, CP2114, CP2115",2,1,,GENCODE
2,ENSG00000000003,4,9606,ENSG00000000003,TSPAN6,ENST00000373020.9,,,CRISPRko,CDS,...,0.9622,4,,,,"CP1889, CP2114, CP2115",3,1,,GENCODE
3,ENSG00000000003,4,9606,ENSG00000000003,TSPAN6,ENST00000373020.9,,,CRISPRko,CDS,...,0.6634,9,,,,,4,1,,GENCODE
4,ENSG00000000005,4,9606,ENSG00000000005,TNMD,ENST00000373031.5,,,CRISPRko,CDS,...,1.1120,1,,,,,1,1,,GENCODE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81674,chrX:-:140091677-140092911,4,9606,,,,NC_000023.11:-:140091677-140092911,chrX:-:140091677-140092911,CRISPRko,CDS,...,0.4118,31,,,,,4,1,,CHESS
81675,chrY:-:1391992-1392341;1395129-1395548;1396371...,4,9606,,,,NC_000024.10:-:1391992-1392341;1395129-1395548...,chrY:-:1391992-1392341;1395129-1395548;1396371...,CRISPRko,CDS,...,0.9535,1,,,,,1,4,Previously skipped due to: Off-target CFD100 m...,CHESS
81676,chrY:-:1391992-1392341;1395129-1395548;1396371...,4,9606,,,,NC_000024.10:-:1391992-1392341;1395129-1395548...,chrY:-:1391992-1392341;1395129-1395548;1396371...,CRISPRko,CDS,...,0.8926,2,,,,,2,4,Previously skipped due to: Off-target CFD100 m...,CHESS
81677,chrY:-:1391992-1392341;1395129-1395548;1396371...,4,9606,,,,NC_000024.10:-:1391992-1392341;1395129-1395548...,chrY:-:1391992-1392341;1395129-1395548;1396371...,CRISPRko,CDS,...,0.8820,3,,,,,3,4,Previously skipped due to: Off-target CFD100 m...,CHESS


In [3]:
#negative controls: 900 intergenic and 100 nontargeting
intergenics=pd.read_csv("../Data/Negative Controls/jacquere_900_intergenics.csv")

intergenics["Input"]="ONE_SITE_INTERGENIC"
intergenics["Source"]="ONE_SITE_INTERGENIC"
intergenics["Target Gene Symbol"]="ONE_SITE_INTERGENIC"
intergenics["On-Target Ruleset"]="RS3seq-Chen2013+RS3target"
intergenics["Target Gene ID"]="ONE_SITE_INTERGENIC"

nontargeting= pd.read_table("../Data/Negative Controls/jacquere_100_nontargeting-sgrna-designs.txt")
nontargeting=nontargeting[nontargeting["Target Gene Symbol"].isna()].reset_index(drop=True) #remove the one targeting guide required for CRISPick to run 
nontargeting["Input"]="NO_SITE"
nontargeting["Source"]="NO_SITE"
nontargeting["Target Gene Symbol"]="NO_SITE"
nontargeting["Target Gene ID"]="NO_SITE"



OSError: [Errno 89] Operation canceled

In [None]:
#exporting jacquere prior to reducing the quota to support screening of jacquere with variable # guides per gene
jacquere_quotasubsampling=jacquere[["Target Gene Symbol","Input","Source","Pick Order","sgRNA Sequence"]].copy()
jacquere_quotasubsampling.loc[jacquere["Target Gene Symbol"].isna(),"Target Gene Symbol"]="N/A"

intergenics["Pick Order"]="N/A"
nontargeting["Pick Order"]="N/A"
jacquere_quotasubsampling=pd.concat([jacquere_quotasubsampling,intergenics[["Target Gene Symbol","Input","Source","Pick Order","sgRNA Sequence"]],nontargeting[["Target Gene Symbol","Input","Source","Pick Order","sgRNA Sequence"]]])
#jacquere_quotasubsampling.to_csv("../Jacquere_uptoquota4.csv",index=False)


In [None]:
jacquere["On-Target Efficacy Score"].hist(bins=50,grid=False)
plt.axvline(x=0.2,c="red",linestyle="--",label="RS3=0.2")
plt.title("distribution of rs3 scores in jacquere (4 guides/gene)")
plt.legend()

[Picking rounds](https://portals.broadinstitute.org/gppx/crispick/platform/how-it-works). Round 1 or 2 is ideal (criteria relaxes from 5-65 to 5-80 along the protein coding region, both are acceptable)

In [None]:
print("% of Jacquere in picking round 1 or 2:", 100*len(jacquere[jacquere["Picking Round"].isin([1,2])])/len(jacquere))


There are distinct Ensembl IDs that associated with the same Gene Symbol, but are optimally targeted by distinct sets of guides. These should be kept separate in Jacquere, and thus our suggested mappings should use Ensembl IDs. Note that the values below represent the 4 guide/gene library.

In [None]:
jacquere_ensembl=jacquere[jacquere["Source"]=="GENCODE"]
gene_symbol_counts=pd.DataFrame(jacquere_ensembl["Target Gene Symbol"].value_counts())
same_genesymbol_diff_ID_symbols=gene_symbol_counts[gene_symbol_counts["count"]>4].reset_index()["Target Gene Symbol"]
print("# gene symbols associated with >1 ensembl gene ID", len(same_genesymbol_diff_ID_symbols))
#get all sgRNAs (up to 4) associated with a given target
guides_per_input=pd.DataFrame(jacquere.groupby(['Input','Target Gene Symbol','Source'])['sgRNA Sequence'].apply(list)).reset_index()
unique_guidesets_shared_genesymbol=pd.DataFrame(guides_per_input[guides_per_input["Target Gene Symbol"].isin(same_genesymbol_diff_ID_symbols)]["sgRNA Sequence"].value_counts())
print("% of genes that share a gene symbol that have unique sets of guides", 100*len(unique_guidesets_shared_genesymbol[unique_guidesets_shared_genesymbol["count"]==1])/len(unique_guidesets_shared_genesymbol))

## Determining Ideal Quota for Jacquere

Subsampling analysis, identifying how top 1 vs 2 vs 3 vs 4 guides in Jacquere are able to yield gene z-scores appropriate for their predicted viability effect (essential vs nonessential) in Deweirdt 2022 tiling

In [None]:
rs3val_zscores= pd.read_csv("../../2. Developing promiscuous classifier/Data/rs3valdata_updatedmappings_zscore.csv")
rs3val_zscores

In [None]:
#get essential vs nonessential classifications 

noness_genes = pd.read_csv("../../2. Developing promiscuous classifier/Data/AchillesNonessentialControls.csv")
noness_genes["Gene"]=noness_genes["Gene"].apply(lambda x: x.split("(")[0])
#remove space from gene name 
noness_genes["Gene"]=noness_genes["Gene"].str.replace(" ","")
noness_gene_list=noness_genes["Gene"].tolist()

essential_genes = pd.read_csv("../../2. Developing promiscuous classifier/Data/AchillesCommonEssentialControls.csv")
essential_genes["Gene"]=essential_genes["Gene"].apply(lambda x: x.split("(")[0])
#remove space from gene name 
essential_genes["Gene"]=essential_genes["Gene"].str.replace(" ","")
essential_gene_list=essential_genes["Gene"].tolist()

rs3val_zscores["is_essential"]=rs3val_zscores["On-target Gene Symbols"].isin(essential_gene_list)
rs3val_zscores["is_nonessential"]=rs3val_zscores["On-target Gene Symbols"].isin(noness_gene_list)

print("# essential:",len(rs3val_zscores[rs3val_zscores["is_essential"]]))
print("# nonessential:",len(rs3val_zscores[rs3val_zscores["is_nonessential"]]))

Get library subsets

In [None]:
jacquere_with_z_scores=pd.merge(left=jacquere,right=rs3val_zscores,on="sgRNA Sequence")

jacquere_top1=jacquere_with_z_scores[jacquere_with_z_scores["Pick Order"]==1].reset_index(drop=True)
jacquere_top2=jacquere_with_z_scores[jacquere_with_z_scores["Pick Order"].isin([1,2])].reset_index(drop=True)
jacquere_top3=jacquere_with_z_scores[jacquere_with_z_scores["Pick Order"].isin([1,2,3])].reset_index(drop=True)
jacquere_top4=jacquere_with_z_scores


In [None]:
#Get average z-score per gene, separately for 1 vs 2 vs 3 vs 4 guides per gene in Jacquere
# not using stouffer's b/c biased for gene set size 

gene_avg_z_jacqueretop1 = (jacquere_top1.groupby(['Target Gene Symbol',"is_essential","is_nonessential"])
                     .agg(mean_z = ('z_scored_sgRNA_lfc', 'mean'))
                     .reset_index())

gene_avg_z_jacqueretop2 = (jacquere_top2.groupby(['Target Gene Symbol',"is_essential","is_nonessential"])
                     .agg(mean_z = ('z_scored_sgRNA_lfc', 'mean'),
                          n_sgrnas = ('z_scored_sgRNA_lfc', 'count'))
                     .reset_index())

gene_avg_z_jacqueretop3 = (jacquere_top3.groupby(['Target Gene Symbol',"is_essential","is_nonessential"])
                     .agg(mean_z = ('z_scored_sgRNA_lfc', 'mean'),
                          n_sgrnas = ('z_scored_sgRNA_lfc', 'count'))
                     .reset_index())

gene_avg_z_jacqueretop4 = (jacquere_top4.groupby(['Target Gene Symbol',"is_essential","is_nonessential"])
                     .agg(mean_z = ('z_scored_sgRNA_lfc', 'mean'),
                          n_sgrnas = ('z_scored_sgRNA_lfc', 'count'))
                     .reset_index())


In [None]:
#add randomly picked guides to comparison
random.seed(1)
sgRNAs_selected=[]
for gene in rs3val_zscores["On-target Gene Symbols"].unique():
    #all sgRNAs that target that gene 
    gene_sgRNAs=rs3val_zscores[rs3val_zscores["On-target Gene Symbols"]==gene]["sgRNA Sequence"].tolist()
    #select 4 of these sgRNAs randomly
    if len(gene_sgRNAs)>=4:  
        random_4=random.sample(gene_sgRNAs,4)
    sgRNAs_selected=sgRNAs_selected+random_4
random_library=rs3val_zscores[rs3val_zscores["sgRNA Sequence"].isin(sgRNAs_selected)].reset_index(drop=True).copy()

gene_avg_z_random_4 = (random_library.groupby(['On-target Gene Symbols',"is_essential","is_nonessential"])
                     .agg(mean_z = ('z_scored_sgRNA_lfc', 'mean'),
                          n_sgrnas = ('z_scored_sgRNA_lfc', 'count'))
                     .reset_index())


In [None]:
# add Brunello to comparison
brunello= pd.read_excel("../../5. Assessment of  CRISPRko Cas9 Genome Wide Libraries/Data/Original Annotations of CRISPRko Cas9 Genome Wide Libraries/STable 21 Brunello.xlsx",sheet_name="Sheet1")
brunello_guides=brunello["sgRNA Target Sequence"].tolist()
brunello_with_z_scores=rs3val_zscores[rs3val_zscores["sgRNA Sequence"].isin(brunello_guides)].reset_index(drop=True)


gene_avg_z_brunello = (brunello_with_z_scores.groupby(['On-target Gene Symbols',"is_essential","is_nonessential"])
                     .agg(mean_z = ('z_scored_sgRNA_lfc', 'mean'),
                          n_sgrnas = ('z_scored_sgRNA_lfc', 'count'))
                     .reset_index())


In [None]:
# add Gattinara to comparison
gattinara= pd.read_table("../../5. Assessment of  CRISPRko Cas9 Genome Wide Libraries/Data/Original Annotations of CRISPRko Cas9 Genome Wide Libraries/broadgpp-gattinara-library-contents.txt")
gattinara_guides=gattinara[gattinara["Annotated Gene Symbol"].str.count("NEG_CONTROL")==0]["Barcode Sequence"].tolist()
gattinara_with_z_scores=rs3val_zscores[rs3val_zscores["sgRNA Sequence"].isin(gattinara_guides)].reset_index(drop=True)
gene_avg_z_gattinara = (gattinara_with_z_scores.groupby(['On-target Gene Symbols',"is_essential","is_nonessential"])
                     .agg(mean_z = ('z_scored_sgRNA_lfc', 'mean'),
                          n_sgrnas = ('z_scored_sgRNA_lfc', 'count'))
                     .reset_index())


Get false negative (% of essential genes with z-score not passing threshold) rates to assess optimal quota

In [None]:
gene_avg_z_jacqueretop1_essentials=gene_avg_z_jacqueretop1[gene_avg_z_jacqueretop1["is_essential"]].reset_index(drop=True)
gene_avg_z_jacqueretop1_essentials["z<-2"]=gene_avg_z_jacqueretop1_essentials["mean_z"]<-2
gene_avg_z_jacqueretop1_essentials["z<-3"]=gene_avg_z_jacqueretop1_essentials["mean_z"]<-3
gene_avg_z_jacqueretop1_essentials["z<-4"]=gene_avg_z_jacqueretop1_essentials["mean_z"]<-4
jacqueretop1_FNR_zneg2=len(gene_avg_z_jacqueretop1_essentials[gene_avg_z_jacqueretop1_essentials["z<-2"]==False])/len(gene_avg_z_jacqueretop1_essentials)
jacqueretop1_FNR_zneg3=len(gene_avg_z_jacqueretop1_essentials[gene_avg_z_jacqueretop1_essentials["z<-3"]==False])/len(gene_avg_z_jacqueretop1_essentials)
jacqueretop1_FNR_zneg4=len(gene_avg_z_jacqueretop1_essentials[gene_avg_z_jacqueretop1_essentials["z<-4"]==False])/len(gene_avg_z_jacqueretop1_essentials)

gene_avg_z_jacqueretop2_essentials=gene_avg_z_jacqueretop2[gene_avg_z_jacqueretop2["is_essential"]].reset_index(drop=True)
gene_avg_z_jacqueretop2_essentials["z<-2"]=gene_avg_z_jacqueretop2_essentials["mean_z"]<-2
gene_avg_z_jacqueretop2_essentials["z<-3"]=gene_avg_z_jacqueretop2_essentials["mean_z"]<-3
gene_avg_z_jacqueretop2_essentials["z<-4"]=gene_avg_z_jacqueretop2_essentials["mean_z"]<-4
jacqueretop2_FNR_zneg2=len(gene_avg_z_jacqueretop2_essentials[gene_avg_z_jacqueretop2_essentials["z<-2"]==False])/len(gene_avg_z_jacqueretop2_essentials)
jacqueretop2_FNR_zneg3=len(gene_avg_z_jacqueretop2_essentials[gene_avg_z_jacqueretop2_essentials["z<-3"]==False])/len(gene_avg_z_jacqueretop2_essentials)
jacqueretop2_FNR_zneg4=len(gene_avg_z_jacqueretop2_essentials[gene_avg_z_jacqueretop2_essentials["z<-4"]==False])/len(gene_avg_z_jacqueretop2_essentials)

gene_avg_z_jacqueretop3_essentials=gene_avg_z_jacqueretop3[gene_avg_z_jacqueretop3["is_essential"]].reset_index(drop=True)
gene_avg_z_jacqueretop3_essentials["z<-2"]=gene_avg_z_jacqueretop3_essentials["mean_z"]<-2
gene_avg_z_jacqueretop3_essentials["z<-3"]=gene_avg_z_jacqueretop3_essentials["mean_z"]<-3
gene_avg_z_jacqueretop3_essentials["z<-4"]=gene_avg_z_jacqueretop3_essentials["mean_z"]<-4
jacqueretop3_FNR_zneg2=len(gene_avg_z_jacqueretop3_essentials[gene_avg_z_jacqueretop3_essentials["z<-2"]==False])/len(gene_avg_z_jacqueretop3_essentials)
jacqueretop3_FNR_zneg3=len(gene_avg_z_jacqueretop3_essentials[gene_avg_z_jacqueretop3_essentials["z<-3"]==False])/len(gene_avg_z_jacqueretop3_essentials)
jacqueretop3_FNR_zneg4=len(gene_avg_z_jacqueretop3_essentials[gene_avg_z_jacqueretop3_essentials["z<-4"]==False])/len(gene_avg_z_jacqueretop3_essentials)

gene_avg_z_jacqueretop4_essentials=gene_avg_z_jacqueretop4[gene_avg_z_jacqueretop4["is_essential"]].reset_index(drop=True)
gene_avg_z_jacqueretop4_essentials["z<-2"]=gene_avg_z_jacqueretop4_essentials["mean_z"]<-2
gene_avg_z_jacqueretop4_essentials["z<-3"]=gene_avg_z_jacqueretop4_essentials["mean_z"]<-3
gene_avg_z_jacqueretop4_essentials["z<-4"]=gene_avg_z_jacqueretop4_essentials["mean_z"]<-4
jacqueretop4_FNR_zneg2=len(gene_avg_z_jacqueretop4_essentials[gene_avg_z_jacqueretop4_essentials["z<-2"]==False])/len(gene_avg_z_jacqueretop4_essentials)
jacqueretop4_FNR_zneg3=len(gene_avg_z_jacqueretop4_essentials[gene_avg_z_jacqueretop4_essentials["z<-3"]==False])/len(gene_avg_z_jacqueretop4_essentials)
jacqueretop4_FNR_zneg4=len(gene_avg_z_jacqueretop4_essentials[gene_avg_z_jacqueretop4_essentials["z<-4"]==False])/len(gene_avg_z_jacqueretop4_essentials)

gene_avg_z_random_4_essentials=gene_avg_z_random_4[gene_avg_z_random_4["is_essential"]].reset_index(drop=True)
gene_avg_z_random_4_essentials["z<-2"]=gene_avg_z_random_4_essentials["mean_z"]<-2
gene_avg_z_random_4_essentials["z<-3"]=gene_avg_z_random_4_essentials["mean_z"]<-3
gene_avg_z_random_4_essentials["z<-4"]=gene_avg_z_random_4_essentials["mean_z"]<-4
random_4_FNR_zneg2=len(gene_avg_z_random_4_essentials[gene_avg_z_random_4_essentials["z<-2"]==False])/len(gene_avg_z_random_4_essentials)
random_4_FNR_zneg3=len(gene_avg_z_random_4_essentials[gene_avg_z_random_4_essentials["z<-3"]==False])/len(gene_avg_z_random_4_essentials)
random_4_FNR_zneg4=len(gene_avg_z_random_4_essentials[gene_avg_z_random_4_essentials["z<-4"]==False])/len(gene_avg_z_random_4_essentials)

gene_avg_z_brunello_essentials=gene_avg_z_brunello[gene_avg_z_brunello["is_essential"]].reset_index(drop=True)
gene_avg_z_brunello_essentials["z<-2"]=gene_avg_z_brunello_essentials["mean_z"]<-2
gene_avg_z_brunello_essentials["z<-3"]=gene_avg_z_brunello_essentials["mean_z"]<-3
gene_avg_z_brunello_essentials["z<-4"]=gene_avg_z_brunello_essentials["mean_z"]<-4
brunello_FNR_zneg2=len(gene_avg_z_brunello_essentials[gene_avg_z_brunello_essentials["z<-2"]==False])/len(gene_avg_z_brunello_essentials)
brunello_FNR_zneg3=len(gene_avg_z_brunello_essentials[gene_avg_z_brunello_essentials["z<-3"]==False])/len(gene_avg_z_brunello_essentials)
brunello_FNR_zneg4=len(gene_avg_z_brunello_essentials[gene_avg_z_brunello_essentials["z<-4"]==False])/len(gene_avg_z_brunello_essentials)


gene_avg_z_gattinara_essentials=gene_avg_z_gattinara[gene_avg_z_gattinara["is_essential"]].reset_index(drop=True)
gene_avg_z_gattinara_essentials["z<-2"]=gene_avg_z_gattinara_essentials["mean_z"]<-2
gene_avg_z_gattinara_essentials["z<-3"]=gene_avg_z_gattinara_essentials["mean_z"]<-3
gene_avg_z_gattinara_essentials["z<-4"]=gene_avg_z_gattinara_essentials["mean_z"]<-4
gattinara_FNR_zneg2=len(gene_avg_z_gattinara_essentials[gene_avg_z_gattinara_essentials["z<-2"]==False])/len(gene_avg_z_gattinara_essentials)
gattinara_FNR_zneg3=len(gene_avg_z_gattinara_essentials[gene_avg_z_gattinara_essentials["z<-3"]==False])/len(gene_avg_z_gattinara_essentials)
gattinara_FNR_zneg4=len(gene_avg_z_gattinara_essentials[gene_avg_z_gattinara_essentials["z<-4"]==False])/len(gene_avg_z_gattinara_essentials)


In [None]:
libraries=["Random \nQuota=4","Brunello \nQuota=4","Gattinara \nQuota=2","Jacquere \nQuota=1","Jacquere \nQuota=2","Jacquere \nQuota=3","Jacquere \nQuota=4"]
FNR_df=pd.DataFrame({"Z<-2":[random_4_FNR_zneg2,brunello_FNR_zneg2,gattinara_FNR_zneg2,jacqueretop1_FNR_zneg2,jacqueretop2_FNR_zneg2,jacqueretop3_FNR_zneg2,jacqueretop4_FNR_zneg2],
             "Z<-3":[random_4_FNR_zneg3,brunello_FNR_zneg3,gattinara_FNR_zneg3,jacqueretop1_FNR_zneg3,jacqueretop2_FNR_zneg3,jacqueretop3_FNR_zneg3,jacqueretop4_FNR_zneg3],
              "Z<-4":[random_4_FNR_zneg4,brunello_FNR_zneg4,gattinara_FNR_zneg4,jacqueretop1_FNR_zneg4,jacqueretop2_FNR_zneg4,jacqueretop3_FNR_zneg4,jacqueretop4_FNR_zneg4]
             },index=libraries)
FNR_df=FNR_df.multiply(100).astype(int) #convert decimals to percentages
FNR_df

In [None]:
x = np.arange(len(libraries))  # the label locations
width = 0.25  # the width of the bars
multiplier = 0

fig, ax = plt.subplots(layout='constrained')
ax.set_prop_cycle(cycler('color', plt.cm.tab20.colors))

for zcutoff, FNR in FNR_df.items():
    offset = width * multiplier
    rects = ax.bar(x + offset, FNR, width, label=zcutoff)
    ax.bar_label(rects, padding=3)
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('False Negative Rate (%)',fontsize=16)
ax.set_title('Identifying essential genes from mean sgRNA Z-score',fontsize=16)
ax.set_xticks(x + width, libraries)
ax.legend(loc='upper right', ncols=3,title="Cutoff to classify gene as essential")
ax.set_yticks([])
ax.set_ylim([0,FNR_df.max().max()+5])
ax.set_xlabel("sgRNA Library",fontsize=16)
#plt.savefig("../Figures/jacquere_subsampling_FNR.png",bbox_inches="tight",dpi=600)
#data source: DeWeirdt 2022 

Note that this plot should **not** be interpreted as to suggest that **statistical significance** decreases with the use of more guides; in reality, screens are not interpreted with the average z-score across guides, but rather a downstream analysis method that assigns greater signficance to genes with evidence from numerous guides. Rather, this plot reveals that the inclusion of 4th guide picks decreases guide quality, thus incorporating more guides that may not necessarily reflect the true phenotype of the gene. 

## Generate outputs of Jacquere, assembled

Subset to the top 3 guide picks, join with the negative controls, and generate an output file showing the optimal guide picks for each target. Features repeated guides/rows since certain guides are picked to target multiple genes


In [None]:
jacquere_quota3=jacquere[jacquere["Pick Order"].isin([1,2,3])].reset_index(drop=True)
#export for comparison to other libraries
#jacquere_quota3.to_csv("../Data/jacquere_assembled_crispick.csv",index=False)

In [None]:
jacquere_export_columns=["Input","Target Gene Symbol","sgRNA Sequence","On-Target Ruleset","On-Target Efficacy Score","Aggregate CFD Score","Source"]
jacquere_quota3_export=jacquere_quota3[jacquere_export_columns]
negative_controls=pd.concat([intergenics[jacquere_export_columns],nontargeting[jacquere_export_columns]])
jacquere_quota3_export=pd.concat([jacquere_quota3_export,negative_controls])

jacquere_quota3_export=jacquere_quota3_export.rename(columns={"Input":"Target Gene ID"})
#jacquere_quota3_export.to_csv("../Jacquere_PerTargetAnnotations.csv",index=False)


Report as well the intended targets as **sets** of guides as to reveal which genes are intended to be targeted by the same optimal set of guides. This is an artifact of selecting the "target local guide selection" option in CRISPick

In [None]:
# Get # guides picked to target >1 gene
guiderepeatcounts=pd.DataFrame(jacquere_quota3["sgRNA Sequence"].value_counts())
print("% of guides picked for exactly 1 gene:", 100-100*len(guiderepeatcounts[guiderepeatcounts["count"]>1])/len(jacquere_quota3))

# Get genes that share identical sets of guides
guides_per_input=pd.DataFrame(jacquere_quota3.groupby(['Input','Target Gene Symbol','Source'])['sgRNA Sequence'].apply(lambda x:"_".join(sorted(x)))).reset_index()
guides_per_input["guide set"]=guides_per_input["sgRNA Sequence"]

repeated_guide_sets=pd.DataFrame(guides_per_input.groupby(["guide set"]).agg(geneSymbols_targeted=("Target Gene Symbol",list),geneIDs_targeted=("Input",list),source_genes_targeted=("Source",list),n_genes_targeted=("Target Gene Symbol","count"))).sort_values(by="n_genes_targeted",ascending=False).reset_index()
#repeated_guide_sets.to_csv("../Jacquere_PerGuideSetAnnotations.csv",index=False)

#report how many guides were picked for 1 vs. 2 etc genes
guide_set_repeat_counts=pd.DataFrame(guides_per_input["sgRNA Sequence"].value_counts())
guide_set_repeat_counts["count"].value_counts()

Generate reports detailing **all NCBI and Ensembl genes that guides map to** rather than just the genes that each guide is *intended* to target. Mappings from internal GPP LIMS.

In [None]:
#Jacquere guides mapped back to ensembl to see if untargeted genes are targeted by guides intended for other genes
jacquere_ensembl_mapped= pd.read_csv("../../5. Assessment of  CRISPRko Cas9 Genome Wide Libraries/Data/GENCODE47 Mappings of CRISPRko Cas9 Genome Wide Libraries/jacquere_quota3_adhoc_sgRNA_disco_GRCh38_Ensembl_SpyoCas9Ko_strict.csv")
#Above, but mapped to NCBI
jacquere_NCBI_mapped= pd.read_csv("../Data/jacquere_quota3_adhoc_sgRNA_disco_GRCh38_NCBI_SpyoCas9Ko_strict.csv")


jacquere_quota3=jacquere_quota3.merge(jacquere_ensembl_mapped[["Target Sequence","On-target Gene Symbols","On-target Gene IDs"]],
                      left_on="sgRNA Sequence", right_on="Target Sequence",
                      how="left").drop(["Target Sequence"],axis=1).rename(columns={"On-target Gene Symbols":"Ensembl.v113 mapped gene symbols",
                                                                                  "On-target Gene IDs":"Ensembl.v113 mapped gene IDs"})
jacquere_quota3=jacquere_quota3.merge(jacquere_NCBI_mapped[["Target Sequence","On-target Gene Symbols","On-target Gene IDs"]],
                      left_on="sgRNA Sequence", right_on="Target Sequence",
                      how="left").drop(["Target Sequence"],axis=1).rename(columns={"On-target Gene Symbols":"NCBI 08-2024 mapped gene symbols",
                                                                                  "On-target Gene IDs":"NCBI 08-2024 mapped gene IDs"})

jacquere_quota3=jacquere_quota3.fillna("")
jacquere_quota3["Gene targeted"]=np.where(jacquere_quota3["Target Gene Symbol"].isna(), jacquere_quota3["Input"],jacquere_quota3["Target Gene Symbol"])
intended_target_symbols=jacquere_og["Target Gene Symbol"].tolist()
jacquere_quota3["All Jacquere target symbols that guide maps to"]=jacquere_quota3.apply(lambda x: set(gene for gene in [x["Gene targeted"]]+x["NCBI 08-2024 mapped gene symbols"].split(",")+x["Ensembl.v113 mapped gene symbols"].split(",") if gene in intended_target_symbols),axis=1)
intended_target_IDs=jacquere_og["Input"].tolist()
jacquere_quota3["All Jacquere target IDs that guide maps to"]=jacquere_quota3.apply(lambda x: set(gene for gene in [x["Input"]]+x["NCBI 08-2024 mapped gene IDs"].split(",")+x["Ensembl.v113 mapped gene IDs"].split(",") if gene in intended_target_IDs),axis=1)

jacquere_for_gene_counts= jacquere_quota3.explode("All Jacquere target IDs that guide maps to")
jacquere_for_gene_counts["Target ID"]=jacquere_for_gene_counts["All Jacquere target IDs that guide maps to"]

print("# targeting guides in jacquere ", len(jacquere_for_gene_counts["sgRNA Sequence"].unique()))
print("# genes targeted", len(jacquere_for_gene_counts["Input"].unique()))

num_guides_gene=pd.DataFrame(jacquere_for_gene_counts["Target ID"].value_counts()).reset_index()
num_guides_gene["# Guides targeting gene"]=num_guides_gene["count"]
#4 is a placehold here for genes targeted by >3 guides
num_guides_gene["# Guides targeting gene"]=np.where(num_guides_gene["# Guides targeting gene"]>3,4,num_guides_gene["# Guides targeting gene"])
num_guides_gene["Source"]=np.where(num_guides_gene["Target ID"].str.count("ENS")>0, "GENCODE",
                       np.where(num_guides_gene["Target ID"].str.count("CHS")>0,"CHESS","RefSeq"))
source_quota_counts=pd.DataFrame(num_guides_gene[["Source","# Guides targeting gene"]].value_counts()).reset_index()

#add in genes targeted by 0 guides
genes_0_jacquere_guides=jacquere_og[jacquere_og["Input"].isin(jacquere_for_gene_counts["Target ID"].tolist())==False][["Source","Input","Target Gene Symbol"]].drop_duplicates()
source_counts_0_genes=pd.DataFrame(genes_0_jacquere_guides["Source"].value_counts()).reset_index()
source_counts_0_genes["# Guides targeting gene"]=0
source_quota_counts_table=pd.concat([source_quota_counts,source_counts_0_genes]).pivot(index="# Guides targeting gene",columns="Source",values="count").fillna(0).sort_values(by="# Guides targeting gene",ascending=False).astype(int)
source_quota_counts_table["Total"]=source_quota_counts_table.sum(axis=1)
source_quota_counts_table=source_quota_counts_table.rename(index={4:"4+"})

#plot
fig, ax = plt.subplots(figsize=(15,2))
ax.table(cellText=source_quota_counts_table[["Total","GENCODE","RefSeq","CHESS"]].values, 
         colLabels=source_quota_counts_table[["Total","GENCODE","RefSeq","CHESS"]].columns, 
         loc='center',colColours=['lightblue','0.8','0.8','0.8'],colWidths=[.1,.1,.1,.1],
         rowLabels=["# Targeted by >3 guides","# Targeted by 3 guides","# Targeted by 2 guides","# Targeted by 1 guide","# Not targetable"])
ax.axis('off')
fig.tight_layout()
plt.title("Genes targeted in Jacquere (subsequent catalogs report those not recognized by prior) ")
gpp.savefig("../Figures/jacquere_gene_counts_source_quota.pdf",bbox_inches="tight",dpi=600)

Output list of untargetable genes

In [None]:
#genes_0_jacquere_guides.to_csv("../Jacquere_untargetable_genes.csv",index=False)


Generate report in which guides are not repeated, and each line indicates all genes that the guide maps to. This is used to construct the Clone Pool Order to GPP Production for Jacquere 

In [None]:
jacquere_no_guide_repeats=jacquere_for_gene_counts[["sgRNA Sequence","Target ID","All Jacquere target symbols that guide maps to"]].groupby("sgRNA Sequence").agg(list).reset_index()

jacquere_no_guide_repeats["Target Gene ID"]=jacquere_no_guide_repeats["Target ID"].apply(lambda x:"|".join([str(i) for i in x]))
jacquere_no_guide_repeats["Target Gene Symbol"]=jacquere_no_guide_repeats["All Jacquere target symbols that guide maps to"].apply(lambda x:"|".join([str(i) for i in x[0]]))

#add in the negative controls
jacquere_no_guide_repeats_export=pd.concat([jacquere_no_guide_repeats[["sgRNA Sequence","Target Gene Symbol","Target Gene ID"]],intergenics[["sgRNA Sequence","Target Gene Symbol","Target Gene ID"]],nontargeting[["sgRNA Sequence","Target Gene Symbol","Target Gene ID"]]])

#jacquere_no_guide_repeats_export.to_csv("../Jacquere_PerGuideAnnotations.csv",index=False)


In [None]:
jacquere_guides=jacquere_quota3["sgRNA Sequence"].tolist() #note: only includes targeting guides 
jacquere_guides_in_brunello=[g for g in jacquere_guides if g in brunello_guides]
print("% of jacquere guides that are in brunello:", 100*len(jacquere_guides_in_brunello)/len(jacquere_guides))

In [None]:
v=venn3([set(brunello_guides),set(gattinara_guides),set(jacquere_guides)],set_labels=('Brunello (2016)','Gattinara (2018)','Jacquere (2025)'),
        layout_algorithm=DefaultLayoutAlgorithm(),
        set_colors= ("y","g","b")) 
v
plt.title("Overlap of Targeting Guides in CRISPRko Cas9 Libraries")
gpp.savefig("../Figures/GPP_CRISPRko_cas9_genomewide_library_overlap.pdf",bbox_inches="tight",dpi=600)

### Identify if any guides violate gnomAD filtering

In [None]:
print("% of targeting genes in Jacquere that violate gnomAD ancestry filter",100*len(jacquere_quota3[jacquere_quota3["Picking Notes"].str.count("Ancestry")>0])/len(jacquere_quota3))

