In [1]:
import pandas as pd
import numpy as np
import re
import gzip


Assessing the effect of the library design choice in Jacquere to tolerate multi-target guides on guide selection for paralogs. 


In [2]:
#Retrieve the genes targeted by each guide in Jacquere 
jacquere_perguide=pd.read_csv("../../4. Jacquere Design/Jacquere_PerGuideAnnotations.csv")

#Assess the prevalence of multi-target guides in Jacquere
multi_target_guides= jacquere_perguide[jacquere_perguide["Target Gene ID"].str.count("ENS")>1]
print("# of guides in jacquere picked for numerous guides:",len(multi_target_guides))
print("% of total library:",100*round(len(multi_target_guides)/len(jacquere_perguide),2))

multi_target_guides["Target Gene ID"]=multi_target_guides["Target Gene ID"].apply(lambda x: x.split("|"))
#get each unique guide/target pair as a row
multi_target_guides_long=multi_target_guides.explode("Target Gene ID")
#keep ensembl gene IDs only 
multi_target_guides_long=multi_target_guides_long[multi_target_guides_long["Target Gene ID"].str.count("ENS")>0].reset_index(drop=True)
#get list of all targets that share guides with other targets
targets_that_share_guides= multi_target_guides_long["Target Gene ID"].unique().tolist()
print("\n# of genes that share guides with others in jacquere:",len(targets_that_share_guides))
#get total # of genes targeted for context 
jacquere_perguide["Target Gene ID"]=jacquere_perguide["Target Gene ID"].apply(lambda x: x.split("|"))
jacquere_perguide_long=jacquere_perguide.explode("Target Gene ID")
jacquere_perguide_long=jacquere_perguide_long[jacquere_perguide_long["Target Gene ID"].str.count("ENS")>0].reset_index(drop=True)
all_targets= jacquere_perguide_long["Target Gene ID"].unique().tolist()
print("% of all genes targeted in jacquere:",100*round(len(targets_that_share_guides)/len(all_targets),2))

# of guides in jacquere picked for numerous guides: 3925
% of total library: 6.0

# of genes that share guides with others in jacquere: 2689
% of all genes targeted in jacquere: 13.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  multi_target_guides["Target Gene ID"]=multi_target_guides["Target Gene ID"].apply(lambda x: x.split("|"))


## Identify how exclude multimapping guides increases the number of untargetable genes

Additionally examining how excluding multimapping guides removes *effective* guides necessary to target genes. These metrics are featured in the results section of the manuscript. For the purpose of this analysis, we are defining effective guides as RS3 >0.2 and disregarding the presence of restriction sites in candidate guides, spacing between guides, variability of the target according to gnomAD. 

In [None]:
filepath="../Data/CRISPick evaluated CRISPRko Cas9 Genome Wide Libraries/sgRNA_design_9606_GRCh38_SpyoCas9_CRISPRko_RS3seq-Chen2013+RS3target_Ensembl_20241028.txt.gz"
with gzip.open(filepath) as f:
    all_ensembl113_candidate_guides = pd.read_table(f)

#total # genes that can be targeted by any guides (i.e. have a PAM)
total_possible_targets= len(all_ensembl113_candidate_guides["Input"].unique())
#dataframe of guides that have no off-target matches in other genes 
no_other_target_matches= all_ensembl113_candidate_guides[all_ensembl113_candidate_guides['Other Target Matches'].isna()].reset_index(drop=True)

print("# genes that at least have a PAM sequence:",total_possible_targets)
print("# genes that lose all candidate guides by removing multimapping guides:",total_possible_targets-len(no_other_target_matches["Input"].unique()))
print("# genes that lose all candidate guides by removing RS3 <0.2 guides:",total_possible_targets-len(all_ensembl113_candidate_guides[all_ensembl113_candidate_guides["On-Target Efficacy Score"]>0.2]["Input"].unique()))
print("# genes that lose all candidate guides by removing RS3 <0.2 and multimapping guides:",total_possible_targets-len(no_other_target_matches[no_other_target_matches["On-Target Efficacy Score"]>0.2]["Input"].unique()))


The metrics above demonstrate that Multi-target guides are often the effective guides you need to target a gene: removing multi-target guides causes 1,124 genes to not receive any guides, and 1,331 genes to not receive any *effective* guides. 

### Case study: Jacquere guide selections for paralogs

Fc gamma receptor genes FCGR2A/B/C

In [3]:
jacquere_pertarget=pd.read_csv("../../4. Jacquere Design/Jacquere_PerTargetAnnotations.csv")


In [4]:
#Retrieve % similarity among RAS family
#File was downloaded from BioMart on 4/14/2025 with respect to Ensembl 113 (GENCODE47 gene annotations)

biomart_ensembl113=pd.read_table("../Data/mart_export_ensembl113_paralogs.txt",sep=",")
#get average of AB and BA percent identity 
biomart_ensembl113["Mean percent identity"]=biomart_ensembl113[["Paralogue %id. target Human gene identical to query gene",
                                                            "Paralogue %id. query gene identical to target Human gene"]].mean(axis=1)
FCGR2B_GeneID="ENSG00000072694"
FCGR2C_GeneID="ENSG00000244682"
print("FCGR2B : FCGR2C mean % identity:",biomart_ensembl113.loc[(biomart_ensembl113["Gene stable ID"]==FCGR2B_GeneID)&(biomart_ensembl113["Human paralogue gene stable ID"]==FCGR2C_GeneID),"Mean percent identity"].values[0])


FCGR2B : FCGR2C mean % identity: 82.4993


In [5]:
FCGR2_B_C_jacquere=jacquere_pertarget[jacquere_pertarget["Target Gene ID"].isin([FCGR2B_GeneID,FCGR2C_GeneID])].sort_values(by="sgRNA Sequence")
FCGR2_B_C_jacquere

Unnamed: 0,Target Gene ID,Target Gene Symbol,sgRNA Sequence,On-Target Ruleset,On-Target Efficacy Score,Aggregate CFD Score,Source
3578,ENSG00000072694,FCGR2B,AAAGCACAGTCAGATGCACA,RS3seq-Chen2013+RS3target,0.9547,2.0222,GENCODE
55051,ENSG00000244682,FCGR2C,AGATTCCCATTGTGGAACCA,RS3seq-Chen2013+RS3target,0.3248,0.7446,GENCODE
3579,ENSG00000072694,FCGR2B,GGTGCTCCAGACCCCTCACC,RS3seq-Chen2013+RS3target,0.8991,3.6679,GENCODE
55049,ENSG00000244682,FCGR2C,GGTGCTCCAGACCCCTCACC,RS3seq-Chen2013+RS3target,0.8942,3.6679,GENCODE
55050,ENSG00000244682,FCGR2C,TCAGAGTCACAGAGTCCTCT,RS3seq-Chen2013+RS3target,0.4117,1.2727,GENCODE
3580,ENSG00000072694,FCGR2B,TGGAGCACGTTGATCCACTG,RS3seq-Chen2013+RS3target,0.4101,1.1875,GENCODE


In [6]:
#from CRISPick: all possible guides for FCGRB, C 
#could maybe use this to make figure: highlight all options, report other genes targeted, Agg CFD, RS3 score + add % similarity
all_FCGR2_B_C_guides=pd.read_table("../Data/FCGR2_B_C_allguides-sgrna-designs.txt")
all_FCGR2_B_C_guides["Other Target Matches"]=all_FCGR2_B_C_guides["Other Target Matches"].apply(lambda x: ",".join(re.findall(r"\(([a-zA-Z0-9]+)\)",str(x))))


In [7]:
pd.set_option('display.max_colwidth', None)
all_FCGR2B_guides=all_FCGR2_B_C_guides[all_FCGR2_B_C_guides["Target Gene Symbol"]=="FCGR2B"].reset_index(drop=True)
all_FCGR2B_guides_in_Jacquere=all_FCGR2B_guides[all_FCGR2B_guides["sgRNA Sequence"].isin(jacquere_pertarget[jacquere_pertarget["Target Gene ID"]==FCGR2B_GeneID]["sgRNA Sequence"].tolist())]
all_FCGR2B_guides_in_Jacquere[["sgRNA Sequence","Other Target Matches","On-Target Efficacy Score","Aggregate CFD Score",'Off-Target Tier I CFD100 Hits','Picking Round']].sort_values(by="Picking Round",ascending=True)


Unnamed: 0,sgRNA Sequence,Other Target Matches,On-Target Efficacy Score,Aggregate CFD Score,Off-Target Tier I CFD100 Hits,Picking Round
30,TGGAGCACGTTGATCCACTG,,0.4101,1.1875,0,1.0
1,AAAGCACAGTCAGATGCACA,"FCGR2A,FCGR2C",0.9547,2.0222,2,5.0
3,GGTGCTCCAGACCCCTCACC,"FCGR2A,FCGR2C",0.8991,3.6679,2,5.0


In [8]:
all_FCGR2C_guides=all_FCGR2_B_C_guides[all_FCGR2_B_C_guides["Target Gene Symbol"]=="FCGR2C"].reset_index(drop=True)
all_FCGR2C_guides_in_Jacquere=all_FCGR2C_guides[all_FCGR2C_guides["sgRNA Sequence"].isin(jacquere_pertarget[jacquere_pertarget["Target Gene ID"]==FCGR2C_GeneID]["sgRNA Sequence"].tolist())]
all_FCGR2C_guides_in_Jacquere[["sgRNA Sequence","Other Target Matches","On-Target Efficacy Score","Aggregate CFD Score",'Off-Target Tier I CFD100 Hits','Picking Round']].sort_values(by="Picking Round",ascending=True)


Unnamed: 0,sgRNA Sequence,Other Target Matches,On-Target Efficacy Score,Aggregate CFD Score,Off-Target Tier I CFD100 Hits,Picking Round
20,TCAGAGTCACAGAGTCCTCT,,0.4117,1.2727,0,1.0
27,AGATTCCCATTGTGGAACCA,,0.3248,0.7446,0,1.0
1,GGTGCTCCAGACCCCTCACC,"FCGR2B,FCGR2A",0.8942,3.6679,2,5.0
