## This script runs prime editing guide RNA design for H2M database.  

Kexin Dong  

May 1, 2024  

In [1]:
import bioh2m as h2m



In [2]:
from pegg import prime
import pandas as pd

In [30]:
df_all = pd.read_csv('/Users/kexindong/Documents/GitHub/Output/h2m_database/PEGG/pe_human.csv')

In [3]:
path_h_ref, path_m_ref = '/Users/kexindong/Documents/GitHub/Database/RefGenome/ncbi-2023-09-12/GCF_000001405.25_GRCh37.p13_genomic.fna.gz', '/Users/kexindong/Documents/GitHub/Database/RefGenome/mouse-2023-09-13/GCF_000001635.27_GRCm39_genomic.fna.gz'
chrom_dict_h, i = prime.genome_loader(path_h_ref)
chrom_dict_m, i = prime.genome_loader(path_m_ref)

# For Human  

In [32]:
df_all = df_all[df_all['Chromosome']!='M'].reset_index(drop=True)

In [34]:
df_all['Reference_Allele'] = [str(x) for x in df_all['Reference_Allele']]
df_all['Tumor_Seq_Allele2'] = [str(x) for x in df_all['Tumor_Seq_Allele2']]
df_all['Start_Position'] = [int(x) for x in df_all['Start_Position']]
df_all['End_Position'] = [int(x) for x in df_all['End_Position']]
df_pe_h = prime.run(df_all, 'cBioPortal', chrom_dict=chrom_dict_h, PAM='NGN',rankby='RF_Score', pegRNAs_per_mut=5, RTT_lengths=[5, 10, 15, 25, 30],PBS_lengths=[8, 10, 13, 15], min_RHA_size=1, RE_sites=None, polyT_threshold=4,proto_size=19, context_size=120, before_proto_context=5, sensor_length=60,sensor_orientation='reverse-complement', sensor=True)

In [38]:
len(df_pe_h['ID'].unique())

5177

In [41]:
df_all['Hugo_Symbol'].value_counts()

Hugo_Symbol
TP53      588
APC       118
PIK3CA    100
PTEN       95
ARID1A     85
         ... 
FGF14       1
FGA         1
ERCC1       1
ERRFI1      1
SETD1B      1
Name: count, Length: 614, dtype: int64

In [44]:
df_pe_h.to_csv('/Users/kexindong/Documents/GitHub/Output/h2m_database/PEGG/pe_human_result.csv',index=False)

# For Mouse  

In [33]:
df_all = pd.read_csv('/Users/kexindong/Documents/GitHub/Output/h2m_database/PEGG/pe_mouse.csv')

In [34]:
dict_of_type = {'TNP':'ONP'}
df_all['Variant_Type'] = [dict_of_type.get(x,x) for x in df_all['Variant_Type']]

In [35]:
df_all = df_all[df_all['Chromosome']!='M'].reset_index(drop=True)

In [36]:
df_all['Reference_Allele'] = [str(x) for x in df_all['Reference_Allele']]
df_all['Tumor_Seq_Allele2'] = [str(x) for x in df_all['Tumor_Seq_Allele2']]
df_all['Start_Position'] = [int(x) for x in df_all['Start_Position']]
df_all['End_Position'] = [int(x) for x in df_all['End_Position']]
df_pe_m = prime.run(df_all, 'cBioPortal', chrom_dict=chrom_dict_m, PAM='NGN',rankby='RF_Score', pegRNAs_per_mut=5, RTT_lengths=[5, 10, 15, 25, 30],PBS_lengths=[8, 10, 13, 15], min_RHA_size=1, RE_sites=None, polyT_threshold=4,proto_size=19, context_size=120, before_proto_context=5, sensor_length=60,sensor_orientation='reverse-complement', sensor=True)
df_pe_m



Unnamed: 0,mutation_idx,Hugo_Symbol,tx_id_m,Start_Position,End_Position,Reference_Allele,Tumor_Seq_Allele2,Variant_Type,ID,Chromosome,...,indel_size,PEGG2_Score,RF_Score,pegRNA_rank,sensor_wt,sensor_alt,sensor_orientation,sensor_error,contains_polyT_terminator,pegRNA_id
0,0,Kras,ENSMUST00000111710.8,145192497,145192497,C,T,SNP,AACR-H0000001-M01-V01,6,...,0,20.660669,30.350048,1.0,AAATGACTGAGTATAAACTTGTGGTGGTTGGAGCTGGTGGCGTAGG...,AAATGACTGAGTATAAACTTGTGGTGGTTGGAGCTGATGGCGTAGG...,reverse-complement,No Error,False,pegRNA_0
1,0,Kras,ENSMUST00000111710.8,145192497,145192497,C,T,SNP,AACR-H0000001-M01-V01,6,...,0,20.310904,27.444430,2.0,AAATGACTGAGTATAAACTTGTGGTGGTTGGAGCTGGTGGCGTAGG...,AAATGACTGAGTATAAACTTGTGGTGGTTGGAGCTGATGGCGTAGG...,reverse-complement,No Error,False,pegRNA_1
2,0,Kras,ENSMUST00000111710.8,145192497,145192497,C,T,SNP,AACR-H0000001-M01-V01,6,...,0,21.117170,24.007503,3.0,AAATGACTGAGTATAAACTTGTGGTGGTTGGAGCTGGTGGCGTAGG...,AAATGACTGAGTATAAACTTGTGGTGGTTGGAGCTGATGGCGTAGG...,reverse-complement,No Error,False,pegRNA_2
3,0,Kras,ENSMUST00000111710.8,145192497,145192497,C,T,SNP,AACR-H0000001-M01-V01,6,...,0,20.669312,23.256523,4.0,AAATGACTGAGTATAAACTTGTGGTGGTTGGAGCTGGTGGCGTAGG...,AAATGACTGAGTATAAACTTGTGGTGGTTGGAGCTGATGGCGTAGG...,reverse-complement,No Error,False,pegRNA_3
4,0,Kras,ENSMUST00000111710.8,145192497,145192497,C,T,SNP,AACR-H0000001-M01-V01,6,...,0,20.302260,22.063635,5.0,AAATGACTGAGTATAAACTTGTGGTGGTTGGAGCTGGTGGCGTAGG...,AAATGACTGAGTATAAACTTGTGGTGGTTGGAGCTGATGGCGTAGG...,reverse-complement,No Error,False,pegRNA_4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23530,4708,Bcl9,ENSMUST00000046521.14,97117238,97117238,C,T,SNP,AACR-H0005030-M01-V01,3,...,0,17.825962,27.045849,1.0,GAGCACTGCTGGACGACCACCTGCTCCTGTTTCCGCCTCTTCTCTT...,GAGCACTGCTGGACGACCACCTGCTCCTGTTTTCGCCTCTTCTCTT...,reverse-complement,No Error,True,pegRNA_23530
23531,4708,Bcl9,ENSMUST00000046521.14,97117238,97117238,C,T,SNP,AACR-H0005030-M01-V01,3,...,0,17.151807,21.992069,2.0,GAGTTTTATGAAGAGAAGAGGCGGAAACAGGAGCAGGTGGTCGTCC...,GAGTTTTATGAAGAGAAGAGGCGAAAACAGGAGCAGGTGGTCGTCC...,reverse-complement,No Error,False,pegRNA_23531
23532,4708,Bcl9,ENSMUST00000046521.14,97117238,97117238,C,T,SNP,AACR-H0005030-M01-V01,3,...,0,17.394716,21.161375,3.0,GAGCACTGCTGGACGACCACCTGCTCCTGTTTCCGCCTCTTCTCTT...,GAGCACTGCTGGACGACCACCTGCTCCTGTTTTCGCCTCTTCTCTT...,reverse-complement,No Error,True,pegRNA_23532
23533,4708,Bcl9,ENSMUST00000046521.14,97117238,97117238,C,T,SNP,AACR-H0005030-M01-V01,3,...,0,17.173775,19.967213,4.0,GAGTTTTATGAAGAGAAGAGGCGGAAACAGGAGCAGGTGGTCGTCC...,GAGTTTTATGAAGAGAAGAGGCGAAAACAGGAGCAGGTGGTCGTCC...,reverse-complement,No Error,False,pegRNA_23533


In [37]:
len(df_pe_m['ID'].unique())

4707

In [None]:
df_all['Hugo_Symbol'].value_counts()

Hugo_Symbol
TP53      588
APC       118
PIK3CA    100
PTEN       95
ARID1A     85
         ... 
FGF14       1
FGA         1
ERCC1       1
ERRFI1      1
SETD1B      1
Name: count, Length: 614, dtype: int64

In [39]:
df_pe_m.to_csv('/Users/kexindong/Documents/GitHub/Output/h2m_database/PEGG/pe_mouse_result.csv',index=False)

# Bind human and mouse data  

In [41]:
df_pe_h = pd.read_csv('/Users/kexindong/Documents/GitHub/Output/h2m_database/PEGG/pe_human_result.csv')

In [46]:
df_pe_h = df_pe_h.rename(columns = {'tx_id_h':'tx_id'})
df_pe_m = df_pe_m.rename(columns = {'tx_id_m':'tx_id'})

In [51]:
pd.concat([df_pe_h,df_pe_m]).drop(['pegRNA_id','tx_id'], axis=1).to_csv('/Users/kexindong/Documents/GitHub/Output/h2m_database/PEGG/pe_result_all.csv',index=False)

# Statistics  