# Generate CSN Complex (COP9 Signalsome) Libraries

In [1]:
import be_scan
import warnings
warnings.filterwarnings('ignore')
import pandas as pd

In [2]:
sgrna_dir = '../../../tests/test_data/sgrna_data/'
gene_dir = '../../../../../../Downloads/CSN/Genes/'
protein_dir = '../../../../../../Downloads/CSN/Proteins/'
libraries_dir = '../../../../../../Downloads/CSN/Libraries/'

subunits = ['1', '2', '3', '4', '5', '6', '7A', '7B', '8']

In [3]:
cop9 = [('COPS1', gene_dir+'GSP1_Input.fasta', protein_dir+'GSP1_Protein.fasta')]
for s in subunits[1:]: 
    cop9.append((f"COPS{s}", gene_dir+f"COPS{s}_Input.fasta", protein_dir+f"COPS{s}_Protein.fasta"))

editors = [("A", "G"), 
           ("C", "T",)
           ]

In [4]:
# for each protein (gene, protein)
for x in cop9: 
    name, gene, protein = x[0], x[1], x[2]
    # for each ABE CBE
    for edit in editors: 
        e1, e2 = edit[0], edit[1]
        be_scan.sgrna.guides(
                gene_filepath = gene, 
                genome_file  = "../../../../reference_genomes/GCF_000001405.26/ncbi_dataset/data/GCF_000001405.26/GCF_000001405.26_GRCh38_genomic.fna",
                protein_filepath = protein, 
                cas_type      = "SpG", 
                edit_from     = e1, 
                edit_to       = e2,
                gene_name     = name,
                output_name   = '_'.join([name, e1+'to'+e2, 'annotated_guides_.csv']),
                output_dir    = libraries_dir,
                )

Create gene object from ../../../../../../Downloads/CSN/Genes/GSP1_Input.fasta
Parsing exons: 14 exons found
Preprocessing sucessful
Guides generated and duplicates removed
40116757 lines processed from ../../../../reference_genomes/GCF_000001405.26/ncbi_dataset/data/GCF_000001405.26/GCF_000001405.26_GRCh38_genomic.fna
Guides checked against reference genome
Guides annotated
Complete! Library generated from ../../../../../../Downloads/CSN/Genes/GSP1_Input.fasta
Create gene object from ../../../../../../Downloads/CSN/Genes/GSP1_Input.fasta
Parsing exons: 14 exons found
Preprocessing sucessful
Guides generated and duplicates removed
40116757 lines processed from ../../../../reference_genomes/GCF_000001405.26/ncbi_dataset/data/GCF_000001405.26/GCF_000001405.26_GRCh38_genomic.fna
Guides checked against reference genome
Guides annotated
Complete! Library generated from ../../../../../../Downloads/CSN/Genes/GSP1_Input.fasta
Create gene object from ../../../../../../Downloads/CSN/Genes/COPS2_

In [5]:
# merge each gene's guides
libraries = []
for s in subunits: 
    libraries.append((libraries_dir+f"COPS{s}_AtoG_annotated_guides.csv", libraries_dir+f"COPS{s}_CtoT_annotated_guides.csv", f"COPS{s}_library.csv"))

s = 0
for pair in libraries: 
    abe = pair[0]
    cbe = pair[1]
    out = pair[2]
    be_scan.sgrna.merge_guide_df(guide_df1_filepath = abe,
                                 guide_df2_filepath = cbe,
                                 output_dir = libraries_dir,
                                 output_name = out,
                                )

    be_scan.sgrna.annotate_guides(guides_file     = libraries_dir+out, 
                                 gene_filepath    = '', 
                                 protein_filepath = protein_dir + f"COPS{subunits[s]}_Protein.fasta",
                                 edit_from        = 'AC', 
                                 edit_to          = 'GT',
                                 output_dir = libraries_dir,
                                 output_name = out,
                                 )
    s += 1


Guides annotated
Guides annotated
Guides annotated
Guides annotated
Guides annotated
Guides annotated
Guides annotated
Guides annotated
Guides annotated


In [7]:
# add dataframes together
cops = []
for s in subunits: 
    cops.append(pd.read_csv(libraries_dir+f"COPS{s}_library.csv"))

csn_lib = pd.concat(cops)
csn_lib.to_csv(libraries_dir+'CSN_Library_.csv')


In [None]:
# this is actually not the most efficient way to run this cod
#    more efficiently would be gen_guides, annotate, merge ABE CBE, annotate dual
#    check so that check is taking in all guides and only iterating through genome once