## W2-CRISPR-cBEST workflow


Base-editing with CRISPR-BEST is a DSB-free method that uses a single sgRNA to target a specific genomic location with single-nucleotide–resolution(Figure 3A) which can enable in-vivo protein engineering or as it is often used, for introducing early stop-codons10,12. It can be used to introduce targeted point mutations in the genome of Streptomyces, facilitating studies on gene function and protein engineering. To get started quickly users can download the pCRISPR-cBEST plasmid file, the genome file for S.coelicor (A3) and streptoAIM will generate your plasmids based on the genes you chose to target (Figure 3, S5). 

In [1]:
import sys
import os

# Ensure the src directory is in the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '../../'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

import os
import pandas as pd
from pydna.dseqrecord import Dseqrecord
from datetime import datetime
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO




from streptocad.sequence_loading.sequence_loading import load_and_process_gene_sequences, load_and_process_plasmid, load_and_process_genome_sequences, annotate_dseqrecord, check_and_convert_input,process_specified_gene_sequences_from_record
from streptocad.utils import polymerase_dict,ProjectDirectory, extract_metadata_to_dataframe
from streptocad.crispr.guideRNAcas3_9_12 import extract_sgRNAs, SgRNAargs
from streptocad.cloning.ssDNA_bridging import assemble_plasmids_by_ssDNA_bridging, make_ssDNA_oligos
from streptocad.crispr.crispr_best import identify_base_editing_sites, filter_sgrnas_for_base_editing, process_base_editing
from streptocad.cloning.plasmid_processing import annotate_plasmid_with_sgrnas
from streptocad.primers.primer_generation import checking_primers, create_idt_order_dataframe, primers_to_IDT


## INPUT

In [2]:
# Inputs
# 1 Add genome of choice (genbank, fasta)
path_to_genome = '../../data/genomes/Streptomyces_coelicolor_A3_chromosome.gb'
genome = load_and_process_genome_sequences(path_to_genome)[0]

# 2 Add plasmid 
path_to_plasmid = '../../data/plasmids/ pCRISPR-cBEST Sequences.gbk'
clean_plasmid = load_and_process_plasmid(path_to_plasmid)

# 3 Choose genes to knock out (list)
#genes_to_KO = ['SCO5087','SCO5088', 'SCO5089']
genes_to_KO = ['80000-100000', '4000-7000', '9000-14000','15000-20000']


#### Advanced settings ####
# 3 Choose polymerase and target melting temperature FOR CHECKING PRIMERS
chosen_polymerase = polymerase_dict['Phusion High-Fidelity DNA Polymerase (GC Buffer)']


melting_temperature = 65
primer_concentration = 0.4 
primer_number_increment = 1
flanking_region_number = 500

# 4 Filtering metrics for sgRNAs
gc_upper = 0.99
gc_lower = 0.01
off_target_seed = 13
off_target_upper = 10
cas_type='cas9'
number_of_sgRNAs_per_group = 5
only_stop_codons = True

# 6 Choose overlapping sequences for our plasmid we can use the following
#As per the article **"CRISPR–Cas9, CRISPRi and CRISPR-BEST-mediated genetic manipulation in streptomycetes"** we need the following oligoes: 
#CGGTTGGTAGGATCGACGGC **-N20-** GTTTTAGAGCTAGAAATAGC
up_homology = Dseqrecord('CGGTTGGTAGGATCGACGGC')
dw_homology = Dseqrecord('GTTTTAGAGCTAGAAATAGC')

# Computation

In [3]:
len(genome.features)


25824

In [4]:
target_dict, genes_to_KO, annotation_input = check_and_convert_input(genes_to_KO)

print(annotation_input)
if annotation_input == True:
    genome = annotate_dseqrecord(genome, target_dict)
    #locus_tags_presence = check_locus_tags_in_record(genome, ['chosen_region_1', 'chosen_region_2','chosen_region_3','c'])
    #print(locus_tags_presence)
    

len(genome.features)


True
this is the target dict [{'chosen_region_1': [80000, 100000]}, {'chosen_region_2': [4000, 7000]}, {'chosen_region_3': [9000, 14000]}, {'chosen_region_4': [15000, 20000]}]
chosen_region_1 [80000, 100000]
chosen_region_2 [4000, 7000]
chosen_region_3 [9000, 14000]
chosen_region_4 [15000, 20000]


25828

In [5]:
from Bio.SeqRecord import SeqRecord

def check_locus_tags_in_record(seq_record: SeqRecord, locus_tags: list) -> dict:
    """
    Check for specific locus tags in a SeqRecord object.
    
    Parameters
    ----------
    seq_record : SeqRecord
        A SeqRecord object containing genome data.
    locus_tags : list
        A list of locus tags to check for in the SeqRecord.
        
    Returns
    -------
    dict
        Dictionary where keys are locus tags and values are booleans indicating presence in the SeqRecord.
    """
    found_locus_tags = {tag: False for tag in locus_tags}
    
    for feature in seq_record.features:
        if feature.type == "CDS":
            locus_tag = feature.qualifiers.get("locus_tag", [])[0]
            #print(locus_tag)
            if locus_tag in found_locus_tags:
                found_locus_tags[locus_tag] = True
    
    return found_locus_tags


locus_tags_presence = check_locus_tags_in_record(genome, ['chosen_region_1', 'chosen_region_2','chosen_region_3','chosen_region_4'])
locus_tags_presence


{'chosen_region_1': True,
 'chosen_region_2': True,
 'chosen_region_3': True,
 'chosen_region_4': True}

In [6]:
genes_to_KO

['chosen_region_1', 'chosen_region_2', 'chosen_region_3', 'chosen_region_4']

We need to check the input genes so we can find sgRNAs. 

In [7]:

# Initialize SgRNAargs with desired parameters
args = SgRNAargs(genome, 
                genes_to_KO,
                step=['find', 'filter'],
                gc_upper = gc_upper,
                gc_lower = gc_lower,
                off_target_seed = off_target_seed,
                off_target_upper = off_target_upper,
                cas_type='cas9',
                )

sgrna_df = extract_sgRNAs(args)
sgrna_df

TypeError: SgRNAargs.__init__() missing 1 required positional argument: 'revcomp'

In [None]:
# import tempfile

# #### wip 

# def save_file(name, content):
#     """Save the Dseqrecord to a file."""
#     with open(name, "w") as fp:
#         SeqIO.write(content, fp, "genbank")


# # Make a temporary filepath if you want to 
# with tempfile.TemporaryDirectory() as tempdir:

#     # Check if annotation is needed and make a temporary path to the file
#     if annotation_input == True:
#         path_to_genome = os.path.join(tempdir, str(genome.name))
#         # Save uploaded files to the temporary directory
#         save_file(path_to_genome, genome)

#     print(genes_to_KO)
#     # Initialize SgRNAargs with desired parameters
#     args = SgRNAargs(path_to_genome, 
#                      locus_tag=genes_to_KO,
#                      step=['find', 'filter'],
#                      gc_upper=gc_upper,
#                      gc_lower=gc_lower,
#                      off_target_seed=off_target_seed,
#                      off_target_upper=off_target_upper,
#                      cas_type=cas_type)
    
#     # Call the function to extract sgRNAs
#     sgrna_df = extract_sgRNAs(args)

# sgrna_df

['chosen_region_1', 'chosen_region_2', 'chosen_region_3', 'chosen_region_4']
sgRNA generated were outside the designated border in chosen_region_1. To incorporate this extent borders. Skipping to next locus tag.
sgRNA generated were outside the designated border in chosen_region_1. To incorporate this extent borders. Skipping to next locus tag.
sgRNA generated were outside the designated border in chosen_region_2. To incorporate this extent borders. Skipping to next locus tag.
sgRNA generated were outside the designated border in chosen_region_2. To incorporate this extent borders. Skipping to next locus tag.
sgRNA generated were outside the designated border in chosen_region_2. To incorporate this extent borders. Skipping to next locus tag.
sgRNA generated were outside the designated border in chosen_region_2. To incorporate this extent borders. Skipping to next locus tag.
sgRNA generated were outside the designated border in chosen_region_2. To incorporate this extent borders. Skippi

Unnamed: 0,strain_name,locus_tag,gene_loc,gene_strand,sgrna_strand,sgrna_loc,gc,pam,sgrna,sgrna_seed_sequence,off_target_count
3517,NC_003888,chosen_region_1,80001,1,1,6574,0.60,TGG,GCCACATTCACTACGACCCC,TCACTACGACCCC,0
2538,NC_003888,chosen_region_1,80001,1,1,16046,0.65,GGG,CCCTGATCCGAGTGTGTGCC,CCGAGTGTGTGCC,0
2539,NC_003888,chosen_region_1,80001,1,1,16045,0.60,CGG,ACCCTGATCCGAGTGTGTGC,TCCGAGTGTGTGC,0
2540,NC_003888,chosen_region_1,80001,1,1,16018,0.55,GGG,ACGACAATCCGGTGATCAGC,TCCGGTGATCAGC,0
2541,NC_003888,chosen_region_1,80001,1,1,16017,0.50,CGG,TACGACAATCCGGTGATCAG,ATCCGGTGATCAG,0
...,...,...,...,...,...,...,...,...,...,...,...
5096,NC_003888,chosen_region_3,9001,1,-1,1710,0.80,AGG,GGAATCGGGCGAGGGCGGCC,GGCGAGGGCGGCC,9
4158,NC_003888,chosen_region_1,80001,1,1,137,0.85,TGG,CGAGCACACGGCGCGGGCGC,ACGGCGCGGGCGC,9
1982,NC_003888,chosen_region_1,80001,1,-1,18880,0.80,TGG,CACCTACGGCGCGCTCGGCG,GGCGCGCTCGGCG,9
6674,NC_003888,chosen_region_4,15001,1,1,3011,0.90,GGG,CGGCGCCGCCGACGCCGACG,GCCGACGCCGACG,10


In [None]:
value_counts = sgrna_df['locus_tag'].value_counts()
value_counts

locus_tag
chosen_region_1    4164
chosen_region_3    1131
chosen_region_4    1097
chosen_region_2     634
Name: count, dtype: int64

In [None]:
len(genome.features)

25828

In [None]:
# Load gene sequences from the genome
gene_sequences = process_specified_gene_sequences_from_record(genome, genes_to_KO)
genes_to_KO_dict = {locus_tag: gene_sequences[locus_tag] for locus_tag in genes_to_KO if locus_tag in gene_sequences}

# Identify and annotate base editing sites
sgrna_df_with_editing = identify_base_editing_sites(sgrna_df)

# filter out only sgRNAs that result in base-editing
filtered_sgrna_df_for_base_editing = filter_sgrnas_for_base_editing(sgrna_df_with_editing)
filtered_sgrna_df_for_base_editing

Unnamed: 0,strain_name,locus_tag,gene_loc,gene_strand,sgrna_strand,sgrna_loc,gc,pam,sgrna,sgrna_seed_sequence,off_target_count,editable_cytosines
3517,NC_003888,chosen_region_1,80001,1,1,6574,0.60,TGG,GCCACATTCACTACGACCCC,TCACTACGACCCC,0,359
2538,NC_003888,chosen_region_1,80001,1,1,16046,0.65,GGG,CCCTGATCCGAGTGTGTGCC,CCGAGTGTGTGCC,0,389
2539,NC_003888,chosen_region_1,80001,1,1,16045,0.60,CGG,ACCCTGATCCGAGTGTGTGC,TCCGAGTGTGTGC,0,34910
2540,NC_003888,chosen_region_1,80001,1,1,16018,0.55,GGG,ACGACAATCCGGTGATCAGC,TCCGGTGATCAGC,0,5910
2541,NC_003888,chosen_region_1,80001,1,1,16017,0.50,CGG,TACGACAATCCGGTGATCAG,ATCCGGTGATCAG,0,3610
...,...,...,...,...,...,...,...,...,...,...,...,...
5096,NC_003888,chosen_region_3,9001,1,-1,1710,0.80,AGG,GGAATCGGGCGAGGGCGGCC,GGCGAGGGCGGCC,9,610
4158,NC_003888,chosen_region_1,80001,1,1,137,0.85,TGG,CGAGCACACGGCGCGGGCGC,ACGGCGCGGGCGC,9,579
1982,NC_003888,chosen_region_1,80001,1,-1,18880,0.80,TGG,CACCTACGGCGCGCTCGGCG,GGCGCGCTCGGCG,9,34710
6674,NC_003888,chosen_region_4,15001,1,1,3011,0.90,GGG,CGGCGCCGCCGACGCCGACG,GCCGACGCCGACG,10,467910


In [None]:
# Process the DataFrame to apply C-to-T mutations
mutated_sgrna_df = process_base_editing(filtered_sgrna_df_for_base_editing, 
                                        genes_to_KO_dict, 
                                        only_stop_codons = False)
mutated_sgrna_df



Unnamed: 0,strain_name,locus_tag,gene_loc,gene_strand,sgrna_strand,sgrna_loc,gc,pam,sgrna,sgrna_seed_sequence,off_target_count,editable_cytosines,mutations
3517,NC_003888,chosen_region_1,80001,1,1,6574,0.60,TGG,GCCACATTCACTACGACCCC,TCACTACGACCCC,0,359,"P2186L, H2187Y, S2188L"
2538,NC_003888,chosen_region_1,80001,1,1,16046,0.65,GGG,CCCTGATCCGAGTGTGTGCC,CCGAGTGTGTGCC,0,389,S5345F
2539,NC_003888,chosen_region_1,80001,1,1,16045,0.60,CGG,ACCCTGATCCGAGTGTGTGC,TCCGAGTGTGTGC,0,34910,"P5343L, S5345F"
2540,NC_003888,chosen_region_1,80001,1,1,16018,0.55,GGG,ACGACAATCCGGTGATCAGC,TCCGGTGATCAGC,0,5910,"Q5335*, S5336F"
2541,NC_003888,chosen_region_1,80001,1,1,16017,0.50,CGG,TACGACAATCCGGTGATCAG,ATCCGGTGATCAG,0,3610,"R5334*, Q5335*, S5336F"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5096,NC_003888,chosen_region_3,9001,1,-1,1710,0.80,AGG,GGAATCGGGCGAGGGCGGCC,GGCGAGGGCGGCC,9,610,D569N
4158,NC_003888,chosen_region_1,80001,1,1,137,0.85,TGG,CGAGCACACGGCGCGGGCGC,ACGGCGCGGGCGC,9,579,"A41V, H42Y"
1982,NC_003888,chosen_region_1,80001,1,-1,18880,0.80,TGG,CACCTACGGCGCGCTCGGCG,GGCGCGCTCGGCG,9,34710,"A6291T, V6292I, G6293N"
6674,NC_003888,chosen_region_4,15001,1,1,3011,0.90,GGG,CGGCGCCGCCGACGCCGACG,GCCGACGCCGACG,10,467910,"R999C, R1000C, R1001*"


In [None]:
# Process the DataFrame to apply C-to-T mutations
mutated_sgrna_df = process_base_editing(filtered_sgrna_df_for_base_editing, 
                                        genes_to_KO_dict, 
                                        only_stop_codons = True)
mutated_sgrna_df



Unnamed: 0,strain_name,locus_tag,gene_loc,gene_strand,sgrna_strand,sgrna_loc,gc,pam,sgrna,sgrna_seed_sequence,off_target_count,editable_cytosines,mutations
4173,NC_003888,chosen_region_2,4001,1,-1,35,0.75,AGG,CCCCAGACACCACGCACGCC,CACCACGCACGCC,1,34810,"G9D, V10I, W11*"
4172,NC_003888,chosen_region_2,4001,1,-1,34,0.70,GGG,CCCAGACACCACGCACGCCA,ACCACGCACGCCA,1,37910,"G9N, V10I, W11*"
5938,NC_003888,chosen_region_4,15001,1,-1,54,0.70,AGG,GCTCAACCTCCAGGCCGGAC,CTCCAGGCCGGAC,1,47810,"W15*, R16K"
4166,NC_003888,chosen_region_1,80001,1,1,80,0.55,AGG,GTTGTCCGTCAGCACTTCGA,GTCAGCACTTCGA,0,6710,"R23C, Q24*"
4165,NC_003888,chosen_region_1,80001,1,1,81,0.50,GGG,TTGTCCGTCAGCACTTCGAA,TCAGCACTTCGAA,1,569,"R23C, Q24*"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2132,NC_003888,chosen_region_1,80001,1,1,19620,0.85,AGG,GCCGACCCGCGCGCTGGTCG,CGCGCGCTGGTCG,1,367810,"R6535*, P6536F, A6537V"
2133,NC_003888,chosen_region_1,80001,1,1,19614,0.85,TGG,CGCTGAGCCGACCCGCGCGC,CCGACCCGCGCGC,0,389,R6535*
2053,NC_003888,chosen_region_1,80001,1,-1,19624,0.75,CGG,ACCTCGACCAGCGCGCGGGT,CCAGCGCGCGGGT,0,3589,"W6539*, R6541K"
2065,NC_003888,chosen_region_1,80001,1,-1,19714,0.55,AGG,GCCAGCATCTCGACTTCTTC,TCTCGACTTCTTC,0,369,"R6569K, C6570Y, W6571*"


In [None]:
# Filter the DataFrame to retain only up to 5 sgRNA sequences per locus_tag
filtered_df = mutated_sgrna_df.groupby('locus_tag').head(number_of_sgRNAs_per_group)
filtered_df

Unnamed: 0,strain_name,locus_tag,gene_loc,gene_strand,sgrna_strand,sgrna_loc,gc,pam,sgrna,sgrna_seed_sequence,off_target_count,editable_cytosines,mutations
4173,NC_003888,chosen_region_2,4001,1,-1,35,0.75,AGG,CCCCAGACACCACGCACGCC,CACCACGCACGCC,1,34810,"G9D, V10I, W11*"
4172,NC_003888,chosen_region_2,4001,1,-1,34,0.7,GGG,CCCAGACACCACGCACGCCA,ACCACGCACGCCA,1,37910,"G9N, V10I, W11*"
5938,NC_003888,chosen_region_4,15001,1,-1,54,0.7,AGG,GCTCAACCTCCAGGCCGGAC,CTCCAGGCCGGAC,1,47810,"W15*, R16K"
4166,NC_003888,chosen_region_1,80001,1,1,80,0.55,AGG,GTTGTCCGTCAGCACTTCGA,GTCAGCACTTCGA,0,6710,"R23C, Q24*"
4165,NC_003888,chosen_region_1,80001,1,1,81,0.5,GGG,TTGTCCGTCAGCACTTCGAA,TCAGCACTTCGAA,1,569,"R23C, Q24*"
5932,NC_003888,chosen_region_3,9001,1,1,118,0.65,CGG,TGCCCGATCTTCCAGGCCAG,TCTTCCAGGCCAG,1,3459,"A34V, R35*, S36F"
4160,NC_003888,chosen_region_1,80001,1,1,130,0.75,CGG,CGAAGGCCGAGCACACGGCG,CGAGCACACGGCG,0,78,R40*
4159,NC_003888,chosen_region_1,80001,1,1,131,0.75,GGG,GAAGGCCGAGCACACGGCGC,GAGCACACGGCGC,0,67,R40*
5931,NC_003888,chosen_region_3,9001,1,1,146,0.6,CGG,ATTCGACGCAGAGACCGCAC,GCAGAGACCGCAC,1,479,"R44*, R45C"
4182,NC_003888,chosen_region_2,4001,1,-1,148,0.7,CGG,CATCCCAGGAAGGCAGCGCG,GGAAGGCAGCGCG,1,456,"W48*, D49N"


## Output

In [None]:
# MAke oligoes
list_of_ssDNAs = make_ssDNA_oligos(filtered_df, upstream_ovh = up_homology,
                      downstream_ovh=dw_homology)
# cut plasmid
from Bio.Restriction import NcoI
linearized_plasmid = sorted(clean_plasmid.cut(NcoI), key=lambda x: len(x), reverse=True)[0]

# assemble plasmid
sgRNA_vectors = assemble_plasmids_by_ssDNA_bridging(list_of_ssDNAs,linearized_plasmid)


In [None]:
# Constructing a meaningful name, ID, and description for the assembled plasmid using user input
targeting_info = []
for index, row in filtered_df.iterrows():
    formatted_str = f"pCRISPR-BEST_{row['locus_tag']}_p{row['sgrna_loc']}"
    targeting_info.append(formatted_str)

for i in range(len(sgRNA_vectors)):
    sgRNA_vectors[i].name = f'{targeting_info[i]}_#{i+1}'
    sgRNA_vectors[i].id = sgRNA_vectors[i].name  # Using the same value for ID as for name for simplicity
    sgRNA_vectors[i].description = f'Assembled plasmid targeting {", ".join(genes_to_KO)} for base-editing, assembled using StreptoCAD.'

# annotate plasmids
for plasmid in sgRNA_vectors: 
    annotate_plasmid_with_sgrnas(plasmid, filtered_df)

# PRINT
print_plasmids = False
if print_plasmids: 
    for vector in sgRNA_vectors: 
        vector.write(f"../../data/plasmids/sgRNA_plasmids_pCRISPR_cBEST/{vector.id}.gb")

In [None]:

integration_names = filtered_df.apply(lambda row: f"sgRNA_{row['locus_tag']}({row['sgrna_loc']})", axis=1).tolist()
plasmid_metadata_df = extract_metadata_to_dataframe(sgRNA_vectors,
                                                    'CRISPR-cBEST',
                                                    integration_names)

plasmid_metadata_df

Unnamed: 0,plasmid_name,date,original_plasmid,integration,size
0,pCRISPR-BEST_chosen_region_2_p35_#1,2024-07-19,CRISPR-cBEST,sgRNA_chosen_region_2(35),12263
1,pCRISPR-BEST_chosen_region_2_p34_#2,2024-07-19,CRISPR-cBEST,sgRNA_chosen_region_2(34),12263
2,pCRISPR-BEST_chosen_region_4_p54_#3,2024-07-19,CRISPR-cBEST,sgRNA_chosen_region_4(54),12263
3,pCRISPR-BEST_chosen_region_1_p80_#4,2024-07-19,CRISPR-cBEST,sgRNA_chosen_region_1(80),12263
4,pCRISPR-BEST_chosen_region_1_p81_#5,2024-07-19,CRISPR-cBEST,sgRNA_chosen_region_1(81),12263
5,pCRISPR-BEST_chosen_region_3_p118_#6,2024-07-19,CRISPR-cBEST,sgRNA_chosen_region_3(118),12263
6,pCRISPR-BEST_chosen_region_1_p130_#7,2024-07-19,CRISPR-cBEST,sgRNA_chosen_region_1(130),12263
7,pCRISPR-BEST_chosen_region_1_p131_#8,2024-07-19,CRISPR-cBEST,sgRNA_chosen_region_1(131),12263
8,pCRISPR-BEST_chosen_region_3_p146_#9,2024-07-19,CRISPR-cBEST,sgRNA_chosen_region_3(146),12263
9,pCRISPR-BEST_chosen_region_2_p148_#10,2024-07-19,CRISPR-cBEST,sgRNA_chosen_region_2(148),12263


# IDT primers

In [None]:
idt_df1 = primers_to_IDT(list_of_ssDNAs)
idt_df1

Unnamed: 0,Name,Sequence,Concentration,Purification
0,chosen_region_2_loc_35,CGGTTGGTAGGATCGACGGCCCCCAGACACCACGCACGCCGTTTTA...,25nm,STD
1,chosen_region_2_loc_34,CGGTTGGTAGGATCGACGGCCCCAGACACCACGCACGCCAGTTTTA...,25nm,STD
2,chosen_region_4_loc_54,CGGTTGGTAGGATCGACGGCGCTCAACCTCCAGGCCGGACGTTTTA...,25nm,STD
3,chosen_region_1_loc_80,CGGTTGGTAGGATCGACGGCGTTGTCCGTCAGCACTTCGAGTTTTA...,25nm,STD
4,chosen_region_1_loc_81,CGGTTGGTAGGATCGACGGCTTGTCCGTCAGCACTTCGAAGTTTTA...,25nm,STD
5,chosen_region_3_loc_118,CGGTTGGTAGGATCGACGGCTGCCCGATCTTCCAGGCCAGGTTTTA...,25nm,STD
6,chosen_region_1_loc_130,CGGTTGGTAGGATCGACGGCCGAAGGCCGAGCACACGGCGGTTTTA...,25nm,STD
7,chosen_region_1_loc_131,CGGTTGGTAGGATCGACGGCGAAGGCCGAGCACACGGCGCGTTTTA...,25nm,STD
8,chosen_region_3_loc_146,CGGTTGGTAGGATCGACGGCATTCGACGCAGAGACCGCACGTTTTA...,25nm,STD
9,chosen_region_2_loc_148,CGGTTGGTAGGATCGACGGCCATCCCAGGAAGGCAGCGCGGTTTTA...,25nm,STD


In [None]:

# Getting checking primeres
checking_primers_df = checking_primers(genome, genes_to_KO, 
                                       flanking_region=flanking_region_number,
                                       target_tm = melting_temperature, 
                                        primer_concentration = primer_concentration, 
                                        polymerase = chosen_polymerase)
checking_primers_df

Unnamed: 0,Locus Tag,f_primer_name,r_primer_name,f_primer_sequences(5-3),r_primer_sequences(5-3),f_tm,r_tm,ta
0,chosen_region_1,chosen_region_1_fwd_checking_primer,chosen_region_1_rev_checking_primer,CGGGGTGCCCGT,CGCAGTCCGATGTGACC,56,59,59
1,chosen_region_2,chosen_region_2_fwd_checking_primer,chosen_region_2_rev_checking_primer,GACCGCGCTGGGTG,TCCGGCCGCATCG,59,57,60
2,chosen_region_3,chosen_region_3_fwd_checking_primer,chosen_region_3_rev_checking_primer,ATCCCGCAGCTGCAGG,GAGGAGCGGCAATGGTCATC,61,63,65
3,chosen_region_4,chosen_region_4_fwd_checking_primer,chosen_region_4_rev_checking_primer,TTGCTGCAACCTGCCTTG,TAGGTGCCTCCGTCGA,61,58,62


In [None]:
idt_df2 = create_idt_order_dataframe(checking_primers_df)
idt_df2

Unnamed: 0,Name,Sequence,Concentration,Purification
0,chosen_region_1_fwd_checking_primer,CGGGGTGCCCGT,25nm,STD
1,chosen_region_2_fwd_checking_primer,GACCGCGCTGGGTG,25nm,STD
2,chosen_region_3_fwd_checking_primer,ATCCCGCAGCTGCAGG,25nm,STD
3,chosen_region_4_fwd_checking_primer,TTGCTGCAACCTGCCTTG,25nm,STD
4,chosen_region_1_rev_checking_primer,CGCAGTCCGATGTGACC,25nm,STD
5,chosen_region_2_rev_checking_primer,TCCGGCCGCATCG,25nm,STD
6,chosen_region_3_rev_checking_primer,GAGGAGCGGCAATGGTCATC,25nm,STD
7,chosen_region_4_rev_checking_primer,TAGGTGCCTCCGTCGA,25nm,STD


In [None]:
full_idt = pd.concat([idt_df1, idt_df2])
full_idt

Unnamed: 0,Name,Sequence,Concentration,Purification
0,chosen_region_2_loc_35,CGGTTGGTAGGATCGACGGCCCCCAGACACCACGCACGCCGTTTTA...,25nm,STD
1,chosen_region_2_loc_34,CGGTTGGTAGGATCGACGGCCCCAGACACCACGCACGCCAGTTTTA...,25nm,STD
2,chosen_region_4_loc_54,CGGTTGGTAGGATCGACGGCGCTCAACCTCCAGGCCGGACGTTTTA...,25nm,STD
3,chosen_region_1_loc_80,CGGTTGGTAGGATCGACGGCGTTGTCCGTCAGCACTTCGAGTTTTA...,25nm,STD
4,chosen_region_1_loc_81,CGGTTGGTAGGATCGACGGCTTGTCCGTCAGCACTTCGAAGTTTTA...,25nm,STD
5,chosen_region_3_loc_118,CGGTTGGTAGGATCGACGGCTGCCCGATCTTCCAGGCCAGGTTTTA...,25nm,STD
6,chosen_region_1_loc_130,CGGTTGGTAGGATCGACGGCCGAAGGCCGAGCACACGGCGGTTTTA...,25nm,STD
7,chosen_region_1_loc_131,CGGTTGGTAGGATCGACGGCGAAGGCCGAGCACACGGCGCGTTTTA...,25nm,STD
8,chosen_region_3_loc_146,CGGTTGGTAGGATCGACGGCATTCGACGCAGAGACCGCACGTTTTA...,25nm,STD
9,chosen_region_2_loc_148,CGGTTGGTAGGATCGACGGCCATCCCAGGAAGGCAGCGCGGTTTTA...,25nm,STD


## Folder with all the generated I/O

In [None]:


input_files = [
    {"name": "input_genome.gb", "content": genome},
    {"name": "input_plasmid.gb", "content": clean_plasmid}
]

output_files = [
    {"name": "cBEST_w_sgRNAs.gb", "content": sgRNA_vectors}, # LIST OF Dseqrecords
    {"name": "primer_df.csv", "content": checking_primers_df},
    {"name": "full_idt.csv", "content": full_idt},
    {"name": "mutated_sgrna_df.csv", "content": mutated_sgrna_df},
    {"name": "filtered_df.csv", "content": filtered_df},
    {"name": "plasmid_metadata_df.csv", "content": plasmid_metadata_df},

]

input_values = {
    "genes_to_knockout": genes_to_KO,
    "polymerase_settings": {
        "chosen_polymerase": chosen_polymerase,
        "melting_temperature": melting_temperature,
        "primer_concentration": primer_concentration,
        "primer_number_increment": primer_number_increment,
        "flanking_region_number": flanking_region_number
    },
    "filtering_metrics": {
        "gc_upper": gc_upper,
        "gc_lower": gc_lower,
        "off_target_seed": off_target_seed,
        "off_target_upper": off_target_upper,
        "cas_type": cas_type,
        "number_of_sgRNAs_per_group": number_of_sgRNAs_per_group
    },
    "overlapping_sequences": {
        "up_homology": str(up_homology),
        "dw_homology": str(dw_homology)
    }
}


# Paths to Markdown files
markdown_file_paths = [
    "../../protocols/conjugation_protcol.md",
    "../../protocols/single_target_crispr_plasmid_protcol.md"

]

# Data and time
timestamp = datetime.utcnow().isoformat()

# Create project directory structure
project_directory = ProjectDirectory(
    project_name=f"CRISPR_cBEST_workflow_{timestamp}",
    input_files=input_files,
    output_files=output_files,
    input_values=input_values,
    markdown_file_paths=markdown_file_paths
)


# DO You want to save the folder? 
save_zip_folder = False 

if save_zip_folder: 
    # Generate the project directory structure and get the zip content
    zip_content = project_directory.create_directory_structure(create_directories=False)

    # Save the zip file to disk (optional)
    with open("project_structure.zip", "wb") as f:
        f.write(zip_content)