## W2-CRISPR-cBEST workflow


Base-editing with CRISPR-BEST is a DSB-free method that uses a single sgRNA to target a specific genomic location with single-nucleotide–resolution(Figure 3A) which can enable in-vivo protein engineering or as it is often used, for introducing early stop-codons10,12. It can be used to introduce targeted point mutations in the genome of Streptomyces, facilitating studies on gene function and protein engineering. To get started quickly users can download the pCRISPR-cBEST plasmid file, the genome file for S.coelicor (A3) and streptoAIM will generate your plasmids based on the genes you chose to target (Figure 3, S5). 

In [1]:
import sys
import os

# Ensure the src directory is in the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '../../'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

import os
import pandas as pd
from pydna.dseqrecord import Dseqrecord
from datetime import datetime
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO




from streptocad.sequence_loading.sequence_loading import load_and_process_gene_sequences, load_and_process_plasmid, load_and_process_genome_sequences, annotate_dseqrecord, check_and_convert_input
from streptocad.utils import polymerase_dict,ProjectDirectory, extract_metadata_to_dataframe
from streptocad.crispr.guideRNAcas3_9_12 import extract_sgRNAs, SgRNAargs
from streptocad.cloning.ssDNA_bridging import assemble_plasmids_by_ssDNA_bridging, make_ssDNA_oligos
from streptocad.crispr.crispr_best import identify_base_editing_sites, filter_sgrnas_for_base_editing, process_base_editing
from streptocad.cloning.plasmid_processing import annotate_plasmid_with_sgrnas
from streptocad.primers.primer_generation import checking_primers, create_idt_order_dataframe, primers_to_IDT


## INPUT

In [2]:
# Inputs
# 1 Add genome of choice (genbank, fasta)
path_to_genome = '../../data/genomes/Streptomyces_coelicolor_A3_chromosome.gb'
genome = load_and_process_genome_sequences(path_to_genome)[0]

# 2 Add plasmid 
path_to_plasmid = '../../data/plasmids/ pCRISPR-cBEST Sequences.gbk'
clean_plasmid = load_and_process_plasmid(path_to_plasmid)

# 3 Choose genes to knock out (list)
genes_to_KO = ['SCO5087','SCO5088', 'SCO5089']

#### Advanced settings ####
# 3 Choose polymerase and target melting temperature FOR CHECKING PRIMERS
chosen_polymerase = polymerase_dict['Phusion High-Fidelity DNA Polymerase (GC Buffer)']


melting_temperature = 65
primer_concentration = 0.4 
primer_number_increment = 1
flanking_region_number = 500

# 4 Filtering metrics for sgRNAs
gc_upper = 0.99
gc_lower = 0.01
off_target_seed = 13
off_target_upper = 0
cas_type='cas9'
number_of_sgRNAs_per_group = 5
only_stop_codons = True

# 6 Choose overlapping sequences for our plasmid we can use the following
#As per the article **"CRISPR–Cas9, CRISPRi and CRISPR-BEST-mediated genetic manipulation in streptomycetes"** we need the following oligoes: 
#CGGTTGGTAGGATCGACGGC **-N20-** GTTTTAGAGCTAGAAATAGC
up_homology = Dseqrecord('CGGTTGGTAGGATCGACGGC')
dw_homology = Dseqrecord('GTTTTAGAGCTAGAAATAGC')

# Computation

In [3]:
genome

Dseqrecord(-8667507)

In [4]:
# Example usage with the convert_input_to_dict function
genes_to_KO = ['1000-20000', '4000-10000']
target_dict, genes_to_KO, annotation_input  = check_and_convert_input(genes_to_KO)
target_dict

[{'target_region_1': [1000, 20000]}, {'target_region_2': [4000, 10000]}]

In [5]:
genes_to_KO

['target_region_1', 'target_region_2']

In [6]:
annotation_input

True

In [23]:
import tempfile


def save_file(name, content):
    """Save the Dseqrecord to a file."""
    with open(name, "w") as fp:
        SeqIO.write(content, fp, "genbank")

genes_to_KO = ['1000-20000', '4000-10000']
# Process genes_to_KO with check_and_convert_input before the with block
target_dict, genes_to_KO, annotation_input = check_and_convert_input(genes_to_KO)

print(target_dict, genes_to_KO,annotation_input )


    # Check if annotation is needed
    if annotation_input:
        with tempfile.TemporaryDirectory() as tempdir:

            genome_path = os.path.join(tempdir, str(genome.name))
            genome = annotate_dseqrecord(genome, target_dict)
            
            # Save uploaded files to the temporary directory
            save_file(genome_path, genome)
            
            temp_genome_path = genome_path
    else:
        temp_genome_path = path_to_genome
    
    # Initialize SgRNAargs with desired parameters
    args = SgRNAargs(temp_genome_path, 
                     genes_to_KO,
                     step=['find', 'filter'],
                     gc_upper=gc_upper,
                     gc_lower=gc_lower,
                     off_target_seed=off_target_seed,
                     off_target_upper=off_target_upper,
                     cas_type=cas_type)
    
    # Call the function to extract sgRNAs
    sgrna_df = extract_sgRNAs(args)
print(sgrna_df)


[{'target_region_1': [1000, 20000]}, {'target_region_2': [4000, 10000]}] ['target_region_1', 'target_region_2'] True
sgRNA generated were outside the designated border in target_region_1. To incorporate this extent borders. Skipping to next locus tag.
sgRNA generated were outside the designated border in target_region_1. To incorporate this extent borders. Skipping to next locus tag.
Pam was found outside designated locus_tag: target_region_2. To incorporate this extent borders. Skipping to next locus tag.
sgRNA generated were outside the designated border in target_region_2. To incorporate this extent borders. Skipping to next locus tag.
sgRNA generated were outside the designated border in target_region_2. To incorporate this extent borders. Skipping to next locus tag.
sgRNA generated were outside the designated border in target_region_2. To incorporate this extent borders. Skipping to next locus tag.
sgRNA generated were outside the designated border in target_region_2. To incorpora

In [8]:
genes_to_KO = ['SCO5087','SCO5088', 'SCO5089']

# Initialize SgRNAargs with desired parameters
args = SgRNAargs(path_to_genome, 
                    genes_to_KO,
                    step=['find', 'filter'],
                    gc_upper=gc_upper,
                    gc_lower=gc_lower,
                    off_target_seed=off_target_seed,
                    off_target_upper=off_target_upper,
                    cas_type=cas_type)

# Call the function to extract sgRNAs
sgrna_df = extract_sgRNAs(args)
sgrna_df

Pam was found outside designated locus_tag: SCO5087. To incorporate this extent borders. Skipping to next locus tag.
sgRNA generated were outside the designated border in SCO5087. To incorporate this extent borders. Skipping to next locus tag.
Pam was found outside designated locus_tag: SCO5087. To incorporate this extent borders. Skipping to next locus tag.
sgRNA generated were outside the designated border in SCO5087. To incorporate this extent borders. Skipping to next locus tag.
sgRNA generated were outside the designated border in SCO5087. To incorporate this extent borders. Skipping to next locus tag.
Pam was found outside designated locus_tag: SCO5088. To incorporate this extent borders. Skipping to next locus tag.
sgRNA generated were outside the designated border in SCO5088. To incorporate this extent borders. Skipping to next locus tag.
sgRNA generated were outside the designated border in SCO5088. To incorporate this extent borders. Skipping to next locus tag.
sgRNA generate

Unnamed: 0,strain_name,locus_tag,gene_loc,gene_strand,sgrna_strand,sgrna_loc,gc,pam,sgrna,sgrna_seed_sequence,off_target_count
0,Streptomyces_coelicolor_A3_chromosome,SCO5087,5529801,1,-1,26,0.80,CGG,TCCACCGGCGCCGCGTCCAG,GCGCCGCGTCCAG,0
400,Streptomyces_coelicolor_A3_chromosome,SCO5088,5531201,1,-1,879,0.70,CGG,GACGTCGACGTCCTCGGGAC,ACGTCCTCGGGAC,0
399,Streptomyces_coelicolor_A3_chromosome,SCO5088,5531201,1,-1,873,0.75,CGG,GACGTCCTCGGGACCGGTCC,TCGGGACCGGTCC,0
398,Streptomyces_coelicolor_A3_chromosome,SCO5088,5531201,1,-1,853,0.75,CGG,CGGCGTCGTTCAGCGCCAGC,GTTCAGCGCCAGC,0
397,Streptomyces_coelicolor_A3_chromosome,SCO5088,5531201,1,-1,838,0.75,AGG,CCAGCCGGATCGCGCGTTCC,GATCGCGCGTTCC,0
...,...,...,...,...,...,...,...,...,...,...,...
32,Streptomyces_coelicolor_A3_chromosome,SCO5087,5529801,1,-1,341,0.70,CGG,TCCATCCGGTCGAGCTCCCG,GGTCGAGCTCCCG,0
34,Streptomyces_coelicolor_A3_chromosome,SCO5087,5529801,1,-1,364,0.75,CGG,CCACGGCGAACTGCGAGGCC,GAACTGCGAGGCC,0
209,Streptomyces_coelicolor_A3_chromosome,SCO5087,5529801,1,1,641,0.70,GGG,GTCTCCACCGGCTGCACCTC,CCGGCTGCACCTC,0
35,Streptomyces_coelicolor_A3_chromosome,SCO5087,5529801,1,-1,369,0.70,AGG,ACAGGCCACGGCGAACTGCG,ACGGCGAACTGCG,0


In [9]:
# Load gene sequences
gene_sequences = load_and_process_gene_sequences(path_to_genome)
genes_to_KO_dict = {locus_tag: gene_sequences[locus_tag] for locus_tag in genes_to_KO if locus_tag in gene_sequences}

# Identify and annotate base editing sites
sgrna_df_with_editing = identify_base_editing_sites(sgrna_df)

# filter out only sgRNAs that result in base-editing
filtered_sgrna_df_for_base_editing = filter_sgrnas_for_base_editing(sgrna_df_with_editing)
filtered_sgrna_df_for_base_editing

Unnamed: 0,strain_name,locus_tag,gene_loc,gene_strand,sgrna_strand,sgrna_loc,gc,pam,sgrna,sgrna_seed_sequence,off_target_count,editable_cytosines
0,Streptomyces_coelicolor_A3_chromosome,SCO5087,5529801,1,-1,26,0.80,CGG,TCCACCGGCGCCGCGTCCAG,GCGCCGCGTCCAG,0,3569
400,Streptomyces_coelicolor_A3_chromosome,SCO5088,5531201,1,-1,879,0.70,CGG,GACGTCGACGTCCTCGGGAC,ACGTCCTCGGGAC,0,369
399,Streptomyces_coelicolor_A3_chromosome,SCO5088,5531201,1,-1,873,0.75,CGG,GACGTCCTCGGGACCGGTCC,TCGGGACCGGTCC,0,3679
398,Streptomyces_coelicolor_A3_chromosome,SCO5088,5531201,1,-1,853,0.75,CGG,CGGCGTCGTTCAGCGCCAGC,GTTCAGCGCCAGC,0,47
397,Streptomyces_coelicolor_A3_chromosome,SCO5088,5531201,1,-1,838,0.75,AGG,CCAGCCGGATCGCGCGTTCC,GATCGCGCGTTCC,0,56
...,...,...,...,...,...,...,...,...,...,...,...,...
32,Streptomyces_coelicolor_A3_chromosome,SCO5087,5529801,1,-1,341,0.70,CGG,TCCATCCGGTCGAGCTCCCG,GGTCGAGCTCCCG,0,367
34,Streptomyces_coelicolor_A3_chromosome,SCO5087,5529801,1,-1,364,0.75,CGG,CCACGGCGAACTGCGAGGCC,GAACTGCGAGGCC,0,47
209,Streptomyces_coelicolor_A3_chromosome,SCO5087,5529801,1,1,641,0.70,GGG,GTCTCCACCGGCTGCACCTC,CCGGCTGCACCTC,0,35689
35,Streptomyces_coelicolor_A3_chromosome,SCO5087,5529801,1,-1,369,0.70,AGG,ACAGGCCACGGCGAACTGCG,ACGGCGAACTGCG,0,679


In [10]:
# Process the DataFrame to apply C-to-T mutations
mutated_sgrna_df = process_base_editing(filtered_sgrna_df_for_base_editing, 
                                        genes_to_KO_dict, 
                                        only_stop_codons = False)
mutated_sgrna_df

Unnamed: 0,strain_name,locus_tag,gene_loc,gene_strand,sgrna_strand,sgrna_loc,gc,pam,sgrna,sgrna_seed_sequence,off_target_count,editable_cytosines,mutations
0,Streptomyces_coelicolor_A3_chromosome,SCO5087,5529801,1,-1,26,0.80,CGG,TCCACCGGCGCCGCGTCCAG,GCGCCGCGTCCAG,0,3569,V8I
400,Streptomyces_coelicolor_A3_chromosome,SCO5088,5531201,1,-1,879,0.70,CGG,GACGTCGACGTCCTCGGGAC,ACGTCCTCGGGAC,0,369,"V291I, D292N, V293I"
399,Streptomyces_coelicolor_A3_chromosome,SCO5088,5531201,1,-1,873,0.75,CGG,GACGTCCTCGGGACCGGTCC,TCGGGACCGGTCC,0,3679,"E289K, D290N, V291I"
398,Streptomyces_coelicolor_A3_chromosome,SCO5088,5531201,1,-1,853,0.75,CGG,CGGCGTCGTTCAGCGCCAGC,GTTCAGCGCCAGC,0,47,"D283N, A284T"
397,Streptomyces_coelicolor_A3_chromosome,SCO5088,5531201,1,-1,838,0.75,AGG,CCAGCCGGATCGCGCGTTCC,GATCGCGCGTTCC,0,56,R278Q
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32,Streptomyces_coelicolor_A3_chromosome,SCO5087,5529801,1,-1,341,0.70,CGG,TCCATCCGGTCGAGCTCCCG,GGTCGAGCTCCCG,0,367,"R112Q, M113I"
34,Streptomyces_coelicolor_A3_chromosome,SCO5087,5529801,1,-1,364,0.75,CGG,CCACGGCGAACTGCGAGGCC,GAACTGCGAGGCC,0,47,"A120T, V121M"
209,Streptomyces_coelicolor_A3_chromosome,SCO5087,5529801,1,1,641,0.70,GGG,GTCTCCACCGGCTGCACCTC,CCGGCTGCACCTC,0,35689,"S209F, T210I"
35,Streptomyces_coelicolor_A3_chromosome,SCO5087,5529801,1,-1,369,0.70,AGG,ACAGGCCACGGCGAACTGCG,ACGGCGAACTGCG,0,679,"V121I, A122T"


In [11]:
# Process the DataFrame to apply C-to-T mutations
mutated_sgrna_df = process_base_editing(filtered_sgrna_df_for_base_editing, 
                                        genes_to_KO_dict, 
                                        only_stop_codons = True)
mutated_sgrna_df

Unnamed: 0,strain_name,locus_tag,gene_loc,gene_strand,sgrna_strand,sgrna_loc,gc,pam,sgrna,sgrna_seed_sequence,off_target_count,editable_cytosines,mutations
294,Streptomyces_coelicolor_A3_chromosome,SCO5088,5531201,1,-1,67,0.75,AGG,ACCAGTACGGCGCCAGGCCG,CGGCGCCAGGCCG,0,38,W22*
624,Streptomyces_coelicolor_A3_chromosome,SCO5089,5532449,1,1,162,0.8,CGG,CCGCGGCGCGACTCGAGAGC,GCGACTCGAGAGC,0,479,"A49V, A50V, R51*"
623,Streptomyces_coelicolor_A3_chromosome,SCO5089,5532449,1,1,167,0.7,CGG,GCGCGACTCGAGAGCCGGTA,TCGAGAGCCGGTA,0,479,"R51*, L52F"
573,Streptomyces_coelicolor_A3_chromosome,SCO5088,5531201,1,1,216,0.7,CGG,CGCAGACGGACCCCTCCACA,GGACCCCTCCACA,0,37,"Q67*, T68M"
17,Streptomyces_coelicolor_A3_chromosome,SCO5087,5529801,1,-1,210,0.6,GGG,GAGCAGTTCCCAGAACTGCC,TCCCAGAACTGCC,0,4910,"W67*, E68K"
322,Streptomyces_coelicolor_A3_chromosome,SCO5088,5531201,1,-1,252,0.8,CGG,CTGGAGGGCCCAGTCCGCGG,GCCCAGTCCGCGG,0,910,"W81*, A82T"
567,Streptomyces_coelicolor_A3_chromosome,SCO5088,5531201,1,1,262,0.75,CGG,GGCCCTCCAGGACGCGAAGG,CAGGACGCGAAGG,0,34578,"A82V, L83F, Q84*"
258,Streptomyces_coelicolor_A3_chromosome,SCO5087,5529801,1,1,283,0.75,AGG,CCGTTCACAGGTCGCGGCGG,CAGGTCGCGGCGG,0,68,"S90L, Q91*"
335,Streptomyces_coelicolor_A3_chromosome,SCO5088,5531201,1,-1,361,0.6,CGG,ACCACAGCTTGCGGAACTCC,CTTGCGGAACTCC,0,358,W120*
550,Streptomyces_coelicolor_A3_chromosome,SCO5088,5531201,1,1,438,0.6,CGG,ACACCGGCCAGATCTCCATC,CCAGATCTCCATC,0,4589,"T141I, Q143*"


In [12]:
# Filter the DataFrame to retain only up to 5 sgRNA sequences per locus_tag
filtered_df = mutated_sgrna_df.groupby('locus_tag').head(number_of_sgRNAs_per_group)
filtered_df

Unnamed: 0,strain_name,locus_tag,gene_loc,gene_strand,sgrna_strand,sgrna_loc,gc,pam,sgrna,sgrna_seed_sequence,off_target_count,editable_cytosines,mutations
294,Streptomyces_coelicolor_A3_chromosome,SCO5088,5531201,1,-1,67,0.75,AGG,ACCAGTACGGCGCCAGGCCG,CGGCGCCAGGCCG,0,38,W22*
624,Streptomyces_coelicolor_A3_chromosome,SCO5089,5532449,1,1,162,0.8,CGG,CCGCGGCGCGACTCGAGAGC,GCGACTCGAGAGC,0,479,"A49V, A50V, R51*"
623,Streptomyces_coelicolor_A3_chromosome,SCO5089,5532449,1,1,167,0.7,CGG,GCGCGACTCGAGAGCCGGTA,TCGAGAGCCGGTA,0,479,"R51*, L52F"
573,Streptomyces_coelicolor_A3_chromosome,SCO5088,5531201,1,1,216,0.7,CGG,CGCAGACGGACCCCTCCACA,GGACCCCTCCACA,0,37,"Q67*, T68M"
17,Streptomyces_coelicolor_A3_chromosome,SCO5087,5529801,1,-1,210,0.6,GGG,GAGCAGTTCCCAGAACTGCC,TCCCAGAACTGCC,0,4910,"W67*, E68K"
322,Streptomyces_coelicolor_A3_chromosome,SCO5088,5531201,1,-1,252,0.8,CGG,CTGGAGGGCCCAGTCCGCGG,GCCCAGTCCGCGG,0,910,"W81*, A82T"
567,Streptomyces_coelicolor_A3_chromosome,SCO5088,5531201,1,1,262,0.75,CGG,GGCCCTCCAGGACGCGAAGG,CAGGACGCGAAGG,0,34578,"A82V, L83F, Q84*"
258,Streptomyces_coelicolor_A3_chromosome,SCO5087,5529801,1,1,283,0.75,AGG,CCGTTCACAGGTCGCGGCGG,CAGGTCGCGGCGG,0,68,"S90L, Q91*"
335,Streptomyces_coelicolor_A3_chromosome,SCO5088,5531201,1,-1,361,0.6,CGG,ACCACAGCTTGCGGAACTCC,CTTGCGGAACTCC,0,358,W120*
54,Streptomyces_coelicolor_A3_chromosome,SCO5087,5529801,1,-1,522,0.7,CGG,GTCGACCTCCCAGTCACGGC,TCCCAGTCACGGC,0,367910,"W171*, E172K, V173I, D174N"


## Output

In [13]:
# MAke oligoes
list_of_ssDNAs = make_ssDNA_oligos(filtered_df, upstream_ovh = up_homology,
                      downstream_ovh=dw_homology)
# cut plasmid
from Bio.Restriction import NcoI
linearized_plasmid = sorted(clean_plasmid.cut(NcoI), key=lambda x: len(x), reverse=True)[0]

# assemble plasmid
sgRNA_vectors = assemble_plasmids_by_ssDNA_bridging(list_of_ssDNAs,linearized_plasmid)


In [14]:
# Constructing a meaningful name, ID, and description for the assembled plasmid using user input
targeting_info = []
for index, row in filtered_df.iterrows():
    formatted_str = f"pCRISPR-BEST_{row['locus_tag']}_p{row['sgrna_loc']}"
    targeting_info.append(formatted_str)

for i in range(len(sgRNA_vectors)):
    sgRNA_vectors[i].name = f'{targeting_info[i]}_#{i+1}'
    sgRNA_vectors[i].id = sgRNA_vectors[i].name  # Using the same value for ID as for name for simplicity
    sgRNA_vectors[i].description = f'Assembled plasmid targeting {", ".join(genes_to_KO)} for base-editing, assembled using StreptoCAD.'

# annotate plasmids
for plasmid in sgRNA_vectors: 
    annotate_plasmid_with_sgrnas(plasmid, filtered_df)

# PRINT
print_plasmids = False
if print_plasmids: 
    for vector in sgRNA_vectors: 
        vector.write(f"../../data/plasmids/sgRNA_plasmids_pCRISPR_cBEST/{vector.id}.gb")

In [15]:

integration_names = filtered_df.apply(lambda row: f"sgRNA_{row['locus_tag']}({row['sgrna_loc']})", axis=1).tolist()
plasmid_metadata_df = extract_metadata_to_dataframe(sgRNA_vectors,
                                                    'CRISPR-cBEST',
                                                    integration_names)

plasmid_metadata_df

Unnamed: 0,plasmid_name,date,original_plasmid,integration,size
0,pCRISPR-BEST_SCO5088_p67_#1,2024-07-19,CRISPR-cBEST,sgRNA_SCO5088(67),12263
1,pCRISPR-BEST_SCO5089_p162_#2,2024-07-19,CRISPR-cBEST,sgRNA_SCO5089(162),12263
2,pCRISPR-BEST_SCO5089_p167_#3,2024-07-19,CRISPR-cBEST,sgRNA_SCO5089(167),12263
3,pCRISPR-BEST_SCO5088_p216_#4,2024-07-19,CRISPR-cBEST,sgRNA_SCO5088(216),12263
4,pCRISPR-BEST_SCO5087_p210_#5,2024-07-19,CRISPR-cBEST,sgRNA_SCO5087(210),12263
5,pCRISPR-BEST_SCO5088_p252_#6,2024-07-19,CRISPR-cBEST,sgRNA_SCO5088(252),12263
6,pCRISPR-BEST_SCO5088_p262_#7,2024-07-19,CRISPR-cBEST,sgRNA_SCO5088(262),12263
7,pCRISPR-BEST_SCO5087_p283_#8,2024-07-19,CRISPR-cBEST,sgRNA_SCO5087(283),12263
8,pCRISPR-BEST_SCO5088_p361_#9,2024-07-19,CRISPR-cBEST,sgRNA_SCO5088(361),12263
9,pCRISPR-BEST_SCO5087_p522_#10,2024-07-19,CRISPR-cBEST,sgRNA_SCO5087(522),12263


# IDT primers

In [16]:
idt_df1 = primers_to_IDT(list_of_ssDNAs)
idt_df1

Unnamed: 0,Name,Sequence,Concentration,Purification
0,SCO5088_loc_67,CGGTTGGTAGGATCGACGGCACCAGTACGGCGCCAGGCCGGTTTTA...,25nm,STD
1,SCO5089_loc_162,CGGTTGGTAGGATCGACGGCCCGCGGCGCGACTCGAGAGCGTTTTA...,25nm,STD
2,SCO5089_loc_167,CGGTTGGTAGGATCGACGGCGCGCGACTCGAGAGCCGGTAGTTTTA...,25nm,STD
3,SCO5088_loc_216,CGGTTGGTAGGATCGACGGCCGCAGACGGACCCCTCCACAGTTTTA...,25nm,STD
4,SCO5087_loc_210,CGGTTGGTAGGATCGACGGCGAGCAGTTCCCAGAACTGCCGTTTTA...,25nm,STD
5,SCO5088_loc_252,CGGTTGGTAGGATCGACGGCCTGGAGGGCCCAGTCCGCGGGTTTTA...,25nm,STD
6,SCO5088_loc_262,CGGTTGGTAGGATCGACGGCGGCCCTCCAGGACGCGAAGGGTTTTA...,25nm,STD
7,SCO5087_loc_283,CGGTTGGTAGGATCGACGGCCCGTTCACAGGTCGCGGCGGGTTTTA...,25nm,STD
8,SCO5088_loc_361,CGGTTGGTAGGATCGACGGCACCACAGCTTGCGGAACTCCGTTTTA...,25nm,STD
9,SCO5087_loc_522,CGGTTGGTAGGATCGACGGCGTCGACCTCCCAGTCACGGCGTTTTA...,25nm,STD


In [17]:

# Getting checking primeres
checking_primers_df = checking_primers(path_to_genome, genes_to_KO, 
                                       flanking_region=flanking_region_number,
                                       target_tm = melting_temperature, 
                                        primer_concentration = primer_concentration, 
                                        polymerase = chosen_polymerase)
checking_primers_df

Unnamed: 0,Locus Tag,f_primer_name,r_primer_name,f_primer_sequences(5-3),r_primer_sequences(5-3),f_tm,r_tm,ta
0,SCO5087,SCO5087_fwd_checking_primer,SCO5087_rev_checking_primer,GACGATTCGGCCCGTG,CAGGGCGTCCAGGC,59,58,61
1,SCO5088,SCO5088_fwd_checking_primer,SCO5088_rev_checking_primer,CGCATCCACGCCGAGA,CAGCGTCCGGCGG,61,58,62
2,SCO5089,SCO5089_fwd_checking_primer,SCO5089_rev_checking_primer,CGTACGGCGAACTCGC,GTCCCCGGACGGC,59,57,60


In [18]:
idt_df2 = create_idt_order_dataframe(checking_primers_df)
idt_df2

Unnamed: 0,Name,Sequence,Concentration,Purification
0,SCO5087_fwd_checking_primer,GACGATTCGGCCCGTG,25nm,STD
1,SCO5088_fwd_checking_primer,CGCATCCACGCCGAGA,25nm,STD
2,SCO5089_fwd_checking_primer,CGTACGGCGAACTCGC,25nm,STD
3,SCO5087_rev_checking_primer,CAGGGCGTCCAGGC,25nm,STD
4,SCO5088_rev_checking_primer,CAGCGTCCGGCGG,25nm,STD
5,SCO5089_rev_checking_primer,GTCCCCGGACGGC,25nm,STD


In [19]:
full_idt = pd.concat([idt_df1, idt_df2])
full_idt

Unnamed: 0,Name,Sequence,Concentration,Purification
0,SCO5088_loc_67,CGGTTGGTAGGATCGACGGCACCAGTACGGCGCCAGGCCGGTTTTA...,25nm,STD
1,SCO5089_loc_162,CGGTTGGTAGGATCGACGGCCCGCGGCGCGACTCGAGAGCGTTTTA...,25nm,STD
2,SCO5089_loc_167,CGGTTGGTAGGATCGACGGCGCGCGACTCGAGAGCCGGTAGTTTTA...,25nm,STD
3,SCO5088_loc_216,CGGTTGGTAGGATCGACGGCCGCAGACGGACCCCTCCACAGTTTTA...,25nm,STD
4,SCO5087_loc_210,CGGTTGGTAGGATCGACGGCGAGCAGTTCCCAGAACTGCCGTTTTA...,25nm,STD
5,SCO5088_loc_252,CGGTTGGTAGGATCGACGGCCTGGAGGGCCCAGTCCGCGGGTTTTA...,25nm,STD
6,SCO5088_loc_262,CGGTTGGTAGGATCGACGGCGGCCCTCCAGGACGCGAAGGGTTTTA...,25nm,STD
7,SCO5087_loc_283,CGGTTGGTAGGATCGACGGCCCGTTCACAGGTCGCGGCGGGTTTTA...,25nm,STD
8,SCO5088_loc_361,CGGTTGGTAGGATCGACGGCACCACAGCTTGCGGAACTCCGTTTTA...,25nm,STD
9,SCO5087_loc_522,CGGTTGGTAGGATCGACGGCGTCGACCTCCCAGTCACGGCGTTTTA...,25nm,STD


## Folder with all the generated I/O

In [20]:


input_files = [
    {"name": "input_genome.gb", "content": genome},
    {"name": "input_plasmid.gb", "content": clean_plasmid}
]

output_files = [
    {"name": "cBEST_w_sgRNAs.gb", "content": sgRNA_vectors}, # LIST OF Dseqrecords
    {"name": "primer_df.csv", "content": checking_primers_df},
    {"name": "full_idt.csv", "content": full_idt},
    {"name": "mutated_sgrna_df.csv", "content": mutated_sgrna_df},
    {"name": "filtered_df.csv", "content": filtered_df},
    {"name": "plasmid_metadata_df.csv", "content": plasmid_metadata_df},

]

input_values = {
    "genes_to_knockout": genes_to_KO,
    "polymerase_settings": {
        "chosen_polymerase": chosen_polymerase,
        "melting_temperature": melting_temperature,
        "primer_concentration": primer_concentration,
        "primer_number_increment": primer_number_increment,
        "flanking_region_number": flanking_region_number
    },
    "filtering_metrics": {
        "gc_upper": gc_upper,
        "gc_lower": gc_lower,
        "off_target_seed": off_target_seed,
        "off_target_upper": off_target_upper,
        "cas_type": cas_type,
        "number_of_sgRNAs_per_group": number_of_sgRNAs_per_group
    },
    "overlapping_sequences": {
        "up_homology": str(up_homology),
        "dw_homology": str(dw_homology)
    }
}


# Paths to Markdown files
markdown_file_paths = [
    "../../protocols/conjugation_protcol.md",
    "../../protocols/single_target_crispr_plasmid_protcol.md"

]

# Data and time
timestamp = datetime.utcnow().isoformat()

# Create project directory structure
project_directory = ProjectDirectory(
    project_name=f"CRISPR_cBEST_workflow_{timestamp}",
    input_files=input_files,
    output_files=output_files,
    input_values=input_values,
    markdown_file_paths=markdown_file_paths
)


# DO You want to save the folder? 
save_zip_folder = False 

if save_zip_folder: 
    # Generate the project directory structure and get the zip content
    zip_content = project_directory.create_directory_structure(create_directories=False)

    # Save the zip file to disk (optional)
    with open("project_structure.zip", "wb") as f:
        f.write(zip_content)