# W6-CRISPR-Cas3-in-frame-deletion

In Workflow 6, CRISPR-Cas3 is paired with a PCR and Gibson protocol to achieve in-frame deletions22(Figure 7A). Just like in workflow 5, this workflow can be used to knock out specific genes in streptomycetes but with the crucial difference of using Cas3 which has been shown to have higher efficiencies for targeted gene deletions in for example BGCs and random-sized deletion in streptomyces22. Due to homologies, and high GC content, this protocol relies on two steps of Gibson cloning to incorporate the sgRNA and the upstream and downstream repair regions. To get started, users can download the pCRISPR-Cas3 file and the genome file (S. coelicolor A3). StreptoAIM will then generate the necessary primers, plasmid files, and checking primers to verify full gene deletions (S5). 


In [1]:
import sys
import os
from Bio.Restriction import StuI
import pandas as pd
from datetime import datetime

# Ensure the src directory is in the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '../../'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)


from pydna.dseqrecord import Dseqrecord

from streptocad.sequence_loading.sequence_loading import (
    load_and_process_plasmid,
    load_and_process_genome_sequences
)

from streptocad.utils import polymerase_dict, create_primer_df_from_dict,ProjectDirectory,extract_metadata_to_dataframe
from streptocad.primers.primer_generation import create_idt_order_dataframe, make_primer_records, primers_to_IDT, checking_primers
from streptocad.crispr.guideRNAcas3_9_12 import extract_sgRNAs, SgRNAargs
from streptocad.cloning.cas3_plasmid_cloning import generate_cas3_primers, cas3_plasmid_pcrs, assemble_cas3_plasmids
from streptocad.cloning.gibson_cloning import (
    find_up_dw_repair_templates,
    assemble_multiple_plasmids_with_repair_templates_for_deletion,
    update_primer_names
)

## INPUT

In [2]:
# Inputs
# 1 Add genome of choice (genbank, fasta)
path_to_genome = '../../data/genomes/Streptomyces_coelicolor_A3_chromosome.gb'
genome = load_and_process_genome_sequences(path_to_genome)[0]

# 2 Add plasmid 
path_to_plasmid = '../../data/plasmids/pCRISPR_Cas3.gbk'
clean_plasmid = load_and_process_plasmid(path_to_plasmid)

# 3 Choose genes to knock out (list)
genes_to_KO = ['SCO5087', 'SCO5089', 'SCO5090']

#### Advanced settings ####
# 4 Filtering metrics for sgRNAs
gc_upper = 0.8
gc_lower = 0.2
off_target_seed = 8
off_target_upper = 10
cas_type='cas3'
number_of_sgRNAs_per_group = 2

# 5 Choose polymerase and target melting temperature
chosen_polymerase = polymerase_dict['Phusion High-Fidelity DNA Polymerase (GC Buffer)']
melting_temperature = 65
primer_concentration = 0.4 
primer_number_increment = 23
flanking_region = 500

# 6 Choose overlapping sequences for our plasmid we can use the following
#As per the article **"CRISPR–Cas9, CRISPRi and CRISPR-BEST-mediated genetic manipulation in streptomycetes"** we need the following oligoes: 
#CGGTTGGTAGGATCGACGGC **-N20-** GTTTTAGAGCTAGAAATAGC
up_homology = Dseqrecord('CGGTTGGTAGGATCGACGGC')
dw_homology = Dseqrecord('GTTTTAGAGCTAGAAATAGC')

# Computation

In [3]:

# Initialize SgRNAargs with desired parameters
args = SgRNAargs(path_to_genome, 
                genes_to_KO,
                step=['find', 'filter'],
                gc_upper = gc_upper,
                gc_lower = gc_lower,
                off_target_seed = off_target_seed,
                off_target_upper = off_target_upper,
                cas_type=cas_type
                )

sgrna_df = extract_sgRNAs(args)
sgrna_df

sgRNA generated were too smallSCO5090. To incorporate this extent borders. Skipping to next locus tag.


Unnamed: 0,strain_name,locus_tag,gene_loc,gene_strand,sgrna_strand,sgrna_loc,gc,pam,sgrna,sgrna_seed_sequence,off_target_count
28,Streptomyces_coelicolor_A3_chromosome,SCO5087,5529801,1,1,131,0.558824,TTC,AATCCGAACAGCTCCTTCGATGCGTTCGTCCGGT,AATCCGAA,0
2,Streptomyces_coelicolor_A3_chromosome,SCO5087,5529801,1,-1,163,0.617647,TTC,GGATTGAAGCGCAGAGTCGTCATCACGGGCGTCG,GGATTGAA,0
6,Streptomyces_coelicolor_A3_chromosome,SCO5087,5529801,1,-1,306,0.735294,TTC,ACAGGTCGCGGCGGAGGCCGACTTCGACCCGGTC,ACAGGTCG,0
0,Streptomyces_coelicolor_A3_chromosome,SCO5087,5529801,1,-1,94,0.705882,TTC,GAGCCTCCTTCGAGCCACGGGGCCGACGATGACG,GAGCCTCC,1
24,Streptomyces_coelicolor_A3_chromosome,SCO5087,5529801,1,1,962,0.676471,TTC,AGGCCCGTCATGTGGTACGCGTTGCAGCGCGTCG,AGGCCCGT,1
14,Streptomyces_coelicolor_A3_chromosome,SCO5087,5529801,1,-1,850,0.705882,TTC,GACGGTACGCGGGACGGCTTCGTGCTGGCCGAAG,GACGGTAC,1
26,Streptomyces_coelicolor_A3_chromosome,SCO5087,5529801,1,1,201,0.794118,TTC,CCAGAACTGCCGGGTGCCGTTCCCGCCGGGGGCG,CCAGAACT,1
1,Streptomyces_coelicolor_A3_chromosome,SCO5087,5529801,1,-1,105,0.735294,TTC,GAGCCACGGGGCCGACGATGACGACGACCACCGG,GAGCCACG,2
29,Streptomyces_coelicolor_A3_chromosome,SCO5087,5529801,1,1,113,0.647059,TTC,GATGCGTTCGTCCGGTGGTCGTCGTCATCGTCGG,GATGCGTT,2
19,Streptomyces_coelicolor_A3_chromosome,SCO5087,5529801,1,-1,1390,0.735294,TTC,CAGAGCGCGATGGTGCTGCGCGACGCCGAGACCG,CAGAGCGC,3


In [4]:
# Filter the DataFrame to retain only up to 5 sgRNA sequences per locus_tag
filtered_df = sgrna_df.groupby('locus_tag').head(number_of_sgRNAs_per_group)
filtered_df

Unnamed: 0,strain_name,locus_tag,gene_loc,gene_strand,sgrna_strand,sgrna_loc,gc,pam,sgrna,sgrna_seed_sequence,off_target_count
28,Streptomyces_coelicolor_A3_chromosome,SCO5087,5529801,1,1,131,0.558824,TTC,AATCCGAACAGCTCCTTCGATGCGTTCGTCCGGT,AATCCGAA,0
2,Streptomyces_coelicolor_A3_chromosome,SCO5087,5529801,1,-1,163,0.617647,TTC,GGATTGAAGCGCAGAGTCGTCATCACGGGCGTCG,GGATTGAA,0
34,Streptomyces_coelicolor_A3_chromosome,SCO5090,5532706,1,-1,199,0.764706,TTC,CGGATCTGGGCGCGCGTGGGCGGCCGGGTGAAGA,CGGATCTG,3
38,Streptomyces_coelicolor_A3_chromosome,SCO5090,5532706,1,-1,505,0.735294,TTC,ACGTTCGAGGACACGCTCCGGGTGCCGTCCGGGG,ACGTTCGA,4


## Output

In [5]:
filtered_df_w_primers = generate_cas3_primers(filtered_df)
filtered_df_w_primers

Unnamed: 0,strain_name,locus_tag,gene_loc,gene_strand,sgrna_strand,sgrna_loc,gc,pam,sgrna,sgrna_seed_sequence,off_target_count,Fwd Primer,Rev Primer
28,Streptomyces_coelicolor_A3_chromosome,SCO5087,5529801,1,1,131,0.558824,TTC,AATCCGAACAGCTCCTTCGATGCGTTCGTCCGGT,AATCCGAA,0,AATCCGAACAGCTCCTTCGATGCGTTCGTCCGGTGTCGCcCggCaa...,ACCGGACGAACGCATCGAAGGAGCTGTTCGGATTGTTTCAATCCAC...
2,Streptomyces_coelicolor_A3_chromosome,SCO5087,5529801,1,-1,163,0.617647,TTC,GGATTGAAGCGCAGAGTCGTCATCACGGGCGTCG,GGATTGAA,0,GGATTGAAGCGCAGAGTCGTCATCACGGGCGTCGGTCGCcCggCaa...,CGACGCCCGTGATGACGACTCTGCGCTTCAATCCGTTTCAATCCAC...
34,Streptomyces_coelicolor_A3_chromosome,SCO5090,5532706,1,-1,199,0.764706,TTC,CGGATCTGGGCGCGCGTGGGCGGCCGGGTGAAGA,CGGATCTG,3,CGGATCTGGGCGCGCGTGGGCGGCCGGGTGAAGAGTCGCcCggCaa...,TCTTCACCCGGCCGCCCACGCGCGCCCAGATCCGGTTTCAATCCAC...
38,Streptomyces_coelicolor_A3_chromosome,SCO5090,5532706,1,-1,505,0.735294,TTC,ACGTTCGAGGACACGCTCCGGGTGCCGTCCGGGG,ACGTTCGA,4,ACGTTCGAGGACACGCTCCGGGTGCCGTCCGGGGGTCGCcCggCaa...,CCCCGGACGGCACCCGGAGCGTGTCCTCGAACGTGTTTCAATCCAC...


In [6]:
amplicons = cas3_plasmid_pcrs(clean_plasmid, filtered_df)

amplicons

[[Amplicon(1489), Amplicon(460)],
 [Amplicon(1489), Amplicon(460)],
 [Amplicon(1489), Amplicon(460)],
 [Amplicon(1489), Amplicon(460)]]

In [7]:
# cut plasmid
assembled_cas3_plasmids = assemble_cas3_plasmids(clean_plasmid, amplicons)
assembled_cas3_plasmids

[Contig(o12766), Contig(o12766), Contig(o12766), Contig(o12766)]

In [8]:

# Constructing a meaningful name, ID, and description for the assembled plasmid using user input
targeting_info = []
for index, row in filtered_df.iterrows():
    formatted_str = f"pCas3_{row['locus_tag']}({row['sgrna_loc']})"
    targeting_info.append(formatted_str)

for i in range(len(assembled_cas3_plasmids)):
    assembled_cas3_plasmids[i].name = f'{targeting_info[i]}'
    assembled_cas3_plasmids[i].id = assembled_cas3_plasmids[i].name  # Using the same value for ID as for name for simplicity
    assembled_cas3_plasmids[i].description = f'Assembled plasmid targeting {", ".join(genes_to_KO)} for single gene knockout, assembled using StreptoCAD.'


In [9]:
assembled_cas3_plasmids[0].name

'pCas3_SCO5087(131)'

In [10]:
assembled_cas3_plasmids[0].description

'Assembled plasmid targeting SCO5087, SCO5089, SCO5090 for single gene knockout, assembled using StreptoCAD.'

In [11]:
assembled_cas3_plasmids[0].id

'pCas3_SCO5087(131)'

In [12]:
print_plasmids = False

if print_plasmids: 
    for vector in assembled_cas3_plasmids: 
        vector.write(f"../../data/plasmids/sgRNA_plasmids_pCRISPR–Cas3/{vector.id}.gb")

### IDT primers

In [13]:
primer_records = make_primer_records(filtered_df_w_primers)
primer_records

[Dseqrecord(-52),
 Dseqrecord(-54),
 Dseqrecord(-52),
 Dseqrecord(-54),
 Dseqrecord(-52),
 Dseqrecord(-54),
 Dseqrecord(-52),
 Dseqrecord(-54)]

In [14]:
idt_primers_cas3=primers_to_IDT(primer_records)
idt_primers_cas3

Unnamed: 0,Name,Sequence,Concentration,Purification
0,SCO5087_loc_131_fwd,AATCCGAACAGCTCCTTCGATGCGTTCGTCCGGTGTCGCcCggCaa...,25nm,STD
1,SCO5087_loc_131_rev,ACCGGACGAACGCATCGAAGGAGCTGTTCGGATTGTTTCAATCCAC...,25nm,STD
2,SCO5087_loc_163_fwd,GGATTGAAGCGCAGAGTCGTCATCACGGGCGTCGGTCGCcCggCaa...,25nm,STD
3,SCO5087_loc_163_rev,CGACGCCCGTGATGACGACTCTGCGCTTCAATCCGTTTCAATCCAC...,25nm,STD
4,SCO5090_loc_199_fwd,CGGATCTGGGCGCGCGTGGGCGGCCGGGTGAAGAGTCGCcCggCaa...,25nm,STD
5,SCO5090_loc_199_rev,TCTTCACCCGGCCGCCCACGCGCGCCCAGATCCGGTTTCAATCCAC...,25nm,STD
6,SCO5090_loc_505_fwd,ACGTTCGAGGACACGCTCCGGGTGCCGTCCGGGGGTCGCcCggCaa...,25nm,STD
7,SCO5090_loc_505_rev,CCCCGGACGGCACCCGGAGCGTGTCCTCGAACGTGTTTCAATCCAC...,25nm,STD


# In frame deletion

In [15]:
in_frame_deletion = True 

In [16]:
if in_frame_deletion:
    # Make repair templates
    print('repair_templates_data')
    repair_templates_data = find_up_dw_repair_templates(genome, 
                                                        genes_to_KO, 
                                                        target_tm=melting_temperature, 
                                                        primer_tm_kwargs={'conc':primer_concentration, 'prodcode':chosen_polymerase} )
    # Digest the plasmids
    print('Digest the plasmids')
    processed_records = [Dseqrecord(record, circular=True).cut(StuI)[0] for record in assembled_cas3_plasmids]

    # Rename them appropriately
    print('Rename them appropriately')

    for i in range(len(processed_records)):
        processed_records[i].name = assembled_cas3_plasmids[i].name

    # Assembly 
    print('assembly_data')

    assembly_data = assemble_multiple_plasmids_with_repair_templates_for_deletion(genes_to_KO, processed_records, 
                                                                                repair_templates_data, 
                                                                                overlap=40)
    # updating the primer names to something systematic.
    update_primer_names(assembly_data)

    # Parse through the primer df
    primer_df = create_primer_df_from_dict(assembly_data)

    #Find_unique primers
    unique_df = primer_df.drop_duplicates(keep='first')
    idt_df = create_idt_order_dataframe(unique_df, concentration="25nm", purification="STD")

    # Contigs
    assembled_contigs = []
    for data in assembly_data: 
        contig_record = data['contig']
        contig_record.id = f"{data['name']}_w_rep"
        contig_record.name = f"{data['name']}_w_rep"

        assembled_contigs.append(contig_record)

    print_plasmids = False

    if print_plasmids:
        # Write to genbank files
        output_directory = "../../data/plasmids/sgRNA_plasmids_pCRISPR–Cas3_with_repair_templates/"
        for contig in assembled_contigs: 
            contig.write(f"{output_directory}/{contig.id}.gb")

repair_templates_data
Digest the plasmids
Rename them appropriately
assembly_data


In [17]:
if in_frame_deletion:
    integration_names = filtered_df.apply(lambda row: f"sgRNA_{row['locus_tag']}({row['sgrna_loc']})", axis=1).tolist()
    plasmid_metadata_df = extract_metadata_to_dataframe(assembled_contigs,
                                                        'CRISPR-Cas9',
                                                        integration_names)
else: 
    integration_names = filtered_df.apply(lambda row: f"sgRNA_{row['locus_tag']}({row['sgrna_loc']})", axis=1).tolist()
    plasmid_metadata_df = extract_metadata_to_dataframe(assembled_cas3_plasmids,
                                                        'CRISPR-Cas9',
                                                        integration_names)

plasmid_metadata_df

Unnamed: 0,plasmid_name,date,original_plasmid,integration,size
0,pCas3_SCO5087(131)_w_rep,2024-07-18,CRISPR-Cas9,sgRNA_SCO5087(131),6425
1,pCas3_SCO5087(163)_w_rep,2024-07-18,CRISPR-Cas9,sgRNA_SCO5087(163),6425
2,pCas3_SCO5090(199)_w_rep,2024-07-18,CRISPR-Cas9,sgRNA_SCO5090(199),6425
3,pCas3_SCO5090(505)_w_rep,2024-07-18,CRISPR-Cas9,sgRNA_SCO5090(505),6425


In [18]:
# Getting checking primers
checking_primers_df = checking_primers(path_to_genome, genes_to_KO, 
                                       flanking_region=flanking_region,
                                       target_tm = melting_temperature, 
                                       primer_concentration = primer_concentration, 
                                       polymerase = chosen_polymerase)
checking_primers_df

Unnamed: 0,Locus Tag,f_primer_name,r_primer_name,f_primer_sequences(5-3),r_primer_sequences(5-3),f_tm,r_tm,ta
0,SCO5087,SCO5087_fwd_checking_primer,SCO5087_rev_checking_primer,GACGATTCGGCCCGTG,CAGGGCGTCCAGGC,59,58,61
1,SCO5089,SCO5089_fwd_checking_primer,SCO5089_rev_checking_primer,CGTACGGCGAACTCGC,GTCCCCGGACGGC,59,57,60
2,SCO5090,SCO5090_fwd_checking_primer,SCO5090_rev_checking_primer,GGCGGCGGCCC,GACGTCCCCCGCGAAC,57,62,61


In [19]:
# making the primers into dseqrecords
checking_primers_df_idt = create_idt_order_dataframe(checking_primers_df)
checking_primers_df_idt


Unnamed: 0,Name,Sequence,Concentration,Purification
0,SCO5087_fwd_checking_primer,GACGATTCGGCCCGTG,25nm,STD
1,SCO5089_fwd_checking_primer,CGTACGGCGAACTCGC,25nm,STD
2,SCO5090_fwd_checking_primer,GGCGGCGGCCC,25nm,STD
3,SCO5087_rev_checking_primer,CAGGGCGTCCAGGC,25nm,STD
4,SCO5089_rev_checking_primer,GTCCCCGGACGGC,25nm,STD
5,SCO5090_rev_checking_primer,GACGTCCCCCGCGAAC,25nm,STD


In [22]:
if in_frame_deletion: 
    full_idt = pd.concat([idt_primers_cas3,idt_df, checking_primers_df_idt],ignore_index=True)
else: 
    full_idt = pd.concat([idt_primers_cas3,checking_primers_df_idt],ignore_index=True)

full_idt

Unnamed: 0,Name,Sequence,Concentration,Purification
0,SCO5087_loc_131_fwd,AATCCGAACAGCTCCTTCGATGCGTTCGTCCGGTGTCGCcCggCaa...,25nm,STD
1,SCO5087_loc_131_rev,ACCGGACGAACGCATCGAAGGAGCTGTTCGGATTGTTTCAATCCAC...,25nm,STD
2,SCO5087_loc_163_fwd,GGATTGAAGCGCAGAGTCGTCATCACGGGCGTCGGTCGCcCggCaa...,25nm,STD
3,SCO5087_loc_163_rev,CGACGCCCGTGATGACGACTCTGCGCTTCAATCCGTTTCAATCCAC...,25nm,STD
4,SCO5090_loc_199_fwd,CGGATCTGGGCGCGCGTGGGCGGCCGGGTGAAGAGTCGCcCggCaa...,25nm,STD
5,SCO5090_loc_199_rev,TCTTCACCCGGCCGCCCACGCGCGCCCAGATCCGGTTTCAATCCAC...,25nm,STD
6,SCO5090_loc_505_fwd,ACGTTCGAGGACACGCTCCGGGTGCCGTCCGGGGGTCGCcCggCaa...,25nm,STD
7,SCO5090_loc_505_rev,CCCCGGACGGCACCCGGAGCGTGTCCTCGAACGTGTTTCAATCCAC...,25nm,STD
8,primer_1,ATGCAGGAGAAGTCGGTGCTGAACAACCAGCACAAGCAGGGACGCC...,25nm,STD
9,primer_3,GGCGGTCGAAGGGAGATGGGGCGTCCTCATCACCGG,25nm,STD


## Folder with all the generated I/O

In [21]:

input_files = [
    {"name": "input_genome.gb", "content": genome},
    {"name": "input_plasmid.gb", "content": clean_plasmid}
]


if in_frame_deletion: 
    output_files = [
        {"name": "BEST_w_sgRNAs.gb", "content": assembled_contigs}, # LIST OF Dseqrecords
        {"name": "primer_df.csv", "content": primer_df},
        {"name": "full_idt.csv", "content": full_idt},
        {"name": "sgrna_df.csv", "content": sgrna_df},
        {"name": "filtered_sgrna_df.csv", "content": filtered_df},
        {"name": "plasmid_metadata_df.csv", "content": plasmid_metadata_df},

    ]
else: 
    output_files = [
        {"name": "cBEST_w_sgRNAs.gb", "content": assembled_cas3_plasmids}, # LIST OF Dseqrecords
        {"name": "full_idt.csv", "content": full_idt},
        {"name": "sgrna_df.csv", "content": sgrna_df},
        {"name": "filtered_df.csv", "content": filtered_df},
        {"name": "plasmid_metadata_df.csv", "content": plasmid_metadata_df},
    ]
         
input_values = {
    "genes_to_knockout": genes_to_KO,

    "filtering_metrics": {
        "gc_upper": gc_upper,
        "gc_lower": gc_lower,
        "off_target_seed": off_target_seed,
        "off_target_upper": off_target_upper,
        "cas_type": cas_type,
        "number_of_sgRNAs_per_group": number_of_sgRNAs_per_group,

    },  
    "polymerase_settings": {
        "chosen_polymerase": chosen_polymerase,
        "melting_temperature": melting_temperature,
        "primer_concentration": primer_concentration,
        "primer_number_increment": primer_number_increment,
        "flanking_region": flanking_region
    },
    "overlapping_sequences": {
        "up_homology": str(up_homology),
        "dw_homology": str(dw_homology)
    }
}

# Paths to Markdown files
markdown_file_paths = [
    "../../protocols/conjugation_protcol.md",
    "../../protocols/single_target_crispr_plasmid_protcol.md"

]

timestamp = datetime.utcnow().isoformat()

# Create project directory structure
project_directory = ProjectDirectory(
    project_name=f"CRISPR_cas9_inframe_deletion_workflow_{timestamp}",
    input_files=input_files,
    output_files=output_files,
    input_values=input_values,
    markdown_file_paths=markdown_file_paths
)

# DO You want to save the folder? 
save_zip_folder = False 

if save_zip_folder: 
    # Generate the project directory structure and get the zip content
    zip_content = project_directory.create_directory_structure(create_directories=False)

    # Save the zip file to disk (optional)
    with open("project_structure.zip", "wb") as f:
        f.write(zip_content)