# W6-CRISPR-Cas3-in-frame-deletion

In Workflow 6, CRISPR-Cas3 is paired with a PCR and Gibson protocol to achieve in-frame deletions22(Figure 7A). Just like in workflow 5, this workflow can be used to knock out specific genes in streptomycetes but with the crucial difference of using Cas3 which has been shown to have higher efficiencies for targeted gene deletions in for example BGCs and random-sized deletion in streptomyces22. Due to homologies, and high GC content, this protocol relies on two steps of Gibson cloning to incorporate the sgRNA and the upstream and downstream repair regions. To get started, users can download the pCRISPR-Cas3 file and the genome file (S. coelicolor A3). StreptoAIM will then generate the necessary primers, plasmid files, and checking primers to verify full gene deletions (S5). 


In [1]:
import sys
import os
from Bio.Restriction import * 
import pandas as pd
from datetime import datetime

# Ensure the src directory is in the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '../../'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)


from pydna.dseqrecord import Dseqrecord

from streptocad.sequence_loading.sequence_loading import (
    load_and_process_plasmid,
    load_and_process_genome_sequences, 
    check_and_convert_input, 
    annotate_dseqrecord)

from streptocad.utils import polymerase_dict, create_primer_df_from_dict,ProjectDirectory,extract_metadata_to_dataframe
from streptocad.primers.primer_generation import create_idt_order_dataframe, make_primer_records, primers_to_IDT, find_best_check_primers_from_genome
from streptocad.crispr.guideRNAcas3_9_12 import extract_sgRNAs, SgRNAargs
from streptocad.cloning.cas3_plasmid_cloning import generate_cas3_protospacer_primers, cas3_plasmid_pcrs, assemble_cas3_plasmids
from streptocad.cloning.gibson_cloning import (
    find_up_dw_repair_templates,
    assemble_multiple_plasmids_with_repair_templates_for_deletion,
    update_primer_names
)
from streptocad.cloning.plasmid_processing import determine_workflow_order_for_plasmids

## INPUT

In [2]:
# Inputs
# 1 Add genome of choice (genbank, fasta)
path_to_genome = '../../data/genomes/Streptomyces_coelicolor_A3_chromosome.gb'
genome = load_and_process_genome_sequences(path_to_genome)[0]

# 2 Add plasmid 
path_to_plasmid = '../../data/plasmids/pCRISPR_Cas3.gbk'
clean_plasmid = load_and_process_plasmid(path_to_plasmid)
print(clean_plasmid.name)

# 3 Choose genes to knock out (list)
genes_to_KO = ['SCO5087', 'SCO5088',]
#genes_to_KO = ['80000-100000', '4000-7000', '9000-14000','15000-20000']

#### Advanced settings ####
# 4 Filtering metrics for sgRNAs
gc_upper = 0.8
gc_lower = 0.2
off_target_seed = 8
off_target_upper = 10
cas_type='cas3'
number_of_sgRNAs_per_group = 2

# 5 Choose polymerase and target melting temperature
chosen_polymerase = polymerase_dict['Phusion High-Fidelity DNA Polymerase (GC Buffer)']
melting_temperature = 65
primer_concentration = 0.4 
primer_number_increment = 23
flanking_region = 500

# 6 Choose overlapping sequences for our plasmid we can use the following
# the part of the primers that anneal to the generate the crRNA and protospacer
forward_protospacer_overhang = 'GTCGCCCGGCAAAACCGG'
reverse_protospacer_overhang = 'GTTTCAATCCACGCGCCCGT'

# The universal part of the 
forward_backbone_overhang = 'GAGCTCATAAGTTCCTATTCCGAAG'
reverse_backbone_overhang = 'AAGAAGTGGGTGTCGGACGC'

Enzyme_for_repair_template_integration = EcoRI
repair_length = 1000
overlap_length = 40 


pCRISPR-Cas3


# Computation

In [3]:
target_dict, genes_to_KO, annotation_input = check_and_convert_input(genes_to_KO)

print(annotation_input)
if annotation_input == True:
    genome = annotate_dseqrecord(genome, target_dict)


len(genome.features)

False


25824

In [4]:
# Initialize SgRNAargs with desired parameters
args = SgRNAargs(genome, 
                genes_to_KO,
                step=['find', 'filter'],
                gc_upper = gc_upper,
                gc_lower = gc_lower,
                off_target_seed = off_target_seed,
                off_target_upper = off_target_upper,
                cas_type=cas_type
                )

sgrna_df = extract_sgRNAs(args)
sgrna_df

sgRNA generated were too smallSCO5088. To incorporate this extent borders. Skipping to next locus tag.


Unnamed: 0,strain_name,locus_tag,gene_loc,gene_strand,sgrna_strand,sgrna_loc,gc,pam,sgrna,sgrna_seed_sequence,off_target_count
2,NC_003888.3,SCO5087,5529801,1,-1,163,0.617647,TTC,GGATTGAAGCGCAGAGTCGTCATCACGGGCGTCG,GGATTGAA,0
6,NC_003888.3,SCO5087,5529801,1,-1,306,0.735294,TTC,ACAGGTCGCGGCGGAGGCCGACTTCGACCCGGTC,ACAGGTCG,0
28,NC_003888.3,SCO5087,5529801,1,1,131,0.558824,TTC,AATCCGAACAGCTCCTTCGATGCGTTCGTCCGGT,AATCCGAA,0
0,NC_003888.3,SCO5087,5529801,1,-1,94,0.705882,TTC,GAGCCTCCTTCGAGCCACGGGGCCGACGATGACG,GAGCCTCC,1
14,NC_003888.3,SCO5087,5529801,1,-1,850,0.705882,TTC,GACGGTACGCGGGACGGCTTCGTGCTGGCCGAAG,GACGGTAC,1
24,NC_003888.3,SCO5087,5529801,1,1,962,0.676471,TTC,AGGCCCGTCATGTGGTACGCGTTGCAGCGCGTCG,AGGCCCGT,1
26,NC_003888.3,SCO5087,5529801,1,1,201,0.794118,TTC,CCAGAACTGCCGGGTGCCGTTCCCGCCGGGGGCG,CCAGAACT,1
1,NC_003888.3,SCO5087,5529801,1,-1,105,0.735294,TTC,GAGCCACGGGGCCGACGATGACGACGACCACCGG,GAGCCACG,2
29,NC_003888.3,SCO5087,5529801,1,1,113,0.647059,TTC,GATGCGTTCGTCCGGTGGTCGTCGTCATCGTCGG,GATGCGTT,2
19,NC_003888.3,SCO5087,5529801,1,-1,1390,0.735294,TTC,CAGAGCGCGATGGTGCTGCGCGACGCCGAGACCG,CAGAGCGC,3


In [5]:
# Filter the DataFrame to retain only up to 5 sgRNA sequences per locus_tag
filtered_df = sgrna_df.groupby('locus_tag').head(number_of_sgRNAs_per_group)
filtered_df

Unnamed: 0,strain_name,locus_tag,gene_loc,gene_strand,sgrna_strand,sgrna_loc,gc,pam,sgrna,sgrna_seed_sequence,off_target_count
2,NC_003888.3,SCO5087,5529801,1,-1,163,0.617647,TTC,GGATTGAAGCGCAGAGTCGTCATCACGGGCGTCG,GGATTGAA,0
6,NC_003888.3,SCO5087,5529801,1,-1,306,0.735294,TTC,ACAGGTCGCGGCGGAGGCCGACTTCGACCCGGTC,ACAGGTCG,0
43,NC_003888.3,SCO5088,5531201,1,-1,1219,0.705882,TTC,AACTCGGCGGCGGTCCTGCGCCGTTTCGCACCGA,AACTCGGC,3
34,NC_003888.3,SCO5088,5531201,1,-1,370,0.676471,TTC,ACCCACCGGGAGTTCCGCAAGCTGTGGTCCGAGG,ACCCACCG,4


## Output

In [6]:
filtered_df_w_primers = generate_cas3_protospacer_primers(filtered_df,
                                                          fwd_overhang= forward_protospacer_overhang, 
                                                          rev_overhang=reverse_protospacer_overhang)
filtered_df_w_primers

Unnamed: 0,strain_name,locus_tag,gene_loc,gene_strand,sgrna_strand,sgrna_loc,gc,pam,sgrna,sgrna_seed_sequence,off_target_count,Fwd Primer,Rev Primer
2,NC_003888.3,SCO5087,5529801,1,-1,163,0.617647,TTC,GGATTGAAGCGCAGAGTCGTCATCACGGGCGTCG,GGATTGAA,0,GGATTGAAGCGCAGAGTCGTCATCACGGGCGTCGGTCGCCCGGCAA...,CGACGCCCGTGATGACGACTCTGCGCTTCAATCCGTTTCAATCCAC...
6,NC_003888.3,SCO5087,5529801,1,-1,306,0.735294,TTC,ACAGGTCGCGGCGGAGGCCGACTTCGACCCGGTC,ACAGGTCG,0,ACAGGTCGCGGCGGAGGCCGACTTCGACCCGGTCGTCGCCCGGCAA...,GACCGGGTCGAAGTCGGCCTCCGCCGCGACCTGTGTTTCAATCCAC...
43,NC_003888.3,SCO5088,5531201,1,-1,1219,0.705882,TTC,AACTCGGCGGCGGTCCTGCGCCGTTTCGCACCGA,AACTCGGC,3,AACTCGGCGGCGGTCCTGCGCCGTTTCGCACCGAGTCGCCCGGCAA...,TCGGTGCGAAACGGCGCAGGACCGCCGCCGAGTTGTTTCAATCCAC...
34,NC_003888.3,SCO5088,5531201,1,-1,370,0.676471,TTC,ACCCACCGGGAGTTCCGCAAGCTGTGGTCCGAGG,ACCCACCG,4,ACCCACCGGGAGTTCCGCAAGCTGTGGTCCGAGGGTCGCCCGGCAA...,CCTCGGACCACAGCTTGCGGAACTCCCGGTGGGTGTTTCAATCCAC...


In [7]:
amplicons = cas3_plasmid_pcrs(clean_plasmid, 
                              filtered_df, 
                              universal_fwd_seq=forward_backbone_overhang, 
                              universal_rev_seq=reverse_backbone_overhang
                              )

amplicons

[[Amplicon(1489), Amplicon(460)],
 [Amplicon(1489), Amplicon(460)],
 [Amplicon(1489), Amplicon(460)],
 [Amplicon(1489), Amplicon(460)]]

In [8]:
amplicons = cas3_plasmid_pcrs(clean_plasmid, filtered_df)

amplicons

[[Amplicon(1489), Amplicon(460)],
 [Amplicon(1489), Amplicon(460)],
 [Amplicon(1489), Amplicon(460)],
 [Amplicon(1489), Amplicon(460)]]

In [9]:
# cut plasmid and incorporate amplicons
assembled_cas3_plasmids = assemble_cas3_plasmids(clean_plasmid, amplicons)
assembled_cas3_plasmids

[Contig(o12766), Contig(o12766), Contig(o12766), Contig(o12766)]

In [10]:
# Constructing a meaningful name, ID, and description for the assembled plasmid using user input
targeting_info = []
for index, row in filtered_df.iterrows():
    formatted_str = f"pCas3_{row['locus_tag']}({row['sgrna_loc']})"
    targeting_info.append(formatted_str)

for i in range(len(assembled_cas3_plasmids)):
    assembled_cas3_plasmids[i].name = f'{targeting_info[i]}'
    assembled_cas3_plasmids[i].id = assembled_cas3_plasmids[i].name  
    assembled_cas3_plasmids[i].description = f'CRISPR-Cas3 targeting {", ".join(genes_to_KO)} for single gene knockout, assembled using StreptoCAD.'


In [11]:
assembled_cas3_plasmids[0].name

'pCas3_SCO5087(163)'

In [12]:
assembled_cas3_plasmids[0].description

'CRISPR-Cas3 targeting SCO5087, SCO5088 for single gene knockout, assembled using StreptoCAD.'

In [13]:
assembled_cas3_plasmids[0].id

'pCas3_SCO5087(163)'

In [14]:
print_plasmids = False

if print_plasmids: 
    for vector in assembled_cas3_plasmids: 
        vector.write(f"../../data/plasmids/sgRNA_plasmids_pCRISPR–Cas3/{vector.id}.gb")

### IDT primers

In [15]:
primer_records = make_primer_records(filtered_df_w_primers)
primer_records

[Dseqrecord(-52),
 Dseqrecord(-54),
 Dseqrecord(-52),
 Dseqrecord(-54),
 Dseqrecord(-52),
 Dseqrecord(-54),
 Dseqrecord(-52),
 Dseqrecord(-54)]

In [16]:
idt_primers_cas3=primers_to_IDT(primer_records)
idt_primers_cas3

Unnamed: 0,Name,Sequence,Concentration,Purification
0,SCO5087_loc_163_fwd,GGATTGAAGCGCAGAGTCGTCATCACGGGCGTCGGTCGCCCGGCAA...,25nm,STD
1,SCO5087_loc_163_rev,CGACGCCCGTGATGACGACTCTGCGCTTCAATCCGTTTCAATCCAC...,25nm,STD
2,SCO5087_loc_306_fwd,ACAGGTCGCGGCGGAGGCCGACTTCGACCCGGTCGTCGCCCGGCAA...,25nm,STD
3,SCO5087_loc_306_rev,GACCGGGTCGAAGTCGGCCTCCGCCGCGACCTGTGTTTCAATCCAC...,25nm,STD
4,SCO5088_loc_1219_fwd,AACTCGGCGGCGGTCCTGCGCCGTTTCGCACCGAGTCGCCCGGCAA...,25nm,STD
5,SCO5088_loc_1219_rev,TCGGTGCGAAACGGCGCAGGACCGCCGCCGAGTTGTTTCAATCCAC...,25nm,STD
6,SCO5088_loc_370_fwd,ACCCACCGGGAGTTCCGCAAGCTGTGGTCCGAGGGTCGCCCGGCAA...,25nm,STD
7,SCO5088_loc_370_rev,CCTCGGACCACAGCTTGCGGAACTCCCGGTGGGTGTTTCAATCCAC...,25nm,STD


# In frame deletion

In [17]:
in_frame_deletion = True 


In [18]:
print(assembled_cas3_plasmids)

[Contig(o12766), Contig(o12766), Contig(o12766), Contig(o12766)]


In [19]:
Dseqrecord(assembled_cas3_plasmids[3], circular=True).cut(EcoRI)

(Dseqrecord(-12770),)

#### Getting repair templates

In [20]:
if in_frame_deletion:
    

    # Make repair templates
    repair_templates_data = find_up_dw_repair_templates(genome, 
                                                        genes_to_KO, 
                                                        target_tm=melting_temperature, 
                                                        primer_tm_kwargs={'conc':primer_concentration, 'prodcode':chosen_polymerase} , 
                                                        repair_length=repair_length)

    # Digest the plasmids
    processed_records = [sorted(Dseqrecord(record, circular=True).cut(Enzyme_for_repair_template_integration), key=lambda x: len(x), reverse=True)[0] for record in assembled_cas3_plasmids]

    # Rename them appropriately
    for i in range(len(processed_records)):
        processed_records[i].name = assembled_cas3_plasmids[i].name
        print(processed_records[i].name)

    # Assembly 
    assembly_data = assemble_multiple_plasmids_with_repair_templates_for_deletion(genes_to_KO, 
                                                                                processed_records, 
                                                                                repair_templates_data, 
                                                                                overlap=overlap_length)
    # updating the primer names to something systematic.
    update_primer_names(assembly_data)

    # Parse through the primer df
    primer_df = create_primer_df_from_dict(assembly_data)

    #Find_unique primers
    unique_df = primer_df.drop_duplicates(keep='first')
    idt_df = create_idt_order_dataframe(unique_df, concentration="25nm", purification="STD")

    # Contigs
    assembled_contigs = []
    for data in assembly_data: 
        contig_record = data['contig']
        contig_record.id = f"{data['name']}_w_rep"
        contig_record.name = f"{data['name']}_w_rep"
        assembled_contigs.append(contig_record)


# print our df that contains primers for the adding repair templates to the vecors.         
unique_df

pCas3_SCO5087(163)
pCas3_SCO5087(306)
pCas3_SCO5088(1219)
pCas3_SCO5088(370)

Processing gene: SCO5087
Checking plasmid: pCas3_SCO5087(163)
Match found in plasmid: pCas3_SCO5087(163)
Repair template match found for gene: SCO5087
Record added for gene: SCO5087
Checking plasmid: pCas3_SCO5087(306)
Match found in plasmid: pCas3_SCO5087(306)
Repair template match found for gene: SCO5087
Record added for gene: SCO5087
Checking plasmid: pCas3_SCO5088(1219)
Checking plasmid: pCas3_SCO5088(370)

Processing gene: SCO5088
Checking plasmid: pCas3_SCO5087(163)
Checking plasmid: pCas3_SCO5087(306)
Checking plasmid: pCas3_SCO5088(1219)
Match found in plasmid: pCas3_SCO5088(1219)
Repair template match found for gene: SCO5088
Record added for gene: SCO5088
Checking plasmid: pCas3_SCO5088(370)
Match found in plasmid: pCas3_SCO5088(370)
Repair template match found for gene: SCO5088
Record added for gene: SCO5088


Unnamed: 0,template,direction,f_primer_anneal(5-3),r_primer_anneal(5-3),f_tm,r_tm,ta,f_primer_sequences(5-3),r_primer_sequences(5-3),f_primer_name,r_primer_name
0,SCO5087,upstream,GACGCCGGAAATCCAG,CCCATCTCCCTTCGAC,56,54,58,AGAAGCTTTCTAGAGGATCCCCGGGTACCGAGCTCGAATTGACGCC...,GACTCCGGTGATGAGGACGCCCCATCTCCCTTCGAC,SCO5087_repair_up_forwar_p,SCO5087_repair_up_reverse_p
1,SCO5087,downstream,GCGTCCTCATCACCGG,GAGTAGAGGCGGCCC,58,57,61,GGCGGTCGAAGGGAGATGGGGCGTCCTCATCACCGG,AACGTCTGGAAAGACGACAAAACTTTAGATCTGGGGAATTGAGTAG...,SCO5087_repair_dw_forwar_p,SCO5087_repair_dw_reverse_p
4,SCO5088,upstream,ACCCGGACACCCTC,GCGGCCGCCCCCG,55,66,59,AGAAGCTTTCTAGAGGATCCCCGGGTACCGAGCTCGAATTACCCGG...,TTCTCCAGTTCTCGGGGTCGGCGGCCGCCCCCG,SCO5088_repair_up_forwar_p,SCO5088_repair_up_reverse_p
5,SCO5088,downstream,CGACCCCGAGAACTGG,TACACGATGCTCCTCGC,58,58,61,GAGACCGCGGGGGCGGCCGCCGACCCCGAGAACTGG,AACGTCTGGAAAGACGACAAAACTTTAGATCTGGGGAATTTACACG...,SCO5088_repair_dw_forwar_p,SCO5088_repair_dw_reverse_p


Check for restriction enzyme sites

In [21]:
if in_frame_deletion:
    integration_names = filtered_df.apply(lambda row: f"sgRNA_{row['locus_tag']}({row['sgrna_loc']})", axis=1).tolist()
    plasmid_metadata_df = extract_metadata_to_dataframe(assembled_contigs,
                                                        clean_plasmid,
                                                        integration_names)
    workflow_df = determine_workflow_order_for_plasmids(assembled_cas3_plasmids, 
                                                        assembled_contigs,
                                                        [Enzyme_for_repair_template_integration,], [ "NcoI","BstBI" ]) 
    
    plasmid_metadata_df = pd.merge(plasmid_metadata_df, workflow_df, on='plasmid_name', how='inner')  # or 'left'/'right' depending on what you need

else: 
    integration_names = filtered_df.apply(lambda row: f"sgRNA_{row['locus_tag']}({row['sgrna_loc']})", axis=1).tolist()
    plasmid_metadata_df = extract_metadata_to_dataframe(assembled_cas3_plasmids,
                                                        clean_plasmid,
                                                        integration_names)

plasmid_metadata_df

Unnamed: 0,plasmid_name,date,original_plasmid,integration,size,sgRNA plasmid,which workflow to proceed with,sgRNA plasmid #EcoRI sites,repair template plasmid #NcoI sites,repair template plasmid #BstBI sites
0,pCas3_SCO5087(163)_w_rep,2024-10-11,pCRISPR-Cas3,sgRNA_SCO5087(163),14770,pCas3_SCO5087(163),Proceed with sgRNA integration first,0,0,1
1,pCas3_SCO5087(306)_w_rep,2024-10-11,pCRISPR-Cas3,sgRNA_SCO5087(306),14770,pCas3_SCO5087(306),Proceed with sgRNA integration first,0,0,1
2,pCas3_SCO5088(1219)_w_rep,2024-10-11,pCRISPR-Cas3,sgRNA_SCO5088(1219),14770,pCas3_SCO5088(1219),Proceed with sgRNA integration first,0,2,2
3,pCas3_SCO5088(370)_w_rep,2024-10-11,pCRISPR-Cas3,sgRNA_SCO5088(370),14770,pCas3_SCO5088(370),Proceed with sgRNA integration first,0,2,2


In [22]:
# Getting checking primers
checking_primers_df = find_best_check_primers_from_genome(genome, 
                                       genes_to_KO, 
                                       flanking_region=flanking_region,
                                       target_tm = melting_temperature, 
                                        primer_concentration = primer_concentration, 
                                        polymerase = chosen_polymerase)
checking_primers_df

Unnamed: 0,locus tag,f_primer_name,r_primer_name,f_primer_sequences(5-3),r_primer_sequences(5-3),f_tm,r_tm,ta,flanking_region,annealing_temperature,...,homodimer_reverse_tm,homodimer_reverse_deltaG (kcal/mol),heterodimer_tm,heterodimer_deltaG (kcal/mol),hairpin_forward_structure_found,hairpin_forward_tm,hairpin_forward_deltaG (kcal/mol),hairpin_reverse_structure_found,hairpin_reverse_tm,hairpin_reverse_deltaG (kcal/mol)
0,SCO5088,SCO5088_fwd_checking_primer,SCO5088_rev_checking_primer,CGCATCCACGCCGAGA,CAGCGTCCGGCGG,61,58,62,500,62,...,22.507821,-4.506029,22.023864,-4.603239,False,0.0,0.0,False,0.0,0.0
1,SCO5087,SCO5087_fwd_checking_primer,SCO5087_rev_checking_primer,GACGATTCGGCCCGTG,CAGGGCGTCCAGGC,59,58,61,500,61,...,9.706246,-3.415427,1.918567,-2.396641,False,0.0,0.0,False,0.0,0.0


In [23]:
# making the primers into dseqrecords
checking_primers_df_idt = create_idt_order_dataframe(checking_primers_df)
checking_primers_df_idt


Unnamed: 0,Name,Sequence,Concentration,Purification
0,SCO5088_fwd_checking_primer,CGCATCCACGCCGAGA,25nm,STD
1,SCO5087_fwd_checking_primer,GACGATTCGGCCCGTG,25nm,STD
2,SCO5088_rev_checking_primer,CAGCGTCCGGCGG,25nm,STD
3,SCO5087_rev_checking_primer,CAGGGCGTCCAGGC,25nm,STD


In [24]:
if in_frame_deletion: 
    full_idt = pd.concat([idt_primers_cas3,idt_df, checking_primers_df_idt],ignore_index=True)
else: 
    full_idt = pd.concat([idt_primers_cas3,checking_primers_df_idt],ignore_index=True)

full_idt

Unnamed: 0,Name,Sequence,Concentration,Purification
0,SCO5087_loc_163_fwd,GGATTGAAGCGCAGAGTCGTCATCACGGGCGTCGGTCGCCCGGCAA...,25nm,STD
1,SCO5087_loc_163_rev,CGACGCCCGTGATGACGACTCTGCGCTTCAATCCGTTTCAATCCAC...,25nm,STD
2,SCO5087_loc_306_fwd,ACAGGTCGCGGCGGAGGCCGACTTCGACCCGGTCGTCGCCCGGCAA...,25nm,STD
3,SCO5087_loc_306_rev,GACCGGGTCGAAGTCGGCCTCCGCCGCGACCTGTGTTTCAATCCAC...,25nm,STD
4,SCO5088_loc_1219_fwd,AACTCGGCGGCGGTCCTGCGCCGTTTCGCACCGAGTCGCCCGGCAA...,25nm,STD
5,SCO5088_loc_1219_rev,TCGGTGCGAAACGGCGCAGGACCGCCGCCGAGTTGTTTCAATCCAC...,25nm,STD
6,SCO5088_loc_370_fwd,ACCCACCGGGAGTTCCGCAAGCTGTGGTCCGAGGGTCGCCCGGCAA...,25nm,STD
7,SCO5088_loc_370_rev,CCTCGGACCACAGCTTGCGGAACTCCCGGTGGGTGTTTCAATCCAC...,25nm,STD
8,SCO5087_repair_up_forwar_p,AGAAGCTTTCTAGAGGATCCCCGGGTACCGAGCTCGAATTGACGCC...,25nm,STD
9,SCO5087_repair_dw_forwar_p,GGCGGTCGAAGGGAGATGGGGCGTCCTCATCACCGG,25nm,STD


## Folder with all the generated I/O

In [25]:
input_files = [
    {"name": "input_genome.gb", "content": genome},
    {"name": "input_plasmid.gb", "content": clean_plasmid}
]


if in_frame_deletion: 
    output_files = [
        {"name": "BEST_w_sgRNAs.gb", "content": assembled_contigs}, # LIST OF Dseqrecords
        {"name": "primer_df.csv", "content": primer_df},
        {"name": "full_idt.csv", "content": full_idt},
        {"name": "sgrna_df.csv", "content": sgrna_df},
        {"name": "filtered_sgrna_df.csv", "content": filtered_df},
        {"name": "plasmid_metadata_df.csv", "content": plasmid_metadata_df},

    ]
else: 
    output_files = [
        {"name": "cBEST_w_sgRNAs.gb", "content": assembled_cas3_plasmids}, # LIST OF Dseqrecords
        {"name": "full_idt.csv", "content": full_idt},
        {"name": "sgrna_df.csv", "content": sgrna_df},
        {"name": "filtered_df.csv", "content": filtered_df},
        {"name": "plasmid_metadata_df.csv", "content": plasmid_metadata_df},
        {"name": "workflow_order_df.csv", "content": workflow_df},

    ]
         
input_values = {
    "genes_to_knockout": genes_to_KO,

    "filtering_metrics": {
        "gc_upper": gc_upper,
        "gc_lower": gc_lower,
        "off_target_seed": off_target_seed,
        "off_target_upper": off_target_upper,
        "cas_type": cas_type,
        "number_of_sgRNAs_per_group": number_of_sgRNAs_per_group,

    },  
    "polymerase_settings": {
        "chosen_polymerase": chosen_polymerase,
        "melting_temperature": melting_temperature,
        "primer_concentration": primer_concentration,
        "primer_number_increment": primer_number_increment,
        "flanking_region": flanking_region
    },
    "overlapping_sequences": {
        "up_homology": str(forward_backbone_overhang),
        "dw_homology": str(reverse_backbone_overhang)
    }
}

# Paths to Markdown files
markdown_file_paths = [
    "../../protocols/conjugation_protcol.md",
    "../../protocols/single_target_crispr_plasmid_protcol.md"

]

timestamp = datetime.utcnow().isoformat()

# Create project directory structure
project_directory = ProjectDirectory(
    project_name=f"CRISPR_cas3_inframe_deletion_workflow_{timestamp}",
    input_files=input_files,
    output_files=output_files,
    input_values=input_values,
    markdown_file_paths=markdown_file_paths
)

# DO You want to save the folder? 
save_zip_folder = True 

if save_zip_folder: 
    # Generate the project directory structure and get the zip content
    zip_content = project_directory.create_directory_structure(create_directories=False)

    # Save the zip file to disk (optional)
    with open("project_structure.zip", "wb") as f:
        f.write(zip_content)

