# W3-CRISPRi-workflow LLPMBPKK_05956 CRISPRi


Transcriptional interference is another powerful tool for assessing the function of genes, by expressing a guide RNA that targets a dead-Cas9/sgRNA complex to bind to the + (Watson) strand of the promoter and 5’-UTR of the targeted gene, thereby sterically blocking binding and progression of RNA polymerases to the promoter region. Workflow 4 uses CRISPR interference (CRISPRi) with ssDNA bridging to reversibly inactivate genes transcriptionally12,29 (Figure 5A). This approach targets regions upstream of the Transcriptional Start Site (TSS), using a dCas9-sgRNA complex positioned near the TSS (default 100bp upstream) to sterically hinder transcription. This allows for the functional study of genes through controlled knockdown. To get started, users can download the pCRISPR-dCas9 plasmid and the S. coelicolor (A3) genome. StreptoAIM will then generate all necessary components, including primers and plasmids.

In [1]:
import sys
import os
from pydna.dseqrecord import Dseqrecord
from datetime import datetime


# Ensure the src directory is in the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '../../'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from streptocad.sequence_loading.sequence_loading import (
    load_and_process_gene_sequences,
    load_and_process_genome_sequences,
    load_and_process_plasmid, 
    check_and_convert_input,
    annotate_dseqrecord,
    process_specified_gene_sequences_from_record)


from streptocad.cloning.ssDNA_bridging import assemble_plasmids_by_ssDNA_bridging, make_ssDNA_oligos
from streptocad.crispr.guideRNA_crispri import extract_sgRNAs_for_crispri, SgRNAargs
from streptocad.primers.primer_generation import  primers_to_IDT
from streptocad.utils import ProjectDirectory,extract_metadata_to_dataframe

## INPUT

In [2]:
# Inputs
# 1 Add genome of choice (genbank, fasta)
path_to_genome = '../wet_lab_notebooks/data_for_drylab/Go40110_flye_polish_actinoannotPFAM_antismash7/Go40110_flye_polish_actinoannotPFAM.gbk'
genome = load_and_process_genome_sequences(path_to_genome)[0]

# 2 Add plasmid 
path_to_plasmid = '../../web_app/assets/pCRISPR-Cas9.gb'
clean_plasmid = load_and_process_plasmid(path_to_plasmid)

# 3 Choose genes to knock out (list)
genes_to_KO = ['LLPMBPKK_05956']#, ''LLPMBPKK_00515', 'LLPMBPKK_00435',', 'SCO5090']


#### Advanced settings ####
# 4 Filtering metrics for sgRNAs
gc_upper = 0.8
gc_lower = 0.2
off_target_seed = 13
off_target_upper = 1
cas_type='cas9'
number_of_sgRNAs_per_group = 3
extension_to_promoter_region=200

# 6 Choose overlapping sequences for our plasmid we can use the following
#As per the article **"CRISPR–Cas9, CRISPRi and CRISPR-BEST-mediated genetic manipulation in streptomycetes"** we need the following oligoes: 
#CGGTTGGTAGGATCGACGGC **-N20-** GTTTTAGAGCTAGAAATAGC
up_homology = Dseqrecord('CGGTTGGTAGGATCGACGGC')
dw_homology = Dseqrecord('GTTTTAGAGCTAGAAATAGC')

In [3]:
print(clean_plasmid.id)


.


# Computation

In [4]:
target_dict, genes_to_KO, annotation_input = check_and_convert_input(genes_to_KO)

print(annotation_input)
if annotation_input == True:
    genome = annotate_dseqrecord(genome, target_dict)


len(genome.features)

False


18013

In [5]:

# Initialize SgRNAArgs with desired parameters
args = SgRNAargs(genome, 
                genes_to_KO,
                step=['find', 'filter'],
                gc_upper = gc_upper,
                gc_lower = gc_lower,
                off_target_seed = off_target_seed,
                off_target_upper = off_target_upper,
                cas_type='cas9',
                extension_to_promoter_region=200,
                target_non_template_strand = True, 
                upstream_tss = 70, 
                dwstream_tss = 60, 
                  )

sgrna_df = extract_sgRNAs_for_crispri(args)
sgrna_df

sgRNA generated were outside the designated border in LLPMBPKK_05956. To incorporate this extent borders. Skipping to next locus tag.
sgRNA generated were outside the designated border in LLPMBPKK_05956. To incorporate this extent borders. Skipping to next locus tag.
Pam was found outside designated locus_tag: LLPMBPKK_05956. To incorporate this extent borders. Skipping to next locus tag.
sgRNA generated were outside the designated border in LLPMBPKK_05956_upstream. To incorporate this extent borders. Skipping to next locus tag.




Unnamed: 0,strain_name,locus_tag,gene_loc,gene_strand,sgrna_strand,sgrna_loc,gc,pam,sgrna,sgrna_seed_sequence,off_target_count,region
140,.,LLPMBPKK_05956,6815841,1,-1,-47,0.75,AGG,TGCGGGCAGGGGTGTCCGTG,AGGGGTGTCCGTG,0,upstream
145,.,LLPMBPKK_05956,6815841,1,-1,-29,0.8,CGG,TAGCGCGCCGTGGGCGCGTG,CCGTGGGCGCGTG,0,upstream
146,.,LLPMBPKK_05956,6815841,1,-1,-20,0.8,GGG,CCACCCGCGTAGCGCGCCGT,CGTAGCGCGCCGT,1,upstream
147,.,LLPMBPKK_05956,6815841,1,-1,-19,0.8,TGG,TCCACCCGCGTAGCGCGCCG,GCGTAGCGCGCCG,1,upstream
0,.,LLPMBPKK_05956,6816041,1,-1,60,0.7,CGG,CAGTCCGGTGGCCACCGCAT,GTGGCCACCGCAT,0,CDS


In [6]:
# Filter the DataFrame to retain only up to 5 sgRNA sequences per locus_tag
filtered_df = sgrna_df.groupby('locus_tag').sample(n=5, random_state=42).sort_values(by='sgrna_loc')
filtered_df

Unnamed: 0,strain_name,locus_tag,gene_loc,gene_strand,sgrna_strand,sgrna_loc,gc,pam,sgrna,sgrna_seed_sequence,off_target_count,region
140,.,LLPMBPKK_05956,6815841,1,-1,-47,0.75,AGG,TGCGGGCAGGGGTGTCCGTG,AGGGGTGTCCGTG,0,upstream
145,.,LLPMBPKK_05956,6815841,1,-1,-29,0.8,CGG,TAGCGCGCCGTGGGCGCGTG,CCGTGGGCGCGTG,0,upstream
146,.,LLPMBPKK_05956,6815841,1,-1,-20,0.8,GGG,CCACCCGCGTAGCGCGCCGT,CGTAGCGCGCCGT,1,upstream
147,.,LLPMBPKK_05956,6815841,1,-1,-19,0.8,TGG,TCCACCCGCGTAGCGCGCCG,GCGTAGCGCGCCG,1,upstream
0,.,LLPMBPKK_05956,6816041,1,-1,60,0.7,CGG,CAGTCCGGTGGCCACCGCAT,GTGGCCACCGCAT,0,CDS


In [7]:
import pandas as pd
crispy_web_sgrnas = pd.read_csv('CRISPyweb__goe_targeting_upstream_LLPMBPKK_05956_.csv')
crispy_web_sgrnas

Unnamed: 0,ID,Start,End,Strand,ORF,Sequence,PAM,C to T mutations,A to G mutations,0bp mismatches,1bp mismatches,2bp mismatches
0,CY00000009,29,52,-1,-,TGCGGGCAGGGGTGTCCGTG,AGG,,,0,2,50
1,CY00000014,47,70,-1,-,TAGCGCGCCGTGGGCGCGTG,CGG,,,0,6,151
2,CY00000022,56,79,-1,-,CCACCCGCGTAGCGCGCCGT,GGG,,,1,3,76
3,CY00000023,57,80,-1,-,TCCACCCGCGTAGCGCGCCG,TGG,,,1,9,215
4,CY00000012,136,159,-1,-,CAGTCCGGTGGCCACCGCAT,CGG,,,0,4,66


In [8]:
# 1) Pull out and align the three columns from each DF
df1 = (
    filtered_df
    [['sgrna','pam','sgrna_strand']]
    .rename(columns={
        'sgrna': 'Sequence',
        'pam':   'PAM',
        'sgrna_strand': 'Strand'
    })
)

df2 = crispy_web_sgrnas[['Sequence','PAM','Strand']]

# 2) Turn each into a set of tuples
set1 = set(df1.itertuples(index=False, name=None))
set2 = set(df2.itertuples(index=False, name=None))

# 3) Compare
if set1 == set2:
    print("✅ Both tables have the same (Sequence, PAM, Strand) rows.")
else:
    only1 = set1 - set2
    only2 = set2 - set1
    print("❌ They differ!")
    if only1:
        print("\nRows in filtered_df but not in crispy_web_sgrnas:")
        for row in only1:
            print(row)
    if only2:
        print("\nRows in crispy_web_sgrnas but not in filtered_df:")
        for row in only2:
            print(row)

✅ Both tables have the same (Sequence, PAM, Strand) rows.


## Output

In [9]:
# MAke oligoes
list_of_ssDNAs = make_ssDNA_oligos(filtered_df, upstream_ovh = up_homology,
                      downstream_ovh=dw_homology)
print(list_of_ssDNAs[0].name)

# cut plasmid
from Bio.Restriction import NcoI
linearized_plasmid = sorted(clean_plasmid.cut(NcoI), key=lambda x: len(x), reverse=True)[0]
#print(linearized_plasmid)

sgRNA_vectors = assemble_plasmids_by_ssDNA_bridging(list_of_ssDNAs,linearized_plasmid)
sgRNA_vectors

LLPMBPKK_05956_loc_-47


[Contig(o11279),
 Contig(o11279),
 Contig(o11279),
 Contig(o11279),
 Contig(o11279)]

In [10]:
# Constructing a meaningful name, ID, and description for the assembled plasmid using user input
targeting_info = []
for index, row in filtered_df.iterrows():
    formatted_str = f"CRISPRi_{row['locus_tag']}_p{row['sgrna_loc']}"
    targeting_info.append(formatted_str)

for i in range(len(sgRNA_vectors)):
    sgRNA_vectors[i].name = f'p{targeting_info[i]}_#{i+1}'
    sgRNA_vectors[i].id = sgRNA_vectors[i].name  # Using the same value for ID as for name for simplicity
    sgRNA_vectors[i].description = f'Assembled plasmid targeting {", ".join(genes_to_KO)} for single gene KNOCK-DOWN, assembled using StreptoCAD.'


In [11]:
filtered_df

Unnamed: 0,strain_name,locus_tag,gene_loc,gene_strand,sgrna_strand,sgrna_loc,gc,pam,sgrna,sgrna_seed_sequence,off_target_count,region
140,.,LLPMBPKK_05956,6815841,1,-1,-47,0.75,AGG,TGCGGGCAGGGGTGTCCGTG,AGGGGTGTCCGTG,0,upstream
145,.,LLPMBPKK_05956,6815841,1,-1,-29,0.8,CGG,TAGCGCGCCGTGGGCGCGTG,CCGTGGGCGCGTG,0,upstream
146,.,LLPMBPKK_05956,6815841,1,-1,-20,0.8,GGG,CCACCCGCGTAGCGCGCCGT,CGTAGCGCGCCGT,1,upstream
147,.,LLPMBPKK_05956,6815841,1,-1,-19,0.8,TGG,TCCACCCGCGTAGCGCGCCG,GCGTAGCGCGCCG,1,upstream
0,.,LLPMBPKK_05956,6816041,1,-1,60,0.7,CGG,CAGTCCGGTGGCCACCGCAT,GTGGCCACCGCAT,0,CDS


In [12]:
# # annotate the plasmids with the sgRNA 
# from Bio.Seq import Seq

# def find_sgRNA_in_plasmid(plasmid_seq, sgRNA_seq, strand):
#     """
#     Find the sgRNA sequence in the plasmid. Returns (start, end, strand) or None if not found.
#     - plasmid_seq: Bio.Seq object (can also use str)
#     - sgRNA_seq: str (should be 5'-3' sequence)
#     - strand: +1 (sense) or -1 (antisense)
#     """
#     # Search on forward strand
#     idx = str(plasmid_seq).find(sgRNA_seq)
#     if idx != -1 and strand == 1:
#         return idx, idx + len(sgRNA_seq), 1
#     # Search on reverse complement
#     rc_sgRNA_seq = str(Seq(sgRNA_seq).reverse_complement())
#     idx = str(plasmid_seq).find(rc_sgRNA_seq)
#     if idx != -1 and strand == -1:
#         return idx, idx + len(rc_sgRNA_seq), -1
#     # Not found
#     return None

# from Bio.SeqFeature import SeqFeature, FeatureLocation
# filtered_df = filtered_df.reset_index(drop=True)

# for j, plasmid in enumerate(sgRNA_vectors):
#     for i, row in filtered_df.iterrows():
#         sgRNA_seq = str(row['sgrna'])
#         strand = int(row['sgrna_strand'])
#         res = find_sgRNA_in_plasmid(Dseqrecord(plasmid.seq), sgRNA_seq, strand)
#         if res:
#             start, end, strand = res
#             location = FeatureLocation(start, end, strand=strand)
#             print(location)
#             # Add feature with proper qualifiers...
#             # (Add only if this feature doesn't exist yet, to avoid duplicates)
#             print(f"Found {sgRNA_seq} in {plasmid.name} at {start}-{end} (strand {strand})")
#         else:
#             print(f"sgRNA {sgRNA_seq} not found in {plasmid.name}")


#     qualifiers = {
#         "locus_tag": row["locus_tag"],
#         "pam": row["pam"],
#         "sgrna_sequence": row["sgrna"],
#         "seed_sequence": row["sgrna_seed_sequence"],
#         "off_target_count": str(row["off_target_count"]),
#         "region": row["region"],
#         "note": f"sgRNA targeting {row['locus_tag']} at {row['sgrna_loc']} ({row['region']})"
#     }

#     feature = SeqFeature(
#         location=location,
#         type="sgRNA",
#         qualifiers=qualifiers
#     )
#     plasmid.features.append(feature)


In [13]:
print_plasmids = True

if print_plasmids: 
    for vector in sgRNA_vectors: 
        vector.write(f"plasmids/{vector.id}.gb")



⚠,Sequence change,Sequence change.1
Filename,plasmids/pCRISPRi_LLPMBPKK_05956_p-47_#1.gb,plasmids/pCRISPRi_LLPMBPKK_05956_p-47_#1_OLD_1755081159817839.gb
Saved,2025-08-13T12:32:39.845451,2025-07-24T11:53:20.118708
Length,11279,11276
uSEGUID,CvyJ8Vq49Dncq9Ow0uS76_mGiJ4,3FXsforzoL7ruEBRvpoKmvlwcZw
cSEGUID,47ZMiGtX_uPlytF19e91LjohsVU,FZfohrsBwje4Xxqit-bYc0m9HYc


⚠,Sequence change,Sequence change.1
Filename,plasmids/pCRISPRi_LLPMBPKK_05956_p-29_#2.gb,plasmids/pCRISPRi_LLPMBPKK_05956_p-29_#2_OLD_1755081159851255.gb
Saved,2025-08-13T12:32:39.869860,2025-07-24T12:05:28.478516
Length,11279,11276
uSEGUID,-BOZ7k5hE44KOAY256INu1t3GkM,zGzXeG5sBYPY0HC_yHe3bXg_dmw
cSEGUID,x92XZbZAfxxj0DEnAKZgUljlaS8,sbs59QsY5WlraWzr1Y8PoxKR2ec


In [14]:
integration_names = filtered_df.apply(lambda row: f"sgRNA_{row['locus_tag']}({row['sgrna_loc']})", axis=1).tolist()
plasmid_metadata_df = extract_metadata_to_dataframe(sgRNA_vectors,
                                                    clean_plasmid,
                                                    integration_names)

plasmid_metadata_df

Unnamed: 0,plasmid_name,date,original_plasmid,integration,size
0,pCRISPRi_LLPMBPKK_05956_p-47_#1,2025-08-13,.,sgRNA_LLPMBPKK_05956(-47),11279
1,pCRISPRi_LLPMBPKK_05956_p-29_#2,2025-08-13,.,sgRNA_LLPMBPKK_05956(-29),11279
2,pCRISPRi_LLPMBPKK_05956_p-20_#3,2025-08-13,.,sgRNA_LLPMBPKK_05956(-20),11279
3,pCRISPRi_LLPMBPKK_05956_p-19_#4,2025-08-13,.,sgRNA_LLPMBPKK_05956(-19),11279
4,pCRISPRi_LLPMBPKK_05956_p60_#5,2025-08-13,.,sgRNA_LLPMBPKK_05956(60),11279


### IDT primers

In [15]:
idt_primers=primers_to_IDT(list_of_ssDNAs)
idt_primers

Unnamed: 0,Name,Sequence,Concentration,Purification
0,LLPMBPKK_05956_loc_-47,CGGTTGGTAGGATCGACGGCTGCGGGCAGGGGTGTCCGTGGTTTTA...,25nm,STD
1,LLPMBPKK_05956_loc_-29,CGGTTGGTAGGATCGACGGCTAGCGCGCCGTGGGCGCGTGGTTTTA...,25nm,STD
2,LLPMBPKK_05956_loc_-20,CGGTTGGTAGGATCGACGGCCCACCCGCGTAGCGCGCCGTGTTTTA...,25nm,STD
3,LLPMBPKK_05956_loc_-19,CGGTTGGTAGGATCGACGGCTCCACCCGCGTAGCGCGCCGGTTTTA...,25nm,STD
4,LLPMBPKK_05956_loc_60,CGGTTGGTAGGATCGACGGCCAGTCCGGTGGCCACCGCATGTTTTA...,25nm,STD


## Folder with all the generated I/O

In [16]:
generate_data_folder = False 

if generate_data_folder:
    input_files = [
        {"name": "input_genome.gb", "content": genome},
        {"name": "input_plasmid.gb", "content": clean_plasmid}
    ]

    output_files = [
        {"name": "cBEST_w_sgRNAs.gb", "content": sgRNA_vectors}, # LIST OF Dseqrecords
        {"name": "full_idt.csv", "content": idt_primers},
        {"name": "sgrna_df.csv", "content": sgrna_df},
        {"name": "filtered_df.csv", "content": filtered_df},
        {"name": "plasmid_metadata_df.csv", "content": plasmid_metadata_df},
    ]

    input_values = {
        "genes_to_knockout": genes_to_KO,


        "filtering_metrics": {
            "gc_upper": gc_upper,
            "gc_lower": gc_lower,
            "off_target_seed": off_target_seed,
            "off_target_upper": off_target_upper,
            "cas_type": cas_type,
            "number_of_sgRNAs_per_group": number_of_sgRNAs_per_group,
            'extension_to_promoter_region':extension_to_promoter_region,

        },
        "overlapping_sequences": {
            "up_homology": str(up_homology),
            "dw_homology": str(dw_homology)
        }
    }


    # Paths to Markdown files
    markdown_file_paths = [
        "../../protocols/conjugation_protcol.md",
        "../../protocols/single_target_crispr_plasmid_protcol.md"

    ]



    # Data and time
    timestamp = datetime.utcnow().isoformat()

    # Create project directory structure
    project_directory = ProjectDirectory(
        project_name=f"CRISPRi_workflow_{timestamp}",
        input_files=input_files,
        output_files=output_files,
        input_values=input_values,
        markdown_file_paths=markdown_file_paths
    )


    # DO You want to save the folder? 
    save_zip_folder = True 

    if save_zip_folder: 
        # Generate the project directory structure and get the zip content
        zip_content = project_directory.create_directory_structure(create_directories=False)

        # Save the zip file to disk (optional)
        with open("project_structure.zip", "wb") as f:
            f.write(zip_content)