<a href="https://colab.research.google.com/github/jjjung99/SD-design/blob/main/SD_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
!apt-get install -y vienna-rna
!pip install viennarna biopython

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
vienna-rna is already the newest version (2.4.17+dfsg-2build2).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.
Collecting viennarna
  Downloading ViennaRNA-2.7.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting biopython
  Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Downloading ViennaRNA-2.7.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.2/13.2 MB[0m [31m57.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m62.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: viennar

In [16]:
import RNA
import random
import re
from multiprocessing import Pool, cpu_count
import pandas as pd

# 1.OSD 후보 선택

In [17]:
# DNA → RNA 변환

def dna_to_rna(seq):
    return seq.replace("T", "U").replace("t","u")

In [18]:
# full mrna 조합

def build_full_mrna(UTR50, AU10, SD, spacer, linker_rna, cds_rna):
    return UTR50 + AU10 + SD + spacer + "AUG" + linker_rna + cds_rna

In [19]:
# Random generators (AU10 / SD6 / spacer6)

def random_AU10():
    return "".join(random.choice("AU") for _ in range(10))

def random_SD6():
    return "".join(random.choice("ACGU") for _ in range(6))

def random_spacer6():
    return "".join(random.choice("ACGU") for _ in range(6))


In [20]:
# ViennaRNA binding & folding

def mfe_hairpin(seq):
    return RNA.fold(seq)[1]  # returns (structure, energy)

def duplex(a, b):
    return RNA.duplexfold(a, b).energy

## O-ASD 생성 관련 함수

In [21]:
def hamming(a, b):
    return sum(x != y for x, y in zip(a, b))


In [22]:
def revcomp_rna(seq):
    comp = {"A":"U","U":"A","C":"G","G":"C"}
    return "".join(comp[x] for x in seq)[::-1]


In [23]:
# O-ASD Validity Check

WT_ASD = "ACCUCCUUA"   # E. coli 16S 3’ tail
WT_SD  = "AGGAGG"      # native Shine-Dalgarno

def gc_percent(seq):
    return (seq.count("G")+seq.count("C")) / len(seq) * 100

def no_palindrome(seq, k=3):
    for L in range(k, 6):
        for i in range(len(seq)-L):
            frag = seq[i:i+L]
            if frag == frag[::-1]:
                return False
    return True

def valid_asd(asd):
    if hamming(asd, WT_ASD) < 4: return False
    if not (30 <= gc_percent(asd) <= 55): return False
    if not no_palindrome(asd): return False

    hp = mfe_hairpin(asd)
    if hp <= -8: return False
    if hp <= -5: return False

    return True


In [24]:
# O-ASD 평가 함수

def evaluate_asd(asd):
    o_sd = revcomp_rna(asd)
    E_orth   = duplex(o_sd, asd)
    E_cross1 = duplex(WT_ASD, o_sd)
    E_cross2 = duplex(asd, WT_SD)
    hp       = mfe_hairpin(asd)
    combined = E_orth - hp

    return {
        "ASD": asd,
        "O_SD": o_sd,
        "E_orth": E_orth,
        "E_cross1": E_cross1,
        "E_cross2": E_cross2,
        "hairpin": hp,
        "combined": combined
    }


In [25]:
# O-ASD 생성기

def generate_OASD(n=2000, length=9):
    result = []
    for _ in range(n):
        asd = "".join(random.choice("ACGU") for _ in range(length))
        if valid_asd(asd):
            result.append(evaluate_asd(asd))
    return result


## O-ASD Top10 생성

In [26]:
oasds = generate_OASD(n=3000)
top10_OASD = sorted(oasds, key=lambda x: x["combined"])[:10]

for i, t in enumerate(top10_OASD):
    print(f"TOP O-ASD #{i+1} → {t['ASD']}  (O-SD: {t['O_SD']})")


TOP O-ASD #1 → UAGGCAGUA  (O-SD: UACUGCCUA)
TOP O-ASD #2 → UGCAGCUAU  (O-SD: AUAGCUGCA)
TOP O-ASD #3 → AGCUAGCAU  (O-SD: AUGCUAGCU)
TOP O-ASD #4 → UGGCCUUAU  (O-SD: AUAAGGCCA)
TOP O-ASD #5 → ACUUAGGCU  (O-SD: AGCCUAAGU)
TOP O-ASD #6 → AGGUAGCUU  (O-SD: AAGCUACCU)
TOP O-ASD #7 → AAGUAGGCA  (O-SD: UGCCUACUU)
TOP O-ASD #8 → AUGGCCAAU  (O-SD: AUUGGCCAU)
TOP O-ASD #9 → UACUGACUC  (O-SD: GAGUCAGUA)
TOP O-ASD #10 → UGCAAUGGU  (O-SD: ACCAUUGCA)


In [35]:
from Bio import SeqIO

records = list(SeqIO.parse("/content/cds_from_genomic.fna", "fasta"))
print("CDS 개수:", len(records))
for rec in records[:5]:
    print(rec.id, len(rec.seq))


CDS 개수: 4318
lcl|NC_000913.3_cds_NP_414542.1_1 66
lcl|NC_000913.3_cds_NP_414543.1_2 2463
lcl|NC_000913.3_cds_NP_414544.1_3 933
lcl|NC_000913.3_cds_NP_414545.1_4 1287
lcl|NC_000913.3_cds_NP_414546.1_5 297


In [45]:
from google.colab import files
import random

uploaded = files.upload()
fname = list(uploaded.keys())[0]
print("Uploaded:", fname)

# CDS 딕셔너리 저장
cds_dict = {}

with open(fname, "r") as f:
    current_id = None
    seq_accum = []

    for line in f:
        line = line.strip()
        if line.startswith(">"):
            # 기존 CDS 저장
            if current_id and len(seq_accum) > 0:
                cds_dict[current_id] = "".join(seq_accum)
                seq_accum = []

            current_id = line[1:]  # '>' 제거
        else:
            seq_accum.append(line)

    # 마지막 CDS 저장
    if current_id and len(seq_accum) > 0:
        cds_dict[current_id] = "".join(seq_accum)

print("총 CDS 개수:", len(cds_dict))


Saving cds_from_genomic.fna to cds_from_genomic.fna
Uploaded: cds_from_genomic.fna
총 CDS 개수: 4318


In [46]:
# Random CDS 선택
selected_id = random.choice(list(cds_dict.keys()))
cds_dna = cds_dict[selected_id]

print("선택된 CDS:", selected_id)
print("CDS length:", len(cds_dna))


선택된 CDS: lcl|NC_000913.3_cds_NP_415476.1_933 [gene=matP] [locus_tag=b0956] [db_xref=UniProtKB/Swiss-Prot:P0A8N0] [protein=macrodomain Ter protein] [protein_id=NP_415476.1] [location=1018485..1018937] [gbkey=CDS]
CDS length: 453


In [47]:
def dna_to_rna(seq):
    return seq.replace("T", "U")

cds_rna = dna_to_rna(cds_dna)

print("RNA CDS length:", len(cds_rna))


RNA CDS length: 453


In [37]:
# Linker DNA → RNA 변환

# ================================
# 2) Linker DNA 입력 + RNA 변환
# ================================

# linker DNA
linker_dna_full = (
    "CCGAAACGTGGTAAAAAAGGTGCTGTTGCTGAAGACGGTGACGAACTGCG"
    "TACCGAACCGGAAGCTAAAAAATCTAAAACCGCTGCTAAAAAAAACGACA"
    "AAGAAGCTGCTGGTGAA"
)

# 실제 사용할 길이 (10~30bp에서 고를 것)
LINKER_LEN = 30
linker_dna = linker_dna_full[:LINKER_LEN]

linker_rna = dna_to_rna(linker_dna)

print("Linker DNA len:", len(linker_dna))
print("Linker RNA len:", len(linker_rna))
print("Linker RNA:", linker_rna)


Linker DNA len: 30
Linker RNA len: 30
Linker RNA: CCGAAACGUGGUAAAAAAGGUGCUGUUGCU


# 2. SD 후보 평가 함수 (Filter 1–3 포함)

In [48]:
def evaluate_SD(O_ASD, UTR50, linker_rna, cds_rna):
    AU = random_AU10()
    SD = random_SD6()
    sp = random_spacer6()

    o_sd = SD
    ASD  = O_ASD

    full = build_full_mrna(UTR50, AU, SD, sp, linker_rna, cds_rna)

    hp = mfe_hairpin(full[40:80])  # SD 주변 window

    E_orth   = duplex(o_sd, ASD)
    E_cross1 = duplex(WT_ASD, o_sd)
    E_cross2 = duplex(ASD, WT_SD)
    combined = E_orth - hp

    # Filter 1 — orthogonality
    if E_orth > -8: return None
    if E_cross1 < -4: return None
    if E_cross2 < -4: return None

    # Filter 2 — hairpin
    if hp <= -8: return None
    if hp <= -5: return None

    # Filter 3 — combined
    if combined > -4: return None

    return {
        "AU": AU,
        "SD": SD,
        "sp": sp,
        "ASD": ASD,
        "E_orth": E_orth,
        "E_cross1": E_cross1,
        "E_cross2": E_cross2,
        "hairpin": hp,
        "combined": combined
    }


## 멀티프로세싱 SD 스크리닝

In [49]:
def scan_SD_for_ASD(O_ASD, UTR50, linker_rna, cds_rna, n_samples=3000, workers=4):
    args = [(O_ASD, UTR50, linker_rna, cds_rna)] * n_samples
    with Pool(workers) as p:
        res = p.starmap(evaluate_SD, args)
    return [r for r in res if r]


# 3. 전체 SD 후보 30~100개 얻기

In [50]:
UTR50 = "TATTATCCCCATAAGATTAGCGGATCCTACCTGACGCTTTTTATCGCAAC"

all_SD = []
for ASD in top10_OASD:
    sd_list = scan_SD_for_ASD(ASD["ASD"], UTR50, linker_rna, cds_rna, n_samples=3000, workers=4)
    all_SD.extend(sd_list)

top_SD_candidates = sorted(all_SD, key=lambda x: x["combined"])[:100]


In [51]:
# 최종 실험 후보 5~10개 선택

final_SD = top_SD_candidates[:10]

for i, f in enumerate(final_SD):
    print(f"\n=== Final SD Candidate #{i+1} ===")
    for k, v in f.items():
        print(k, ":", v)



=== Final SD Candidate #1 ===
AU : AAAAUAAUUA
SD : GCCUAC
sp : CUAGAU
ASD : AAGUAGGCA
E_orth : -10.1
E_cross1 : 2.2
E_cross2 : 1.2
hairpin : -0.10000000149011612
combined : -9.999999998509884

=== Final SD Candidate #2 ===
AU : AAAAUAAAAA
SD : GCUAGC
sp : CCUAUG
ASD : AGCUAGCAU
E_orth : -10.1
E_cross1 : 1.3
E_cross2 : 0.0
hairpin : -0.800000011920929
combined : -9.29999998807907

=== Final SD Candidate #3 ===
AU : UAUUUUAAAA
SD : ACUGCC
sp : AAACAA
ASD : UAGGCAGUA
E_orth : -9.4
E_cross1 : 2.2
E_cross2 : 1.2
hairpin : -0.20000000298023224
combined : -9.199999997019768

=== Final SD Candidate #4 ===
AU : AUUUUAUAAA
SD : GCUGCC
sp : UACAAU
ASD : UGCAGCUAU
E_orth : -8.9
E_cross1 : 2.2
E_cross2 : 0.0
hairpin : -0.4000000059604645
combined : -8.499999994039536

=== Final SD Candidate #5 ===
AU : UUAUAAAAUU
SD : ACUGCC
sp : AGCUUU
ASD : UAGGCAGUA
E_orth : -9.4
E_cross1 : 2.2
E_cross2 : 1.2
hairpin : -1.0
combined : -8.4

=== Final SD Candidate #6 ===
AU : UAAUAAAUUA
SD : GCUACU
sp : AUCCCC
A

## 최종 mRNA 길이

In [52]:
print("CDS RNA length:", len(cds_rna))


CDS RNA length: 453


In [53]:
print("Linker length:", len(linker_rna))

Linker length: 30


In [56]:
Full_length = 50 + 10 + 6 + 6 + 3 + len(linker_rna) + len(cds_rna)
Full_length


558