<a href="https://colab.research.google.com/github/jjjung99/SD-design/blob/main/300%2C000%EA%B0%9C%20%EC%83%98%ED%94%8C%EB%A7%81.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!apt-get install -y vienna-rna
!pip install viennarna biopython

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libgsl27 libgslcblas0 libncbi6 ncbi-data readseq
Suggested packages:
  gsl-ref-psdoc | gsl-doc-pdf | gsl-doc-info | gsl-ref-html
The following NEW packages will be installed:
  libgsl27 libgslcblas0 libncbi6 ncbi-data readseq vienna-rna
0 upgraded, 6 newly installed, 0 to remove and 41 not upgraded.
Need to get 11.1 MB of archives.
After this operation, 130 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libgslcblas0 amd64 2.7.1+dfsg-3 [94.4 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libgsl27 amd64 2.7.1+dfsg-3 [1,000 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 ncbi-data all 6.1.20170106+dfsg1-9 [3,519 kB]
Get:4 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libncbi6 amd64 6.1.20170106+dfsg1-9 [3,992 kB]
Get:5 http://archive.ubuntu.com/

In [2]:
import RNA
import random
import re
from multiprocessing import Pool, cpu_count
import pandas as pd

# 1.OSD 후보 선택

In [3]:
# DNA → RNA 변환

def dna_to_rna(seq):
    return seq.replace("T", "U").replace("t","u")

In [4]:
# full mrna 조합

def build_full_mrna(UTR50, AU10, SD, spacer, linker_rna, cds_rna):
    return UTR50 + AU10 + SD + spacer + "AUG" + linker_rna + cds_rna

In [5]:
# Random generators (AU10 / SD6 / spacer6)

def random_AU10():
    return "".join(random.choice("AU") for _ in range(10))

def random_SD6():
    return "".join(random.choice("ACGU") for _ in range(6))

def random_spacer6():
    return "".join(random.choice("ACGU") for _ in range(6))


In [6]:
# ViennaRNA binding & folding

def mfe_hairpin(seq):
    return RNA.fold(seq)[1]  # returns (structure, energy)

def duplex(a, b):
    return RNA.duplexfold(a, b).energy

## O-ASD 생성 관련 함수

In [7]:
def hamming(a, b):
    return sum(x != y for x, y in zip(a, b))


In [8]:
def revcomp_rna(seq):
    comp = {"A":"U","U":"A","C":"G","G":"C"}
    return "".join(comp[x] for x in seq)[::-1]


In [9]:
# O-ASD Validity Check

WT_ASD = "ACCUCCUUA"   # E. coli 16S 3’ tail
WT_SD  = "AGGAGG"      # native Shine-Dalgarno

def gc_percent(seq):
    return (seq.count("G")+seq.count("C")) / len(seq) * 100

def no_palindrome(seq, k=3):
    for L in range(k, 6):
        for i in range(len(seq)-L):
            frag = seq[i:i+L]
            if frag == frag[::-1]:
                return False
    return True

def valid_asd(asd):
    if hamming(asd, WT_ASD) < 4: return False
    if not (30 <= gc_percent(asd) <= 55): return False
    if not no_palindrome(asd): return False

    hp = mfe_hairpin(asd)
    if hp <= -8: return False
    if hp <= -5: return False

    return True


In [10]:
# O-ASD 평가 함수

def evaluate_asd(asd):
    o_sd = revcomp_rna(asd)
    E_orth   = duplex(o_sd, asd)
    E_cross1 = duplex(WT_ASD, o_sd)
    E_cross2 = duplex(asd, WT_SD)
    hp       = mfe_hairpin(asd)
    combined = E_orth - hp

    return {
        "ASD": asd,
        "O_SD": o_sd,
        "E_orth": E_orth,
        "E_cross1": E_cross1,
        "E_cross2": E_cross2,
        "hairpin": hp,
        "combined": combined
    }


In [11]:
# O-ASD 생성기

def generate_OASD(n=2000, length=9):
    result = []
    for _ in range(n):
        asd = "".join(random.choice("ACGU") for _ in range(length))
        if valid_asd(asd):
            result.append(evaluate_asd(asd))
    return result


## O-ASD Top10 생성

In [12]:
oasds = generate_OASD(n=3000)
top10_OASD = sorted(oasds, key=lambda x: x["combined"])[:10]

for i, t in enumerate(top10_OASD):
    print(f"TOP O-ASD #{i+1} → {t['ASD']}  (O-SD: {t['O_SD']})")


TOP O-ASD #1 → UACUGCCUA  (O-SD: UAGGCAGUA)
TOP O-ASD #2 → UGCUAGUCU  (O-SD: AGACUAGCA)
TOP O-ASD #3 → UACUGCCAU  (O-SD: AUGGCAGUA)
TOP O-ASD #4 → UCAUGGCAA  (O-SD: UUGCCAUGA)
TOP O-ASD #5 → UUCAGCCAU  (O-SD: AUGGCUGAA)
TOP O-ASD #6 → UUGCUAGCU  (O-SD: AGCUAGCAA)
TOP O-ASD #7 → AGCAUCCAA  (O-SD: UUGGAUGCU)
TOP O-ASD #8 → UUAGGCAGU  (O-SD: ACUGCCUAA)
TOP O-ASD #9 → UGGCCUUAU  (O-SD: AUAAGGCCA)
TOP O-ASD #10 → UUGCCUACU  (O-SD: AGUAGGCAA)


In [15]:
from google.colab import files
import random

uploaded = files.upload()
fname = list(uploaded.keys())[0]
print("Uploaded:", fname)

# CDS 딕셔너리 저장
cds_dict = {}

with open(fname, "r") as f:
    current_id = None
    seq_accum = []

    for line in f:
        line = line.strip()
        if line.startswith(">"):
            # 기존 CDS 저장
            if current_id and len(seq_accum) > 0:
                cds_dict[current_id] = "".join(seq_accum)
                seq_accum = []

            current_id = line[1:]  # '>' 제거
        else:
            seq_accum.append(line)

    # 마지막 CDS 저장
    if current_id and len(seq_accum) > 0:
        cds_dict[current_id] = "".join(seq_accum)

print("총 CDS 개수:", len(cds_dict))


Saving cds_from_genomic.fna to cds_from_genomic.fna
Uploaded: cds_from_genomic.fna
총 CDS 개수: 4318


In [16]:
# Random CDS 선택
selected_id = random.choice(list(cds_dict.keys()))
cds_dna = cds_dict[selected_id]

print("선택된 CDS:", selected_id)
print("CDS length:", len(cds_dna))


선택된 CDS: lcl|NC_000913.3_cds_NP_418247.1_3740 [gene=hemX] [locus_tag=b3803] [db_xref=UniProtKB/Swiss-Prot:P09127] [protein=PF04375 family protein HemX] [protein_id=NP_418247.1] [location=complement(3987885..3989066)] [gbkey=CDS]
CDS length: 1182


In [17]:
def dna_to_rna(seq):
    return seq.replace("T", "U")

cds_rna = dna_to_rna(cds_dna)

print("RNA CDS length:", len(cds_rna))


RNA CDS length: 1182


In [18]:
# Linker DNA → RNA 변환

# ================================
# 2) Linker DNA 입력 + RNA 변환
# ================================

# linker DNA
linker_dna_full = (
    "CCGAAACGTGGTAAAAAAGGTGCTGTTGCTGAAGACGGTGACGAACTGCG"
    "TACCGAACCGGAAGCTAAAAAATCTAAAACCGCTGCTAAAAAAAACGACA"
    "AAGAAGCTGCTGGTGAA"
)

# 실제 사용할 길이 (10~30bp에서 고를 것)
LINKER_LEN = 30
linker_dna = linker_dna_full[:LINKER_LEN]

linker_rna = dna_to_rna(linker_dna)

print("Linker DNA len:", len(linker_dna))
print("Linker RNA len:", len(linker_rna))
print("Linker RNA:", linker_rna)


Linker DNA len: 30
Linker RNA len: 30
Linker RNA: CCGAAACGUGGUAAAAAAGGUGCUGUUGCU


# 2. SD 후보 평가 함수 (Filter 1–3 포함)

In [19]:
def evaluate_SD(O_ASD, UTR50, linker_rna, cds_rna):
    AU = random_AU10()
    SD = random_SD6()
    sp = random_spacer6()

    o_sd = SD
    ASD  = O_ASD

    full = build_full_mrna(UTR50, AU, SD, sp, linker_rna, cds_rna)

    hp = mfe_hairpin(full[40:80])  # SD 주변 window

    E_orth   = duplex(o_sd, ASD)
    E_cross1 = duplex(WT_ASD, o_sd)
    E_cross2 = duplex(ASD, WT_SD)
    combined = E_orth - hp

    # Filter 1 — orthogonality
    if E_orth > -8: return None
    if E_cross1 < -4: return None
    if E_cross2 < -4: return None

    # Filter 2 — hairpin
    if hp <= -8: return None
    if hp <= -5: return None

    # Filter 3 — combined
    if combined > -4: return None

    return {
        "AU": AU,
        "SD": SD,
        "sp": sp,
        "ASD": ASD,
        "E_orth": E_orth,
        "E_cross1": E_cross1,
        "E_cross2": E_cross2,
        "hairpin": hp,
        "combined": combined
    }


## 멀티프로세싱 SD 스크리닝

In [20]:
def scan_SD_for_ASD(O_ASD, UTR50, linker_rna, cds_rna, n_samples=300000, workers=6):
    args = [(O_ASD, UTR50, linker_rna, cds_rna)] * n_samples
    with Pool(workers) as p:
        res = p.starmap(evaluate_SD, args)
    return [r for r in res if r]


# 3. 전체 SD 후보 30~100개 얻기

In [22]:
UTR50 = "TATTATCCCCATAAGATTAGCGGATCCTACCTGACGCTTTTTATCGCAAC"

all_SD = []
for ASD in top10_OASD:
    sd_list = scan_SD_for_ASD(ASD["ASD"], UTR50, linker_rna, cds_rna, n_samples=300000, workers=6)
    all_SD.extend(sd_list)

top_SD_candidates = sorted(all_SD, key=lambda x: x["combined"])[:100]


In [23]:
# 최종 실험 후보 5~10개 선택

final_SD = top_SD_candidates[:10]

for i, f in enumerate(final_SD):
    print(f"\n=== Final SD Candidate #{i+1} ===")
    for k, v in f.items():
        print(k, ":", v)



=== Final SD Candidate #1 ===
AU : UAUUAUAUAA
SD : GCUAGC
sp : CCGACC
ASD : UUGCUAGCU
E_orth : -9.4
E_cross1 : 1.3
E_cross2 : -0.5
hairpin : 0.0
combined : -9.4

=== Final SD Candidate #2 ===
AU : AAUAAAAAUA
SD : GCCAUG
sp : CCUAUA
ASD : UCAUGGCAA
E_orth : -9.7
E_cross1 : 2.2
E_cross2 : 0.0
hairpin : -0.30000001192092896
combined : -9.39999998807907

=== Final SD Candidate #3 ===
AU : UAUAAUAAAU
SD : GCCAUG
sp : UCAAUA
ASD : UCAUGGCAA
E_orth : -9.7
E_cross1 : 2.2
E_cross2 : 0.0
hairpin : -0.30000001192092896
combined : -9.39999998807907

=== Final SD Candidate #4 ===
AU : AAAAAUAAUA
SD : GGAUGC
sp : UUCUUU
ASD : AGCAUCCAA
E_orth : -10.1
E_cross1 : -3.1
E_cross2 : -3.4
hairpin : -0.8999999761581421
combined : -9.200000023841858

=== Final SD Candidate #5 ===
AU : AUUAAUUUAA
SD : GCCAUG
sp : ACCUUU
ASD : UCAUGGCAA
E_orth : -9.7
E_cross1 : 2.2
E_cross2 : 0.0
hairpin : -0.5
combined : -9.2

=== Final SD Candidate #6 ===
AU : UAUAAAAUAA
SD : GCCAUG
sp : AAUGAU
ASD : UCAUGGCAA
E_orth : -9.7

## 최종 mRNA 길이

In [24]:
print("CDS RNA length:", len(cds_rna))


CDS RNA length: 1182


In [25]:
print("Linker length:", len(linker_rna))

Linker length: 30


In [26]:
Full_length = 50 + 10 + 6 + 6 + 3 + len(linker_rna) + len(cds_rna)
Full_length


1287