<a href="https://colab.research.google.com/github/jjjung99/SD-design/blob/main/SD_design1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

UTR50 (고정 50bp)
+ AU-rich (10bp, A/U 랜덤)
+ SD (6bp 랜덤, A/G/C/T 사용)
+ Spacer (6bp 랜덤)
+ AUG (고정)
+ Unstructured peptide (10~30bp 고정 or 선택)
+ Coding sequence (E. coli CDS에서 가져온 것)

In [1]:
!apt-get install -y vienna-rna
!pip install viennarna biopython


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libgsl27 libgslcblas0 libncbi6 ncbi-data readseq
Suggested packages:
  gsl-ref-psdoc | gsl-doc-pdf | gsl-doc-info | gsl-ref-html
The following NEW packages will be installed:
  libgsl27 libgslcblas0 libncbi6 ncbi-data readseq vienna-rna
0 upgraded, 6 newly installed, 0 to remove and 41 not upgraded.
Need to get 11.1 MB of archives.
After this operation, 130 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libgslcblas0 amd64 2.7.1+dfsg-3 [94.4 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libgsl27 amd64 2.7.1+dfsg-3 [1,000 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 ncbi-data all 6.1.20170106+dfsg1-9 [3,519 kB]
Get:4 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libncbi6 amd64 6.1.20170106+dfsg1-9 [3,992 kB]
Get:5 http://archive.ubuntu.com/

In [2]:
import RNA
import random
import pandas as pd
from multiprocessing import Pool, cpu_count
from Bio import SeqIO
from pathlib import Path

In [46]:
import RNA

def rna_fold_py(seq):
    """
    ViennaRNA Python binding을 사용해 ΔG 계산
    """
    fc = RNA.fold_compound(seq)
    structure, mfe = fc.mfe()
    # structure: dot-bracket
    # mfe: minimum free energy (float)
    return mfe

In [47]:
def rna_duplex_py(a, b):
    """
    RNA.duplexfold() 사용한 2개 RNA 결합 ΔG 계산
    """
    duplex = RNA.duplexfold(a, b)
    return duplex.energy  # kcal/mol

# 1. E. coli coding sequence 파싱

In [49]:
from google.colab import files
uploaded = files.upload()


Saving cds_from_genomic.fna to cds_from_genomic.fna


In [50]:
CDS_FILE = list(uploaded.keys())[0]
CDS_FILE

'cds_from_genomic.fna'

In [51]:
# E. coli CDS 파일 파싱 코드

def load_cds_fasta(path):
    cds_list = []
    seq = ""

    with open(path) as f:
        for line in f:
            if line.startswith(">"):
                if seq:
                    cds_list.append(seq.replace("T","U"))
                seq = ""
            else:
                seq += line.strip()

    if seq:
        cds_list.append(seq.replace("T","U"))

    return cds_list

CDS_LIST = load_cds_fasta(CDS_FILE)
len(CDS_LIST)


4318

# 2.Unstructured peptide linker 준비 (DNA → RNA)

In [69]:
# Unstructured peptide DNA(10~30bp) 생성 함수

FULL_LINKER_DNA = (
"CCGAAACGTGGTAAAAAAGGTGCTGTTGCTGAAGACGGTGACGAACTGCG"
"TACCGAACCGGAAGCTAAAAAATCTAAAACCGCTGCTAAAAAAAACGACA"
"AAGAAGCTGCTGGTGAA"
)

def get_linker(n=20):
    dna = FULL_LINKER_DNA[:n]
    return dna.replace("T","U")



#3. ViennaRNA 실행

In [70]:
import subprocess

def rna_duplex(a, b):
    """두 RNA의 ΔG(dG) 계산"""
    cmd = f"printf '{a}\n{b}' | /usr/bin/RNAduplex"
    result = subprocess.check_output(cmd, shell=True).decode()
    dg = float(result.split("(")[-1].replace(")",""))
    return dg

def rna_fold(seq):
    """RNAfold를 이용하여 2차 구조 ΔG 계산"""
    cmd = f"printf '{seq}' | /usr/bin/RNAfold --noPS"
    out = subprocess.check_output(cmd, shell=True).decode().split("\n")
    energy = float(out[1].split("(")[-1].replace(")",""))
    return energy


In [71]:
# 랜덤서열

import random

def random_AU10():
    return "".join(random.choice("AU") for _ in range(10))

def random_sd():
    return "".join(random.choice("ATGC") for _ in range(6))

def random_spacer6():
    return "".join(random.choice("ATGC") for _ in range(6))

def random_asd():
    return "".join(random.choice("ATGC") for _ in range(6))


# 4. Full mRNA 조립 함수

In [72]:
UTR50 = "TATTATCCCCATAAGATTAGCGGATCCTACCTGACGCTTTTTATCGCAAC"  # 50bp

def build_mrna(UTR50, AU10, SD, Spacer, AUG, Linker, CDS):
    return UTR50 + AU10 + SD + Spacer + AUG + Linker + CDS


# 5. 필터링

In [73]:
WT_ASD = "ACCUCCUUA"
WT_SD  = "AGGAGG"


In [74]:
def pass_filters(dg_ortho, dg_wt_asd, dg_wt_sd_asd, dg_hairpin):
    # Filter 1
    if not (-12 < dg_wt_asd < -5):
        return False
    if dg_ortho > -8.5:
        return False
    if (dg_ortho - dg_wt_asd) < 3:
        return False

    # Filter 2
    if dg_hairpin < -8:
        return False

    # Filter 3
    combined = dg_ortho - dg_hairpin
    if combined > -4:
        return False

    return True


In [75]:
def evaluate_one(args):
    ASD, SD, idx = args

    AU10   = random_AU10()
    Spacer = random_spacer6()
    Linker = get_linker(20)
    CDS    = CDS_LIST[idx % len(CDS_LIST)]

    mRNA = build_mrna(UTR50, AU10, SD, Spacer, "AUG", Linker, CDS)

    # ΔG 계산 (ViennaRNA Python binding)
    dg_ortho     = rna_duplex_py(SD, ASD)
    dg_wt_asd    = rna_duplex_py(SD, WT_ASD)
    dg_wt_sd_asd = rna_duplex_py(WT_SD, ASD)
    dg_hairpin   = rna_fold_py(mRNA)

    if pass_filters(dg_ortho, dg_wt_asd, dg_wt_sd_asd, dg_hairpin):
        return {
            "ASD": ASD,
            "SD": SD,
            "AU10": AU10,
            "Spacer": Spacer,
            "Linker": Linker,
            "CDS_length": len(CDS),
            "E_orth": dg_ortho,
            "E_WT_ASD": dg_wt_asd,
            "E_WT_SD_ASD": dg_wt_sd_asd,
            "E_hairpin": dg_hairpin,
            "Combined": dg_ortho - dg_hairpin
        }

    return None



In [82]:
def evaluate_one_debug(args):
    ASD, SD, idx = args

    AU10   = random_AU10()
    Spacer = random_spacer6()
    Linker = get_linker(20)
    CDS    = random.choice(CDS_LIST)

    mRNA = build_mrna(UTR50, AU10, SD, Spacer, "AUG", Linker, CDS)

    # ΔG 계산
    try:
        E_orth         = rna_duplex(SD, ASD)
        E_cross1       = rna_duplex(WT_ASD, SD)
        E_cross2       = rna_duplex(ASD, WT_SD)
        sd_start = mRNA.index(SD)
        local_region = mRNA[sd_start-20 : sd_start+40]
        E_hairpin = rna_fold(local_region)

    except:
        print("ERROR running RNA tools")
        return None

    print("\n====== Candidate Check ======")
    print("SD:", SD)
    print("ASD:", ASD)
    print("E_orth         :", E_orth)
    print("E_cross1 (WT-ASD:SD):", E_cross1)
    print("E_cross2 (ASD:WT-SD):", E_cross2)
    print("E_hairpin       :", E_hairpin)

    # -------------------------
    # Filter 1
    # -------------------------
    if E_cross1 < -3:
        print("❌ Reject: WT cross-binding too strong")
        return None

    if E_orth > -8.5:
        print("❌ Reject: Ortho binding too weak")
        return None

    if (E_orth - E_cross1) < 3:
        print("❌ Reject: Ortho-WT Δ insufficient")
        return None

    # -------------------------
    # Filter 2 — hairpin
    # -------------------------
    if E_hairpin < -8:
        print("❌ Reject: Hairpin too stable")
        return None

    if E_hairpin > -2:
        print("❌ Reject: Hairpin not stable enough (rare error?)")
        return None

    # -------------------------
    # Filter 3 — Combined
    # -------------------------
    combined = E_orth - E_hairpin
    print("Combined:", combined)

    if combined > -4:
        print("❌ Reject: Combined too weak")
        return None

    print("✅ PASS")
    return True


In [77]:
from multiprocessing import Pool, cpu_count
import pandas as pd

def run_search(n_asd=20, n_sd=200, n_workers=None):
    # --------------------------------------------------------
    # 1) ASD / SD 랜덤 후보 생성
    # --------------------------------------------------------
    ASD_list = [random_asd() for _ in range(n_asd)]
    SD_list  = [random_sd() for _ in range(n_sd)]

    # 모든 조합 생성
    tasks = [(ASD, SD, j)
             for j, SD in enumerate(SD_list)
             for ASD in ASD_list]

    print("총 작업 개수:", len(tasks))

    # --------------------------------------------------------
    # 2) 사용할 코어 개수 결정
    # --------------------------------------------------------
    if n_workers is None:
        n_workers = cpu_count()   # 전체 코어 자동 사용

    print(f"사용할 코어 수: {n_workers}")

    # --------------------------------------------------------
    # 3) 멀티프로세싱 실행
    # --------------------------------------------------------
    with Pool(n_workers) as p:
        results = p.map(evaluate_one, tasks)

    # --------------------------------------------------------
    # 4) 필터 통과한 후보만 반환
    # --------------------------------------------------------
    results = [r for r in results if r is not None]
    return pd.DataFrame(results)



In [78]:
def print_top_candidates(df, k=10):
    """
    df: 결과 DataFrame
    k : 상위 k개 출력
    """

    # 가장 좋은 후보를 combined (낮을수록 strong) 기준으로 정렬
    df = df.sort_values(by="Combined")

    top = df.head(k).reset_index(drop=True)

    for idx, row in top.iterrows():
        print(f"\n=== Candidate #{idx+1} ===")
        print("AU (RNA, 10nt)             :", row["AU10"])
        print("SD (RNA, 6nt)              :", row["SD"])
        print("Spacer (RNA, 6nt)          :", row["Spacer"])

        # 자동 계산: O-ASD = revcomp(SD)
        def revcomp(seq):
            comp = {"A":"U","U":"A","G":"C","C":"G"}
            return "".join(comp[b] for b in seq[::-1])

        o_asd = revcomp(row["SD"])

        print("O-ASD (RNA, revcomp of SD) :", o_asd)

        print("E_orth (O-ASD:O-SD)        : {:.2f} kcal/mol".format(row["dG_ortho"]))
        print("E_cross WT-ASD:O-SD        : {:.2f} kcal/mol".format(row["dG_WT_ASD"]))
        print("E_cross O-ASD:WT-SD_min    : {:.2f} kcal/mol".format(row["dG_WT_SD_ASD"]))
        print("E_hairpin (full mRNA)      : {:.2f} kcal/mol".format(row["dG_hairpin"]))
        print("Combined Score             : {:.2f}".format(row["Combined"]))

    return top


In [84]:
import RNA
import random

WT_ASD = "ACCUCCUUA"

def random_asd(length=9):
    return "".join(random.choice("ACGU") for _ in range(length))

def hamming(a,b):
    return sum(x!=y for x,y in zip(a,b))

def mfe_hairpin(seq):
    fold = RNA.fold(seq)
    return fold[1]  # ΔG

def duplex_energy(a,b):
    d = RNA.duplexfold(a,b)
    return d.energy

def valid_asd(asd):
    # distance from WT
    if hamming(asd, WT_ASD) < 4:
        return False

    # hairpin constraints
    hp = mfe_hairpin(asd)
    if hp < -8: return False
    if hp <= -5: return False

    # avoid palindromes
    for k in range(3,6):
        for i in range(len(asd)-k):
            frag = asd[i:i+k]
            if frag == frag[::-1]:
                return False
    return True


def evaluate_asd(asd):
    # place a representative O-SD (reverse complement)
    o_sd = AS_to_SD(asd)

    E_orth = duplex_energy(o_sd, asd)
    E_cross1 = duplex_energy(o_sd, WT_ASD)
    E_cross2 = duplex_energy(WT_ASD, asd)
    hp = mfe_hairpin(asd)

    combined = E_orth - hp

    return {
        "ASD":asd,
        "O_SD":o_sd,
        "E_orth":E_orth,
        "E_cross1":E_cross1,
        "E_cross2":E_cross2,
        "hairpin":hp,
        "combined":combined
    }

def AS_to_SD(asd):
    comp = {"A":"U","U":"A","C":"G","G":"C"}
    return "".join(comp[x] for x in asd)[::-1]


# --- MAIN ---
candidates = []
for _ in range(2000):
    a = random_asd()
    if valid_asd(a):
        candidates.append(evaluate_asd(a))

# 정렬
top = sorted(candidates, key=lambda x: x["combined"])[:10]

for c in top:
    print("\nASD:",c["ASD"])
    print("O-SD:",c["O_SD"])
    print("E_orth:",c["E_orth"])
    print("E_cross1:",c["E_cross1"])
    print("E_cross2:",c["E_cross2"])
    print("hairpin MFE:",c["hairpin"])
    print("combined:",c["combined"])



ASD: AGGCCAGGC
O-SD: GCCUGGCCU
E_orth: -18.4
E_cross1: -1.5
E_cross2: -4.1
hairpin MFE: 0.0
combined: -18.4

ASD: GGCAGCCAC
O-SD: GUGGCUGCC
E_orth: -17.8
E_cross1: -1.5
E_cross2: -1.8
hairpin MFE: 0.0
combined: -17.8

ASD: GCAUCCGGC
O-SD: GCCGGAUGC
E_orth: -17.3
E_cross1: -3.3
E_cross2: -1.7
hairpin MFE: 0.0
combined: -17.3

ASD: GGCCUACGC
O-SD: GCGUAGGCC
E_orth: -17.3
E_cross1: -2.6
E_cross2: -1.5
hairpin MFE: 0.0
combined: -17.3

ASD: CCGGAUGCC
O-SD: GGCAUCCGG
E_orth: -17.2
E_cross1: -1.5
E_cross2: -3.3
hairpin MFE: 0.0
combined: -17.2

ASD: CGGUCCAGC
O-SD: GCUGGACCG
E_orth: -17.1
E_cross1: -3.0
E_cross2: -2.4
hairpin MFE: 0.0
combined: -17.1

ASD: AGCAGGCCA
O-SD: UGGCCUGCU
E_orth: -16.7
E_cross1: -1.5
E_cross2: -2.9
hairpin MFE: 0.0
combined: -16.7

ASD: ACUAGGCCC
O-SD: GGGCCUAGU
E_orth: -16.4
E_cross1: -2.1
E_cross2: -2.4
hairpin MFE: 0.0
combined: -16.4

ASD: GGCAAGCCG
O-SD: CGGCUUGCC
E_orth: -16.8
E_cross1: -1.7
E_cross2: -1.5
hairpin MFE: -0.5
combined: -16.3

ASD: UGCCAGCAG
O-

In [61]:
df = run_search(n_asd=4, n_sd=10, n_workers=2)

총 작업 개수: 40
사용할 코어 수: 2


In [63]:
top10 = print_top_candidates(df, k=10)

KeyError: 'Combined'

# 실제 계산 + 필터링 + 저장

In [None]:
import RNA
import pandas as pd
import random
from Bio import SeqIO
from multiprocessing import Pool, cpu_count

# =========================================================
# 1️⃣ 고정 서열 및 파라미터 설정
# =========================================================
UTR_PBAD = "AAATTTGCTGAAAGGAGGAAATAATAATGATGATGTAAAGCTTTAGGAGAT".replace("T","U")
ANTI_SD_WT = "UCCUCC"
ANTI_SD_ORTHO = "UGGAUA"
ΔG_MIN, ΔG_MAX = -12.0, -3.0

# 업로드된 E. coli K-12 CDS 파일 경로 지정
CDS_FILE = "/content/cds_from_genomic.fna"  # 실제 업로드 파일 이름으로 수정

# Intermediate/Flexible region (직접 변환한 단백질 서열 → RNA로 입력)
INTERMEDIATE_REGION = "AUGGCCUUGAAAGAUUGCUCUUUGAGGAUCCUUUGGAGGAGUUAAGUGA"  # 예시


# =========================================================
# 2️⃣ 유틸 함수
# =========================================================
def random_seq(bases, length):
    return ''.join(random.choice(bases) for _ in range(length))

def load_ecoli_cds(file_path):
    cds_list = []
    for record in SeqIO.parse(file_path, "fasta"):
        seq = str(record.seq).replace("T","U")
        cds_list.append(seq)
    return cds_list


# =========================================================
# 3️⃣ mRNA 조합 생성 함수
# =========================================================
def build_mrna(utr, sd, cds):
    au = random_seq(["A","U"], 10)
    spacer = random_seq(["A","U","G","C"], 6)
    return utr + au + sd + spacer + "AUG" + INTERMEDIATE_REGION + cds


# =========================================================
# 4️⃣ ΔG 계산 함수
# =========================================================
def compute_energy(seq, sd):
    mfe = RNA.fold(seq)[1]
    dg_wt = RNA.duplexfold(sd, ANTI_SD_WT).energy
    dg_ortho = RNA.duplexfold(sd, ANTI_SD_ORTHO).energy
    return mfe, dg_wt, dg_ortho, dg_ortho - dg_wt


# =========================================================
# 5️⃣ 병렬 처리 함수
# =========================================================
def worker(args):
    utr, sd, cds = args
    mrna = build_mrna(utr, sd, cds)
    mfe, dg_wt, dg_ortho, diff = compute_energy(mrna, sd)
    return {
        "SD_seq": sd,
        "ΔG_hairpin": mfe,
        "ΔG_WT": dg_wt,
        "ΔG_Ortho": dg_ortho,
        "Δ(Ortho-WT)": diff,
        "mRNA_seq": mrna
    }

def run_parallel(utr, sd_list, cds_list):
    combos = [(utr, sd, cds) for sd in sd_list for cds in cds_list]
    with Pool(processes=cpu_count()) as pool:
        results = pool.map(worker, combos)
    return results


# =========================================================
# 6️⃣ 필터링 함수
# =========================================================
def filter_results(df):
    return df[
        (df["ΔG_Ortho"].between(-10, -5)) &
        (df["ΔG_WT"] > -3) &
        (df["Δ(Ortho-WT)"] < -5) &
        (df["ΔG_hairpin"] > -35)
    ]


# =========================================================
# 7️⃣ 메인 파이프라인 실행
# =========================================================
def main():
    print("E. coli CDS 불러오는 중...")
    ecoli_cds_list = load_ecoli_cds(CDS_FILE)
    print(f"총 {len(ecoli_cds_list)}개의 CDS 시퀀스 로드 완료")

    print("SD 후보 시퀀스 생성 중...")
    sd_list = [random_seq(["A","U","G","C"], 6) for _ in range(500)]

    print("Multiprocessing 계산 시작...")
    results = run_parallel(UTR_PBAD, sd_list, ecoli_cds_list[:20])  # 샘플 20개만

    df = pd.DataFrame(results)
    df.to_csv("raw_binding_results.csv", index=False)
    print(f"계산 완료: raw_binding_results.csv ({len(df)}개 결과)")

    print("ΔG 조건 필터링 중...")
    final_df = filter_results(df)
    final_df.to_csv("final_mRNA_candidates.csv", index=False)
    print(f"최종 후보 {len(final_df)}개 저장 완료: final_mRNA_candidates.csv")

    # 최적 서열 자동 선택
    if len(final_df) > 0:
        best = final_df.loc[
            (final_df["ΔG_hairpin"].between(-28, -25)),
            "Δ(Ortho-WT)"
        ].idxmin()
        final_seq = final_df.loc[best]
        final_seq.to_frame().T.to_csv("final_selected_sequence.csv", index=False)
        print("\n [최종 선정 서열]")
        print(final_seq[["SD_seq","ΔG_hairpin","ΔG_WT","ΔG_Ortho","Δ(Ortho-WT)"]])
        print(f"\nFull mRNA sequence:\n{final_seq['mRNA_seq']}\n")
        print("final_selected_sequence.csv 저장 완료")


# =========================================================
# 실행
# =========================================================
if __name__ == "__main__":
    main()


E. coli CDS 불러오는 중...
총 4318개의 CDS 시퀀스 로드 완료
SD 후보 시퀀스 생성 중...
Multiprocessing 계산 시작...
