<a href="https://colab.research.google.com/github/jjjung99/SD-design/blob/main/200%2C000%EA%B0%9C_%EC%83%98%ED%94%8C_Base_line.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
# ============================
# Cell 1. WT 분석 (SD–ASD ΔG 분포)
#  - MG1655 genome + GFF 필요
#  - 결과: WT_ASD_RNA, DG_MIN, DG_MAX, CROSS_CUTOFF, WT_SD_SUBSET
# ============================

!pip install -q ViennaRNA
!pip install -q biopython

import RNA
from Bio import SeqIO
import numpy as np
import random

genome_fasta_path = "/content/GCF_000005845.2_ASM584v2_genomic.fna"
gff_path          = "/content/genomic.gff"

def dna_to_rna(seq: str) -> str:
    return seq.upper().replace("T", "U")

def revcomp_dna(seq: str) -> str:
    comp = str.maketrans("ACGTacgt", "TGCAtgca")
    return seq.translate(comp)[::-1]

def is_sd_candidate(motif: str) -> bool:
    motif = motif.upper().replace("U", "T")
    L = len(motif)
    if L < 4 or L > 6:
        return False
    purines = motif.count("A") + motif.count("G")
    if purines < int(0.6 * L):
        return False
    if ("GG" not in motif) and ("AGG" not in motif) and ("GAG" not in motif):
        return False
    return True

def calc_duplex_energy(asd_rna: str, sd_dna: str) -> float:
    sd_rna = dna_to_rna(sd_dna)
    duplex = RNA.duplexfold(asd_rna, sd_rna)
    return duplex.energy

def load_genome_and_features():
    genome_dict = {}
    for record in SeqIO.parse(genome_fasta_path, "fasta"):
        genome_dict[record.id] = str(record.seq).upper()

    features = []
    with open(gff_path, "r") as gff:
        for line in gff:
            if not line.strip() or line.startswith("#"):
                continue
            cols = line.rstrip().split("\t")
            if len(cols) != 9:
                continue
            # Corrected: Added 'score' variable to unpack all 9 columns
            seqid, source, ftype, start, end, score, strand, phase, attrs = cols
            features.append((seqid, ftype, int(start), int(end), strand, attrs))
    return genome_dict, features

def extract_wt_asd(genome_dict, features):
    ASD_list = []
    for seqid, ftype, start, end, strand, attrs in features:
        if ftype == "rRNA" and "16S ribosomal RNA" in attrs:
            if seqid not in genome_dict:
                continue
            full_rrna = genome_dict[seqid][start-1:end]
            if strand == "-":
                full_rrna = revcomp_dna(full_rrna)
            ASD_list.append(full_rrna[-10:])
    if not ASD_list:
        raise RuntimeError("16S rRNA에서 ASD 후보를 찾지 못했습니다.")
    return dna_to_rna(ASD_list[0])   # RNA

def get_upstream_20(genome_dict, seqid, start, end, strand, window=20):
    chrom = genome_dict.get(seqid)
    if chrom is None:
        return None
    L = len(chrom)
    if strand == "+":
        s0 = start - 1
        if s0 < window:
            return None
        return chrom[s0-window:s0].upper()
    elif strand == "-":
        e0 = end - 1
        if e0 + window >= L:
            return None
        downstream = chrom[e0+1:e0+1+window]
        return revcomp_dna(downstream).upper()
    else:
        return None

def find_wt_sd_and_energies(genome_dict, features, WT_ASD_RNA):
    energies = []
    wt_sd_list = []
    for seqid, ftype, start, end, strand, attrs in features:
        if ftype != "CDS":
            continue
        upstream = get_upstream_20(genome_dict, seqid, start, end, strand, 20)
        if upstream is None:
            continue
        window = upstream.upper()
        best_motif = None
        best_dG = None
        for k in [4, 5, 6]:
            for i in range(0, len(window)-k+1):
                m = window[i:i+k]
                if not is_sd_candidate(m):
                    continue
                dG = calc_duplex_energy(WT_ASD_RNA, m)
                if (best_dG is None) or (dG < best_dG):
                    best_dG = dG
                    best_motif = m
        if best_motif is not None:
            energies.append(best_dG)
            wt_sd_list.append(best_motif)

    if not energies:
        raise RuntimeError("어떤 CDS에서도 SD 후보를 찾지 못했습니다.")
    return np.array(energies, float), wt_sd_list

# ---- 실행 ----
genome_dict, features = load_genome_and_features()
print(f"[INFO] Loaded chromosomes: {list(genome_dict.keys())}")
print(f"[INFO] Total features in GFF: {len(features)}")

WT_ASD_RNA = extract_wt_asd(genome_dict, features)
print(f"[INFO] WT_ASD_RNA (16S 3' tail) = {WT_ASD_RNA}")

energies, WT_SD_DNA_LIST = find_wt_sd_and_energies(genome_dict, features, WT_ASD_RNA)

mean_dG = float(energies.mean())
std_dG  = float(energies.std())

# 타깃 범위: mean \u00b1 0.4 (너가 이미 본 [-6.110, -5.310]이 여기서 나왔음)
DG_MIN  = mean_dG - 0.400
DG_MAX  = mean_dG + 0.400

# 직교성 기준: WT 평균보다 1\u03c3 약한 결합 이상만 허용
CROSS_CUTOFF = mean_dG + std_dG

print("\n=== WT SD–WT ASD \u0394G 통계 ===")
print(f"Count : {len(energies)}")
print(f"Mean  : {mean_dG:.3f} kcal/mol")
print(f"Std   : {std_dG:.3f} kcal/mol")
print(f"TARGET_DG range   : [{DG_MIN:.3f}, {DG_MAX:.3f}]")
print(f"CROSS_CUTOFF (weak): \u0394G >= {CROSS_CUTOFF:.3f} kcal/mol")

# 직교성 테스트에 사용할 WT SD subset (계산량 줄이기용)
WT_SD_SUBSET_SIZE = 300
if len(WT_SD_DNA_LIST) > WT_SD_SUBSET_SIZE:
    WT_SD_SUBSET = random.sample(WT_SD_DNA_LIST, WT_SD_SUBSET_SIZE)
else:
    WT_SD_SUBSET = WT_SD_DNA_LIST[:]

print(f"[INFO] WT_SD_SUBSET size for orthogonality check: {len(WT_SD_SUBSET)}")

[INFO] Loaded chromosomes: ['NC_000913.3']
[INFO] Total features in GFF: 9523
[INFO] WT_ASD_RNA (16S 3' tail) = CACCUCCUUA

=== WT SD–WT ASD ΔG 통계 ===
Count : 4101
Mean  : -5.710 kcal/mol
Std   : 1.968 kcal/mol
TARGET_DG range   : [-6.110, -5.310]
CROSS_CUTOFF (weak): ΔG >= -3.742 kcal/mol
[INFO] WT_SD_SUBSET size for orthogonality check: 300


In [10]:
# ============================================
# Cell 2. O-SD 설계 (hairpin + 접힘 구조까지 포함한 최종본)
# ============================================

import RNA, random, math
import multiprocessing as mp

# --- 공통 유틸 ---
def dna_to_rna(seq: str) -> str:
    return seq.upper().replace("T", "U")

def rna_revcomp(seq: str) -> str:
    comp = str.maketrans("AUCGaucg", "UAGCuagc")
    return seq.translate(comp)[::-1]

def calc_duplex_energy(asd_rna: str, sd_dna: str) -> float:
    sd_rna = dna_to_rna(sd_dna)
    duplex = RNA.duplexfold(asd_rna, sd_rna)
    return duplex.energy

def random_AU(n: int) -> str:
    return "".join(random.choice("AT") for _ in range(n))

def random_nt(n: int) -> str:
    return "".join(random.choice("ATGC") for _ in range(n))


# --- 디자인에 사용되는 고정 DNA 서열 ---
UTR_50 = "ATATAGGCATAGCGCACAGACAGATAAAAATTACAGAGTACACAACATCC"

EMPTY_ORF = "CTGCTGGGTGAGCTTTCTCCGTAAACTTAAAGGAAAAGATTCCGTTGAAAGATTCAAAGCTATCGTTCAGCGTATACAAGAGACTTCCTCCTGAGACTCGTGTTCCCGTACCGAACTCT"
CODING_SEQ = """ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAACGTTTTCTGCGTGTTGCCGAT...생략...(너가 준 전체)""".replace("\n","")


print("[INFO] WT_ASD_RNA =", WT_ASD_RNA)
print("[INFO] DG_MIN, DG_MAX =", DG_MIN, DG_MAX)
print("[INFO] CROSS_CUTOFF =", CROSS_CUTOFF)


# ============================
# 1. O-SD 후보 1개 생성 + 전체 평가
# ============================

def make_and_score_candidate(seed=None):
    if seed is not None:
        random.seed(seed)

    au_dna = random_AU(10)
    sd_dna = random_nt(6)
    spacer_dna = random_nt(6)

    # --- O-ASD 자동 생성 ---
    sd_rna = dna_to_rna(sd_dna)
    ortho_asd_rna = rna_revcomp(sd_rna)

    # --- full DNA construct ---
    full_dna = UTR_50 + au_dna + sd_dna + spacer_dna + "ATG" + EMPTY_ORF + CODING_SEQ

    # ===============================
    # (A) O-ASD : O-SD 결합 (핵심 타깃)
    # ===============================
    E_orth = calc_duplex_energy(ortho_asd_rna, sd_dna)

    # ===============================
    # (B) 직교성 기준 1: WT-ASD : O-SD
    # ===============================
    E_cross1 = calc_duplex_energy(WT_ASD_RNA, sd_dna)

    # ===============================
    # (C) 직교성 기준 2: O-ASD : WT-SD_i (subset)
    # ===============================
    E_cross2_min = math.inf
    for wt_sd in WT_SD_SUBSET:
        dG = calc_duplex_energy(ortho_asd_rna, wt_sd)
        if dG < E_cross2_min:
            E_cross2_min = dG

    # ===============================
    # (D) RNAfold hairpin ΔG 계산 (추가한 부분)
    # ===============================
    # SD 주변 구조만 보면 되므로 UTR50 + AU10 + SD + spacer 범위만 fold
    local_window_dna = UTR_50 + au_dna + sd_dna + spacer_dna
    local_window_rna = dna_to_rna(local_window_dna)

    structure, mfe_local = RNA.fold(local_window_rna)
    # hairpin이 너무 안정하면(ΔG 매우 낮으면) SD가 숨음 → 탈락
    # 기준: ΔG >= -3.0
    if mfe_local < -3.0:
        return None  # hairpin strong → reject candidate

    return {
        "AU_DNA": au_dna,
        "SD_DNA": sd_dna,
        "spacer_DNA": spacer_dna,
        "full_dna": full_dna,
        "E_orth": E_orth,
        "E_cross1": E_cross1,
        "E_cross2_min": E_cross2_min,
        "mfe_local": mfe_local
    }


# ============================
# 2. Filter 조건 정의
# ============================

def passes_filters(c):
    if c is None:
        return False

    # (1) 타깃 결합 범위 (O-ASD : O-SD)
    if not (DG_MIN <= c["E_orth"] <= DG_MAX):
        return False

    # (2) WT-ASD : O-SD 직교성
    if c["E_cross1"] < CROSS_CUTOFF:
        return False

    # (3) O-ASD : WT-SD 직교성
    if c["E_cross2_min"] < CROSS_CUTOFF:
        return False

    # (4) hairpin ΔG 필터 (mfe_local >= -3.0)
    if c["mfe_local"] < -3.0:
        return False

    return True


# ============================
# 3. 멀티프로세싱 worker
# ============================

def worker_generate(n_samples: int, seed_offset: int):
    random.seed(seed_offset)
    accepted = []
    for i in range(n_samples):
        cand = make_and_score_candidate()
        if passes_filters(cand):
            accepted.append(cand)
    return accepted


# ============================
# 4. 병렬 실행
# ============================

TOTAL_SAMPLES = 200000     # 먼저 20만개로 테스트
N_PROCESSES   = max(1, mp.cpu_count() - 1)
SAMPLES_PER_WORKER = TOTAL_SAMPLES // N_PROCESSES

print(f"[INFO] Using {N_PROCESSES} workers, {SAMPLES_PER_WORKER} samples per worker")

if __name__ == "__main__":
    with mp.Pool(N_PROCESSES) as pool:
        results = pool.starmap(worker_generate,
                               [(SAMPLES_PER_WORKER, i*7777) for i in range(N_PROCESSES)])

    candidates = [c for sub in results for c in sub]

    print(f"\n[INFO] 총 시도 샘플 수 : ~{TOTAL_SAMPLES}")
    print(f"[INFO] 통과한 후보 수 : {len(candidates)}")

    TARGET_DG_MEAN = (DG_MIN + DG_MAX)/2

    candidates_sorted = sorted(
        candidates,
        key=lambda c: abs(c["E_orth"] - TARGET_DG_MEAN)
    )

    TOP_K = min(10, len(candidates_sorted))
    print(f"\n[TOP {TOP_K} 후보 요약] (RNA 서열 기준)")
    for idx, c in enumerate(candidates_sorted[:TOP_K], 1):
        AU_RNA     = dna_to_rna(c["AU_DNA"])
        SD_RNA     = dna_to_rna(c["SD_DNA"])
        spacer_RNA = dna_to_rna(c["spacer_DNA"])
        O_ASD      = rna_revcomp(SD_RNA)

        print(f"\n--- Candidate #{idx} ---")
        print("AU (RNA):", AU_RNA)
        print("SD (RNA):", SD_RNA)
        print("Spacer (RNA):", spacer_RNA)
        print("O-ASD:", O_ASD)
        print("E_orth:", c["E_orth"])
        print("E_cross1:", c["E_cross1"])
        print("E_cross2_min:", c["E_cross2_min"])
        print("Hairpin ΔG:", c["mfe_local"])


[INFO] WT_ASD_RNA = CACCUCCUUA
[INFO] DG_MIN, DG_MAX = -6.110290173128505 -5.310290173128505
[INFO] CROSS_CUTOFF = -3.7424900198527995
[INFO] Using 1 workers, 200000 samples per worker

[INFO] 총 시도 샘플 수 : ~200000
[INFO] 통과한 후보 수 : 7289

[TOP 10 후보 요약] (RNA 서열 기준)

--- Candidate #1 ---
AU (RNA): UUUAAAAUAU
SD (RNA): ACGUAC
Spacer (RNA): GUUAUA
O-ASD: GUACGU
E_orth: -5.7
E_cross1: 0.5
E_cross2_min: -2.5
Hairpin ΔG: -2.569999933242798

--- Candidate #2 ---
AU (RNA): UUUUAAAAUA
SD (RNA): ACGUAC
Spacer (RNA): ACGAUG
O-ASD: GUACGU
E_orth: -5.7
E_cross1: 0.5
E_cross2_min: -2.5
Hairpin ΔG: -2.0

--- Candidate #3 ---
AU (RNA): AAAAUAUAUU
SD (RNA): CUUAGC
Spacer (RNA): CCUGGG
O-ASD: GCUAAG
E_orth: -5.7
E_cross1: 1.3
E_cross2_min: -3.7
Hairpin ΔG: -1.7000000476837158

--- Candidate #4 ---
AU (RNA): UAAAAUUAAU
SD (RNA): ACGUAC
Spacer (RNA): GCGGAA
O-ASD: GUACGU
E_orth: -5.7
E_cross1: 0.5
E_cross2_min: -2.5
Hairpin ΔG: -2.0

--- Candidate #5 ---
AU (RNA): AUAUAAUAUU
SD (RNA): CCAAUC
Spacer (RNA): A