<a href="https://colab.research.google.com/github/jjjung99/SD-design/blob/main/2%EC%B0%A8%EA%B5%AC%EC%A1%B0%2C_%ED%97%A4%EC%96%B4%ED%95%80%EC%9D%80_%EA%B3%A0%EB%A0%A4%EC%95%88%ED%95%B4%EB%B4%84.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
# ============================
# Cell 1. WT 분석 (SD–ASD ΔG 분포)
#  - MG1655 genome + GFF 필요
#  - 결과: WT_ASD_RNA, DG_MIN, DG_MAX, CROSS_CUTOFF, WT_SD_SUBSET
# ============================

!apt-get -y install viennarna > /dev/null 2>&1
!pip install -q biopython

import RNA
from Bio import SeqIO
import numpy as np
import random

genome_fasta_path = "/content/GCF_000005845.2_ASM584v2_genomic.fna"
gff_path          = "/content/genomic.gff"

def dna_to_rna(seq: str) -> str:
    return seq.upper().replace("T", "U")

def revcomp_dna(seq: str) -> str:
    comp = str.maketrans("ACGTacgt", "TGCAtgca")
    return seq.translate(comp)[::-1]

def is_sd_candidate(motif: str) -> bool:
    motif = motif.upper().replace("U", "T")
    L = len(motif)
    if L < 4 or L > 6:
        return False
    purines = motif.count("A") + motif.count("G")
    if purines < int(0.6 * L):
        return False
    if ("GG" not in motif) and ("AGG" not in motif) and ("GAG" not in motif):
        return False
    return True

def calc_duplex_energy(asd_rna: str, sd_dna: str) -> float:
    sd_rna = dna_to_rna(sd_dna)
    duplex = RNA.duplexfold(asd_rna, sd_rna)
    return duplex.energy

def load_genome_and_features():
    genome_dict = {}
    for record in SeqIO.parse(genome_fasta_path, "fasta"):
        genome_dict[record.id] = str(record.seq).upper()

    features = []
    with open(gff_path, "r") as gff:
        for line in gff:
            if not line.strip() or line.startswith("#"):
                continue
            cols = line.rstrip().split("\t")
            if len(cols) != 9:
                continue
            seqid, source, ftype, start, end, score, strand, phase, attrs = cols
            features.append((seqid, ftype, int(start), int(end), strand, attrs))
    return genome_dict, features

def extract_wt_asd(genome_dict, features):
    ASD_list = []
    for seqid, ftype, start, end, strand, attrs in features:
        if ftype == "rRNA" and "16S ribosomal RNA" in attrs:
            if seqid not in genome_dict:
                continue
            full_rrna = genome_dict[seqid][start-1:end]
            if strand == "-":
                full_rrna = revcomp_dna(full_rrna)
            ASD_list.append(full_rrna[-10:])
    if not ASD_list:
        raise RuntimeError("16S rRNA에서 ASD 후보를 찾지 못했습니다.")
    return dna_to_rna(ASD_list[0])   # RNA

def get_upstream_20(genome_dict, seqid, start, end, strand, window=20):
    chrom = genome_dict.get(seqid)
    if chrom is None:
        return None
    L = len(chrom)
    if strand == "+":
        s0 = start - 1
        if s0 < window:
            return None
        return chrom[s0-window:s0].upper()
    elif strand == "-":
        e0 = end - 1
        if e0 + window >= L:
            return None
        downstream = chrom[e0+1:e0+1+window]
        return revcomp_dna(downstream).upper()
    else:
        return None

def find_wt_sd_and_energies(genome_dict, features, WT_ASD_RNA):
    energies = []
    wt_sd_list = []
    for seqid, ftype, start, end, strand, attrs in features:
        if ftype != "CDS":
            continue
        upstream = get_upstream_20(genome_dict, seqid, start, end, strand, 20)
        if upstream is None:
            continue
        window = upstream.upper()
        best_motif = None
        best_dG = None
        for k in [4, 5, 6]:
            for i in range(0, len(window)-k+1):
                m = window[i:i+k]
                if not is_sd_candidate(m):
                    continue
                dG = calc_duplex_energy(WT_ASD_RNA, m)
                if (best_dG is None) or (dG < best_dG):
                    best_dG = dG
                    best_motif = m
        if best_motif is not None:
            energies.append(best_dG)
            wt_sd_list.append(best_motif)

    if not energies:
        raise RuntimeError("어떤 CDS에서도 SD 후보를 찾지 못했습니다.")
    return np.array(energies, float), wt_sd_list

# ---- 실행 ----
genome_dict, features = load_genome_and_features()
print(f"[INFO] Loaded chromosomes: {list(genome_dict.keys())}")
print(f"[INFO] Total features in GFF: {len(features)}")

WT_ASD_RNA = extract_wt_asd(genome_dict, features)
print(f"[INFO] WT_ASD_RNA (16S 3' tail) = {WT_ASD_RNA}")

energies, WT_SD_DNA_LIST = find_wt_sd_and_energies(genome_dict, features, WT_ASD_RNA)

mean_dG = float(energies.mean())
std_dG  = float(energies.std())

# 타깃 범위: mean ± 0.4 (너가 이미 본 [-6.110, -5.310]이 여기서 나왔음)
DG_MIN  = mean_dG - 0.400
DG_MAX  = mean_dG + 0.400

# 직교성 기준: WT 평균보다 1σ 약한 결합 이상만 허용
CROSS_CUTOFF = mean_dG + std_dG

print("\n=== WT SD–WT ASD ΔG 통계 ===")
print(f"Count : {len(energies)}")
print(f"Mean  : {mean_dG:.3f} kcal/mol")
print(f"Std   : {std_dG:.3f} kcal/mol")
print(f"TARGET_DG range   : [{DG_MIN:.3f}, {DG_MAX:.3f}]")
print(f"CROSS_CUTOFF (weak): ΔG >= {CROSS_CUTOFF:.3f} kcal/mol")

# 직교성 테스트에 사용할 WT SD subset (계산량 줄이기용)
WT_SD_SUBSET_SIZE = 300
if len(WT_SD_DNA_LIST) > WT_SD_SUBSET_SIZE:
    WT_SD_SUBSET = random.sample(WT_SD_DNA_LIST, WT_SD_SUBSET_SIZE)
else:
    WT_SD_SUBSET = WT_SD_DNA_LIST[:]

print(f"[INFO] WT_SD_SUBSET size for orthogonality check: {len(WT_SD_SUBSET)}")


[INFO] Loaded chromosomes: ['NC_000913.3']
[INFO] Total features in GFF: 9523
[INFO] WT_ASD_RNA (16S 3' tail) = CACCUCCUUA

=== WT SD–WT ASD ΔG 통계 ===
Count : 4101
Mean  : -5.710 kcal/mol
Std   : 1.968 kcal/mol
TARGET_DG range   : [-6.110, -5.310]
CROSS_CUTOFF (weak): ΔG >= -3.742 kcal/mol
[INFO] WT_SD_SUBSET size for orthogonality check: 300


In [15]:
# ============================
# Cell 2. O-SD 설계 (O-ASD 자동 생성 + 직교성 + 멀티프로세싱)
#  - Cell 1에서 WT_ASD_RNA, DG_MIN/DG_MAX, CROSS_CUTOFF, WT_SD_SUBSET 이미 계산됨
# ============================

import RNA, random, math
import multiprocessing as mp

# --- 공통 유틸 재사용 ---

def dna_to_rna(seq: str) -> str:
    return seq.upper().replace("T", "U")

def rna_revcomp(seq: str) -> str:
    comp = str.maketrans("AUCGaucg", "UAGCuagc")
    return seq.translate(comp)[::-1]

def calc_duplex_energy(asd_rna: str, sd_dna: str) -> float:
    sd_rna = dna_to_rna(sd_dna)
    duplex = RNA.duplexfold(asd_rna, sd_rna)
    return duplex.energy

def random_AU(n: int) -> str:
    return "".join(random.choice("AT") for _ in range(n))

def random_nt(n: int) -> str:
    return "".join(random.choice("ATGC") for _ in range(n))

# --- 설계에 사용할 고정 시퀀스 (DNA) ---

UTR_50 = "ATATAGGCATAGCGCACAGACAGATAAAAATTACAGAGTACACAACATCC"

EMPTY_ORF = "CTGCTGGGTGAGCTTTCTCCGTAAACTTAAAGGAAAAGATTCCGTTGAAAGATTCAAAGCTATCGTTCAGCGTATACAAGAGACTTCCTCCTGAGACTCGTGTTCCCGTACCGAACTCT"
CODING_SEQ = "ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAACGTTTTCTGCGTGTTGCCGATATTCTGGAAAGCAATGCCAGGCAGGGGCAGGTGGCCACCGTCCTCTCTGCCCCCGCCAAAATCACCAACCACCTGGTGGCGATGATTGAAAAAACCATTAGCGGCCAGGATGCTTTACCCAATATCAGCGATGCCGAACGTATTTTTGCCGAACTTTTGACGGGACTCGCCGCCGCCCAGCCGGGGTTCCCGCTGGCGCAATTGAAAACTTTCGTCGATCAGGAATTTGCCCAAATAAAACATGTCCTGCATGGCATTAGTTTGTTGGGGCAGTGCCCGGATAGCATCAACGCTGCGCTGATTTGCCGTGGCGAGAAAATGTCGATCGCCATTATGGCCGGCGTATTAGAAGCGCGCGGTCACAACGTTACTGTTATCGATCCGGTCGAAAAACTGCTGGCAGTGGGGCATTACCTCGAATCTACCGTCGATATTGCTGAGTCCACCCGCCGTATTGCGGCAAGCCGCATTCCGGCTGATCACATGGTGCTGATGGCAGGTTTCACCGCCGGTAATGAAAAAGGCGAACTGGTGGTGCTTGGACGCAACGGTTCCGACTACTCTGCTGCGGTGCTGGCTGCCTGTTTACGCGCCGATTGTTGCGAGATTTGGACGGACGTTGACGGGGTCTATACCTGCGACCCGCGTCAGGTGCCCGATGCGAGGTTGTTGAAGTCGATGTCCTACCAGGAAGCGATGGAGCTTTCCTACTTCGGCGCTAAAGTTCTTCACCCCCGCACCATTACCCCCATCGCCCAGTTCCAGATCCCTTGCCTGATTAAAAATACCGGAAATCCTCAAGCACCAGGTACGCTCATTGGTGCCAGCCGTGATGAAGACGAATTACCGGTCAAGGGCATTTCCAATCTGAATAACATGGCAATGTTCAGCGTTTCTGGTCCGGGGATGAAAGGGATGGTCGGCATGGCGGCGCGCGTCTTTGCAGCGATGTCACGCGCCCGTATTTCCGTGGTGCTGATTACGCAATCATCTTCCGAATACAGCATCAGTTTCTGCGTTCCACAAAGCGACTGTGTGCGAGCTGAACGGGCAATGCAGGAAGAGTTCTACCTGGAACTGAAAGAAGGCTTACTGGAGCCGCTGGCAGTGACGGAACGGCTGGCCATTATCTCGGTGGTAGGTGATGGTATGCGCACCTTGCGTGGGATCTCGGCGAAATTCTTTGCCGCACTGGCCCGCGCCAATATCAACATTGTCGCCATTGCTCAGGGATCTTCTGAACGCTCAATCTCTGTCGTGGTAAATAACGATGATGCGACCACTGGCGTGCGCGTTACTCATCAGATGCTGTTCAATACCGATCAGGTTATCGAAGTGTTTGTGATTGGCGTCGGTGGCGTTGGCGGTGCGCTGCTGGAGCAACTGAAGCGTCAGCAAAGCTGGCTGAAGAATAAACATATCGACTTACGTGTCTGCGGTGTTGCCAACTCGAAGGCTCTGCTCACCAATGTACATGGCCTTAATCTGGAAAACTGGCAGGAAGAACTGGCGCAAGCCAAAGAGCCGTTTAATCTCGGGCGCTTAATTCGCCTCGTGAAAGAATATCATCTGCTGAACCCGGTCATTGTTGACTGCACTTCCAGCCAGGCAGTGGCGGATCAATATGCCGACTTCCTGCGCGAAGGTTTCCACGTTGTCACGCCGAACAAAAAGGCCAACACCTCGTCGATGGATTACTACCATCAGTTGCGTTATGCGGCGGAAAAATCGCGGCGTAAATTCCTCTATGACACCAACGTTGGGGCTGGATTACCGGTTATTGAGAACCTGCAAAATCTGCTCAATGCAGGTGATGAATTGATGAAGTTCTCCGGCATTCTTTCTGGTTCGCTTTCTTATATCTTCGGCAAGTTAGACGAAGGCATGAGTTTCTCCGAGGCGACCACGCTGGCGCGGGAAATGGGTTATACCGAACCGGACCCGCGAGATGATCTTTCTGGTATGGATGTGGCGCGTAAACTATTGATTCTCGCTCGTGAAACGGGACGTGAACTGGAGCTGGCGGATATTGAAATTGAACCTGTGCTGCCCGCAGAGTTTAACGCCGAGGGTGATGTTGCCGCTTTTATGGCGAATCTGTCACAACTCGACGATCTCTTTGCCGCGCGCGTGGCGAAGGCCCGTGATGAAGGAAAAGTTTTGCGCTATGTTGGCAATATTGATGAAGATGGCGTCTGCCGCGTGAAGATTGCCGAAGTGGATGGTAATGATCCGCTGTTCAAAGTGAAAAATGGCGAAAACGCCCTGGCCTTCTATAGCCACTATTATCAGCCGCTGCCGTTGGTACTGCGCGGATATGGTGCGGGCAATGACGTTACAGCTGCCGGTGTCTTTGCTGATCTGCTACGTACCCTCTCATGGAAGTTAGGAGTCTGA"

# WT 분석 결과 변수들 (Cell 1에서 계산된 값 사용)
print("[INFO] WT_ASD_RNA =", WT_ASD_RNA)
print("[INFO] DG_MIN, DG_MAX =", DG_MIN, DG_MAX)
print("[INFO] CROSS_CUTOFF =", CROSS_CUTOFF)

# ---- 후보 1개 생성 + 평가 ----

def make_and_score_candidate(seed=None):
    if seed is not None:
        random.seed(seed)

    au_dna = random_AU(10)
    sd_dna = random_nt(6)
    spacer_dna = random_nt(6)

    # O-ASD는 O-SD의 역상보 (RNA 기준)
    sd_rna  = dna_to_rna(sd_dna)
    ortho_asd_rna = rna_revcomp(sd_rna)

    full_dna = UTR_50 + au_dna + sd_dna + spacer_dna + "ATG" + EMPTY_ORF + CODING_SEQ

    # (1) O-ASD : O-SD 결합 세기 (타깃 범위 안이어야 함)
    E_orth = calc_duplex_energy(ortho_asd_rna, sd_dna)

    # (2) WT-ASD : O-SD (직교성 1) → WT와 너무 강하게 붙으면 탈락
    E_cross_WTASD_OSD = calc_duplex_energy(WT_ASD_RNA, sd_dna)

    # (3) O-ASD : WT-SD들 (직교성 2) → WT SD와도 강결합이면 탈락
    E_cross_OASD_WTSD_min = math.inf
    for wt_sd in WT_SD_SUBSET:
        dG = calc_duplex_energy(ortho_asd_rna, wt_sd)
        if dG < E_cross_OASD_WTSD_min:
            E_cross_OASD_WTSD_min = dG

    return {
        "AU_DNA": au_dna,
        "SD_DNA": sd_dna,
        "spacer_DNA": spacer_dna,
        "full_dna": full_dna,
        "E_orth": E_orth,
        "E_cross_WTASD_OSD": E_cross_WTASD_OSD,
        "E_cross_OASD_WTSD_min": E_cross_OASD_WTSD_min,
    }

def passes_filters(cand):
    # 1) O-ASD:O-SD ΔG가 WT 타깃 범위 안
    if not (DG_MIN <= cand["E_orth"] <= DG_MAX):
        return False
    # 2) WT-ASD:O-SD 결합이 WT보다 충분히 약해야 함
    if cand["E_cross_WTASD_OSD"] < CROSS_CUTOFF:
        return False
    # 3) O-ASD:WT-SD 결합도 약해야 함
    if cand["E_cross_OASD_WTSD_min"] < CROSS_CUTOFF:
        return False
    return True

# ---- 멀티프로세싱 worker ----

def worker_generate(n_samples: int, seed_offset: int):
    random.seed(seed_offset)
    accepted = []
    for i in range(n_samples):
        cand = make_and_score_candidate()
        if passes_filters(cand):
            accepted.append(cand)
    return accepted

# ---- 병렬 실행 ----

TOTAL_SAMPLES      = 200000   # 전체 시도 수 (원하면 늘리면 됨)
N_PROCESSES        = max(1, mp.cpu_count() - 1)
SAMPLES_PER_WORKER = TOTAL_SAMPLES // N_PROCESSES

print(f"[INFO] Using {N_PROCESSES} processes, {SAMPLES_PER_WORKER} samples/worker")

if __name__ == "__main__":
    with mp.Pool(processes=N_PROCESSES) as pool:
        results = pool.starmap(worker_generate,
                               [(SAMPLES_PER_WORKER, i*10007) for i in range(N_PROCESSES)])
    candidates = [c for sub in results for c in sub]

    print(f"\n[INFO] 총 시도 샘플 수   : ~{TOTAL_SAMPLES}")
    print(f"[INFO] 조건 통과 후보 수 : {len(candidates)}")

    # WT mean에 얼마나 가까운지 기준으로 정렬
    TARGET_DG_MEAN = (DG_MIN + DG_MAX) / 2.0
    candidates_sorted = sorted(
        candidates,
        key=lambda c: abs(c["E_orth"] - TARGET_DG_MEAN)
    )

    TOP_K = min(10, len(candidates_sorted))
    print(f"\n[TOP {TOP_K} 후보 요약]  (RNA 서열 기준 출력)")

    for idx, c in enumerate(candidates_sorted[:TOP_K], 1):
        AU_RNA     = dna_to_rna(c["AU_DNA"])
        SD_RNA     = dna_to_rna(c["SD_DNA"])
        spacer_RNA = dna_to_rna(c["spacer_DNA"])
        sd_rna     = SD_RNA
        ortho_asd  = rna_revcomp(sd_rna)

        print(f"\n--- Candidate #{idx} ---")
        print("AU (RNA, 10nt)             :", AU_RNA)
        print("SD (RNA, 6nt)              :", SD_RNA)
        print("spacer (RNA, 6nt)          :", spacer_RNA)
        print("O-ASD (RNA, revcomp of SD) :", ortho_asd)
        print("E_orth (O-ASD:O-SD)        : {:.2f} kcal/mol".format(c["E_orth"]))
        print("E_cross WT-ASD:O-SD        : {:.2f} kcal/mol".format(c["E_cross_WTASD_OSD"]))
        print("E_cross O-ASD:WT-SD_min    : {:.2f} kcal/mol".format(c["E_cross_OASD_WTSD_min"]))


[INFO] WT_ASD_RNA = CACCUCCUUA
[INFO] DG_MIN, DG_MAX = -6.110290173128505 -5.310290173128505
[INFO] CROSS_CUTOFF = -3.7424900198527995
[INFO] Using 1 processes, 200000 samples/worker

[INFO] 총 시도 샘플 수   : ~200000
[INFO] 조건 통과 후보 수 : 16455

[TOP 10 후보 요약]  (RNA 서열 기준 출력)

--- Candidate #1 ---
AU (RNA, 10nt)             : UUUAAAAUAU
SD (RNA, 6nt)              : ACGUAC
spacer (RNA, 6nt)          : GUUAUA
O-ASD (RNA, revcomp of SD) : GUACGU
E_orth (O-ASD:O-SD)        : -5.70 kcal/mol
E_cross WT-ASD:O-SD        : 0.50 kcal/mol
E_cross O-ASD:WT-SD_min    : -2.60 kcal/mol

--- Candidate #2 ---
AU (RNA, 10nt)             : UUUUAAAAUA
SD (RNA, 6nt)              : ACGUAC
spacer (RNA, 6nt)          : ACGAUG
O-ASD (RNA, revcomp of SD) : GUACGU
E_orth (O-ASD:O-SD)        : -5.70 kcal/mol
E_cross WT-ASD:O-SD        : 0.50 kcal/mol
E_cross O-ASD:WT-SD_min    : -2.60 kcal/mol

--- Candidate #3 ---
AU (RNA, 10nt)             : UUUAAUAAAA
SD (RNA, 6nt)              : GUUCAC
spacer (RNA, 6nt)          : 