<a href="https://colab.research.google.com/github/jjjung99/SD-design/blob/main/143%2C360_%EC%83%98%ED%94%8C.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install viennarna


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m52.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os, csv, time
from itertools import product
from multiprocessing import Pool, cpu_count
import RNA
import matplotlib.pyplot as plt

# -----------------------------
# Fixed sequences
# -----------------------------
UTR_DNA = "ATATAGGCATAGCGCACAGACAGATAAAAATTACAGAGTACACAACATCC"  # 50bp
AU_RICH_DNA = "TTAATTAA"
AUG_DNA = "ATG"

EMPTY_ORF_DNA_PREFIX = (
    "CTGCTGGGTGAGCTTTCTCCGTAAACTTAAAGGAAAAGATTCCGTTGAAAGATT"
    "CAAAGCTATCGTTCAGCGTATACAAGAGACTTCCTCCTGAGACTCGTGTTCCC"
    "GTACCGAACTCT"
)

WT_SD_DNA = "AGGAGG"
WT_ASD_CORE_DNA = "CTCCTT"  # core

def dna_to_rna(seq: str) -> str:
    return seq.upper().replace("T", "U")

def gc_count(seq: str) -> int:
    s = seq.upper()
    return sum(1 for ch in s if ch in ("G", "C"))

def generate_sd_6mers_gc_40_60():
    alphabet = ("A", "C", "G", "T")
    out = []
    for mer in product(alphabet, repeat=6):
        s = "".join(mer)
        g = gc_count(s)
        if g in (3, 4):  # 40~60% for length 6
            out.append(s)
    return out  # 2240

def generate_spacer_6mers_at_only():
    alphabet = ("A", "T")
    return ["".join(p) for p in product(alphabet, repeat=6)]  # 64

def duplex_dG(seq1_rna: str, seq2_rna: str) -> float:
    return float(RNA.duplexfold(seq1_rna, seq2_rna).energy)

def build_window_rna(sd_dna: str, spacer_dna: str, orf_prefix_len: int = 60) -> str:
    dna = (
        UTR_DNA
        + AU_RICH_DNA
        + sd_dna
        + spacer_dna
        + AUG_DNA
        + EMPTY_ORF_DNA_PREFIX[:orf_prefix_len]
    )
    return dna_to_rna(dna)

# -----------------------------
# Run settings
# -----------------------------
OUTDIR = "/content/orthogonal_sd_results"
os.makedirs(OUTDIR, exist_ok=True)

OASD_INPUT = "AAAAAA"  # 너가 테스트할 O-ASD core (6nt). DNA/T 포함해도 됨
OASD_RNA = dna_to_rna(OASD_INPUT)
assert len(OASD_RNA) == 6

NPROC = max(1, min(8, cpu_count() - 1))
CHUNKSIZE = 2000  # 크게 잡아야 IPC 오버헤드 줄어듦
PROGRESS_EVERY = 5000

# -----------------------------
# Enumerate SD / spacer
# -----------------------------
SDS = generate_sd_6mers_gc_40_60()
SPACERS = generate_spacer_6mers_at_only()

n_sd = len(SDS)        # 2240
n_sp = len(SPACERS)    # 64
TOTAL = n_sd * n_sp    # 143,360

print(f"[INFO] SD count: {n_sd}")
print(f"[INFO] Spacer count: {n_sp}")
print(f"[INFO] Total combos: {TOTAL:,}")
print(f"[INFO] Using NPROC={NPROC}, CHUNKSIZE={CHUNKSIZE}")

WT_SD_RNA = dna_to_rna(WT_SD_DNA)
WT_ASD_CORE_RNA = dna_to_rna(WT_ASD_CORE_DNA)

# -----------------------------
# Worker uses only an integer index -> no giant jobs list
# -----------------------------
def eval_by_index(i: int):
    sd = SDS[i // n_sp]
    sp = SPACERS[i % n_sp]

    osd_rna = dna_to_rna(sd)

    dG_osd_oasd = duplex_dG(osd_rna, OASD_RNA)
    dG_osd_wtasd = duplex_dG(osd_rna, WT_ASD_CORE_RNA)
    dG_wtsd_oasd = duplex_dG(WT_SD_RNA, OASD_RNA)

    pass_relaxed = int((-10.0 <= dG_osd_oasd <= -8.5))
    pass_strict = int(pass_relaxed and (dG_osd_wtasd > 0.0) and (dG_wtsd_oasd > 0.0))
    neg_count_cross = int(dG_osd_wtasd < 0.0) + int(dG_wtsd_oasd < 0.0)

    window_rna = build_window_rna(sd, sp, orf_prefix_len=60)
    struct, mfe = RNA.fold(window_rna)  # 구조도 같이 저장 (원하면)
    return (sd, sp, dG_osd_oasd, dG_osd_wtasd, dG_wtsd_oasd, neg_count_cross, pass_relaxed, pass_strict, mfe, struct)

# -----------------------------
# Stream-write CSV while computing (메모리 폭발 방지 + 중간 저장)
# -----------------------------
tag = f"oasd_{OASD_RNA}"
csv_path = os.path.join(OUTDIR, f"{tag}.csv")

fieldnames = [
    "sd_dna", "spacer_dna",
    "dG_osd_oasd", "dG_osd_wtasd", "dG_wtsd_oasd",
    "neg_count_cross", "pass_relaxed", "pass_strict",
    "mfe_window", "struct_window"
]

t0 = time.time()
strict_hits = 0
relaxed_hits = 0
neg0 = neg1 = neg2 = 0

dG1 = []
dG2 = []
dG3 = []
mfe_list = []

with open(csv_path, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow(fieldnames)

    with Pool(processes=NPROC) as pool:
        for k, row in enumerate(pool.imap_unordered(eval_by_index, range(TOTAL), chunksize=CHUNKSIZE), start=1):
            sd, sp, dg1, dg2, dg3, negc, prel, pstr, mfe, struct = row

            w.writerow([sd, sp, f"{dg1:.4f}", f"{dg2:.4f}", f"{dg3:.4f}", negc, prel, pstr, f"{mfe:.4f}", struct])

            # stats
            strict_hits += pstr
            relaxed_hits += prel
            if negc == 0: neg0 += 1
            elif negc == 1: neg1 += 1
            else: neg2 += 1

            dG1.append(dg1); dG2.append(dg2); dG3.append(dg3); mfe_list.append(mfe)

            if k % PROGRESS_EVERY == 0:
                elapsed = time.time() - t0
                print(f"[PROGRESS] {k:,}/{TOTAL:,}  elapsed={elapsed/60:.1f} min")

t1 = time.time()
print(f"\n[INFO] Done. elapsed = {(t1-t0)/60:.2f} min")
print(f"[INFO] CSV saved: {csv_path}")

print("\n=== SUMMARY ===")
print(f"pass_relaxed (dG1 in [-10,-8.5]): {relaxed_hits:,}/{TOTAL:,}")
print(f"pass_strict  (target + cross>0):  {strict_hits:,}/{TOTAL:,}")
print(f"cross negative counts: neg0={neg0:,}, neg1={neg1:,}, neg2={neg2:,}")

# -----------------------------
# Save plots
# -----------------------------
def plot_hist(vals, title, outpath, xlabel):
    plt.figure()
    plt.hist(vals, bins=80)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel("count")
    plt.tight_layout()
    plt.savefig(outpath, dpi=180)
    plt.close()

plot_hist(dG1, f"{tag} dG(O-SD:O-ASD)", os.path.join(OUTDIR, f"{tag}_dG_osd_oasd.png"), "kcal/mol")
plot_hist(dG2, f"{tag} dG(O-SD:WT-ASDcore)", os.path.join(OUTDIR, f"{tag}_dG_osd_wtasd.png"), "kcal/mol")
plot_hist(dG3, f"{tag} dG(WT-SD:O-ASD)", os.path.join(OUTDIR, f"{tag}_dG_wtsd_oasd.png"), "kcal/mol")
plot_hist(mfe_list, f"{tag} MFE window", os.path.join(OUTDIR, f"{tag}_mfe_window.png"), "kcal/mol")

print(f"\n[INFO] Plots saved in: {OUTDIR}")
!ls -lh {OUTDIR} | head -n 20


[INFO] SD count: 2240
[INFO] Spacer count: 64
[INFO] Total combos: 143,360
[INFO] Using NPROC=1, CHUNKSIZE=2000
[PROGRESS] 5,000/143,360  elapsed=1.7 min
[PROGRESS] 10,000/143,360  elapsed=2.8 min
[PROGRESS] 15,000/143,360  elapsed=4.5 min
[PROGRESS] 20,000/143,360  elapsed=5.6 min
[PROGRESS] 25,000/143,360  elapsed=7.2 min
[PROGRESS] 30,000/143,360  elapsed=8.4 min
[PROGRESS] 35,000/143,360  elapsed=10.0 min
[PROGRESS] 40,000/143,360  elapsed=11.1 min
[PROGRESS] 45,000/143,360  elapsed=12.7 min
[PROGRESS] 50,000/143,360  elapsed=13.8 min
[PROGRESS] 55,000/143,360  elapsed=15.5 min
[PROGRESS] 60,000/143,360  elapsed=16.6 min
[PROGRESS] 65,000/143,360  elapsed=18.3 min
[PROGRESS] 70,000/143,360  elapsed=19.4 min
[PROGRESS] 75,000/143,360  elapsed=21.0 min
[PROGRESS] 80,000/143,360  elapsed=22.1 min
[PROGRESS] 85,000/143,360  elapsed=23.8 min
[PROGRESS] 90,000/143,360  elapsed=24.9 min
[PROGRESS] 95,000/143,360  elapsed=26.6 min
[PROGRESS] 100,000/143,360  elapsed=27.7 min
[PROGRESS] 105