<a href="https://colab.research.google.com/github/jjjung99/SD-design/blob/main/top50.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install viennarna

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m113.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os, csv, time
from itertools import product
from multiprocessing import Pool, cpu_count
import RNA
import matplotlib.pyplot as plt

# ===============================
# Fixed sequences (unchanged)
# ===============================
UTR_DNA = "ATATAGGCATAGCGCACAGACAGATAAAAATTACAGAGTACACAACATCC"
AU_RICH_DNA = "TTAATTAA"
AUG_DNA = "ATG"

EMPTY_ORF_PREFIX = (
    "CTGCTGGGTGAGCTTTCTCCGTAAACTTAAAGGAAAAGATTCCGTTGAAAGATT"
    "CAAAGCTATCGTTCAGCGTATACAAGAGACTTCCTCCTGAGACTCGTGTTCCC"
)

WT_SD_DNA = "AGGAGG"
WT_ASD_CORE_DNA = "CTCCTT"

# ===============================
# Utilities
# ===============================
def dna_to_rna(seq): return seq.replace("T", "U")
def revcomp_dna(seq):
    comp = {"A":"T","T":"A","C":"G","G":"C"}
    return "".join(comp[b] for b in seq[::-1])

def gc_count(seq): return sum(b in "GC" for b in seq)

def duplex_dG(a,b): return float(RNA.duplexfold(a,b).energy)

def fold_window(sd, spacer):
    dna = UTR_DNA + AU_RICH_DNA + sd + spacer + AUG_DNA + EMPTY_ORF_PREFIX
    rna = dna_to_rna(dna)
    struct, mfe = RNA.fold(rna)
    return struct, mfe

# ===============================
# SD / spacer generation
# ===============================
def generate_sd():
    bases = ("A","T","G","C")
    out=[]
    for s in product(bases, repeat=6):
        s="".join(s)
        g=gc_count(s)
        if g in (3,4):
            out.append(s)
    return out  # 2240

def generate_spacer():
    return ["".join(p) for p in product(("A","T"), repeat=6)]  # 64

SDS = generate_sd()
SPACERS = generate_spacer()

TOTAL = len(SDS)*len(SPACERS)
print(f"Total SD–spacer combinations: {TOTAL:,}")

# ===============================
# Worker (index-based)
# ===============================
WT_SD_RNA = dna_to_rna(WT_SD_DNA)
WT_ASD_CORE_RNA = dna_to_rna(WT_ASD_CORE_DNA)

def evaluate(i):
    sd = SDS[i//len(SPACERS)]
    sp = SPACERS[i%len(SPACERS)]

    osd_rna = dna_to_rna(sd)
    oasd_rna = dna_to_rna(revcomp_dna(sd))

    dG_main = duplex_dG(osd_rna, oasd_rna)
    dG_sd_wt = duplex_dG(osd_rna, WT_ASD_CORE_RNA)
    dG_wt_sd = duplex_dG(WT_SD_RNA, oasd_rna)

    pass_main = -10 <= dG_main <= -8.5
    pass_strict = pass_main and dG_sd_wt>0 and dG_wt_sd>0
    neg_cross = int(dG_sd_wt<0) + int(dG_wt_sd<0)

    struct, mfe = fold_window(sd, sp)

    return [
        sd, sp, revcomp_dna(sd),
        dG_main, dG_sd_wt, dG_wt_sd,
        pass_main, pass_strict, neg_cross,
        mfe, struct
    ]

# ===============================
# Run
# ===============================
OUTDIR="/content/final_SD_design"
os.makedirs(OUTDIR, exist_ok=True)
csv_path=f"{OUTDIR}/SD_design_full.csv"

header=[
    "SD","Spacer","ASD(comp)",
    "dG_SD_ASD","dG_SD_WTASD","dG_WTSD_ASD",
    "pass_main","pass_strict","neg_cross",
    "mfe","structure"
]

t0=time.time()
with open(csv_path,"w",newline="") as f:
    w=csv.writer(f); w.writerow(header)
    with Pool(max(1,cpu_count()-1)) as p:
        for k,row in enumerate(p.imap_unordered(evaluate, range(TOTAL), chunksize=2000),1):
            w.writerow(row)
            if k%5000==0:
                print(f"[PROGRESS] {k}/{TOTAL}  {(time.time()-t0)/60:.1f} min")

print("Finished. CSV saved:", csv_path)


Total SD–spacer combinations: 143,360
[PROGRESS] 5000/143360  3.5 min
[PROGRESS] 10000/143360  5.7 min
[PROGRESS] 15000/143360  9.1 min
[PROGRESS] 20000/143360  11.4 min
[PROGRESS] 25000/143360  14.7 min
[PROGRESS] 30000/143360  16.9 min
[PROGRESS] 35000/143360  20.3 min
[PROGRESS] 40000/143360  22.6 min
[PROGRESS] 45000/143360  26.0 min
[PROGRESS] 50000/143360  28.2 min
[PROGRESS] 55000/143360  31.6 min
[PROGRESS] 60000/143360  33.9 min
[PROGRESS] 65000/143360  37.2 min
[PROGRESS] 70000/143360  39.4 min
[PROGRESS] 75000/143360  42.8 min
[PROGRESS] 80000/143360  45.1 min
[PROGRESS] 85000/143360  48.5 min
[PROGRESS] 90000/143360  50.7 min
[PROGRESS] 95000/143360  54.0 min
[PROGRESS] 100000/143360  56.3 min
[PROGRESS] 105000/143360  59.6 min
[PROGRESS] 110000/143360  61.8 min
[PROGRESS] 115000/143360  65.1 min
[PROGRESS] 120000/143360  67.3 min
[PROGRESS] 125000/143360  70.6 min
[PROGRESS] 130000/143360  72.8 min
[PROGRESS] 135000/143360  76.1 min
[PROGRESS] 140000/143360  78.3 min
Finis

In [3]:
# =========================================================
# POST-PROCESS: TOP50 SD 선정 (pass_strict True 기준)
# =========================================================
import pandas as pd
import numpy as np
import os

input_csv = csv_path  # 위 코드에서 만든 결과 파일 경로 그대로 사용
outdir = OUTDIR
os.makedirs(outdir, exist_ok=True)

TOP_N = 50

# 기준값 (원하면 바꿔도 됨)
TARGET = -10.0
CROSS_MARGIN = 0.0   # pass_strict면 이미 >0 이지만, 더 강하게 하려면 1.0 등으로 변경 가능

# 1) 결과 로드
df = pd.read_csv(input_csv)

# 2) strict 후보만
strict = df[df["pass_strict"] == True].copy()
print(f"[INFO] strict candidates: {len(strict):,}")

if len(strict) == 0:
    raise RuntimeError("pass_strict==True 후보가 0개입니다. 조건을 완화하거나 계산 결과를 확인하세요.")

# 3) 파생지표 생성
# (1) -10에 가까운 정도: 작을수록 좋음
strict["abs_to_minus10"] = (strict["dG_SD_ASD"] - TARGET).abs()

# (2) 직교성 마진: 두 cross 중 작은 값도 충분히 커야 안전하므로 min 사용 (클수록 좋음)
strict["cross_min"] = strict[["dG_SD_WTASD", "dG_WTSD_ASD"]].min(axis=1)

# (3) 구조 방해: mfe가 덜 음수(더 큰 값)일수록 덜 접힘 = 더 좋음
# -> 정렬에서 mfe는 내림차순

# 4) (선택) 직교성 마진을 추가로 더 강하게 필터링하고 싶으면 사용
if CROSS_MARGIN > 0:
    before = len(strict)
    strict = strict[strict["cross_min"] >= CROSS_MARGIN].copy()
    print(f"[INFO] after cross_min >= {CROSS_MARGIN}: {len(strict):,} (from {before:,})")
    if len(strict) == 0:
        raise RuntimeError("CROSS_MARGIN이 너무 커서 후보가 0개가 되었습니다. 값을 낮춰주세요.")

# 5) SD 단위 중복 제거 (권장)
# 같은 SD가 spacer만 달라 여러 줄로 존재하므로, SD별로 가장 좋은 1개만 대표로 선택
# 우선순위 정렬 규칙:
#   1) abs_to_minus10 (작을수록 좋음)
#   2) cross_min      (클수록 좋음)
#   3) mfe            (클수록 좋음 = 덜 접힘)
strict_sorted = strict.sort_values(
    by=["abs_to_minus10", "cross_min", "mfe"],
    ascending=[True, False, False]
).copy()

best_per_sd = strict_sorted.groupby("SD", as_index=False).head(1).copy()
print(f"[INFO] unique SD after best-per-SD: {best_per_sd['SD'].nunique():,}")

# 6) TOP50 최종 선정 (같은 정렬 규칙 적용)
top50 = best_per_sd.sort_values(
    by=["abs_to_minus10", "cross_min", "mfe"],
    ascending=[True, False, False]
).head(TOP_N).copy()

print(f"[INFO] TOP{TOP_N} selected: {len(top50):,}")

# 7) 출력(요약값 + SD 서열 나열)
show_cols = [
    "SD","Spacer","ASD(comp)",
    "dG_SD_ASD","abs_to_minus10",
    "dG_SD_WTASD","dG_WTSD_ASD","cross_min",
    "mfe"
]
print("\n=== TOP50 preview (first 10) ===")
print(top50[show_cols].head(10).to_string(index=False))

print("\n=== TOP50 SD sequences (rank order) ===")
for r, sd in enumerate(top50["SD"].tolist(), start=1):
    print(f"{r:02d}: {sd}")

# 8) CSV 저장 (TOP50)
top50_csv = os.path.join(outdir, "TOP50_SD_ranked.csv")
top50.to_csv(top50_csv, index=False)
print("\n[SAVE]", top50_csv)

# 9) (추가) SD만 따로 저장하고 싶으면
sd_only_csv = os.path.join(outdir, "TOP50_SD_sequences_only.csv")
pd.DataFrame({"rank": range(1, len(top50)+1), "SD": top50["SD"].tolist()}).to_csv(sd_only_csv, index=False)
print("[SAVE]", sd_only_csv)


[INFO] strict candidates: 4,928
[INFO] unique SD after best-per-SD: 77
[INFO] TOP50 selected: 50

=== TOP50 preview (first 10) ===
    SD Spacer ASD(comp)  dG_SD_ASD  abs_to_minus10  dG_SD_WTASD  dG_WTSD_ASD  cross_min        mfe
TCCCCT AAAAAA    AGGGGA       -9.3             0.7     100000.0     100000.0   100000.0 -26.600000
TCCCCA TATAAA    TGGGGA       -9.3             0.7          4.4          3.1        3.1 -27.000000
ACCCCT AAAAAA    AGGGGT       -9.1             0.9          4.3          3.9        3.9 -26.700001
ACCCCA AAAAAA    TGGGGT       -9.1             0.9          4.3          3.1        3.1 -26.900000
TGCCCT AAAAAA    AGGGCA       -9.1             0.9          2.2          1.2        1.2 -28.100000
TGCCCA TATAAA    TGGGCA       -9.1             0.9          2.2          1.2        1.2 -28.500000
AGCCCT AAAAAA    AGGGCT       -9.1             0.9          1.7          0.7        0.7 -27.799999
AGCCCA TATAAA    TGGGCT       -9.1             0.9          1.7          0.7 