<a href="https://colab.research.google.com/github/jjjung99/SD-design/blob/main/143%2C360%EC%83%98%ED%94%8C_%EC%B5%9C%EC%A2%85_top_50.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install viennarna


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os, csv, time
from itertools import product
from multiprocessing import Pool, cpu_count
import RNA
import matplotlib.pyplot as plt

# ===============================
# Fixed sequences (unchanged)
# ===============================
UTR_DNA = "ATATAGGCATAGCGCACAGACAGATAAAAATTACAGAGTACACAACATCC"
AU_RICH_DNA = "TTAATTAA"
AUG_DNA = "ATG"

EMPTY_ORF_PREFIX = (
    "CTGCTGGGTGAGCTTTCTCCGTAAACTTAAAGGAAAAGATTCCGTTGAAAGATT"
    "CAAAGCTATCGTTCAGCGTATACAAGAGACTTCCTCCTGAGACTCGTGTTCCC"
)

WT_SD_DNA = "AGGAGG"
WT_ASD_CORE_DNA = "CTCCTT"

# ===============================
# Utilities
# ===============================
def dna_to_rna(seq): return seq.replace("T", "U")
def revcomp_dna(seq):
    comp = {"A":"T","T":"A","C":"G","G":"C"}
    return "".join(comp[b] for b in seq[::-1])

def gc_count(seq): return sum(b in "GC" for b in seq)

def duplex_dG(a,b): return float(RNA.duplexfold(a,b).energy)

def fold_window(sd, spacer):
    dna = UTR_DNA + AU_RICH_DNA + sd + spacer + AUG_DNA + EMPTY_ORF_PREFIX
    rna = dna_to_rna(dna)
    struct, mfe = RNA.fold(rna)
    return struct, mfe

# ===============================
# SD / spacer generation
# ===============================
def generate_sd():
    bases = ("A","T","G","C")
    out=[]
    for s in product(bases, repeat=6):
        s="".join(s)
        g=gc_count(s)
        if g in (3,4):
            out.append(s)
    return out  # 2240

def generate_spacer():
    return ["".join(p) for p in product(("A","T"), repeat=6)]  # 64

SDS = generate_sd()
SPACERS = generate_spacer()

TOTAL = len(SDS)*len(SPACERS)
print(f"Total SD–spacer combinations: {TOTAL:,}")

# ===============================
# Worker (index-based)
# ===============================
WT_SD_RNA = dna_to_rna(WT_SD_DNA)
WT_ASD_CORE_RNA = dna_to_rna(WT_ASD_CORE_DNA)

def evaluate(i):
    sd = SDS[i//len(SPACERS)]
    sp = SPACERS[i%len(SPACERS)]

    osd_rna = dna_to_rna(sd)
    oasd_rna = dna_to_rna(revcomp_dna(sd))

    dG_main = duplex_dG(osd_rna, oasd_rna)
    dG_sd_wt = duplex_dG(osd_rna, WT_ASD_CORE_RNA)
    dG_wt_sd = duplex_dG(WT_SD_RNA, oasd_rna)

    pass_main = -10 <= dG_main <= -8.5
    pass_strict = pass_main and dG_sd_wt>0 and dG_wt_sd>0
    neg_cross = int(dG_sd_wt<0) + int(dG_wt_sd<0)

    struct, mfe = fold_window(sd, sp)

    return [
        sd, sp, revcomp_dna(sd),
        dG_main, dG_sd_wt, dG_wt_sd,
        pass_main, pass_strict, neg_cross,
        mfe, struct
    ]

# ===============================
# Run
# ===============================
OUTDIR="/content/final_SD_design"
os.makedirs(OUTDIR, exist_ok=True)
csv_path=f"{OUTDIR}/SD_design_full.csv"

header=[
    "SD","Spacer","ASD(comp)",
    "dG_SD_ASD","dG_SD_WTASD","dG_WTSD_ASD",
    "pass_main","pass_strict","neg_cross",
    "mfe","structure"
]

t0=time.time()
with open(csv_path,"w",newline="") as f:
    w=csv.writer(f); w.writerow(header)
    with Pool(max(1,cpu_count()-1)) as p:
        for k,row in enumerate(p.imap_unordered(evaluate, range(TOTAL), chunksize=2000),1):
            w.writerow(row)
            if k%5000==0:
                print(f"[PROGRESS] {k}/{TOTAL}  {(time.time()-t0)/60:.1f} min")

print("Finished. CSV saved:", csv_path)


Total SD–spacer combinations: 143,360
[PROGRESS] 5000/143360  3.5 min
[PROGRESS] 10000/143360  5.8 min
[PROGRESS] 15000/143360  9.3 min
[PROGRESS] 20000/143360  11.6 min
[PROGRESS] 25000/143360  15.1 min
[PROGRESS] 30000/143360  17.3 min
[PROGRESS] 35000/143360  20.7 min
[PROGRESS] 40000/143360  22.9 min
[PROGRESS] 45000/143360  26.4 min
[PROGRESS] 50000/143360  28.6 min
[PROGRESS] 55000/143360  32.0 min
[PROGRESS] 60000/143360  34.3 min
[PROGRESS] 65000/143360  37.7 min
[PROGRESS] 70000/143360  40.0 min
[PROGRESS] 75000/143360  43.4 min
[PROGRESS] 80000/143360  45.7 min
[PROGRESS] 85000/143360  49.2 min
[PROGRESS] 90000/143360  51.5 min
[PROGRESS] 95000/143360  55.0 min
[PROGRESS] 100000/143360  57.3 min
[PROGRESS] 105000/143360  60.8 min
[PROGRESS] 110000/143360  63.1 min
[PROGRESS] 115000/143360  66.5 min
[PROGRESS] 120000/143360  68.9 min
[PROGRESS] 125000/143360  72.4 min
[PROGRESS] 130000/143360  74.7 min
[PROGRESS] 135000/143360  78.1 min
[PROGRESS] 140000/143360  80.3 min
Finis

In [3]:
import pandas as pd
df = pd.read_csv("/content/final_SD_design/SD_design_full.csv")
df[df["pass_main"] == True].shape


(14144, 11)

In [4]:
df[df["pass_strict"] == True].shape


(4928, 11)

전체 조합: 143,360
기능성 결합 통과 (ΔG SD–ASD ∈ [-10, -8.5]) : 14,144개 (약 9.9%)
직교성까지 만족 (strict): 4,928개 (약 3.4%)

In [10]:
import pandas as pd
import os

df = pd.read_csv("/content/final_SD_design/SD_design_full.csv")
strict = df[df["pass_strict"] == True].copy()

# 1) target window 완화
TARGET_CENTER = -9.2
TARGET_TOL = 0.5
cond_target = (strict["dG_SD_ASD"] - TARGET_CENTER).abs() <= TARGET_TOL

# 2) cross margin (너무 빡세면 0.5로)
CROSS_MARGIN = 1.0
cond_cross = (strict["dG_SD_WTASD"] >= CROSS_MARGIN) & (strict["dG_WTSD_ASD"] >= CROSS_MARGIN)

tmp = strict[cond_target & cond_cross].copy()
print("after target+cross:", len(tmp))

# 3) MFE 분위수 컷 (상위 15%만 남기기)
q = 0.85
mfe_cut = tmp["mfe"].quantile(q)
final = tmp[tmp["mfe"] >= mfe_cut].copy()

print("mfe_cut:", mfe_cut)
print("final count:", len(final))

OUTDIR="/content/final_SD_design/filtered2"
os.makedirs(OUTDIR, exist_ok=True)
final.to_csv(f"{OUTDIR}/final_candidates.csv", index=False)
print("saved:", f"{OUTDIR}/final_candidates.csv")


after target+cross: 1984
mfe_cut: -27.0
final count: 319
saved: /content/final_SD_design/filtered2/final_candidates.csv


In [11]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# =========================================================
# 0) 입력/출력 경로
# =========================================================
# 네가 방금 만든 파일 경로(스크린샷 기준)
INPUT_CSV = "/content/final_SD_design/filtered2/final_candidates.csv"

OUTDIR = "/content/final_SD_design/top50"
os.makedirs(OUTDIR, exist_ok=True)

# =========================================================
# 1) 설계 철학을 파라미터로 고정 (원하면 여기만 조정)
# =========================================================
TARGET_CENTER = -9.2   # SD-ASD 목표 결합 중앙값 (kcal/mol)
TARGET_TOL    = 0.5    # 중앙값 허용 폭 (예: 0.3~0.7)
CROSS_MARGIN  = 1.0    # 직교성 마진 (예: 0.5 / 1.0 / 2.0)
TOP_N         = 50     # 최종 Top50

# MFE는 절대컷 대신 "상대적으로 덜 접힌 후보"를 선호하므로,
# 최종 점수에서 'mfe가 클수록(덜 음수일수록)' 유리하게 반영
# 필요하면 추가로 분위수 컷을 한 번 더 넣을 수도 있음:
MFE_Q_CUT = None  # 예: 0.80 이면 상위 20% (덜 접힘)만 남기기, 아니면 None

# 점수 가중치(설명 가능하게 단순/직관적으로)
W_TARGET = 1.0    # 중앙값 근접도
W_CROSS  = 1.0    # 직교성 마진
W_MFE    = 0.03   # 구조 (mfe 규모가 보통 -10~-50라 너무 지배하지 않게 작은 가중치)

# =========================================================
# 2) 데이터 로드 + 기본 체크
# =========================================================
df = pd.read_csv(INPUT_CSV)

# 컬럼 이름이 조금 다를 수 있어 안전장치
required = ["SD","Spacer","dG_SD_ASD","dG_SD_WTASD","dG_WTSD_ASD","mfe"]
missing = [c for c in required if c not in df.columns]
if missing:
    raise ValueError(f"필수 컬럼 누락: {missing}\n현재 컬럼: {list(df.columns)}")

print(f"[INFO] loaded rows: {len(df):,}")

# =========================================================
# 3) (선택) 최종 후보군에서도 한 번 더 '안전 필터' 적용
#    - target 중앙값 범위
#    - cross 마진
# =========================================================
df = df.copy()
df["dg_target_dist"] = (df["dG_SD_ASD"] - TARGET_CENTER).abs()

cond_target = df["dg_target_dist"] <= TARGET_TOL
cond_cross  = (df["dG_SD_WTASD"] >= CROSS_MARGIN) & (df["dG_WTSD_ASD"] >= CROSS_MARGIN)

filtered = df[cond_target & cond_cross].copy()
print(f"[INFO] after (target ±{TARGET_TOL}) & (cross ≥{CROSS_MARGIN}): {len(filtered):,}")

# (선택) MFE 분위수로 한 번 더 컷(덜 접히는 상위 X%)
if MFE_Q_CUT is not None:
    mfe_cut = filtered["mfe"].quantile(MFE_Q_CUT)
    filtered = filtered[filtered["mfe"] >= mfe_cut].copy()
    print(f"[INFO] after MFE quantile cut (q={MFE_Q_CUT}, cut={mfe_cut:.2f}): {len(filtered):,}")

if len(filtered) == 0:
    raise RuntimeError("필터가 너무 빡세서 0개가 됐어. TARGET_TOL/CROSS_MARGIN/MFE_Q_CUT를 완화해줘.")

# =========================================================
# 4) 점수(score) 정의
#    - 중앙값에 가까울수록 좋음 (dist 작을수록 좋음)
#    - cross는 클수록 좋음 (둘 다 양수 마진이 크게)
#    - mfe는 클수록 좋음 (덜 음수 = 덜 접힘)
# =========================================================
# (A) target score: dist가 0이면 최고
filtered["score_target"] = -filtered["dg_target_dist"]

# (B) cross score: 두 cross의 "최소값"을 쓰면 더 보수적으로 안전마진 평가 가능
filtered["cross_min"] = filtered[["dG_SD_WTASD","dG_WTSD_ASD"]].min(axis=1)

# (C) 최종 score
filtered["score"] = (
    W_TARGET * filtered["score_target"]
    + W_CROSS  * filtered["cross_min"]
    + W_MFE    * filtered["mfe"]
)

# =========================================================
# 5) SD 단위 중복 제거 (추천)
#    - 같은 SD가 spacer만 달라 여러 번 나오면 실험 후보가 분산됨
#    - SD별로 score가 가장 높은 조합 1개만 대표로 뽑음
# =========================================================
best_per_sd = (
    filtered.sort_values("score", ascending=False)
            .groupby("SD", as_index=False)
            .head(1)
            .copy()
)

print(f"[INFO] unique SD count after best-per-SD: {best_per_sd['SD'].nunique():,}")

# =========================================================
# 6) Top50 선정
# =========================================================
top50 = best_per_sd.sort_values("score", ascending=False).head(TOP_N).copy()
print(f"[INFO] TOP{TOP_N} selected: {len(top50):,}")

# 저장
top50_path = os.path.join(OUTDIR, f"TOP{TOP_N}_SD_candidates.csv")
top50.to_csv(top50_path, index=False)
print("[SAVE]", top50_path)

# =========================================================
# 7) 시각화 (matplotlib, 색 지정 안 함)
#    - 분포 비교(전체 filtered vs top50)
#    - 산점도(결합 vs 구조, cross vs 결합)
#    - Top50 score bar
# =========================================================
def save_hist(a, title, fname, bins=40):
    plt.figure()
    plt.hist(pd.Series(a).dropna(), bins=bins)
    plt.title(title)
    plt.xlabel(title)
    plt.ylabel("count")
    plt.tight_layout()
    plt.savefig(os.path.join(OUTDIR, fname), dpi=200)
    plt.close()

def save_scatter(x, y, title, xlabel, ylabel, fname):
    plt.figure()
    plt.scatter(x, y, s=12)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.tight_layout()
    plt.savefig(os.path.join(OUTDIR, fname), dpi=200)
    plt.close()

# (1) 히스토그램들
save_hist(best_per_sd["dG_SD_ASD"],  "best_per_sd dG_SD_ASD",  "hist_best_dG_SD_ASD.png")
save_hist(best_per_sd["cross_min"],  "best_per_sd cross_min",  "hist_best_cross_min.png")
save_hist(best_per_sd["mfe"],        "best_per_sd MFE",        "hist_best_mfe.png")

save_hist(top50["dG_SD_ASD"],        f"TOP{TOP_N} dG_SD_ASD",  f"hist_top{TOP_N}_dG_SD_ASD.png")
save_hist(top50["cross_min"],        f"TOP{TOP_N} cross_min",  f"hist_top{TOP_N}_cross_min.png")
save_hist(top50["mfe"],              f"TOP{TOP_N} MFE",        f"hist_top{TOP_N}_mfe.png")

# (2) 산점도: 결합 에너지 vs 구조(MFE)
save_scatter(
    best_per_sd["dG_SD_ASD"], best_per_sd["mfe"],
    "Binding vs Structure (best_per_sd)",
    "dG_SD_ASD (kcal/mol)", "MFE (kcal/mol)",
    "scatter_best_dG_vs_MFE.png"
)

save_scatter(
    top50["dG_SD_ASD"], top50["mfe"],
    f"Binding vs Structure (TOP{TOP_N})",
    "dG_SD_ASD (kcal/mol)", "MFE (kcal/mol)",
    f"scatter_top{TOP_N}_dG_vs_MFE.png"
)

# (3) 산점도: 결합 vs 직교성 마진
save_scatter(
    best_per_sd["dG_SD_ASD"], best_per_sd["cross_min"],
    "Binding vs Orthogonality margin (best_per_sd)",
    "dG_SD_ASD (kcal/mol)", "cross_min (kcal/mol)",
    "scatter_best_dG_vs_crossmin.png"
)

save_scatter(
    top50["dG_SD_ASD"], top50["cross_min"],
    f"Binding vs Orthogonality margin (TOP{TOP_N})",
    "dG_SD_ASD (kcal/mol)", "cross_min (kcal/mol)",
    f"scatter_top{TOP_N}_dG_vs_crossmin.png"
)

# (4) Top50 score 막대그래프 (시각적 효과 좋음)
top50_plot = top50.copy()
top50_plot["label"] = top50_plot["SD"] + "_" + top50_plot["Spacer"]

plt.figure(figsize=(12, 6))
plt.bar(range(len(top50_plot)), top50_plot["score"])
plt.title(f"TOP{TOP_N} Scores (best SD per SD)")
plt.xlabel("rank")
plt.ylabel("score")
plt.tight_layout()
plt.savefig(os.path.join(OUTDIR, f"bar_top{TOP_N}_scores.png"), dpi=200)
plt.close()

# =========================================================
# 8) 최종 요약 텍스트 저장
# =========================================================
report = os.path.join(OUTDIR, "TOP50_report.txt")
with open(report, "w") as f:
    f.write("=== TOP50 selection settings ===\n")
    f.write(f"INPUT_CSV     = {INPUT_CSV}\n")
    f.write(f"TARGET_CENTER = {TARGET_CENTER}\n")
    f.write(f"TARGET_TOL    = {TARGET_TOL}\n")
    f.write(f"CROSS_MARGIN  = {CROSS_MARGIN}\n")
    f.write(f"MFE_Q_CUT     = {MFE_Q_CUT}\n")
    f.write(f"Weights       = W_TARGET:{W_TARGET}, W_CROSS:{W_CROSS}, W_MFE:{W_MFE}\n\n")
    f.write("=== Counts ===\n")
    f.write(f"loaded rows: {len(pd.read_csv(INPUT_CSV))}\n")
    f.write(f"after target+cross (pair-level): {len(filtered)}\n")
    f.write(f"unique SD (best per SD): {best_per_sd['SD'].nunique()}\n")
    f.write(f"TOP{TOP_N}: {len(top50)}\n\n")
    f.write("=== TOP50 preview ===\n")
    f.write(top50[["SD","Spacer","dG_SD_ASD","dG_SD_WTASD","dG_WTSD_ASD","cross_min","mfe","score"]]
            .head(10).to_string(index=False))
    f.write("\n")

print("[SAVE]", report)
print("[DONE] All outputs saved to:", OUTDIR)

# (Colab에서 파일 목록 확인)
print("\n[FILES]")
!ls -lh {OUTDIR} | head -n 50


[INFO] loaded rows: 319
[INFO] after (target ±0.5) & (cross ≥1.0): 319
[INFO] unique SD count after best-per-SD: 15
[INFO] TOP50 selected: 15
[SAVE] /content/final_SD_design/top50/TOP50_SD_candidates.csv
[SAVE] /content/final_SD_design/top50/TOP50_report.txt
[DONE] All outputs saved to: /content/final_SD_design/top50

[FILES]
total 476K
-rw-r--r-- 1 root root  51K Dec 23 07:35 bar_top50_scores.png
-rw-r--r-- 1 root root  31K Dec 23 07:35 hist_best_cross_min.png
-rw-r--r-- 1 root root  35K Dec 23 07:35 hist_best_dG_SD_ASD.png
-rw-r--r-- 1 root root  29K Dec 23 07:35 hist_best_mfe.png
-rw-r--r-- 1 root root  30K Dec 23 07:35 hist_top50_cross_min.png
-rw-r--r-- 1 root root  33K Dec 23 07:35 hist_top50_dG_SD_ASD.png
-rw-r--r-- 1 root root  26K Dec 23 07:35 hist_top50_mfe.png
-rw-r--r-- 1 root root  56K Dec 23 07:35 scatter_best_dG_vs_crossmin.png
-rw-r--r-- 1 root root  50K Dec 23 07:35 scatter_best_dG_vs_MFE.png
-rw-r--r-- 1 root root  56K Dec 23 07:35 scatter_top50_dG_vs_crossmin.png
-rw

In [12]:
import os
import pandas as pd
import matplotlib.pyplot as plt

# =========================
# 0) 입력/출력 경로
# =========================
# (1) 전수 결과에서 바로 뽑고 싶으면:
INPUT_CSV = "/content/final_SD_design/SD_design_full.csv"
# (2) 이미 최종 후보(319개)에서만 뽑고 싶으면 아래로 바꿔:
# INPUT_CSV = "/content/final_SD_design/filtered2/final_candidates.csv"

OUTDIR = "/content/final_SD_design/top50_ranked"
os.makedirs(OUTDIR, exist_ok=True)

TOP_N = 50

# =========================
# 1) 어떤 True 후보에서 뽑을지 선택
# =========================
# - pass_strict: 엄격 직교성(둘 다 >0 등)까지 통과한 True
# - pass_main  : 메인 조건만 통과한 True
FILTER_COL = "pass_strict"   # <- 기본 추천
# FILTER_COL = "pass_main"   # <- 필요하면 이걸로 바꿔

# =========================
# 2) 데이터 로드
# =========================
df = pd.read_csv(INPUT_CSV)

# 필요한 컬럼 체크
need = ["SD","Spacer","dG_SD_ASD","dG_SD_WTASD","dG_WTSD_ASD","mfe", FILTER_COL]
missing = [c for c in need if c not in df.columns]
if missing:
    raise ValueError(f"필수 컬럼 누락: {missing}\n현재 컬럼: {list(df.columns)}")

# =========================
# 3) True 후보만 추출
# =========================
cand = df[df[FILTER_COL] == True].copy()
print(f"[INFO] {FILTER_COL} True rows: {len(cand):,}")

if len(cand) == 0:
    raise RuntimeError(f"{FILTER_COL}==True 후보가 0개야. FILTER_COL을 pass_main으로 바꾸거나 이전 단계 결과를 확인해줘.")

# =========================
# 4) 정렬에 쓸 파생 지표 생성
# =========================
# (1) -10에 얼마나 가까운가? (작을수록 좋음)
cand["abs_to_minus10"] = (cand["dG_SD_ASD"] - (-10.0)).abs()

# (2) 직교성은 두 cross 중 "약한 쪽"도 충분히 커야 안전하므로 min 사용
cand["cross_min"] = cand[["dG_SD_WTASD","dG_WTSD_ASD"]].min(axis=1)

# (3) 구조 방해: mfe가 "덜 음수(더 큼)"일수록 덜 접힘(= 방해 적음)
# => 정렬에서는 mfe 내림차순(큰 값 우선)
# 참고: mfe가 음수인 게 정상이라 "양수"를 기대하진 않아도 됨.

# =========================
# 5) SD 단위로 중복 제거 (권장)
# =========================
# 같은 SD가 spacer만 달라 여러 번 나오는 걸 막기 위해:
# SD별로 "가장 좋은 1개"만 대표로 뽑는다.
#
# 대표 기준(=정렬 규칙과 동일):
#  1) abs_to_minus10 최소
#  2) cross_min 최대
#  3) mfe 최대
cand_sorted_for_best = cand.sort_values(
    by=["abs_to_minus10", "cross_min", "mfe"],
    ascending=[True, False, False]
)

best_per_sd = cand_sorted_for_best.groupby("SD", as_index=False).head(1).copy()
print(f"[INFO] unique SD (best per SD): {best_per_sd['SD'].nunique():,}")

# =========================
# 6) Top50 최종 선정 (우선순위 정렬)
# =========================
ranked = best_per_sd.sort_values(
    by=["abs_to_minus10", "cross_min", "mfe"],
    ascending=[True, False, False]
).copy()

top50 = ranked.head(TOP_N).copy()
print(f"[INFO] TOP{TOP_N} selected: {len(top50):,}")

# 저장
top50_path = os.path.join(OUTDIR, f"TOP{TOP_N}_ranked.csv")
top50.to_csv(top50_path, index=False)
print("[SAVE]", top50_path)

# =========================
# 7) 시각화 (그래프 저장)
# =========================

# (A) 전체 True 후보 vs Top50: abs_to_minus10 분포
plt.figure()
plt.hist(best_per_sd["abs_to_minus10"], bins=40)
plt.title(f"{FILTER_COL} True (best_per_sd) | abs(dG_SD_ASD - (-10))")
plt.xlabel("abs_to_minus10")
plt.ylabel("count")
plt.tight_layout()
plt.savefig(os.path.join(OUTDIR, "hist_abs_to_minus10_best_per_sd.png"), dpi=200)
plt.close()

plt.figure()
plt.hist(top50["abs_to_minus10"], bins=30)
plt.title(f"TOP{TOP_N} | abs(dG_SD_ASD - (-10))")
plt.xlabel("abs_to_minus10")
plt.ylabel("count")
plt.tight_layout()
plt.savefig(os.path.join(OUTDIR, f"hist_abs_to_minus10_top{TOP_N}.png"), dpi=200)
plt.close()

# (B) 산점도: 결합(-10 근접) vs 직교성(cross_min)
plt.figure()
plt.scatter(best_per_sd["abs_to_minus10"], best_per_sd["cross_min"], s=12)
plt.title("abs_to_minus10 vs cross_min (best_per_sd)")
plt.xlabel("abs_to_minus10 (smaller is better)")
plt.ylabel("cross_min (bigger is better)")
plt.tight_layout()
plt.savefig(os.path.join(OUTDIR, "scatter_abs_to_minus10_vs_crossmin_best_per_sd.png"), dpi=200)
plt.close()

plt.figure()
plt.scatter(top50["abs_to_minus10"], top50["cross_min"], s=20)
plt.title(f"abs_to_minus10 vs cross_min (TOP{TOP_N})")
plt.xlabel("abs_to_minus10 (smaller is better)")
plt.ylabel("cross_min (bigger is better)")
plt.tight_layout()
plt.savefig(os.path.join(OUTDIR, f"scatter_abs_to_minus10_vs_crossmin_top{TOP_N}.png"), dpi=200)
plt.close()

# (C) 산점도: 직교성(cross_min) vs 구조(mfe)
plt.figure()
plt.scatter(best_per_sd["cross_min"], best_per_sd["mfe"], s=12)
plt.title("cross_min vs mfe (best_per_sd)")
plt.xlabel("cross_min (bigger is better)")
plt.ylabel("mfe (bigger/less negative is better)")
plt.tight_layout()
plt.savefig(os.path.join(OUTDIR, "scatter_crossmin_vs_mfe_best_per_sd.png"), dpi=200)
plt.close()

plt.figure()
plt.scatter(top50["cross_min"], top50["mfe"], s=20)
plt.title(f"cross_min vs mfe (TOP{TOP_N})")
plt.xlabel("cross_min (bigger is better)")
plt.ylabel("mfe (bigger/less negative is better)")
plt.tight_layout()
plt.savefig(os.path.join(OUTDIR, f"scatter_crossmin_vs_mfe_top{TOP_N}.png"), dpi=200)
plt.close()

# (D) Top50 순위 막대: 1) abs_to_minus10, 2) cross_min, 3) mfe
# 순위 시각적 효과를 위해 3개 그래프를 각각 저장
top50 = top50.reset_index(drop=True)

plt.figure(figsize=(12,4))
plt.bar(range(len(top50)), top50["abs_to_minus10"])
plt.title(f"TOP{TOP_N} | abs_to_minus10 (lower is better)")
plt.xlabel("rank")
plt.ylabel("abs_to_minus10")
plt.tight_layout()
plt.savefig(os.path.join(OUTDIR, f"bar_top{TOP_N}_abs_to_minus10.png"), dpi=200)
plt.close()

plt.figure(figsize=(12,4))
plt.bar(range(len(top50)), top50["cross_min"])
plt.title(f"TOP{TOP_N} | cross_min (higher is better)")
plt.xlabel("rank")
plt.ylabel("cross_min")
plt.tight_layout()
plt.savefig(os.path.join(OUTDIR, f"bar_top{TOP_N}_cross_min.png"), dpi=200)
plt.close()

plt.figure(figsize=(12,4))
plt.bar(range(len(top50)), top50["mfe"])
plt.title(f"TOP{TOP_N} | mfe (higher / less negative is better)")
plt.xlabel("rank")
plt.ylabel("mfe")
plt.tight_layout()
plt.savefig(os.path.join(OUTDIR, f"bar_top{TOP_N}_mfe.png"), dpi=200)
plt.close()

# =========================
# 8) 요약 리포트 저장
# =========================
report_path = os.path.join(OUTDIR, "rank_report.txt")
with open(report_path, "w") as f:
    f.write("=== Ranking rule ===\n")
    f.write("1) abs_to_minus10 ascending  (|dG_SD_ASD - (-10)| smaller is better)\n")
    f.write("2) cross_min descending      (min(dG_SD_WTASD, dG_WTSD_ASD) bigger is better)\n")
    f.write("3) mfe descending            (bigger/less negative is better)\n\n")
    f.write(f"INPUT_CSV  = {INPUT_CSV}\n")
    f.write(f"FILTER_COL = {FILTER_COL}\n")
    f.write(f"TOP_N      = {TOP_N}\n\n")
    f.write(f"True rows           : {len(cand):,}\n")
    f.write(f"Unique SD (best/SD) : {best_per_sd['SD'].nunique():,}\n")
    f.write(f"TOP{TOP_N} exported : {len(top50):,}\n\n")
    f.write("=== TOP10 preview ===\n")
    f.write(top50[["SD","Spacer","dG_SD_ASD","abs_to_minus10","dG_SD_WTASD","dG_WTSD_ASD","cross_min","mfe"]]
            .head(10).to_string(index=False))
    f.write("\n")

print("[SAVE]", report_path)

# 파일 확인
print("\n[FILES]")
!ls -lh {OUTDIR} | head -n 50


[INFO] pass_strict True rows: 4,928
[INFO] unique SD (best per SD): 77
[INFO] TOP50 selected: 50
[SAVE] /content/final_SD_design/top50_ranked/TOP50_ranked.csv
[SAVE] /content/final_SD_design/top50_ranked/rank_report.txt

[FILES]
total 472K
-rw-r--r-- 1 root root  44K Dec 23 07:40 bar_top50_abs_to_minus10.png
-rw-r--r-- 1 root root  45K Dec 23 07:40 bar_top50_cross_min.png
-rw-r--r-- 1 root root  37K Dec 23 07:40 bar_top50_mfe.png
-rw-r--r-- 1 root root  41K Dec 23 07:40 hist_abs_to_minus10_best_per_sd.png
-rw-r--r-- 1 root root  35K Dec 23 07:40 hist_abs_to_minus10_top50.png
-rw-r--r-- 1 root root 1.4K Dec 23 07:40 rank_report.txt
-rw-r--r-- 1 root root  59K Dec 23 07:40 scatter_abs_to_minus10_vs_crossmin_best_per_sd.png
-rw-r--r-- 1 root root  59K Dec 23 07:40 scatter_abs_to_minus10_vs_crossmin_top50.png
-rw-r--r-- 1 root root  59K Dec 23 07:40 scatter_crossmin_vs_mfe_best_per_sd.png
-rw-r--r-- 1 root root  57K Dec 23 07:40 scatter_crossmin_vs_mfe_top50.png
-rw-r--r-- 1 root root  14K