<a href="https://colab.research.google.com/github/jjjung99/SD-design/blob/main/6700%EB%A7%8C%EA%B0%9C_%EC%A0%84%EC%88%98%EC%A1%B0%EC%82%AC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install ViennaRNA
!pip install viennarna biopython

Collecting biopython
  Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.86


In [None]:
import itertools
import multiprocessing as mp
from Bio.Seq import Seq
import RNA

# ===============================
# 1. 조합 공간 정의
# ===============================
AT = ["A", "T"]
ATGC = ["A", "T", "G", "C"]

UPSTREAM_8 = ["".join(p) for p in itertools.product(AT, repeat=8)]      # 2^8 = 256
SD_6       = ["".join(p) for p in itertools.product(ATGC, repeat=6)]    # 4^6 = 4096
SPACER_6   = ["".join(p) for p in itertools.product(AT, repeat=6)]       # 2^6 = 64

WT_ASD_10BP = "CACCTCCTTA"
WT_ASD_CORE = WT_ASD_10BP[2:8]               # CTCCTT (6bp core)
WT_ASD_CORE_RC = str(Seq(WT_ASD_CORE).reverse_complement())

TARGET_RANGE = (-10, -8)


# ===============================
# 2. ΔG 계산
# ===============================
def calc_dg(a, b):
    return RNA.duplexfold(a, b).energy


# ===============================
# 3. O-ASD 생성
# ===============================
def make_o_asd(o_sd):
    rc = str(Seq(o_sd).reverse_complement())
    return WT_ASD_10BP[:2] + rc + WT_ASD_10BP[8:]


# ===============================
# 4. 평가 함수 (멀티프로세싱 대상)
# ===============================
def evaluate_candidate(item):
    upstream, o_sd, spacer = item

    # O-ASD 생성
    o_asd_full = make_o_asd(o_sd)
    o_asd_core = o_asd_full[2:8]  # core 6bp

    # ΔG(O-SD : O-ASD core)
    dg_orth = calc_dg(o_sd, o_asd_core)
    if not (-10 <= dg_orth <= -8):
        return None

    # ΔG(O-SD : WT-ASD core)
    dg_wt = calc_dg(o_sd, WT_ASD_CORE_RC)
    if dg_wt <= 0:
        return None

    # 전체 20bp
    region20 = upstream + o_sd + spacer

    # CSV 한 줄 구성
    return f"{region20},{o_sd},{o_asd_full},{dg_orth},{dg_wt}"



# ===============================
# 5. CHUNK 생성기
# ===============================
def chunk_generator(chunk_size=1000000):
    """
    Upstream × SD × Spacer 67M 조합에서 chunk 단위로 yield
    """
    batch = []
    for u in UPSTREAM_8:
        for s in SD_6:
            for sp in SPACER_6:
                batch.append((u, s, sp))
                if len(batch) == chunk_size:
                    yield batch
                    batch = []
    if batch:
        yield batch


# ===============================
# 6. 메인 실행
# ===============================
def run_full_scan():

    cpu_count = mp.cpu_count() - 1
    print(f"Using {cpu_count} CPU cores")

    outfile = open("orthogonal_SD_fullscan.csv", "w")
    outfile.write("20bp,O_SD,O_ASD,dg_orth,dg_wt\n")

    for idx, chunk in enumerate(chunk_generator()):

        print(f"Processing chunk {idx+1} ... size {len(chunk)}")

        with mp.Pool(cpu_count) as pool:
            for result in pool.imap_unordered(evaluate_candidate, chunk):
                if result:
                    outfile.write(result + "/n")

    outfile.close()
    print("Finished full 6,700만 개 전수 계산!")


if __name__ == "__main__":
    run_full_scan()


Using 1 CPU cores
Processing chunk 1 ... size 1000000
Processing chunk 2 ... size 1000000
Processing chunk 3 ... size 1000000
Processing chunk 4 ... size 1000000
Processing chunk 5 ... size 1000000
Processing chunk 6 ... size 1000000
Processing chunk 7 ... size 1000000
Processing chunk 8 ... size 1000000
Processing chunk 9 ... size 1000000
Processing chunk 10 ... size 1000000
Processing chunk 11 ... size 1000000
Processing chunk 12 ... size 1000000
Processing chunk 13 ... size 1000000
Processing chunk 14 ... size 1000000
Processing chunk 15 ... size 1000000
Processing chunk 16 ... size 1000000
Processing chunk 17 ... size 1000000
Processing chunk 18 ... size 1000000
Processing chunk 19 ... size 1000000
Processing chunk 20 ... size 1000000
Processing chunk 21 ... size 1000000
Processing chunk 22 ... size 1000000
Processing chunk 23 ... size 1000000
Processing chunk 24 ... size 1000000
Processing chunk 25 ... size 1000000
Processing chunk 26 ... size 1000000
Processing chunk 27 ... size 

In [13]:
input_file = "orthogonal_SD_fullscan.csv"
output_file = "orthogonal_SD_fullscan_fixed.csv"

with open(input_file, "r") as infile, open(output_file, "w") as outfile:
    text = infile.read()
    text = text.replace("/n", "\n")
    outfile.write(text)

print("수정된 CSV 저장 완료 → orthogonal_SD_fullscan_fixed.csv")


20bp 전체 UTR-only CSV 저장 완료 → 20bp_only.csv


In [14]:
count = 0

with open("orthogonal_SD_fullscan_fixed.csv", "r") as f:
    for i, line in enumerate(f):
        if i == 0:
            continue  # 헤더 건너뛰기
        if line.strip():
            count += 1

print("최종 후보 개수:", count)


최종 후보 개수: 4390912


Upstream 256 × SD 4096 × Spacer 64 = 67,108,864개 (약 6,700만 개)

최종 후보 개수 : 4,390,912개

전체 조합 중 약 6.5%가 조건을 통과함