### 데이터 다운

In [None]:
!mkdir -p genome/human
!wget -O genome/human/GRCh38.fna.gz \
 ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.28_GRCh38.p13/GCA_000001405.28_GRCh38.p13_genomic.fna.gz

In [None]:
#압축해제
!gunzip genome/human/GRCh38.fna.gz

### 사전 학습 모델 전처리

In [None]:
import csv
from Bio import SeqIO
from tqdm import tqdm

# --- 파라미터 ---
INPUT_FILE = '/content/genome/human/GRCh38.fna'
OUTPUT_FILE = '/content/genome/human/grch38_windows_seq.csv'

SEQ_LEN = 512      # 윈도우 길이
STRIDE = 256       # 슬라이딩 간격
MAX_N_RATIO = 0.05 # N 비율 허용 최대치

USE_ALL_CHROMOSOMES = True   # 테스트할 땐 False로 두고 일부만 돌려도 됨
MAX_CHROMOSOMES = 1          # USE_ALL_CHROMOSOMES=False일 때만 의미 있음


def preprocess_genome_to_seq_csv(input_path, output_path):
    print(f"Processing to CSV (seq only): {output_path}")

    chrom_count = 0

    with open(output_path, 'w', newline='', encoding='utf-8') as f_out:
        writer = csv.writer(f_out)

        # CSV header
        writer.writerow(["id", "chrom", "start_pos", "seq"])

        # FASTA 파싱
        for record in tqdm(SeqIO.parse(input_path, "fasta"), desc="Chromosomes"):
            if not USE_ALL_CHROMOSOMES and chrom_count >= MAX_CHROMOSOMES:
                break

            full_seq = str(record.seq).upper()
            chrom_id = record.id

            if len(full_seq) < SEQ_LEN:
                continue

            # 윈도우 슬라이싱
            for start in range(0, len(full_seq) - SEQ_LEN + 1, STRIDE):
                chunk = full_seq[start:start+SEQ_LEN]

                # N 비율 필터링
                if chunk.count('N') / SEQ_LEN > MAX_N_RATIO:
                    continue

                row_id = f"{chrom_id}_{start}"
                writer.writerow([row_id, chrom_id, start, chunk])

            chrom_count += 1

    print("\n완료! CSV 저장됨:", output_path)


# 실행
preprocess_genome_to_seq_csv(INPUT_FILE, OUTPUT_FILE)

In [None]:
csv_path = "/content/genome/human/grch38_windows_seq.csv"

with open(csv_path, "r") as f:
    for i in range(5):
        print(f.readline().strip())