# 사이트
- ncbi virus
- virushostdb

# k-mer
- 단일염기 A, C, G, T는 정보량이 너무 적음
- k-mer는 DNA에서 의미 단위의 토큰 역할을 함 -> 단어처럼 작동
- 3-mer: 문맥 포함, 의미 있는 단위
- 6-mer: 더 긴 문맥 가능, 희소성 증가

In [None]:
!pip install transformers
!pip install sentencepiece

# 기본 버전

In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf
from sklearn.model_selection import train_test_split
import numpy as np

x # 유전자서열 컬럼
y # 0 or 1의 정답

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=123)
x_train,x_val,y_train,y_val=train_test_split(x_train,y_train,test_size=0.2,random_state=123)

model_name = "zhihan1996/DNABERT-6"
# model_name = "nucleotide-transformer/dna-bert-500k" 이것도 해보기
tokenizer = BertTokenizer.from_pretrained(model_name)
model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=2)

def kmer_tokenizer(seq, k=6):
    return ' '.join([seq[i:i+k] for i in range(len(seq) - k + 1)])
    
k=6
x_train=[kmer_tokenizer(seq, k) for seq in x_train]
x_val=[kmer_tokenizer(seq, k) for seq in x_val]
x_test=[kmer_tokenizer(seq, k) for seq in x_test]

x_train = tokenizer(x_train, return_tensors="tf", padding=True, truncation=True,max_length=512)
x_val = tokenizer(x_val, return_tensors="tf", padding=True, truncation=True,max_length=512)
x_test = tokenizer(x_test, return_tensors="tf", padding=True, truncation=True,max_length=512)

y_train = tf.convert_to_tensor(np.array(y_train), dtype=tf.int32)
y_val = tf.convert_to_tensor(np.array(y_val), dtype=tf.int32)
y_test = tf.convert_to_tensor(np.array(y_test), dtype=tf.int32)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), # or BinaryCrossentropy
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name="acc"),tf.keras.metrics.AUC(name="auc")]  #auc등 존재
)

# 콜백
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',  # 검증 손실 모니터링, val_auc도 가능
    patience=3,          # 3번 연속 개선 없으면 종료
    restore_best_weights=True)

model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath='DNA_BERT.keras',  # 저장할 파일 이름
    monitor='val_loss',
    save_best_only=True)  

train_dataset = tf.data.Dataset.from_tensor_slices((dict(x_train), y_train))
val_dataset = tf.data.Dataset.from_tensor_slices((dict(x_val), y_val))
test_dataset = tf.data.Dataset.from_tensor_slices((dict(x_test), y_test))

train_dataset = train_dataset.shuffle(100).batch(32).prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.shuffle(100).batch(32).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.shuffle(100).batch(32).prefetch(tf.data.AUTOTUNE)

model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=3,
    callbacks=[early_stop, model_checkpoint] 
)

#저장
model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_model")

# 평가
loss, acc,auc = model.evaluate(val_dataset)

#예측
def predict_bert(model, tokenizer, texts, k=6):
    if isinstance(texts, str):
        texts = [texts]
    texts = [kmer_tokenizer(seq, k) for seq in texts]
    enc = tokenizer(texts, return_tensors="tf", padding=True, truncation=True,max_length=512)
    logits = model(enc).logits
    preds = tf.argmax(tf.nn.softmax(logits, axis=-1), axis=-1)
    return preds.numpy()

predict_bert(model,tokenizer,texts)

In [None]:
# roc_curve그리기
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
import numpy as np

# 1) val_dataset에서 예측 확률 얻기
y_true = []
y_scores = []

for batch in val_dataset:
    inputs, labels = batch
    logits = model(inputs).logits
    probs = tf.nn.softmax(logits, axis=1)[:, 1]  # 클래스 1에 대한 확률
    y_scores.extend(probs.numpy())
    y_true.extend(labels.numpy())

y_true = np.array(y_true)
y_scores = np.array(y_scores)

# 2) ROC curve 계산
fpr, tpr, thresholds = roc_curve(y_true, y_scores)
roc_auc = auc(fpr, tpr)

# 3) ROC curve 시각화
plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.3f})')
plt.plot([0,1], [0,1], color='navy', lw=1, linestyle='--')
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

# 슬라이딩 윈도우 하기

In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split

# 1. k-mer tokenizer
def kmer_tokenizer(seq, k=6):
    return ' '.join([seq[i:i+k] for i in range(len(seq) - k + 1)])

# 2. 슬라이딩 윈도우 함수
def sliding_window(seq, window_size=512 + 6 - 1, step=50):
    windows = []
    for start in range(0, len(seq) - window_size + 1, step):
        windows.append(seq[start:start+window_size])
    if (len(seq) - window_size) % step != 0:
        windows.append(seq[-window_size:])
    return windows

# 3. 긴 시퀀스 리스트를 윈도우 분할 후 k-mer 변환, 레이블 복제까지
def prepare_dataset(sequences, labels, k=6, window_size=512, step=50):
    all_windows = []
    all_labels = []
    for seq, label in zip(sequences, labels):
        windows = sliding_window(seq, window_size=window_size + k - 1, step=step)
        kmer_windows = [kmer_tokenizer(w, k) for w in windows]
        all_windows.extend(kmer_windows)
        all_labels.extend([label] * len(windows))  # 각 윈도우에 원본 시퀀스 라벨 복제
    return all_windows, all_labels

# 4. 데이터 로드 및 분할 (x, y는 유전자 서열 리스트와 레이블 리스트)
# 예: x = ['ACGT...', 'TGCA...', ...], y = [0, 1, ...]
x_train_full, x_test, y_train_full, y_test = train_test_split(x, y, test_size=0.2, random_state=123)
x_train, x_val, y_train, y_val = train_test_split(x_train_full, y_train_full, test_size=0.2, random_state=123)

# 5. 윈도우 분할 및 k-mer 변환, 레이블 확장
k = 6
window_size = 512
step = 50

x_train_windows, y_train_windows = prepare_dataset(x_train, y_train, k, window_size, step)
x_val_windows, y_val_windows = prepare_dataset(x_val, y_val, k, window_size, step)
x_test_windows, y_test_windows = prepare_dataset(x_test, y_test, k, window_size, step)

# 6. 토크나이저 및 모델 불러오기
model_name = "zhihan1996/DNABERT-6"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# 7. 토크나이징
x_train_enc = tokenizer(x_train_windows, return_tensors="tf", padding=True, truncation=True, max_length=window_size)
x_val_enc = tokenizer(x_val_windows, return_tensors="tf", padding=True, truncation=True, max_length=window_size)
x_test_enc = tokenizer(x_test_windows, return_tensors="tf", padding=True, truncation=True, max_length=window_size)

# 8. 레이블 tensor 변환
y_train_tensor = tf.convert_to_tensor(np.array(y_train_windows), dtype=tf.int32)
y_val_tensor = tf.convert_to_tensor(np.array(y_val_windows), dtype=tf.int32)
y_test_tensor = tf.convert_to_tensor(np.array(y_test_windows), dtype=tf.int32)

# 9. 데이터셋 생성 및 배치 처리
train_dataset = tf.data.Dataset.from_tensor_slices((dict(x_train_enc), y_train_tensor)).shuffle(1000).batch(32).prefetch(tf.data.AUTOTUNE)
val_dataset = tf.data.Dataset.from_tensor_slices((dict(x_val_enc), y_val_tensor)).batch(32).prefetch(tf.data.AUTOTUNE)
test_dataset = tf.data.Dataset.from_tensor_slices((dict(x_test_enc), y_test_tensor)).batch(32).prefetch(tf.data.AUTOTUNE)

# 10. 모델 컴파일
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name="acc"), tf.keras.metrics.AUC(name="auc")]
)

# 11. 콜백 설정
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath='DNA_BERT.keras', monitor='val_loss', save_best_only=True)

# 12. 모델 학습
model.fit(train_dataset, validation_data=val_dataset, epochs=3, callbacks=[early_stop, model_checkpoint])

# 13. 평가
results = model.evaluate(test_dataset)
print(f"Test Loss: {results[0]:.4f}, Accuracy: {results[1]:.4f}, AUC: {results[2]:.4f}")

# 14. 예측
def predict_long_sequence(model, tokenizer, sequence, k=6, window_size=512, step=50):
    windows = sliding_window(sequence, window_size=window_size + k - 1, step=step)
    probs = []
    for win_seq in windows:
        kmer_seq = kmer_tokenizer(win_seq, k)
        enc = tokenizer(kmer_seq, return_tensors="tf", padding=True, truncation=True, max_length=window_size)
        logits = model(enc).logits
        prob = tf.nn.softmax(logits, axis=-1)[0, 1].numpy()  # 클래스 1 확률
        probs.append(prob)
    avg_prob = sum(probs) / len(probs)
    return avg_prob, probs
avg_prob, window_probs = predict_long_sequence(model, tokenizer, long_seq)

# 내부에서 각 샘플별 윈도우들 하나로 합쳐 분류층에 넣기
- (샘플,윈도우,임베딩)을 (샘플X윈도우,임베딩)해서 bert에 넣고 (샘플X윈도우,cls)로 출력해서 (샘플,윈도우,cls)로 출력하고 같은 샘플의 윈도우끼리 cls 평균내서 (샘플,평균cls)로 변환후 dense에 넣음
- mean, dense말고 max나 attention, lstm에도 넣을수있음

In [None]:
from transformers import TFBertModel, BertTokenizer
import tensorflow as tf
import numpy as np

#데이터
x # 유전자 서열
y # 레이블 값(0,1)

# k-mer tokenizer
def kmer_tokenizer(seq, k=6):
    return ' '.join([seq[i:i+k] for i in range(len(seq) - k + 1)])

# sliding window
def sliding_window(seq, window_size=512 + 6 - 1, step=50):
    windows = []
    for start in range(0, len(seq) - window_size + 1, step):
        windows.append(seq[start:start+window_size])
    if (len(seq) - window_size) % step != 0:
        windows.append(seq[-window_size:])
    return windows

# 모델 클래스 정의
class WindowAggregateClassifier(tf.keras.Model):
    def __init__(self, pretrained_model_name, num_labels=2):
        super().__init__()
        self.bert = TFBertModel.from_pretrained(pretrained_model_name)
        self.classifier = tf.keras.layers.Dense(num_labels)  # 분류기, logits 출력

    def call(self, inputs, training=False):
        """
        inputs: dict with keys:
            - input_ids: shape (batch_size, windows, seq_len)
            - attention_mask: shape (batch_size, windows, seq_len)
        """

        input_ids = inputs['input_ids']   # (batch, windows, seq_len)
        attention_mask = inputs['attention_mask']  # (batch, windows, seq_len)

        batch_size = tf.shape(input_ids)[0]
        windows = tf.shape(input_ids)[1]
        seq_len = tf.shape(input_ids)[2]

        # windows 차원과 batch 차원을 합쳐서 BERT에 한번에 넣기
        input_ids_reshaped = tf.reshape(input_ids, (-1, seq_len))          # (batch*windows, seq_len)
        attention_mask_reshaped = tf.reshape(attention_mask, (-1, seq_len))# (batch*windows, seq_len)

        bert_outputs = self.bert(input_ids_reshaped, attention_mask=attention_mask_reshaped, training=training)
        # pooled_output: (batch*windows, hidden_size)
        pooled_output = bert_outputs.pooler_output

        # 다시 (batch, windows, hidden_size)로 reshape
        pooled_output = tf.reshape(pooled_output, (batch_size, windows, -1))

        # 윈도우 임베딩 평균
        pooled_mean = tf.reduce_mean(pooled_output, axis=1)  # (batch, hidden_size)

        logits = self.classifier(pooled_mean)  # (batch, num_labels)

        return logits

# 토크나이저 및 하이퍼파라미터
model_name = "zhihan1996/DNABERT-6"
tokenizer = BertTokenizer.from_pretrained(model_name)
num_labels = 2

# 예시 입력 데이터 (batch=2, 각 시퀀스 길이 길어서 윈도우 3개씩)
example_sequences = [
    "ACGTGCTAGCTAGCTAGCTGATCGATCGTACGATCGATGCTAGCTAGCTAGCATCGATCGATGCTAGCTAGCTAGCATCGATCGATGC",
    "TGCTAGCTAGCTAGCTAGCTAGCATCGATGCTAGCTAGCTAGCATCGATCGTAGCTAGCTAGCATCGTAGCTAGCTAGCTAGCATCGA"
]

# 1. 각 시퀀스 윈도우 분할 및 k-mer 토크나이징
k = 6
window_size = 512
step = 50

all_windows = []
window_counts = []
for seq in example_sequences:
    windows = sliding_window(seq, window_size=window_size + k - 1, step=step)
    window_counts.append(len(windows))
    windows_kmer = [kmer_tokenizer(w, k) for w in windows]
    all_windows.extend(windows_kmer)

# 2. 토크나이징 한꺼번에
encodings = tokenizer(all_windows, padding='max_length', truncation=True, max_length=window_size, return_tensors='tf')

# 3. 배치 및 윈도우 개수 맞춰서 reshape
max_windows = max(window_counts)
input_ids = encodings['input_ids']
attention_mask = encodings['attention_mask']

# 윈도우 개수가 다르면 패딩 필요 → 짧은 시퀀스는 빈 윈도우로 패딩
def pad_windows(input_ids, attention_mask, window_counts, max_windows):
    padded_input_ids = []
    padded_attention_mask = []
    start = 0
    seq_len = input_ids.shape[1]
    for count in window_counts:
        windows_slice = input_ids[start:start+count]
        mask_slice = attention_mask[start:start+count]
        pad_len = max_windows - count
        if pad_len > 0:
            windows_pad = tf.zeros((pad_len, seq_len), dtype=input_ids.dtype)
            mask_pad = tf.zeros((pad_len, seq_len), dtype=attention_mask.dtype)
            windows_slice = tf.concat([windows_slice, windows_pad], axis=0)
            mask_slice = tf.concat([mask_slice, mask_pad], axis=0)
        padded_input_ids.append(windows_slice)
        padded_attention_mask.append(mask_slice)
        start += count
    return tf.stack(padded_input_ids), tf.stack(padded_attention_mask)

input_ids_padded, attention_mask_padded = pad_windows(input_ids, attention_mask, window_counts, max_windows)

# 4. 모델 생성 및 컴파일
model = WindowAggregateClassifier(pretrained_model_name=model_name, num_labels=num_labels)

optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = [tf.keras.metrics.SparseCategoricalAccuracy(name='acc'), tf.keras.metrics.AUC(name='auc')]

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

# 5. 레이블 (예시)
labels = tf.constant([0,1])  # 배치 사이즈 2

# 6. 학습 예시 (여기서는 그냥 한 번 호출)
logits = model({'input_ids': input_ids_padded, 'attention_mask': attention_mask_padded})
print(logits)

# 7. fit에 맞게 데이터셋 만들려면 input dict 형식 맞춰서 구성하면 됩니다.

In [None]:
# 예측
def predict_long_sequence_class(model, tokenizer, sequence, k=6, window_size=512, step=50):
    # 1. 슬라이딩 윈도우 → k-mer 변환
    windows = sliding_window(sequence, window_size + k - 1, step)
    kmer_windows = [kmer_tokenizer(w, k) for w in windows]

    # 2. 토크나이징
    encodings = tokenizer(kmer_windows, padding='max_length', truncation=True,
                          max_length=window_size, return_tensors='tf')
    
    input_ids = encodings['input_ids']
    attention_mask = encodings['attention_mask']

    # 3. 윈도우 개수 맞춰서 패딩 (batch=1)
    input_ids_padded, attention_mask_padded = pad_windows(input_ids, attention_mask, [len(windows)], len(windows))

    # 4. 예측
    logits = model({'input_ids': input_ids_padded, 'attention_mask': attention_mask_padded})
    probs = tf.nn.softmax(logits, axis=-1)
    pred_class = tf.argmax(probs, axis=-1).numpy()[0]      # 예측된 클래스
    pred_prob = probs.numpy()[0][1]                        # 클래스 1일 확률

    return pred_class, pred_prob

# 최종

In [None]:
from transformers import TFBertModel, BertTokenizer
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split

# ============================
# 1. 데이터
# ============================
x = [...]  # 유전자 서열 리스트 (문자열)
y = [...]  # 정답 (0 또는 1)

# ============================
# 2. 전처리 함수
# ============================
def kmer_tokenizer(seq, k=6):
    return ' '.join([seq[i:i+k] for i in range(len(seq) - k + 1)])

def sliding_window(seq, window_size=512+6-1, step=300):
    windows = []
    for start in range(0, len(seq) - window_size + 1, step):
        windows.append(seq[start:start+window_size])
    if (len(seq) - window_size) % step != 0:
        windows.append(seq[-window_size:])
    return windows

def encode_and_pad(sequences, labels, tokenizer, k=6, window_size=512, step=300):
    max_seq_len = max(len(seq) for seq in sequences)
    max_windows = (max_seq_len - (window_size + k - 1)) // step + 2  # +2 여유 패딩용

    all_input_ids = []
    all_attention_masks = []
    all_labels = []

    for seq, label in zip(sequences, labels):
        windows = sliding_window(seq, window_size + k - 1, step)
        # max_windows 이상이면 자르기 (필요하면)
        if len(windows) > max_windows:
            windows = windows[:max_windows]

        kmer_windows = [kmer_tokenizer(w, k) for w in windows]
        enc = tokenizer(kmer_windows, padding='max_length', truncation=True,
                        max_length=window_size, return_tensors='np')

        input_ids = enc['input_ids']
        attention_mask = enc['attention_mask']

        pad_len = max_windows - input_ids.shape[0]
        if pad_len > 0:
            input_ids = np.pad(input_ids, ((0, pad_len), (0, 0)), constant_values=0)
            attention_mask = np.pad(attention_mask, ((0, pad_len), (0, 0)), constant_values=0)

        all_input_ids.append(input_ids)
        all_attention_masks.append(attention_mask)
        all_labels.append(label)

    return (
        tf.convert_to_tensor(all_input_ids, dtype=tf.int32),
        tf.convert_to_tensor(all_attention_masks, dtype=tf.int32),
        tf.convert_to_tensor(all_labels, dtype=tf.int32)
    )

# ============================
# 3. 모델 클래스 정의
# ============================
class WindowAggregateClassifier(tf.keras.Model):
    def __init__(self, pretrained_model_name, num_labels=2):
        super().__init__()
        self.bert = TFBertModel.from_pretrained(pretrained_model_name)
        self.classifier = tf.keras.layers.Dense(num_labels)

    def call(self, inputs, training=False):
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']

        batch_size = tf.shape(input_ids)[0]
        windows = tf.shape(input_ids)[1]
        seq_len = tf.shape(input_ids)[2]

        input_ids_flat = tf.reshape(input_ids, (-1, seq_len))
        attn_flat = tf.reshape(attention_mask, (-1, seq_len))

        bert_output = self.bert(input_ids_flat, attention_mask=attn_flat, training=training)
        pooled = bert_output.pooler_output  # (batch * windows, hidden)

        pooled = tf.reshape(pooled, (batch_size, windows, -1))
        mean_pooled = tf.reduce_mean(pooled, axis=1)  # (batch, hidden)

        logits = self.classifier(mean_pooled)  # (batch, num_labels)
        return logits

# ============================
# 4. 학습/검증/테스트 분할
# ============================
x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.3, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42)

# ============================
# 5. 모델 및 토크나이저 준비
# ============================
model_name = "zhihan1996/DNABERT-6"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = WindowAggregateClassifier(pretrained_model_name=model_name, num_labels=2)

model.compile(
    optimizer=tf.keras.optimizers.Adam(2e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name="acc"),
             tf.keras.metrics.AUC(name="auc")]
)

early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath='DNA_BERT.keras', monitor='val_loss', save_best_only=True)

# ============================
# 6. Dataset 생성
# ============================
def make_dataset(x_data, y_data, tokenizer, batch_size=32):
    input_ids, attn_mask, labels = encode_and_pad(x_data, y_data, tokenizer)
    ds = tf.data.Dataset.from_tensor_slices(({
        'input_ids': input_ids,
        'attention_mask': attn_mask
    }, labels))
    return ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)

train_ds = make_dataset(x_train, y_train, tokenizer)
val_ds   = make_dataset(x_val, y_val, tokenizer)
test_ds  = make_dataset(x_test, y_test, tokenizer)

# ============================
# 7. 학습
# ============================
model.fit(train_ds, validation_data=val_ds, epochs=10, callbacks=[early_stop, model_checkpoint])

# ============================
# 8. 저장
# ============================
model.save_weights("dna_bert_classifier.weights.h5")
tokenizer.save_pretrained("./saved_tokenizer")

# ============================
# 9. 평가
# ============================
model.evaluate(test_ds)

# ============================
# 10. 예측 함수
# ============================
def predict_sequence(model, tokenizer, sequence, k=6, window_size=512, step=300, max_windows=120):
    # 1. 슬라이딩 윈도우 분할 (step=300으로 바꿨음)
    windows = sliding_window(sequence, window_size + k - 1, step)[:max_windows]
    
    # 2. k-mer 토크나이징
    kmer_windows = [kmer_tokenizer(w, k) for w in windows]

    # 3. 토크나이징 및 패딩
    enc = tokenizer(kmer_windows, padding='max_length', truncation=True,
                    max_length=window_size, return_tensors='tf')

    input_ids = enc['input_ids']
    attn_mask = enc['attention_mask']

    # 4. 윈도우 부족하면 0패딩
    pad_len = max_windows - input_ids.shape[0]
    if pad_len > 0:
        input_ids = tf.pad(input_ids, [[0, pad_len], [0, 0]])
        attn_mask = tf.pad(attn_mask, [[0, pad_len], [0, 0]])

    # 5. 배치 차원 추가
    input_ids = tf.expand_dims(input_ids, axis=0)        # (1, windows, seq_len)
    attn_mask = tf.expand_dims(attn_mask, axis=0)

    # 6. 예측
    logits = model({'input_ids': input_ids, 'attention_mask': attn_mask}, training=False)
    probs = tf.nn.softmax(logits, axis=-1)
    pred = tf.argmax(probs, axis=-1).numpy()[0]

    return int(pred), probs.numpy()[0]

## 예측하기
long_seq = "ACGT" * 10000  # 길이 40000짜리 유전자 서열
pred, prob = predict_sequence(model, tokenizer, long_seq)
print(f"예측 클래스: {pred}, 확률 분포: {prob}")

### attention

In [None]:
# attention버전
class WindowAttentionClassifier(tf.keras.Model):
    def __init__(self, pretrained_model_name, num_labels=2, hidden_size=768):
        super().__init__()
        self.bert = TFBertModel.from_pretrained(pretrained_model_name)
        self.attn_layer = tf.keras.layers.Dense(1)  # attention weight 계산
        self.classifier = tf.keras.layers.Dense(num_labels)

    def call(self, inputs, training=False):
        input_ids = inputs['input_ids']         # (batch, windows, seq_len)
        attention_mask = inputs['attention_mask']

        batch_size = tf.shape(input_ids)[0]
        num_windows = tf.shape(input_ids)[1]
        seq_len = tf.shape(input_ids)[2]

        input_ids = tf.reshape(input_ids, (-1, seq_len))            # (batch * windows, seq_len)
        attention_mask = tf.reshape(attention_mask, (-1, seq_len))  # (batch * windows, seq_len)

        bert_outputs = self.bert(input_ids, attention_mask=attention_mask, training=training)
        pooled_output = bert_outputs.pooler_output  # (batch * windows, hidden_size)

        window_embeddings = tf.reshape(pooled_output, (batch_size, num_windows, -1))  # (batch, windows, hidden)

        # Attention weights: (batch, windows, 1)
        attn_logits = self.attn_layer(window_embeddings)
        attn_weights = tf.nn.softmax(attn_logits, axis=1)

        # 가중합: (batch, hidden)
        weighted_sum = tf.reduce_sum(attn_weights * window_embeddings, axis=1)

        logits = self.classifier(weighted_sum)  # (batch, num_labels)
        return logits
'''
window_embeddings는 이미 BERT를 통과한 벡터들 → 즉, V (value) 역할

self.attn_layer(Dense(1))는 각 윈도우 벡터에 대해 스칼라 score 계산 → attention 점수

softmax는 이 score를 확률처럼 정규화 → attention weight

attn_weights * window_embeddings는 V에 대한 가중치 곱

reduce_sum은 attention-weighted sum → 최종 attention pooled 벡터
'''

### LSTM

In [None]:
class WindowLSTMClassifier(tf.keras.Model):
    def __init__(self, pretrained_model_name, num_labels=2, hidden_size=768):
        super().__init__()
        self.bert = TFBertModel.from_pretrained(pretrained_model_name)
        self.lstm = tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(hidden_size, return_sequences=False)
        )
        self.classifier = tf.keras.layers.Dense(num_labels)

    def call(self, inputs, training=False):
        input_ids = inputs['input_ids']         # (batch, windows, seq_len)
        attention_mask = inputs['attention_mask']

        batch_size = tf.shape(input_ids)[0]
        num_windows = tf.shape(input_ids)[1]
        seq_len = tf.shape(input_ids)[2]

        input_ids = tf.reshape(input_ids, (-1, seq_len))
        attention_mask = tf.reshape(attention_mask, (-1, seq_len))

        bert_outputs = self.bert(input_ids, attention_mask=attention_mask, training=training)
        pooled_output = bert_outputs.pooler_output  # (batch * windows, hidden_size)

        window_embeddings = tf.reshape(pooled_output, (batch_size, num_windows, -1))  # (batch, windows, hidden)

        lstm_output = self.lstm(window_embeddings)  # (batch, hidden*2)

        logits = self.classifier(lstm_output)  # (batch, num_labels)
        return logits

# DNA_BERT+MIL: 위의 attention과 거의 유사한 방식

In [None]:
from transformers import TFBertModel, BertTokenizerFast
import tensorflow as tf
import numpy as np

# 1. DNA-BERT 모델과 토크나이저 불러오기
bert_model_name = "zhihan1996/DNA_bert_6"
tokenizer = BertTokenizerFast.from_pretrained(bert_model_name)
bert = TFBertModel.from_pretrained(bert_model_name)

# 2. Sliding window 함수 (k-mer=6 기준)
def sliding_windows(sequence, k=6, window_size=512, stride=128):
    kmers = [sequence[i:i+k] for i in range(len(sequence) - k + 1)]
    windows = []
    for start in range(0, len(kmers) - window_size + 1, stride):
        window = " ".join(kmers[start:start+window_size])
        windows.append(window)
    return windows

# 3. Attention Pooling Layer (MIL에서 optional)
class AttentionPooling(tf.keras.layers.Layer):
    def __init__(self, hidden_dim):
        super().__init__()
        self.dense = tf.keras.layers.Dense(1)

    def call(self, inputs):
        # inputs: [batch, num_windows, hidden_dim]
        scores = tf.nn.softmax(self.dense(inputs), axis=1)  # 윈도우별 attention score
        weighted_sum = tf.reduce_sum(inputs * scores, axis=1)
        return weighted_sum  # [batch, hidden_dim]

# 4. MIL 모델 빌드 함수
def build_dnabert_mil_classifier(bert, hidden_dim=768, pooling="max"):
    # 입력: batch 크기 상관없이 윈도우 개수 변동 가능 (None)
    input_ids = tf.keras.Input(shape=(None, 512), dtype=tf.int32, name="input_ids")
    attention_mask = tf.keras.Input(shape=(None, 512), dtype=tf.int32, name="attention_mask")

    # 윈도우별 BERT 임베딩 추출
    def get_cls_embedding(i):
        outputs = bert(input_ids[:, i], attention_mask=attention_mask[:, i])
        return outputs.last_hidden_state[:, 0]  # [CLS] 토큰 벡터 [batch, hidden_dim]

    # 윈도우 개수
    num_windows = tf.shape(input_ids)[1]

    # 모든 윈도우에 대해 CLS 임베딩 계산 (tf.map_fn 사용)
    cls_embeddings = tf.map_fn(get_cls_embedding, tf.range(num_windows), dtype=tf.float32)
    cls_embeddings = tf.transpose(cls_embeddings, perm=[1, 0, 2])  # [batch, num_windows, hidden_dim]

    # MIL Pooling (max, mean, attention 중 선택)
    if pooling == "max":
        pooled = tf.reduce_max(cls_embeddings, axis=1)
    elif pooling == "mean":
        pooled = tf.reduce_mean(cls_embeddings, axis=1)
    elif pooling == "attention":
        pooled = AttentionPooling(hidden_dim)(cls_embeddings)
    else:
        raise ValueError("pooling must be 'max', 'mean', or 'attention'")

    # 최종 분류 레이어 (이진 분류)
    output = tf.keras.layers.Dense(1, activation="sigmoid")(pooled)

    model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output)
    return model

# 5. 실제 사용 예시

# 긴 DNA 서열 (예시)
sequence = "ATGCGTACGTTAGCTAGCTAGCTGATCGTACGATCGTAGCTAGCTAGCTAGCTA" * 1000  # 5만 bp 이상

# 윈도우 생성
windows = sliding_windows(sequence, k=6, window_size=512, stride=128)
print(f"윈도우 개수: {len(windows)}")

# 토크나이징 (batch size=1 가정, padding/truncation 필요)
tokens = tokenizer(windows, return_tensors="tf", padding="max_length", truncation=True, max_length=512)

# 입력 차원 맞추기
input_ids = tf.expand_dims(tokens["input_ids"], axis=0)       # [batch=1, num_windows, seq_len]
attention_mask = tf.expand_dims(tokens["attention_mask"], axis=0)

# 모델 생성
model = build_dnabert_mil_classifier(bert, pooling="attention")  # pooling: max/mean/attention 선택 가능
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# 가짜 레이블 (positive=1, batch size=1)
y = np.array([1])

# 학습 예시 (실제 데이터셋에서는 batch 처리 권장)
model.fit([input_ids, attention_mask], y, epochs=1)
