In [10]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install transformers==4.41.0
!pip install biopython
!pip install tqdm
!pip install pandas
!pip install numpy
!pip install scikit-learn


Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting transformers==4.41.0
  Using cached transformers-4.41.0-py3-none-any.whl.metadata (43 kB)
Collecting huggingface-hub<1.0,>=0.23.0 (from transformers==4.41.0)
  Using cached huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting pyyaml>=5.1 (from transformers==4.41.0)
  Downloading pyyaml-6.0.3-cp310-cp310-win_amd64.whl.metadata (2.4 kB)
Collecting regex!=2019.12.17 (from transformers==4.41.0)
  Using cached regex-2025.11.3-cp310-cp310-win_amd64.whl.metadata (41 kB)
Collecting requests (from transformers==4.41.0)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers==4.41.0)
  Downloading tokenizers-0.19.1-cp310-none-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.1 (from transformers==4.41.0)
  Downloading safetensors-0.7.0-cp38-abi3-win_amd64.whl.metadata (4.2 kB)
Collecting tqdm>=4.27 (from transformers==4.41.0)
  Using cached 

In [11]:
import torch
print("CUDA:", torch.cuda.is_available())
print("Device:", torch.cuda.get_device_name(0))

CUDA: True
Device: NVIDIA GeForce RTX 4060


In [13]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import random
import numpy as np
import pandas as pd
from typing import List

import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForMaskedLM


# =====================================================
# 0) Í≤ΩÎ°ú ÏÑ§Ï†ï (üî• ÏßÄÏòàÎãò PC Í∏∞Ï§Ä)
# =====================================================
BASE = r"C:\Users\USER\Desktop\Ï†ÑÏßÄÏòà"

GRCH38_CSV = os.path.join(BASE, "grch38_windows_seq.csv")
CHM13_CSV  = os.path.join(BASE, "chm13_windows_seq.csv")

TEST_PATH        = os.path.join(BASE, "test.csv")
SAMPLE_SUB_PATH  = os.path.join(BASE, "sample_submission.csv")
OUT_PATH         = os.path.join(BASE, "submission_infonce_v4.csv")


# =====================================================
# 1) InfoNCE ÌïôÏäµ ÏÑ§Ï†ï
# =====================================================
SEED = 42
MODEL_ID = "InstaDeepAI/nucleotide-transformer-v2-500m-multi-species"

MAX_EXT_SEQ = 120000     # Ïô∏Î∂Ä Îç∞Ïù¥ÌÑ∞ ÏÉòÌîåÎßÅ ÏµúÎåÄ Í∞úÏàò
NUM_PAIRS   = 30000      # InfoNCE pair Í∞úÏàò

MAX_LENGTH = 512
LAST_N_LAYERS = 4
OUTPUT_DIM = 512

TRAIN_EPOCHS = 3
BATCH_SIZE_TR = 32
BATCH_SIZE_INFER = 32

LR_HEAD = 1e-4
WEIGHT_DECAY = 1e-4
TEMPERATURE = 0.07
USE_FP16 = True


# =====================================================
# 2) Utility
# =====================================================
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)


def l2_normalize(x, eps=1e-12):
    return x / x.norm(p=2, dim=-1, keepdim=True).clamp(min=eps)


_rc_map = str.maketrans("ACGT", "TGCA")
def reverse_complement(seq):
    return seq.translate(_rc_map)[::-1]


def apply_snv(seq, k=1):
    bases = ["A", "C", "G", "T"]
    s = list(seq)
    idxs = random.sample(range(len(s)), k)
    for idx in idxs:
        orig = s[idx]
        s[idx] = random.choice([b for b in bases if b != orig])
    return "".join(s)


def generate_pairs(seqs: List[str], num_pairs: int):
    pairs = []
    for _ in range(num_pairs):
        anchor = random.choice(seqs)

        # Í∏∏Î©¥ MAX_LENGTHÎ°ú ÎûúÎç§ Ïä¨ÎùºÏù¥Ïä§
        if len(anchor) > MAX_LENGTH:
            st = random.randint(0, len(anchor) - MAX_LENGTH)
            anchor = anchor[st:st + MAX_LENGTH]

        r = random.random()
        if r < 0.4:
            pos = anchor
        elif r < 0.8:
            pos = apply_snv(anchor, random.randint(1, 2))
        else:
            pos = reverse_complement(apply_snv(anchor))

        pairs.append([anchor, pos])

    return pairs


# =====================================================
# 3) RobustModel
# =====================================================
class RobustModel(nn.Module):
    def __init__(self, hidden, last_n, out_dim):
        super().__init__()
        self.last_n = last_n
        self.layer_weights = nn.Parameter(torch.zeros(last_n))

        self.proj = nn.Sequential(
            nn.Linear(hidden, hidden * 2),
            nn.LayerNorm(hidden * 2),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(hidden * 2, out_dim)
        )

    def forward(self, hidden_states, mask):
        stack = torch.stack(hidden_states[-self.last_n:], 0)
        w = F.softmax(self.layer_weights, dim=0).view(-1, 1, 1, 1)

        feat = (stack * w).sum(0)
        mask = mask.unsqueeze(-1).float()

        summed = (feat * mask).sum(1)
        denom = mask.sum(1).clamp(min=1e-9)

        mean_emb = summed / denom
        return l2_normalize(self.proj(mean_emb))


# =====================================================
# 4) Main
# =====================================================
def main():

    print("üî• GPU:", torch.cuda.get_device_name(0))
    set_seed(SEED)
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # -------------------------------------------------
    # 1) Ïù¥ÎØ∏ Ï†ÑÏ≤òÎ¶¨Îêú CSV Î∂àÎü¨Ïò§Í∏∞ (GRCh38 + CHM13)
    # -------------------------------------------------
    print("\nüìå Ïô∏Î∂ÄÎç∞Ïù¥ÌÑ∞ Î°úÎìú (GRCh38 + CHM13)")
    df1 = pd.read_csv(GRCH38_CSV)
    df2 = pd.read_csv(CHM13_CSV)

    combined = pd.concat([df1, df2], ignore_index=True)
    print("üëâ Ï†ÑÏ≤¥ Ïô∏Î∂Ä Îç∞Ïù¥ÌÑ∞ Í∞úÏàò:", len(combined))

    ext_sequences = combined["seq"].astype(str).tolist()

    # ÏÉòÌîåÎßÅ
    if len(ext_sequences) > MAX_EXT_SEQ:
        idx = np.random.choice(len(ext_sequences), MAX_EXT_SEQ, replace=False)
        ext_sequences = [ext_sequences[i] for i in idx]
        print("üîª ÏÉòÌîåÎßÅ:", len(ext_sequences))

    # InfoNCE Îç∞Ïù¥ÌÑ∞ ÏÉùÏÑ±
    print("\nüìå InfoNCE pair ÏÉùÏÑ±")
    pair_data = generate_pairs(ext_sequences, NUM_PAIRS)

    # -------------------------------------------------
    # 2) test.csv Î°úÎìú
    # -------------------------------------------------
    test_df = pd.read_csv(TEST_PATH)
    test_sequences = test_df["seq"].astype(str).tolist()
    print("üëâ test ÏãúÌÄÄÏä§ Í∞úÏàò:", len(test_sequences))

    # -------------------------------------------------
    # 3) Î∞±Î≥∏ Î™®Îç∏ Î°úÎìú
    # -------------------------------------------------
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
    backbone = AutoModelForMaskedLM.from_pretrained(MODEL_ID, trust_remote_code=True)
    backbone = backbone.to(device).eval()

    for p in backbone.parameters():
        p.requires_grad = False

    # -------------------------------------------------
    # 4) Ìó§Îìú Î™®Îç∏
    # -------------------------------------------------
    model = RobustModel(backbone.config.hidden_size, LAST_N_LAYERS, OUTPUT_DIM).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LR_HEAD, weight_decay=WEIGHT_DECAY)
    scaler = torch.cuda.amp.GradScaler(enabled=USE_FP16)

    # -------------------------------------------------
    # 5) InfoNCE ÌïôÏäµ
    # -------------------------------------------------
    print("\nüöÄ InfoNCE ÌïôÏäµ ÏãúÏûë")

    for epoch in range(TRAIN_EPOCHS):
        random.shuffle(pair_data)
        epoch_losses = []

        for i in tqdm(range(0, len(pair_data), BATCH_SIZE_TR), desc=f"EPOCH {epoch+1}"):
            batch = pair_data[i:i+BATCH_SIZE_TR]
            anchors, positives = zip(*batch)

            seqs = list(anchors) + list(positives)

            enc = tokenizer(
                seqs, padding=True, truncation=True,
                max_length=MAX_LENGTH, return_tensors="pt"
            ).to(device)

            with torch.cuda.amp.autocast(enabled=USE_FP16):
                with torch.no_grad():
                    out = backbone(**enc, output_hidden_states=True)

                emb = model(out.hidden_states, enc["attention_mask"])
                B = len(batch)

                ea, ep = emb[:B], emb[B:]
                sim = (ea @ ep.T) / TEMPERATURE
                labels = torch.arange(B, device=device)

                loss = (F.cross_entropy(sim, labels) +
                        F.cross_entropy(sim.T, labels)) / 2

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

            epoch_losses.append(loss.item())

        print(f"üìâ Epoch {epoch+1} Loss = {np.mean(epoch_losses):.4f}")

    # -------------------------------------------------
    # 6) Ï∂îÎ°†
    # -------------------------------------------------
    print("\nüîç test ÏûÑÎ≤†Îî© ÏÉùÏÑ±")

    outputs = []

    for i in tqdm(range(0, len(test_sequences), BATCH_SIZE_INFER)):
        batch = test_sequences[i:i+BATCH_SIZE_INFER]

        view1 = batch
        view2 = [reverse_complement(x) for x in batch]

        seqs = view1 + view2

        enc = tokenizer(
            seqs, padding=True, truncation=True,
            max_length=MAX_LENGTH, return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            out = backbone(**enc, output_hidden_states=True)
            emb_all = model(out.hidden_states, enc["attention_mask"])

            B = len(batch)
            emb_mean = (emb_all[:B] + emb_all[B:]) / 2.0
            outputs.append(emb_mean.cpu().numpy())

    embeddings = np.concatenate(outputs, axis=0)

    # -------------------------------------------------
    # 7) Ï†ÄÏû•
    # -------------------------------------------------
    print("\nüíæ Í≤∞Í≥º Ï†ÄÏû•")

    col_names = [f"emb_{i:04d}" for i in range(OUTPUT_DIM)]
    df_emb = pd.DataFrame(embeddings, columns=col_names)

    final = pd.concat([test_df[["ID"]], df_emb], axis=1)
    final.to_csv(OUT_PATH, index=False)

    print("üéâ ÏôÑÎ£å! ÌååÏùº Ï†ÄÏû•Îê® ‚Üí", OUT_PATH)


if __name__ == "__main__":
    main()


üî• GPU: NVIDIA GeForce RTX 4060

üìå Ïô∏Î∂ÄÎç∞Ïù¥ÌÑ∞ Î°úÎìú (GRCh38 + CHM13)
üëâ Ï†ÑÏ≤¥ Ïô∏Î∂Ä Îç∞Ïù¥ÌÑ∞ Í∞úÏàò: 9502199
üîª ÏÉòÌîåÎßÅ: 120000

üìå InfoNCE pair ÏÉùÏÑ±
üëâ test ÏãúÌÄÄÏä§ Í∞úÏàò: 13711


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/nucleotide-transformer-v2-500m-multi-species:
- esm_config.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/nucleotide-transformer-v2-500m-multi-species:
- modeling_esm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For bette


üöÄ InfoNCE ÌïôÏäµ ÏãúÏûë


  with torch.cuda.amp.autocast(enabled=USE_FP16):
EPOCH 1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 938/938 [05:44<00:00,  2.72it/s]


üìâ Epoch 1 Loss = 0.1248


EPOCH 2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 938/938 [05:53<00:00,  2.65it/s]


üìâ Epoch 2 Loss = 0.0080


EPOCH 3: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 938/938 [06:00<00:00,  2.60it/s]


üìâ Epoch 3 Loss = 0.0050

üîç test ÏûÑÎ≤†Îî© ÏÉùÏÑ±


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 429/429 [13:27<00:00,  1.88s/it]



üíæ Í≤∞Í≥º Ï†ÄÏû•
üéâ ÏôÑÎ£å! ÌååÏùº Ï†ÄÏû•Îê® ‚Üí C:\Users\USER\Desktop\Ï†ÑÏßÄÏòà\submission_infonce_v4.csv
