In [9]:
!nvidia-smi

Wed Nov 26 22:38:32 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 576.80                 Driver Version: 576.80         CUDA Version: 12.9     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3060      WDDM  |   00000000:01:00.0  On |                  N/A |
|  0%   42C    P0             36W /  170W |    1152MiB /  12288MiB |      2%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [10]:
import os
import re
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
from Bio import SeqIO
from collections import Counter
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.sequence import pad_sequences

# ==========================================
# 1. C·∫§U H√åNH H·ªÜ TH·ªêNG & GPU (RTX 3060 SETUP)
# ==========================================
print("=== KI·ªÇM TRA GPU ===")
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            # Quan tr·ªçng: C·∫•p ph√°t b·ªô nh·ªõ ƒë·ªông ƒë·ªÉ tr√°nh l·ªói OOM tr√™n Windows
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"‚úÖ ƒê√É K√çCH HO·∫†T GPU: {gpus}")
        print("üöÄ Code s·∫Ω ch·∫°y tr√™n NVIDIA RTX 3060 (12GB VRAM)")
    except RuntimeError as e:
        print(e)
else:
    print("‚ö†Ô∏è C·∫¢NH B√ÅO: Kh√¥ng t√¨m th·∫•y GPU! H√£y ki·ªÉm tra version TensorFlow (N√™n d√πng TF <= 2.10 tr√™n Windows Native)")

=== KI·ªÇM TRA GPU ===
‚ö†Ô∏è C·∫¢NH B√ÅO: Kh√¥ng t√¨m th·∫•y GPU! H√£y ki·ªÉm tra version TensorFlow (N√™n d√πng TF <= 2.10 tr√™n Windows Native)


In [11]:


# C·∫•u h√¨nh ƒë∆∞·ªùng d·∫´n (ƒê√£ ch·ªânh theo m√°y c·ªßa b·∫°n)
BASE_DIR = r'E:\LMVH\PROJECT\CAFA-6-Protein-Function-Prediction-Kaggle'
DATA_DIR = os.path.join(BASE_DIR, 'data', 'Train')
TEST_DIR = os.path.join(BASE_DIR, 'data', 'Test')
MODEL_DIR = os.path.join(BASE_DIR, 'models')
SUBMISSION_PATH = os.path.join(BASE_DIR, 'submission.tsv')

# T·∫°o th∆∞ m·ª•c models n·∫øu ch∆∞a c√≥
os.makedirs(MODEL_DIR, exist_ok=True)

# Hyperparameters (T·ªëi ∆∞u cho 12GB VRAM)
MAX_SEQ_LEN = 512       
NUM_CLASSES = 1500      
BATCH_SIZE  = 64        # 12GB VRAM ch·ªãu t·ªët m·ª©c n√†y
EPOCHS      = 10
EMBED_DIM   = 64
NUM_HEADS   = 4
FF_DIM      = 128

# ==========================================
# 2. X·ª¨ L√ù D·ªÆ LI·ªÜU (PREPROCESSING)
# ==========================================
def load_data():
    print("\n[1/5] ƒêang ƒë·ªçc d·ªØ li·ªáu Train...")
    fasta_path = os.path.join(DATA_DIR, 'train_sequences.fasta')
    terms_path = os.path.join(DATA_DIR, 'train_terms.tsv')

    ids, sequences, pes = [], [], []
    
    # ƒê·ªçc nhanh file
    for record in SeqIO.parse(fasta_path, "fasta"):
        parts = record.id.split('|')
        clean_id = parts[1] if len(parts) >= 2 else record.id
        
        # L·∫•y PE t·ª´ header
        header = record.description
        pe_match = re.search(r'PE=(\d+)', header)
        pe_val = int(pe_match.group(1)) if pe_match else 0
        
        ids.append(clean_id)
        sequences.append(str(record.seq))
        pes.append(pe_val)

    df_seq = pd.DataFrame({'EntryID': ids, 'sequence': sequences, 'PE': pes})
    
    # ƒê·ªçc Terms
    df_terms = pd.read_csv(terms_path, sep="\t", usecols=['EntryID', 'term'])
    df_labels = df_terms.groupby('EntryID')['term'].apply(list).reset_index()
    
    # Merge
    df_final = pd.merge(df_seq, df_labels, on='EntryID', how='inner')
    
    print(f"   -> ƒê√£ load {len(df_final)} m·∫´u protein.")
    return df_final

# Map Axit Amin -> S·ªë nguy√™n
AA_MAP = {
    'A': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'K': 9, 'L': 10,
    'M': 11, 'N': 12, 'P': 13, 'Q': 14, 'R': 15, 'S': 16, 'T': 17, 'V': 18, 'W': 19, 'Y': 20
}

def preprocess_input(df):
    print("[2/5] Tokenizing Sequences & Padding...")
    X_list = [[AA_MAP.get(aa, 0) for aa in seq] for seq in df['sequence']]
    X = pad_sequences(X_list, maxlen=MAX_SEQ_LEN, padding='post', truncating='post')
    return X

def preprocess_labels(df):
    print("[3/5] Encoding Labels (Multi-hot)...")
    all_terms = [t for sublist in df['term'] for t in sublist]
    top_terms = [t[0] for t in Counter(all_terms).most_common(NUM_CLASSES)]
    term_to_idx = {t: i for i, t in enumerate(top_terms)}
    
    Y = np.zeros((len(df), NUM_CLASSES), dtype='float32')
    for i, terms in enumerate(df['term']):
        for t in terms:
            if t in term_to_idx:
                Y[i, term_to_idx[t]] = 1.0
                
    return Y, top_terms

# ==========================================
# 3. M√î H√åNH TRANSFORMER (CUSTOM LAYERS)
# ==========================================
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1, **kwargs):
        super(TransformerBlock, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        self.rate = rate
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = models.Sequential([
            layers.Dense(ff_dim, activation="relu"), 
            layers.Dense(embed_dim),
        ])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)
    
    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "ff_dim": self.ff_dim,
            "rate": self.rate,
        })
        return config

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim, **kwargs):
        super(TokenAndPositionEmbedding, self).__init__(**kwargs)
        self.maxlen = maxlen
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

    def get_config(self):
        config = super().get_config()
        config.update({
            "maxlen": self.maxlen,
            "vocab_size": self.vocab_size,
            "embed_dim": self.embed_dim,
        })
        return config

def create_model():
    inputs = layers.Input(shape=(MAX_SEQ_LEN,))
    embedding_layer = TokenAndPositionEmbedding(MAX_SEQ_LEN, 21, EMBED_DIM)
    x = embedding_layer(inputs)
    
    transformer_block = TransformerBlock(EMBED_DIM, NUM_HEADS, FF_DIM)
    x = transformer_block(x)
    
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dropout(0.1)(x)
    x = layers.Dense(64, activation="relu")(x)
    outputs = layers.Dense(NUM_CLASSES, activation="sigmoid")(inputs=x)
    
    model = models.Model(inputs=inputs, outputs=outputs)
    return model

# ==========================================
# 4. MAIN PIPELINE
# ==========================================
if __name__ == "__main__":
    # --- A. TRAINING PHASE ---
    df = load_data()
    X = preprocess_input(df)
    Y, top_terms = preprocess_labels(df)
    
    X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.1, random_state=42)
    
    print("\n[4/5] Kh·ªüi t·∫°o Model...")
    # C∆∞·ª°ng ch·∫ø ch·∫°y tr√™n GPU 0
    with tf.device('/GPU:0'):
        model = create_model()
        model.compile(optimizer="adam", 
                      loss="binary_crossentropy", 
                      metrics=["binary_accuracy", tf.keras.metrics.AUC(multi_label=True, name='auc')])
    
    model.summary()
    
    print("\n--- B·∫ÆT ƒê·∫¶U TRAIN TR√äN GPU ---")
    history = model.fit(
        X_train, y_train,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        validation_data=(X_val, y_val)
    )
    
    print("\nƒêang l∆∞u model v√† labels...")
    model.save(os.path.join(MODEL_DIR, 'transformer_model.keras'))
    with open(os.path.join(MODEL_DIR, 'labels_map.pkl'), 'wb') as f:
        pickle.dump(top_terms, f)
    print("‚úÖ ƒê√£ l∆∞u xong!")

    # --- B. INFERENCE PHASE (T·∫†O FILE SUBMISSION) ---
    print("\n[5/5] B·∫Øt ƒë·∫ßu d·ª± ƒëo√°n t·∫≠p Test...")
    test_fasta = os.path.join(TEST_DIR, 'testsuperset.fasta')
    
    print("ƒêang ƒë·ªçc file test...")
    test_ids, test_seqs = [], []
    for i, record in enumerate(SeqIO.parse(test_fasta, "fasta")):
        test_ids.append(record.id)
        test_seqs.append(str(record.seq))
    
    print(f"T·ªïng s·ªë m·∫´u test: {len(test_ids)}")
    
    with open(SUBMISSION_PATH, 'w') as f:
        # X·ª≠ l√Ω theo Chunk ƒë·ªÉ tr√°nh tr√†n RAM
        CHUNK_SIZE = 5000 
        for i in range(0, len(test_ids), CHUNK_SIZE):
            end_idx = min(i + CHUNK_SIZE, len(test_ids))
            print(f"Processing chunk {i} to {end_idx}...")
            
            chunk_seqs = test_seqs[i:end_idx]
            chunk_ids = test_ids[i:end_idx]
            
            # Preprocess chunk
            X_chunk_raw = [[AA_MAP.get(aa, 0) for aa in seq] for seq in chunk_seqs]
            X_chunk = pad_sequences(X_chunk_raw, maxlen=MAX_SEQ_LEN, padding='post', truncating='post')
            
            # Predict
            preds = model.predict(X_chunk, batch_size=BATCH_SIZE, verbose=0)
            
            # Write results
            for j, pid in enumerate(chunk_ids):
                probs = preds[j]
                top_indices = np.argsort(probs)[-30:] 
                for idx in top_indices:
                    score = probs[idx]
                    if score > 0.01:
                        term = top_terms[idx]
                        f.write(f"{pid}\t{term}\t{score:.3f}\n")
                    
    print(f"\nüéâ HO√ÄN T·∫§T! File submission t·∫°i: {SUBMISSION_PATH}")


[1/5] ƒêang ƒë·ªçc d·ªØ li·ªáu Train...
   -> ƒê√£ load 82404 m·∫´u protein.
[2/5] Tokenizing Sequences & Padding...
[3/5] Encoding Labels (Multi-hot)...


KeyboardInterrupt: 