In [2]:
# ================= CELL TRAINING: FINE-TUNING SBERT =================
from sentence_transformers import SentenceTransformer, InputExample, losses, models
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from torch.utils.data import DataLoader
import pandas as pd
import os
import math

# --- 1. KONFIGURASI ---
# Pilih salah satu model dari folder lokalmu
# Ganti path ini setiap kali mau ganti model yang dilatih
BASE_MODEL_PATH = r"D:\Media\Kuliah\Skripsi\ScripTI\Versi 4\Model\hf_models\LazarusNLP__all-indo-e5-small-v4"

# Output akan disimpan di folder baru
OUTPUT_PATH = r"D:\Media\Kuliah\Skripsi\ScripTI\Versi 4\Model\Trained_SBERT\finetuned_indo_e5_v1"

# Hyperparameters (Standar SBERT)
BATCH_SIZE = 32 # Kalau GPU Memory Error, turunkan jadi 16
EPOCHS = 4
LEARNING_RATE = 2e-5

# --- 2. FUNGSI DETEKSI PREFIX (PENTING BUAT E5) ---
def get_prefix(model_name_or_path):
    # Cek kalau nama model mengandung 'e5' atau 'indo-e5'
    if "e5" in model_name_or_path.lower():
        print("⚠️ Model E5 Terdeteksi! Menambahkan prefix 'query: ' ke data training.")
        return "query: "
    return ""

prefix = get_prefix(BASE_MODEL_PATH)

# --- 3. PREPARE DATALOADER ---
print("Membaca data CSV...")
train_df = pd.read_csv("train_sbert.csv", encoding='utf-8-sig')
val_df = pd.read_csv("val_sbert.csv", encoding='utf-8-sig')

# Fungsi konversi DataFrame ke InputExample SBERT
def convert_to_examples(df, prefix_str=""):
    examples = []
    for _, row in df.iterrows():
        # Konversi text ke string jaga-jaga
        t1 = prefix_str + str(row['text_1'])
        t2 = prefix_str + str(row['text_2'])
        score = float(row['score_norm']) # Wajib float 0.0 - 1.0
        examples.append(InputExample(texts=[t1, t2], label=score))
    return examples

print("Mengkonversi data ke format InputExample...")
train_examples = convert_to_examples(train_df, prefix)
val_examples = convert_to_examples(val_df, prefix)

# DataLoader
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=BATCH_SIZE)

# --- 4. SETUP MODEL & LOSS ---
print(f"Loading Base Model: {BASE_MODEL_PATH}")
model = SentenceTransformer(BASE_MODEL_PATH)

# Loss Function untuk STS (Cosine Similarity Loss)
# Ini cocok dengan data skor float 0.0 - 1.0
train_loss = losses.CosineSimilarityLoss(model)

# --- 5. SETUP EVALUATOR ---
# Evaluator akan mengecek performa model setiap habis 1 epoch
# Menggunakan Spearman Correlation (Sesuai Bab 2 kamu)
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    val_examples, 
    name='sts-val',
    write_csv=True
)

# --- 6. MULAI TRAINING ---
print(f"Mulai Training selama {EPOCHS} epoch...")
print(f"Model akan disimpan di: {OUTPUT_PATH}")

# Kalkulasi langkah pemanasan (warmup steps) - biasanya 10% dari total steps
warmup_steps = math.ceil(len(train_dataloader) * EPOCHS * 0.1)

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluator,
    epochs=EPOCHS,
    evaluation_steps=1000,     # Evaluasi setiap 1000 batch (opsional)
    warmup_steps=warmup_steps,
    output_path=OUTPUT_PATH,
    save_best_model=True,      # Simpan model terbaik berdasarkan Val Score
    optimizer_params={'lr': LEARNING_RATE} 
)

print("\n✅ TRAINING SELESAI!")
print(f"Model terbaik tersimpan di: {OUTPUT_PATH}")

⚠️ Model E5 Terdeteksi! Menambahkan prefix 'query: ' ke data training.
Membaca data CSV...
Mengkonversi data ke format InputExample...
Loading Base Model: D:\Media\Kuliah\Skripsi\ScripTI\Versi 4\Model\hf_models\LazarusNLP__all-indo-e5-small-v4
Mulai Training selama 4 epoch...
Model akan disimpan di: D:\Media\Kuliah\Skripsi\ScripTI\Versi 4\Model\Trained_SBERT\finetuned_indo_e5_v1


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss


KeyboardInterrupt: 