In [2]:
from sentence_transformers import SentenceTransformer, util
import torch

# ==== 1. Path model (sesuaikan kalau beda) ====
BASE_MODEL_PATH = "D:\\Media\\Kuliah\\Skripsi\\ScripTI\\Versi 4\\Model\\hf_models\\LazarusNLP__all-nusabert-large-v4"
FT_MODEL_PATH   = "D:\\Media\\Kuliah\\Skripsi\\ScripTI\\Versi 4\\Model\\Trained_SBERT\\finetuned_nusabert_large_v1"

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

print("\n[LOAD MODEL]")
base_model = SentenceTransformer(BASE_MODEL_PATH, device=device)
ft_model   = SentenceTransformer(FT_MODEL_PATH, device=device)

# ==== 2. Beberapa pasangan kalimat uji ====
pairs = [
    # Dalam domain, sangat mirip
    (
        "Sistem ini bertujuan untuk mendeteksi kemiripan proposal tugas akhir berbasis Sentence-BERT.",
        "Penelitian ini mengembangkan sistem pendeteksi kemiripan proposal skripsi menggunakan model SBERT."
    ),
    # Dalam domain, beda topik (mirip sedang)
    (
        "Dataset proposal diambil dari platform ScripTI yang berisi ribuan judul tugas akhir.",
        "Data penelitian dikumpulkan dari jurnal ilmiah nasional dan internasional."
    ),
    # Di luar domain, harusnya rendah
    (
        "Sistem rekomendasi film menggunakan pendekatan collaborative filtering berbasis rating pengguna.",
        "Harga saham perusahaan teknologi mengalami kenaikan tajam akibat sentimen pasar global."
    ),
    # Identik (kontrol)
    (
        "Metode yang digunakan adalah fine-tuning Sentence-BERT dengan objective Semantic Textual Similarity.",
        "Metode yang digunakan adalah fine-tuning Sentence-BERT dengan objective Semantic Textual Similarity."
    ),
]

# ==== 3. Hitung & tampilkan skor ====
for i, (s1, s2) in enumerate(pairs, start=1):
    emb1_base = base_model.encode(s1, convert_to_tensor=True)
    emb2_base = base_model.encode(s2, convert_to_tensor=True)
    emb1_ft   = ft_model.encode(s1, convert_to_tensor=True)
    emb2_ft   = ft_model.encode(s2, convert_to_tensor=True)

    sim_base = util.cos_sim(emb1_base, emb2_base).item()
    sim_ft   = util.cos_sim(emb1_ft, emb2_ft).item()

    print(f"\n=== Pair {i} ===")
    print("Kalimat A :", s1)
    print("Kalimat B :", s2)
    print(f"Hasil BASE (sebelum FT) : {sim_base*100:.2f}%")
    print(f"Hasil FT   (sesudah FT) : {sim_ft*100:.2f}%")


Device: cuda

[LOAD MODEL]

=== Pair 1 ===
Kalimat A : Sistem ini bertujuan untuk mendeteksi kemiripan proposal tugas akhir berbasis Sentence-BERT.
Kalimat B : Penelitian ini mengembangkan sistem pendeteksi kemiripan proposal skripsi menggunakan model SBERT.
Hasil BASE (sebelum FT) : 81.80%
Hasil FT   (sesudah FT) : 86.25%

=== Pair 2 ===
Kalimat A : Dataset proposal diambil dari platform ScripTI yang berisi ribuan judul tugas akhir.
Kalimat B : Data penelitian dikumpulkan dari jurnal ilmiah nasional dan internasional.
Hasil BASE (sebelum FT) : 45.35%
Hasil FT   (sesudah FT) : 55.73%

=== Pair 3 ===
Kalimat A : Sistem rekomendasi film menggunakan pendekatan collaborative filtering berbasis rating pengguna.
Kalimat B : Harga saham perusahaan teknologi mengalami kenaikan tajam akibat sentimen pasar global.
Hasil BASE (sebelum FT) : 11.01%
Hasil FT   (sesudah FT) : 13.09%

=== Pair 4 ===
Kalimat A : Metode yang digunakan adalah fine-tuning Sentence-BERT dengan objective Semantic Textual S

In [4]:
# ================= SYSTEM DIAGNOSTIC TOOL =================
import sys
import platform

def check_version(package_name, alias=None):
    try:
        if alias:
            module = __import__(alias)
        else:
            module = __import__(package_name)
        
        version = getattr(module, '__version__', 'Terinstall (Versi tidak diketahui)')
        return version
    except ImportError:
        return "TIDAK TERINSTALL ❌"

print("="*50)
print(f"SISTEM OPERASI: {platform.system()} {platform.release()}")
print(f"PYTHON VERSION: {sys.version.split()[0]}")
print("="*50)
print(f"{'LIBRARY / FRAMEWORK':<25} | {'VERSI SAAT INI':<20}")
print("-" * 50)

# --- 1. CORE AI ---
print(f"{'PyTorch':<25} | {check_version('torch')}")
print(f"{'Sentence-Transformers':<25} | {check_version('sentence_transformers')}")
print(f"{'FAISS':<25} | {check_version('faiss')}")

# --- 2. DATA PROCESSING ---
print(f"{'Pandas':<25} | {check_version('pandas')}")
print(f"{'NumPy':<25} | {check_version('numpy')}")

# --- 3. BACKEND API ---
print(f"{'FastAPI':<25} | {check_version('fastapi')}")
print(f"{'Uvicorn':<25} | {check_version('uvicorn')}")

# --- 4. LLM API ---
# Google Generative AI import-nya agak beda
try:
    import google.generativeai as genai
    ver = genai.__version__
except:
    ver = "TIDAK TERINSTALL ❌"
print(f"{'Google GenAI SDK':<25} | {ver}")

print("="*50)

# --- KHUSUS CEK GPU (WAJIB UTK RTX 3060) ---
import torch
print("\n[GPU STATUS CHECK]")
if torch.cuda.is_available():
    print(f"✅ CUDA Aktif: Ya")
    print(f"✅ GPU Terdeteksi: {torch.cuda.get_device_name(0)}")
    print(f"✅ VRAM Total: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    print(f"✅ CUDA Version: {torch.version.cuda}")
else:
    print("❌ CUDA Tidak Aktif (Training akan lambat pakai CPU!)")
    print("   Saran: Install PyTorch versi CUDA di https://pytorch.org/")
print("="*50)

SISTEM OPERASI: Windows 10
PYTHON VERSION: 3.11.9
LIBRARY / FRAMEWORK       | VERSI SAAT INI      
--------------------------------------------------
PyTorch                   | 2.6.0+cu124
Sentence-Transformers     | 3.4.1
FAISS                     | 1.13.0
Pandas                    | 2.3.3
NumPy                     | 2.2.6
FastAPI                   | 0.121.2
Uvicorn                   | 0.38.0
Google GenAI SDK          | 0.8.5

[GPU STATUS CHECK]
✅ CUDA Aktif: Ya
✅ GPU Terdeteksi: NVIDIA GeForce RTX 3050 Ti Laptop GPU
✅ VRAM Total: 4.29 GB
✅ CUDA Version: 12.4


In [5]:
# ================= SYSTEM DIAGNOSTIC TOOL (LENGKAP) =================
import sys
import platform
import importlib

def check_library(package_name, display_name=None):
    if display_name is None:
        display_name = package_name
    
    try:
        module = importlib.import_module(package_name)
        # Coba berbagai cara standar mengambil versi
        version = getattr(module, '__version__', None)
        if not version:
            # Kadang versi ada di sub-atribut
            version = getattr(module, 'version', 'Terinstall (Versi n/a)')
        return f"{display_name:<25} | {version}"
    except ImportError:
        return f"{display_name:<25} | ❌ TIDAK TERINSTALL"

print("="*50)
print(f"SISTEM OPERASI: {platform.system()} {platform.release()}")
print(f"PYTHON VERSION: {sys.version.split()[0]}")
print("="*50)
print(f"{'LIBRARY / FRAMEWORK':<25} | {'VERSI SAAT INI':<20}")
print("-" * 50)

# --- 1. CORE DEEP LEARNING ---
print(check_library('torch', 'PyTorch'))
print(check_library('transformers', 'HuggingFace Transformers'))
print(check_library('accelerate', 'HuggingFace Accelerate')) # PENTING UTK GPU
print(check_library('sentence_transformers', 'Sentence-Transformers'))

# --- 2. VECTOR SEARCH & CLUSTERING ---
print(check_library('faiss', 'FAISS'))  # Atau faiss-gpu / faiss-cpu
print(check_library('sklearn', 'Scikit-Learn')) # PENTING UTK CLUSTERING & SPLIT

# --- 3. DATA PROCESSING ---
print(check_library('pandas', 'Pandas'))
print(check_library('numpy', 'NumPy'))
print(check_library('tqdm', 'Tqdm (Progress Bar)'))

# --- 4. BACKEND API ---
print(check_library('fastapi', 'FastAPI'))
print(check_library('uvicorn', 'Uvicorn (Server)'))
print(check_library('python_multipart', 'Python-Multipart')) # Utk Upload File

# --- 5. LLM API ---
print(check_library('google.generativeai', 'Google GenAI SDK'))

print("="*50)
# Catatan soal Regex
import re
print(f"{'Regex (re)':<25} | Built-in Python Lib (Std)")
print("="*50)

SISTEM OPERASI: Windows 10
PYTHON VERSION: 3.11.9
LIBRARY / FRAMEWORK       | VERSI SAAT INI      
--------------------------------------------------
PyTorch                   | 2.6.0+cu124
HuggingFace Transformers  | 4.57.1
HuggingFace Accelerate    | 1.11.0
Sentence-Transformers     | 3.4.1
FAISS                     | 1.13.0
Scikit-Learn              | 1.7.2
Pandas                    | 2.3.3
NumPy                     | 2.2.6
Tqdm (Progress Bar)       | 4.67.1
FastAPI                   | 0.121.2
Uvicorn (Server)          | 0.38.0
Python-Multipart          | ❌ TIDAK TERINSTALL
Google GenAI SDK          | 0.8.5
Regex (re)                | Built-in Python Lib (Std)


In [6]:
import pandas as pd
import os
import json
from transformers import AutoConfig, AutoTokenizer
from sentence_transformers import SentenceTransformer

# --- KONFIGURASI ---
# Masukkan path folder lokal model kamu (atau nama Hugging Face jika online)
MODEL_PATHS = [
    r"D:\Media\Kuliah\Skripsi\ScripTI\Versi 4\Model\hf_models\LazarusNLP__simcse-indobert-base",
    r"D:\Media\Kuliah\Skripsi\ScripTI\Versi 4\Model\hf_models\LazarusNLP__all-nusabert-base-v4",
    r"D:\Media\Kuliah\Skripsi\ScripTI\Versi 4\Model\hf_models\LazarusNLP__all-nusabert-large-v4",
    r"D:\Media\Kuliah\Skripsi\ScripTI\Versi 4\Model\hf_models\LazarusNLP__all-indo-e5-small-v4"
]

def get_pooling_mode(model_path):
    """Mencoba mendeteksi mode pooling dari file konfigurasi SBERT"""
    try:
        # Cek file config pooling standar SBERT
        pool_config_path = os.path.join(model_path, "1_Pooling", "config.json")
        
        if os.path.exists(pool_config_path):
            with open(pool_config_path, 'r') as f:
                conf = json.load(f)
                if conf.get("pooling_mode_mean_tokens"): return "Mean Pooling"
                if conf.get("pooling_mode_cls_token"): return "CLS Pooling"
                if conf.get("pooling_mode_max_tokens"): return "Max Pooling"
        
        # Fallback: Cek modules.json
        modules_path = os.path.join(model_path, "modules.json")
        if os.path.exists(modules_path):
            with open(modules_path, 'r') as f:
                mods = json.load(f)
                # Biasanya modul pooling ada di urutan ke-1
                for m in mods:
                    if m['type'] == 'sentence_transformers.models.Pooling':
                        # Kita harus buka file config di folder path-nya
                        p_path = os.path.join(model_path, m['path'], "config.json")
                        if os.path.exists(p_path):
                            with open(p_path, 'r') as pf:
                                pconf = json.load(pf)
                                if pconf.get("pooling_mode_mean_tokens"): return "Mean Pooling"
                                if pconf.get("pooling_mode_cls_token"): return "CLS Pooling"
    except:
        pass
    return "Default (Cek Manual)"

def check_instruction_tuned(model_path):
    """Mendeteksi apakah model butuh prompt (Instruction-Tuned)"""
    # Cek nama folder/model
    name = str(model_path).lower()
    if "e5" in name or "instruction" in name:
        return "Ya (query: / passage:)"
    return "Tidak"

# --- EKSEKUSI ---
data = []

print("Sedang menganalisis model...")

for path in MODEL_PATHS:
    try:
        # Load Config HuggingFace
        config = AutoConfig.from_pretrained(path)
        tokenizer = AutoTokenizer.from_pretrained(path)
        
        # Ambil nama folder terakhir sebagai nama model
        model_name = os.path.basename(path)
        if not model_name: model_name = path  # Fallback
        
        # Deteksi Arsitektur Dasar
        arch = config.model_type.upper() # BERT, ROBERTA, etc
        
        # Deteksi Tokenizer
        tok_type = type(tokenizer).__name__
        
        # Deteksi Pooling
        pooling = get_pooling_mode(path)
        
        # Deteksi Instruction
        instruction = check_instruction_tuned(path)
        
        # Masukkan ke data
        data.append({
            "Nama Model": model_name,
            "Arsitektur": arch,
            "Dimensi (Hidden Size)": config.hidden_size,
            "Jumlah Layer": config.num_hidden_layers,
            "Attention Heads": config.num_attention_heads,
            "Vocab Size": f"{config.vocab_size:,}", # Format ribuan
            "Max Position": config.max_position_embeddings,
            "Tokenizer": tok_type,
            "Pooling Strategy": pooling,
            "Instruction-Tuned": instruction
        })
        print(f"✅ {model_name} - OK")
        
    except Exception as e:
        print(f"❌ Gagal memproses {path}: {e}")

# Buat DataFrame
df = pd.DataFrame(data)

# Tampilkan
print("\n=== TABEL SPESIFIKASI MODEL ===")
print(df.to_string(index=False))

# Simpan ke CSV biar gampang dicopy ke Word/Excel
df.to_csv("tabel_spesifikasi_model.csv", index=False)
print("\nFile tersimpan: tabel_spesifikasi_model.csv")

Sedang menganalisis model...
✅ LazarusNLP__simcse-indobert-base - OK
✅ LazarusNLP__all-nusabert-base-v4 - OK
✅ LazarusNLP__all-nusabert-large-v4 - OK
✅ LazarusNLP__all-indo-e5-small-v4 - OK

=== TABEL SPESIFIKASI MODEL ===
                       Nama Model Arsitektur  Dimensi (Hidden Size)  Jumlah Layer  Attention Heads Vocab Size  Max Position               Tokenizer     Pooling Strategy      Instruction-Tuned
 LazarusNLP__simcse-indobert-base       BERT                    768            12               12     50,000           512       BertTokenizerFast Default (Cek Manual)                  Tidak
 LazarusNLP__all-nusabert-base-v4       BERT                    768            12               12     32,032           512       BertTokenizerFast Default (Cek Manual)                  Tidak
LazarusNLP__all-nusabert-large-v4       BERT                   1024            24               16     32,032           512       BertTokenizerFast Default (Cek Manual)                  Tidak
 LazarusN