In [1]:
# ================= CELL FINAL: INDEXING FAISS & SENTENCE-LEVEL CLUSTERING =================
import pandas as pd
import numpy as np
import faiss
import pickle
import os
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from tqdm import tqdm

# --- 1. KONFIGURASI PATH (SESUAIKAN DISINI) ---
# Path Model Terbaik (Epoch 4)
MODEL_PATH = r"D:\Media\Kuliah\Skripsi\ScripTI\Versi 4\Model\Trained_SBERT\finetuned_all-nusabert-base_v1"

# File Data Mentah (Kalimat)
INPUT_CSV = r"D:\Media\Kuliah\Skripsi\ScripTI\Versi 4\Preprocessing\dataTuning.csv"

# Output Folder (Tempat simpan index & metadata untuk Sistem)
OUTPUT_DIR = r"D:\Media\Kuliah\Skripsi\ScripTI\Versi 4\Index_and_Clusters"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Nama File Output
INDEX_FILE = os.path.join(OUTPUT_DIR, "proposal_vectors.index")
META_FILE  = os.path.join(OUTPUT_DIR, "proposal_metadata.pkl")
CLUSTER_MODEL_FILE = os.path.join(OUTPUT_DIR, "topic_cluster_model.pkl")
CLUSTER_MAP_FILE   = os.path.join(OUTPUT_DIR, "proposal_clusters.csv")

# Konfigurasi Clustering
NUM_CLUSTERS = 20  # Jumlah topik besar yang ingin dipetakan

# --- 2. LOAD DATA & MODEL ---
print(f"Loading Model dari: {MODEL_PATH} ...")
# Cek apakah perlu prefix (untuk E5)
device = 'cuda' if faiss.get_num_gpus() > 0 else 'cpu' # Cek GPU via FAISS/Torch logic standar
model = SentenceTransformer(MODEL_PATH)

print(f"Membaca data: {INPUT_CSV} ...")
df = pd.read_csv(INPUT_CSV, encoding='utf-8-sig')

# Pastikan kolom text string dan proposal_id konsisten
df['text'] = df['text'].astype(str)
df['proposal_id'] = df['proposal_id'].astype(str)

sentences = df['text'].tolist()
proposal_ids = df['proposal_id'].tolist()

print(f"Total Kalimat: {len(sentences)}")
print(f"Total Proposal Unik: {df['proposal_id'].nunique()}")

# --- 3. ENCODING (KALIMAT -> VEKTOR) ---
print("\n[1/4] Melakukan Encoding Kalimat...")
# Tambahkan prefix 'query: ' jika modelnya E5 (Deteksi otomatis dari nama path)
if "e5" in MODEL_PATH.lower():
    print("   -> Terdeteksi model E5, menambahkan prefix 'query: ' ...")
    sentences_to_encode = ["query: " + s for s in sentences]
else:
    sentences_to_encode = sentences

# Batch size 64 agar cepat di GPU
embeddings = model.encode(sentences_to_encode, batch_size=64, show_progress_bar=True, convert_to_tensor=False)

# Konversi ke Numpy Float32 (Wajib buat FAISS)
embeddings = np.array(embeddings).astype('float32')

# --- 4. FAISS INDEXING (FLAT IP) ---
print("\n[2/4] Membangun Index FAISS (Exact Search)...")

# PENTING: Normalisasi L2 agar Inner Product (IP) = Cosine Similarity
faiss.normalize_L2(embeddings)

# Dimensi vektor
d = embeddings.shape[1] 
print(f"   Dimensi Vektor: {d}")

# Menggunakan IndexFlatIP (Brute Force/Exact) sesuai Bab 3.9
index = faiss.IndexFlatIP(d)
index.add(embeddings)

print(f"   Total vektor terindeks: {index.ntotal}")

# Simpan Index
faiss.write_index(index, INDEX_FILE)
print(f"‚úÖ Index FAISS tersimpan di: {INDEX_FILE}")

# Simpan Metadata (Mapping Index FAISS -> ID Proposal & Teks Asli)
# Backend butuh ini untuk tahu "Hasil pencarian baris ke-X itu kalimat apa?"
df.to_pickle(META_FILE) 
print(f"‚úÖ Metadata tersimpan di: {META_FILE}")

# --- 5. CLUSTERING (SENTENCE-LEVEL VOTING) ---
print("\n[3/4] Melatih Clustering (K-Means) pada Level Kalimat...")

# Latih K-Means pada SEMUA vektor kalimat
# Ini memungkinkan model membedakan nuansa topik per kalimat
kmeans = KMeans(n_clusters=NUM_CLUSTERS, random_state=42, n_init=10)
kmeans.fit(embeddings)

# Simpan Model K-Means (Untuk memprediksi proposal baru mahasiswa nanti)
with open(CLUSTER_MODEL_FILE, 'wb') as f:
    pickle.dump(kmeans, f)
print(f"‚úÖ Model K-Means tersimpan di: {CLUSTER_MODEL_FILE}")

# Dapatkan label cluster untuk setiap kalimat di database
sentence_cluster_labels = kmeans.labels_

print("\n[4/4] Menghitung Profil Topik per Proposal (Voting)...")

# Gabungkan ID Proposal dengan Label Cluster Kalimatnya
df_voting = pd.DataFrame({
    'proposal_id': proposal_ids,
    'cluster_label': sentence_cluster_labels
})

# Agregasi: Hitung persentase dominasi cluster per proposal
proposal_profiles = []
grouped = df_voting.groupby('proposal_id')

for pid, group in tqdm(grouped, desc="Voting Profile"):
    # Hitung frekuensi cluster di proposal ini
    counts = group['cluster_label'].value_counts(normalize=True) # normalize=True jadi % (0.0 - 1.0)
    
    # Ambil Top 3 Cluster dominan
    # (Misal: 60% Cluster 5, 30% Cluster 2, 10% Cluster 8)
    top_clusters = counts.head(3)
    
    record = {'proposal_id': pid}
    
    # Isi Cluster 1 (Utama)
    record['primary_cluster'] = top_clusters.index[0]
    record['primary_ratio']   = top_clusters.iloc[0]
    
    # Isi Cluster 2 (Jika ada)
    if len(top_clusters) > 1:
        record['secondary_cluster'] = top_clusters.index[1]
        record['secondary_ratio']   = top_clusters.iloc[1]
    else:
        record['secondary_cluster'] = -1
        record['secondary_ratio']   = 0.0
        
    # Isi Cluster 3 (Jika ada)
    if len(top_clusters) > 2:
        record['tertiary_cluster']  = top_clusters.index[2]
        record['tertiary_ratio']    = top_clusters.iloc[2]
    else:
        record['tertiary_cluster']  = -1
        record['tertiary_ratio']    = 0.0
    
    proposal_profiles.append(record)

# Simpan Hasil Profiling
df_prop_clusters = pd.DataFrame(proposal_profiles)
df_prop_clusters.to_csv(CLUSTER_MAP_FILE, index=False)

print(f"‚úÖ Peta Distribusi Cluster Proposal tersimpan di: {CLUSTER_MAP_FILE}")
print("\n=== CONTOH DATA CLUSTERING ===")
print(df_prop_clusters.head(3))

print("\n=== SYSTEM ASSETS GENERATED SUCCESSFULLY ===")

Loading Model dari: D:\Media\Kuliah\Skripsi\ScripTI\Versi 4\Model\Trained_SBERT\finetuned_all-nusabert-base_v1 ...
Membaca data: D:\Media\Kuliah\Skripsi\ScripTI\Versi 4\Preprocessing\dataTuning.csv ...
Total Kalimat: 24509
Total Proposal Unik: 1279

[1/4] Melakukan Encoding Kalimat...


Batches:   0%|          | 0/383 [00:00<?, ?it/s]


[2/4] Membangun Index FAISS (Exact Search)...
   Dimensi Vektor: 768
   Total vektor terindeks: 24509
‚úÖ Index FAISS tersimpan di: D:\Media\Kuliah\Skripsi\ScripTI\Versi 4\Index_and_Clusters\proposal_vectors.index
‚úÖ Metadata tersimpan di: D:\Media\Kuliah\Skripsi\ScripTI\Versi 4\Index_and_Clusters\proposal_metadata.pkl

[3/4] Melatih Clustering (K-Means) pada Level Kalimat...
‚úÖ Model K-Means tersimpan di: D:\Media\Kuliah\Skripsi\ScripTI\Versi 4\Index_and_Clusters\topic_cluster_model.pkl

[4/4] Menghitung Profil Topik per Proposal (Voting)...


Voting Profile: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1279/1279 [00:00<00:00, 3253.94it/s]


‚úÖ Peta Distribusi Cluster Proposal tersimpan di: D:\Media\Kuliah\Skripsi\ScripTI\Versi 4\Index_and_Clusters\proposal_clusters.csv

=== CONTOH DATA CLUSTERING ===
  proposal_id  primary_cluster  primary_ratio  secondary_cluster  \
0         102                3       0.583333                 15   
1        1057               17       0.458333                 11   
2        1063                4       0.956522                  9   

   secondary_ratio  tertiary_cluster  tertiary_ratio  
0         0.250000                 8        0.083333  
1         0.291667                 4        0.250000  
2         0.043478                -1        0.000000  

=== SYSTEM ASSETS GENERATED SUCCESSFULLY ===


In [1]:
# ================= CELL: GENERATE CLUSTER -> DOSEN ID MAPPING =================
import pandas as pd
import json
import os
import numpy as np

# --- KONFIGURASI PATH ---
BASE_DIR = r"D:\Media\Kuliah\Skripsi\ScripTI\Versi 4\Index_and_Clusters"

# Input 1: Hasil Clustering (Dari script sebelumnya)
CLUSTER_FILE = os.path.join(BASE_DIR, "proposal_clusters.csv")

# Input 2: Data ID Dosen (Dari Export SQL baru kamu)
DOSEN_IDS_FILE = os.path.join(BASE_DIR, "proposal_dosen_ids.csv")

# Output: Mapping Cluster -> List[DosenID]
OUTPUT_MAP_JSON = os.path.join(BASE_DIR, "cluster_dosen_ids.json")

# --- PROSES ---
print("Membaca data...")
df_clusters = pd.read_csv(CLUSTER_FILE) # [proposal_id, primary_cluster, ...]
df_dosen = pd.read_csv(DOSEN_IDS_FILE)  # [proposal_id, id_dosen_1, id_dosen_2]

# Gabungkan
df_merged = pd.merge(df_clusters, df_dosen, on='proposal_id', how='inner')
print(f"Data tergabung: {len(df_merged)} proposal.")

# Dictionary hasil: { cluster_id: [dosen_id_a, dosen_id_b] }
cluster_dosen_map = {}

# Group by Cluster Utama
grouped = df_merged.groupby('primary_cluster')

for cluster_id, group in grouped:
    # Kumpulkan semua ID dosen di cluster ini
    dosen_list = []
    
    # Ambil Dosen 1
    d1 = group['id_dosen_1'].dropna().tolist()
    dosen_list.extend(d1)
    
    # Ambil Dosen 2 (jika ada)
    d2 = group['id_dosen_2'].dropna().tolist()
    dosen_list.extend(d2)
    
    # Hitung Frekuensi (Siapa dosen paling sering di topik ini?)
    if not dosen_list:
        cluster_dosen_map[int(cluster_id)] = []
        continue
        
    # Pakai numpy/pandas value_counts biar cepat
    counts = pd.Series(dosen_list).value_counts()
    
    # Ambil Top 3 Dosen ID terbanyak
    # Convert ke int (JSON standard)
    top_ids = [int(x) for x in counts.head(3).index.tolist()]
    
    cluster_dosen_map[int(cluster_id)] = top_ids

# Simpan
with open(OUTPUT_MAP_JSON, 'w') as f:
    json.dump(cluster_dosen_map, f)

print(f"‚úÖ Mapping Cluster ke Dosen ID tersimpan: {OUTPUT_MAP_JSON}")
# Contoh isi: { "0": [55, 12], "1": [60] }

Membaca data...
Data tergabung: 1276 proposal.
‚úÖ Mapping Cluster ke Dosen ID tersimpan: D:\Media\Kuliah\Skripsi\ScripTI\Versi 4\Index_and_Clusters\cluster_dosen_ids.json


In [3]:
# ================= CELL FINAL: RE-INDEXING & SMART CLUSTERING V3 =================
import pandas as pd
import numpy as np
import faiss
import pickle
import os
import json
import re
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from tqdm.notebook import tqdm

# --- 1. KONFIGURASI PATH (SESUAIKAN DISINI) ---
BASE_DIR = r"D:\Media\Kuliah\Skripsi\ScripTI\Versi 4\Index_and_Clusters"
MODEL_PATH = r"D:\Media\Kuliah\Skripsi\ScripTI\Versi 4\Model\Trained_SBERT\finetuned_all-nusabert-base_v1"
INPUT_CSV = r"D:\Media\Kuliah\Skripsi\ScripTI\Versi 4\Preprocessing\dataTuning.csv"
DOSEN_IDS_FILE = r"D:\Media\Kuliah\Skripsi\ScripTI\Versi 4\Index_and_Clusters\proposal_dosen_ids.csv" # Pastikan file ini ada!

# Output Files
INDEX_FILE = os.path.join(BASE_DIR, "proposal_vectors.index")
META_FILE = os.path.join(BASE_DIR, "proposal_metadata.pkl")
CLUSTER_MODEL_FILE = os.path.join(BASE_DIR, "topic_cluster_model.pkl")
CLUSTER_INFO_FILE = os.path.join(BASE_DIR, "cluster_info_complete.json")

NUM_CLUSTERS = 20

# --- 2. LOAD & PREPARE DATA ---
print("üì• Memuat data...")
df_texts = pd.read_csv(INPUT_CSV, encoding='utf-8-sig')
df_dosen = pd.read_csv(DOSEN_IDS_FILE)

# Pastikan tipe data string
df_texts['text'] = df_texts['text'].astype(str)
df_texts['proposal_id'] = df_texts['proposal_id'].astype(str)
df_dosen['proposal_id'] = df_dosen['proposal_id'].astype(str)

print(f"   -> Total Kalimat: {len(df_texts)}")

# --- 3. ENCODING (SBERT) ---
print("‚ö° Encoding vectors (SBERT)...")
model = SentenceTransformer(MODEL_PATH)

sentences_list = df_texts['text'].tolist()
if "e5" in MODEL_PATH.lower():
    sentences_input = ["query: " + s for s in sentences_list]
else:
    sentences_input = sentences_list

embeddings = model.encode(sentences_input, batch_size=64, show_progress_bar=True, convert_to_tensor=False)
embeddings = np.array(embeddings).astype('float32')
faiss.normalize_L2(embeddings)

# --- 4. FAISS INDEXING ---
print("üóÇÔ∏è  Membangun Index FAISS...")
d = embeddings.shape[1]
new_index = faiss.IndexFlatIP(d)
new_index.add(embeddings)

faiss.write_index(new_index, INDEX_FILE)
df_texts.to_pickle(META_FILE)
print(f"   -> Index tersimpan: {INDEX_FILE}")

# --- 5. CLUSTERING (K-MEANS) ---
print("üß© Melakukan Clustering...")
kmeans = KMeans(n_clusters=NUM_CLUSTERS, random_state=42, n_init=10)
kmeans.fit(embeddings)

with open(CLUSTER_MODEL_FILE, 'wb') as f:
    pickle.dump(kmeans, f)

df_texts['cluster_label'] = kmeans.labels_

# --- 6. TOPIC NAMING (SMART HYBRID) ---
print("üè∑Ô∏è  Generating Topic Names...")

# Setup Stopwords & Normalization
factory = StopWordRemoverFactory()
stopwords_sastrawi = factory.get_stop_words()

academic_stopwords = [
                'banyak', 'perkembangan', 'mengalami', 'terjadi', 'sering', 'kerap', 
                'selama', 'masa', 'kini', 'saat', 'ini', 'itu', 'tersebut', 'berbagai',
                'merupakan', 'adalah', 'yaitu', 'yakni', 'antara', 'lain', 'sangat',
                'serta', 'ataupun', 'bagai', 'bagaimana', 'sebagai', 'banyak', 'sedikit',
                'berdasarkan', 'melalui', 'secara', 'suatu', 'sebuah', 'jadi', 'lintas', 
                'pula', 'pun', 'dapat', 'bisa', 'mampu', 'guna', 'agar', 'semakin', 'beberapa',
                
                'menggunakan', 'digunakan', 'dilakukan', 'melakukan', 'membuat', 'membangun', 'kehidupan',
                'merancang', 'mengimplementasikan', 'penerapan', 'implementasi', 'pembuatan', 'masyarakat',
                'pembangunan', 'perancangan', 'pengembangan', 'analisis', 'menganalisis', 'orang',
                'menentukan', 'menghitung', 'mencari', 'diterapkan', 'dibuat', 'dikembangkan', 'satunama',
                'membantu', 'mengetahui', 'meningkatkan', 'diharapkan', 'perlu', 'mengenai',
                'menyelesaikan', 'permasalahan', 'solusi', 'hasil', 'proses', 'lalu', 'pesat',
                'menjadi', 'lebih', 'memiliki',  'satu', 'salah', 'benar', 'lama', 'manual',
                'baik', 'besar', 'tinggi', 'cepat', 'kuat', 'tepat', 'efektif', 'efisien', 'bidang',
                'mengatur', 'memastikan', 'menjamin', 'mengecek', 'mengevaluasi', 'manusia',
                'mengawasi', 'mengontrol', 'mengoptimalkan', 'mengumpulkan', 'mengolah', 'barang',
                'menyimpan', 'menampilkan', 'memberikan', 'menyiapkan', 'menyesuaikan', 'jenis',
                'mensinkronkan', 'menggabungkan', 'mengganti', 'menghapus', 'menambah', 'jalan',
                'satu', 'dua', 'tiga', 'utama', 'empat', 'lima', 'enam', 'tujuh', 'delapan', 'sembilan', 'sepuluh', 'puluh',
                
                'sistem', 'aplikasi', 'website', 'web', 'berbasis', 'program', 'fitur', 'perusahaan', 'organisasi',
                'metode', 'metodologi', 'data', 'informasi', 'teknologi', 'komputer', 'siswa', 'algoritma',
                'penelitian', 'skripsi', 'tugas', 'akhir', 'proposal', 'penulis', 'pengguna', 'yayasan',
                'latar', 'belakang', 'masalah', 'tujuan', 'manfaat', 'rumusan', 'batasan', 'kampus',
                'universitas', 'kristen', 'duta', 'wacana', 'ukdw', 'yogyakarta', 'fakultas', 'prodi',
                'indonesia', 'tahun', 'waktu', 'jumlah', 'studi', 'kasus', 'dosen', 'mahasiswa'
            ]
final_stopwords = list(set(stopwords_sastrawi + academic_stopwords))

normalization_map = {
                "mengenali": "pengenalan","mengenal": "pengenalan", "dikenali": "pengenalan", "identifikasi": "identifikasi",
                "mengidentifikasi": "identifikasi", "klasifikasi": "klasifikasi", "mengklasifikasikan": "klasifikasi",
                "deteksi": "deteksi", "mendeteksi": "deteksi", "pendeteksian": "deteksi",
                "diagnosa": "diagnosa", "mendiagnosa": "diagnosa", "prediksi": "prediksi",
                "memprediksi": "prediksi", "rekomendasi": "rekomendasi", "merekomendasikan": "rekomendasi",
                "belajar": "pembelajaran", "ajar": "pembelajaran", "edukasi": "pembelajaran",
                "citra": "citra", "image": "citra", "mobile": "mobile", "android": "android",
                "aman": "keamanan", "mengamankan": "keamanan", "pengamanan": "keamanan",
                "jaring": "jaringan", "terhubung": "koneksi"
            }

cluster_names = {}
for i in tqdm(range(NUM_CLUSTERS), desc="Generating Topics"):
    c_df = df_texts[df_texts['cluster_label'] == i]
    if len(c_df) > 1000: c_df = c_df.sample(1000, random_state=42)
    
    if c_df.empty:
        cluster_names[str(i)] = f"Topik {i}"
        continue
    
    raw_text = " ".join(c_df['text'].tolist())
    clean_text = re.sub(r'[^a-zA-Z\s]', ' ', raw_text).lower()
    
    words = clean_text.split()
    normalized_words = [normalization_map.get(w, w) for w in words]
    final_text = " ".join(normalized_words)
    
    try:
        tfidf = TfidfVectorizer(
            max_features=5,
            stop_words=final_stopwords,
            ngram_range=(1, 2)
        )
        tfidf.fit_transform([final_text])
        feature_names = tfidf.get_feature_names_out()
        keys = [k.title() for k in feature_names if len(k) > 3]
        
        if not keys:
            cluster_names[str(i)] = f"Topik {i}"
        else:
            cluster_names[str(i)] = ", ".join(keys[:3])
            
    except Exception as e:
        cluster_names[str(i)] = f"Topik {i}"

# --- 7. MAPPING DOSEN (SPLIT ROLE) ---
print("üë• Mapping Dosen...")
prop_cluster_map = df_texts.groupby('proposal_id')['cluster_label'].agg(
    lambda x: x.value_counts().index[0]
).reset_index()

df_merged = pd.merge(prop_cluster_map, df_dosen, on='proposal_id')

final_cluster_info = {}
for i in range(NUM_CLUSTERS):
    g = df_merged[df_merged['cluster_label'] == i]
    count_proposals = len(g)
    
    d1 = []
    if not g.empty and 'id_dosen_1' in g:
        vc1 = g['id_dosen_1'].value_counts()
        if not vc1.empty: d1 = [int(x) for x in vc1.head(3).index.tolist()]
    
    d2 = []
    if not g.empty and 'id_dosen_2' in g:
        vc2 = g['id_dosen_2'].value_counts()
        if not vc2.empty: d2 = [int(x) for x in vc2.head(3).index.tolist()]
    
    final_cluster_info[str(i)] = {
        "name": cluster_names.get(str(i), ""),
        "count": int(count_proposals),
        "dosen1": d1, 
        "dosen2": d2
    }

with open(CLUSTER_INFO_FILE, 'w') as f: 
    json.dump(final_cluster_info, f)

print(f"‚úÖ Selesai! File {CLUSTER_INFO_FILE} telah diperbarui.")
print("Contoh Hasil:", list(final_cluster_info.items())[:2])

üì• Memuat data...
   -> Total Kalimat: 24509
‚ö° Encoding vectors (SBERT)...


Batches:   0%|          | 0/383 [00:00<?, ?it/s]

üóÇÔ∏è  Membangun Index FAISS...
   -> Index tersimpan: D:\Media\Kuliah\Skripsi\ScripTI\Versi 4\Index_and_Clusters\proposal_vectors.index
üß© Melakukan Clustering...
üè∑Ô∏è  Generating Topic Names...


Generating Topics:   0%|          | 0/20 [00:00<?, ?it/s]

üë• Mapping Dosen...
‚úÖ Selesai! File D:\Media\Kuliah\Skripsi\ScripTI\Versi 4\Index_and_Clusters\cluster_info_complete.json telah diperbarui.
Contoh Hasil: [('0', {'name': 'Batik, Citra, Klasifikasi', 'count': 152, 'dosen1': [6, 18, 7], 'dosen2': [7, 20, 50]}), ('1', {'name': 'Akademik, Learning, Media', 'count': 53, 'dosen1': [50, 12, 15], 'dosen2': [17, 2, 14]})]
