In [2]:
import json
from pyserini.search.lucene import LuceneSearcher
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.corpus import stopwords
import nltk

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Pastikan stopwords NLTK sudah diunduh
try:
    stopwords.words('indonesian')
except LookupError:
    print("Mengunduh stopwords untuk Bahasa Indonesia...")
    nltk.download('stopwords')
    print("Selesai.")

In [4]:
# Path index & data
INDEX_DIR = "my_index"
JSON_FILE = "json-file/docs.jsonl"  # ganti sesuai nama file

# Load searcher
searcher = LuceneSearcher(INDEX_DIR)

In [5]:
stop_words = set(stopwords.words('indonesian'))
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stem_tokens(tokens):
    """Fungsi untuk melakukan stemming pada list token."""
    return [stemmer.stem(token) for token in tokens]

def preprocess_text(text):
    """Fungsi lengkap untuk preprocessing: lowercase, tokenisasi, stopword removal, stemming."""
    # 1. Lowercase
    text = text.lower()
    # 2. Tokenisasi + Hapus Stopwords
    tokens = [word for word in text.split() if word not in stop_words]
    # 3. Stemming
    stemmed_tokens = stem_tokens(tokens)
    return " ".join(stemmed_tokens)

In [6]:
def load_docs(path):
    docs = []
    with open(path, "r", encoding="utf-8") as f:
        first_char = f.read(1)
        f.seek(0)
        if first_char == "[":  # format array JSON
            docs = json.load(f)
        else:  # format NDJSON
            for line in f:
                if line.strip():
                    try:
                        docs.append(json.loads(line))
                    except Exception as e:
                        print(f"⚠️ Gagal parse baris: {line[:50]}... ({e})")
    return docs

In [7]:
TECH_QUERIES = [
    "gemini ai",
    "laptop gaming",
    "hack",
    "teknologi",
    "komputer",
    "mobile legends"
]

In [8]:
def build_ground_truth():
    docs = load_docs(JSON_FILE)
    ground_truth = {}

    print("Membangun Ground Truth")
    for query in TECH_QUERIES:
        relevant_docs = []
        
        # 1. Ubah query menjadi daftar kata-kata kunci (lowercased)
        query_words = query.lower().split()

        for doc in docs:
            doc_id = str(doc.get("id"))
            doc_text = (doc.get("title", "") + " " + doc.get("content", "")).lower()

            # 2. LOGIKA UTAMA: Cek jika SEMUA kata kunci dari query ada di dalam teks dokumen
            # all() akan mengembalikan True hanya jika semua kondisi di dalamnya True
            if all(word in doc_text for word in query_words):
                relevant_docs.append(doc_id)
        
        if relevant_docs:
            ground_truth[query] = relevant_docs
            print(f"Query '{query}' menemukan {len(relevant_docs)} dokumen relevan.")
        else:
            print(f"Query '{query}' tidak menemukan dokumen relevan.")
            
    print("-" * 30)
    return ground_truth, docs

In [9]:
# [Sel ke-8, file: evaluate.ipynb]
# GANTIKAN SEL LAMA ANDA DENGAN YANG INI

def evaluate_ir(k=10):
    ground_truth, documents = build_ground_truth()
    all_precisions, all_recalls, all_f1s = [], [], []

    print("=== Hasil Evaluasi IR (Format Laporan) ===")
    print("Metode: BM25 Title+Content")
    print("-" * 40)

    # DataFrame untuk menampung hasil
    results_data = []

    for query, relevant_docs in ground_truth.items():
        processed_query = preprocess_text(query)
        hits = searcher.search(processed_query, k=k)
        retrieved = [h.docid for h in hits]

        retrieved_relevant = [d for d in retrieved if d in relevant_docs]

        # --- Perhitungan Metrik (Sama seperti sebelumnya) ---
        retrieved_count = len(retrieved)
        retrieved_relevant_count = len(retrieved_relevant)
        total_relevant_count = len(relevant_docs)

        precision = retrieved_relevant_count / retrieved_count if retrieved_count else 0
        recall = retrieved_relevant_count / total_relevant_count if total_relevant_count else 0
        f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0

        all_precisions.append(precision)
        all_recalls.append(recall)
        all_f1s.append(f1)

        # --- BAGIAN PRINT YANG DIMODIFIKASI ---
        
        # 1. Buat string daftar dokumen yang relevan (e.g., "#170, #277, ...")
        doc_id_str = ", ".join([f"#{docid}" for docid in retrieved_relevant])
        if not doc_id_str:
            doc_id_str = "Tidak ada"

        # 2. Simpan data untuk tabel
        results_data.append({
            "Query": query,
            "Relevan (Top-10)": f"{retrieved_relevant_count}/{retrieved_count} (dok. {doc_id_str})",
            "Precision": f"{retrieved_relevant_count}/{retrieved_count} = {precision:.2f}",
            "Recall": f"{retrieved_relevant_count}/{total_relevant_count} = {recall:.2f}",
            "F1-Score": f"{f1:.2f}"  # <-- TAMBAHAN BARU
        })
        

    # --- Tampilkan Hasil dalam Bentuk Tabel (Menggunakan Pandas) ---
    try:
        import pandas as pd
        # Atur urutan kolom agar F1-Score ada di akhir
        df_results = pd.DataFrame(results_data, columns=["Query", "Relevan (Top-10)", "Precision", "Recall", "F1-Score"])
        display(df_results)
    except ImportError:
        print("Install pandas (pip install pandas) untuk melihat tabel.")


    print("\n=== Rata-rata Evaluasi === (dari semua query)")
    if all_precisions:
        print(f"Precision : {sum(all_precisions)/len(all_precisions):.2f}")
        print(f"Recall    : {sum(all_recalls)/len(all_recalls):.2f}")
        print(f"F1-score  : {sum(all_f1s)/len(all_f1s):.2f}")
    else:
        print("Tidak ada hasil evaluasi (cek ground truth atau query).")


if __name__ == "__main__":
    # Pastikan pandas sudah ter-install jika Anda menjalankan ini di Jupyter
    try:
        import pandas as pd
    except ImportError:
        print("Peringatan: 'pip install pandas' agar hasil evaluasi bisa tampil sebagai tabel.")
        
    evaluate_ir(k=10)

Membangun Ground Truth
Query 'gemini ai' menemukan 25 dokumen relevan.
Query 'laptop gaming' menemukan 3 dokumen relevan.
Query 'hack' menemukan 3 dokumen relevan.
Query 'teknologi' menemukan 5 dokumen relevan.
Query 'komputer' menemukan 3 dokumen relevan.
Query 'mobile legends' menemukan 4 dokumen relevan.
------------------------------
=== Hasil Evaluasi IR (Format Laporan) ===
Metode: BM25 Title+Content
----------------------------------------


Unnamed: 0,Query,Relevan (Top-10),Precision,Recall,F1-Score
0,gemini ai,"8/10 (dok. #170, #277, #496, #316, #168, #25, ...",8/10 = 0.80,8/25 = 0.32,0.46
1,laptop gaming,"3/10 (dok. #78, #352, #687)",3/10 = 0.30,3/3 = 1.00,0.46
2,hack,"2/4 (dok. #99, #454)",2/4 = 0.50,2/3 = 0.67,0.57
3,teknologi,"2/10 (dok. #332, #85)",2/10 = 0.20,2/5 = 0.40,0.27
4,komputer,"2/10 (dok. #437, #284)",2/10 = 0.20,2/3 = 0.67,0.31
5,mobile legends,"4/10 (dok. #29, #487, #450, #162)",4/10 = 0.40,4/4 = 1.00,0.57



=== Rata-rata Evaluasi === (dari semua query)
Precision : 0.40
Recall    : 0.68
F1-score  : 0.44
