In [None]:
import os
import json
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from pyserini.search.lucene import LuceneSearcher, querybuilder

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#download nltk
nltk.download('punkt_tab')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

#PREPROCESSING FUNCTION
def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    processed_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return " ".join(processed_tokens)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ilham\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ilham\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
documents = [
    {"id": "d1", "contents": "The cat chased a small mouse into the garden."},
    {"id": "d2", "contents": "A friendly dog played fetch by the river."},
    {"id": "d3", "contents": "BM25 is a ranking function widely used in search engines."},
    {"id": "d4", "contents": "Boolean retrieval uses logical operators like AND and OR."},
    {"id": "d5", "contents": "TF-IDF weights terms by frequency and rarity."},
    {"id": "d6", "contents": "Neural retrieval uses dense embeddings for semantic search."},
    {"id": "d7", "contents": "The dog and the cat slept on the same couch."},
    {"id": "d8", "contents": "The library hosts a workshop on information retrieval."},
    {"id": "d9", "contents": "Students implemented BM25 and compared it with TF-IDF."},
    {"id": "d10", "contents": "The chef roasted chicken with rosemary and garlic."},
    {"id": "d11", "contents": "A black cat crossed the old stone bridge at night."},
    {"id": "d12", "contents": "Dogs are loyal companions during long hikes."},
    {"id": "d13", "contents": "The dataset contains fifteen short sentences for testing."},
    {"id": "d14", "contents": "Reranking models reorder BM25 candidates using transformers."},
    {"id": "d15", "contents": "The dog sniffed a cat but ignored the mouse."}
]

In [4]:
preprocessed_documents = []
for doc in documents:
    cleaned_contents = preprocess_text(doc['contents'])
    preprocessed_documents.append({"id": doc['id'], "contents": cleaned_contents})

# Simpan ke JSONL agar bisa diindeks Pyserini
os.makedirs('dataset_jsonlgg', exist_ok=True)
file_path = 'dataset_jsonlgg/documents_preprocessed.jsonl'

with open(file_path, 'w') as f:
    for doc in preprocessed_documents:
        f.write(json.dumps(doc) + '\n')

print(f"Dataset berhasil diproses dan disimpan di '{file_path}'\n")

Dataset berhasil diproses dan disimpan di 'dataset_jsonlgg/documents_preprocessed.jsonl'



In [None]:
# 5. TF-IDF COSINE SIMILARITY RETRIEVAL
doc_texts = [doc['contents'] for doc in preprocessed_documents]
doc_ids = [doc['id'] for doc in preprocessed_documents]

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(doc_texts)

def cosine_similarity_search(query, vectorizer, tfidf_matrix):
    query = preprocess_text(query)
    q_vec = vectorizer.transform([query])
    sim = np.dot(tfidf_matrix, q_vec.T).toarray().flatten()
    ranked = np.argsort(sim)[::-1]
    return [(doc_ids[i], sim[i]) for i in ranked if sim[i] > 0]

In [8]:
import sys
import subprocess
cmd = [
    sys.executable, "-m", "pyserini.index",
    "--collection", "JsonCollection",
    "--input", "dataset_jsonlgg",
    "--index", "my_index",
    "--generator", "DefaultLuceneDocumentGenerator",
    "--threads", "1",
    "--storePositions",
    "--storeDocvectors",
    "--storeRaw"
]

result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stdout)
print(result.stderr)

pyserini.index is deprecated, please use pyserini.index.lucene.
2025-10-28 13:52:18,697 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:205) - Setting log level to INFO
2025-10-28 13:52:18,699 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:209) - AbstractIndexer settings:
2025-10-28 13:52:18,700 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:210) -  + DocumentCollection path: dataset_jsonlgg
2025-10-28 13:52:18,700 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:211) -  + CollectionClass: JsonCollection
2025-10-28 13:52:18,701 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:212) -  + Index path: my_index
2025-10-28 13:52:18,701 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:213) -  + Threads: 1
2025-10-28 13:52:18,701 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:214) -  + Optimize (merge segments)? false
2025-10-28 13:52:18,735 INFO  [main] index.IndexCollection (IndexCollection.java:246) - Using DefaultEnglishAnalyze

In [None]:
# 7. LOAD SEARCHER UNTUK BOOLEAN QUERY
from pyserini.search.lucene import querybuilder, LuceneSearcher

try:
    searcher = LuceneSearcher('my_index')
except Exception as e:
    print(f"Error: {e}")
    raise e

# Operator Boolean
MUST = querybuilder.JBooleanClauseOccur['must'].value
SHOULD = querybuilder.JBooleanClauseOccur['should'].value
MUST_NOT = querybuilder.JBooleanClauseOccur['must_not'].value

# Helper untuk membuat query Boolean
def make_query(terms, logic):
    builder = querybuilder.get_boolean_query_builder()
    for term, op in zip(terms, logic):
        builder.add(querybuilder.get_term_query(term), op)
    return builder.build()

# 8. DEFINISI QUERY
queries = {}

queries["dog AND cat"] = make_query(["dog", "cat"], [MUST, MUST])
queries["dog OR cat"] = make_query(["dog", "cat"], [SHOULD, SHOULD])
queries["dog AND NOT cat"] = make_query(["dog", "cat"], [MUST, MUST_NOT])

# (bm25 OR tf-idf) AND retrieval
inner4 = make_query(["bm25", "tf-idf"], [SHOULD, SHOULD])
outer4 = querybuilder.get_boolean_query_builder()
outer4.add(inner4, MUST)
outer4.add(querybuilder.get_term_query("retrieval"), MUST)
queries["(bm25 OR tf-idf) AND retrieval"] = outer4.build()

# model AND (retrieval OR bm25)
inner5 = make_query(["retrieval", "bm25"], [SHOULD, SHOULD])
outer5 = querybuilder.get_boolean_query_builder()
outer5.add(inner5, MUST)
outer5.add(querybuilder.get_term_query("model"), MUST)
queries["model AND (retrieval OR bm25)"] = outer5.build()

queries["mouse AND garden"] = make_query(["mouse", "garden"], [MUST, MUST])
queries["engine OR mouse"] = make_query(["engine", "mouse"], [SHOULD, SHOULD])
queries["sniff OR mouse"] = make_query(["sniff", "mouse"], [SHOULD, SHOULD])


In [None]:
for query_name, query_obj in queries.items():
    print("\n" + "="*80)
    print(f"QUERY: {query_name}")
    print("="*80)

    # BOOLEAN RETRIEVAL
    hits = searcher.search(query_obj)
    print("=== BOOLEAN RETRIEVAL RESULTS ===")
    if hits:
        for hit in hits:
            doc_data = next((d for d in documents if d["id"] == hit.docid), None)
            if doc_data:
                content = doc_data["contents"]
                docid_str = doc_data["id"]
            else:
                content = "N/A"
                docid_str = hit.docid
            print(f" {docid_str} | score={hit.score:.4f} | contents={content}")
    else:
        print(" (tidak ada hasil)")

    # COSINE SIMILARITY (TF-IDF)
    print("\n=== COSINE SIMILARITY (TF-IDF) RESULTS ===")
    cos_results = cosine_similarity_search(query_name, vectorizer, tfidf_matrix)
    if cos_results:
        for docid, score in cos_results[:len(hits) if hits else 5]:
            doc_data = next((d for d in documents if d["id"] == docid), None)
            if doc_data:
                content = doc_data["contents"]
                docid_str = doc_data["id"]
            else:
                content = "N/A"
                docid_str = docid
            print(f" {docid_str} | score={score:.4f} | contents={content}")
    else:
        print(" (tidak ada hasil)")

print("\nSelesai menampilkan hasil untuk semua query.")



QUERY: dog AND cat
=== BOOLEAN RETRIEVAL RESULTS ===
 d7 | score=1.3990 | contents=The dog and the cat slept on the same couch.
 d15 | score=1.3482 | contents=The dog sniffed a cat but ignored the mouse.

=== COSINE SIMILARITY (TF-IDF) RESULTS ===
 d7 | score=0.5946 | contents=The dog and the cat slept on the same couch.
 d15 | score=0.5332 | contents=The dog sniffed a cat but ignored the mouse.

QUERY: dog OR cat
=== BOOLEAN RETRIEVAL RESULTS ===
 d7 | score=1.3990 | contents=The dog and the cat slept on the same couch.
 d15 | score=1.3482 | contents=The dog sniffed a cat but ignored the mouse.
 d1 | score=0.6741 | contents=The cat chased a small mouse into the garden.
 d12 | score=0.6741 | contents=Dogs are loyal companions during long hikes.
 d2 | score=0.6741 | contents=A friendly dog played fetch by the river.
 d11 | score=0.6284 | contents=A black cat crossed the old stone bridge at night.

=== COSINE SIMILARITY (TF-IDF) RESULTS ===
 d7 | score=0.5946 | contents=The dog and the 