In [10]:
import pandas as pd
import json
from pyserini.search.lucene import LuceneSearcher
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.corpus import stopwords

In [None]:
# PREPROCESSING Query
stop_words = set(stopwords.words('indonesian'))

factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stem_tokens(tokens):
    return [stemmer.stem(token) for token in tokens]

def preprocess_query(text):
    text = text.lower()
    tokens = [word for word in text.split() if word not in stop_words]
    tokens = stem_tokens(tokens)
    return " ".join(tokens)

In [12]:
def display_results(query, hits, df):
    print(f"\nQuery: {query}")

    if not hits:
        print("Tidak ada dokumen yang sesuai.")
        return pd.DataFrame()

    results_with_scores = []
    for i, hit in enumerate(hits, start=1):
        # Parse JSON asli dari dokumen (karena pakai --storeRaw)
        try:
            raw = json.loads(hit.raw)
            docid = str(raw.get("id", hit.docid))  # fallback ke docid internal
        except Exception:
            docid = hit.docid

        # Cari di DataFrame
        matching_row = df[df["id"].astype(str) == docid].copy()

        if not matching_row.empty:
            matching_row["score"] = hit.score
            matching_row["rank"] = i
            matching_row["query"] = query
            results_with_scores.append(matching_row)

            # Print ringkas
            print(f"{i}. {matching_row.iloc[0]['title']} (Score: {hit.score:.4f})")
        else:
            print(f"{i}. [ID {docid}] tidak ditemukan di DataFrame (Score: {hit.score:.4f})")

    if results_with_scores:
        return pd.concat(results_with_scores, ignore_index=True)
    else:
        print("No matching documents found in DataFrame.")
        return pd.DataFrame()

In [13]:
df = pd.read_json("json-file/docs.jsonl", lines=True)

# Gunakan index hasil pyserini.index
searcher = LuceneSearcher("my_index")

# Daftar query
queries = [
    "gemini ai",
    "laptop gaming wajib dibeli",
    "cara agar tidak di hack",
    "teknologi canggih sekarang",
    "komputer terbaik",
    "mobile legend"
]

all_results = []

for q in queries:
    processed_q = preprocess_query(q)   # <<< preprocessing dulu
    hits = searcher.search(processed_q, k=10)
    result_df = display_results(q, hits, df)  # tetap tampilkan query asli
    if not result_df.empty:
        all_results.append(result_df)


Query: gemini ai
1. viral foto polaroid gemini ai bareng medsos bikin contoh prompt (Score: 3.1467)
2. google limit hari gemini ai gratis ai pro ai ultra (Score: 3.1460)
3. gemini ai suntik google drive gambar (Score: 3.1302)
4. viral miniatur ai foto medsos buat via gemini (Score: 3.1248)
5. viral foto polaroid gemini ai orang tua tiada pakai prompt (Score: 3.1212)
6. google sulap chrome browser gemini ai 10 fitur canggih (Score: 3.1140)
7. 8 prompt foto polaroid gemini ai idol kpop tinggal pilih copas (Score: 3.0935)
8. fitur google docs teks ubah audio ai (Score: 3.0927)
9. foto polaroid gemini ai peluk idol kpop viral prompt buat (Score: 3.0848)
10. bikin miniatur ai gerak contoh prompt tarik coba (Score: 3.0809)

Query: laptop gaming wajib dibeli
1. 6 timbang beli laptop chromebook (Score: 4.9612)
2. 10 beda laptop chromebook windows pertimbangankan beli (Score: 4.7194)
3. 6 bikin chromebook batas (Score: 4.6599)
4. laptop chromebook beda laptop windows (Score: 4.6577)
5. anak sa

In [None]:
if all_results:
    combined = pd.concat(all_results, ignore_index=True)

    # Pilih hanya kolom sesuai permintaan + query
    output_df = combined[["id", "title", "date", "score", "rank", "query"]]

    print("\n=== Semua Hasil Gabungan ===")
    display(output_df)

    # Opsional: simpan ke Excel
    output_df.to_excel("hasil-query/search_results.xlsx", engine="openpyxl")
    #print("\nSemua hasil pencarian disimpan ke search_results.xlsx")
else:
    print("Tidak ada hasil yang ditemukan untuk semua query.")


=== Semua Hasil Gabungan ===


Unnamed: 0,id,title,date,score,rank,query
0,170,viral foto polaroid gemini ai bareng medsos bi...,"Minggu, 14 September 2025",3.1467,1,gemini ai
1,277,google limit hari gemini ai gratis ai pro ai u...,"Selasa, 9 September 2025",3.146,2,gemini ai
2,496,gemini ai suntik google drive gambar,"Jumat, 29 Agustus 2025",3.1302,3,gemini ai
3,316,viral miniatur ai foto medsos buat via gemini,"Senin, 8 September 2025",3.1248,4,gemini ai
4,168,viral foto polaroid gemini ai orang tua tiada ...,"Minggu, 14 September 2025",3.1212,5,gemini ai
5,25,google sulap chrome browser gemini ai 10 fitur...,"Sabtu, 20 September 2025",3.114,6,gemini ai
6,229,8 prompt foto polaroid gemini ai idol kpop tin...,"Kamis, 11 September 2025",3.0935,7,gemini ai
7,490,fitur google docs teks ubah audio ai,"Sabtu, 30 Agustus 2025",3.0927,8,gemini ai
8,236,foto polaroid gemini ai peluk idol kpop viral ...,"Kamis, 11 September 2025",3.0848,9,gemini ai
9,268,bikin miniatur ai gerak contoh prompt tarik coba,"Selasa, 9 September 2025",3.0809,10,gemini ai
