In [2]:
#%%
import pandas as pd
import re
import string
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import nltk
from nltk.corpus import stopwords
import sys
import subprocess
import json

In [3]:
# Download stopwords NLTK
nltk.download('stopwords')

# Buat stopwords bahasa Indonesia
stop_words = set(stopwords.words('indonesian'))

# Buat stemmer Sastrawi
factory = StemmerFactory()
stemmer = factory.create_stemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ilham\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


###preprocesing

In [None]:
#preprocessing
# 1. Baca file Excel hasil scraping
df = pd.read_excel("artikel-cleaning-final/data_gabungan_normalized.xlsx")   # ganti nama file sesuai hasil scraping

# 2. Hapus duplikat (berdasarkan judul)
df = df.drop_duplicates(subset="title")

# 3. Hapus baris yang tidak ada content-nya
df = df.dropna(subset=["content"])

# 4. Hapus kolom tag (karena kosong)
if "tag" in df.columns:
    df = df.drop(columns=["tag"])

# 5.hapus kolom description(karena kosong)
if "description" in df.columns:
    df = df.drop(columns=["description"])

In [5]:
# 6. Preprocessing teks
def clean_text(text):
    # Case folding
    text = text.lower()

    # Hapus URL/link
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)

    # Hapus tanda baca
    text = text.translate(str.maketrans("", "", string.punctuation))

    # Hapus karakter non-ascii (noise)
    text = re.sub(r"[^\x00-\x7f]", " ", text)

    return text

def remove_stopwords(text):
    tokens = text.split()
    filtered = [word for word in tokens if word not in stop_words]
    return " ".join(filtered)

def stemming(text):
    return stemmer.stem(text)

In [6]:
# Terapkan ke kolom title, description, dan content
for col in ["title", "content"]:
    df[col] = df[col].astype(str)  # pastikan string
    df[col] = df[col].apply(clean_text)
    df[col] = df[col].apply(remove_stopwords)
    df[col] = df[col].apply(stemming)

with open("json-file/docs.jsonl", "w", encoding="utf-8") as f:
    for i, row in df.iterrows():
        doc = {
            "id": str(i+1),  
            "contents": f"{row['title']} {row['content']}",  
            "title": row["title"],       
            "link": row["link"],         
            "date": str(row["date"])     
        }
        f.write(json.dumps(doc, ensure_ascii=False) + "\n")

In [7]:
cmd = [
    sys.executable, "-m", "pyserini.index",
    "--collection", "JsonCollection",
    "--input", "json-file",
    "--index", "my_index",
    "--generator", "DefaultLuceneDocumentGenerator",
    "--threads", "1",
    "--storePositions",
    "--storeDocvectors",
    "--storeRaw"
]

result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stdout)
print(result.stderr)

pyserini.index is deprecated, please use pyserini.index.lucene.
2025-09-23 19:21:30,858 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:205) - Setting log level to INFO
2025-09-23 19:21:30,860 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:209) - AbstractIndexer settings:
2025-09-23 19:21:30,861 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:210) -  + DocumentCollection path: json-file
2025-09-23 19:21:30,861 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:211) -  + CollectionClass: JsonCollection
2025-09-23 19:21:30,862 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:212) -  + Index path: my_index
2025-09-23 19:21:30,862 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:213) -  + Threads: 1
2025-09-23 19:21:30,862 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:214) -  + Optimize (merge segments)? false
2025-09-23 19:21:30,887 INFO  [main] index.IndexCollection (IndexCollection.java:246) - Using DefaultEnglishAnalyzer
2025