In [1]:
import pandas as pd
from pyserini.search.lucene import LuceneSearcher

  from .autonotebook import tqdm as notebook_tqdm


# Membaca Dataframe

In [3]:
df = pd.read_csv('../artikel/all_health_articles.csv')
df.drop(columns=['description', 'content'], inplace=True)
df.head()

Unnamed: 0,article_index,title,tag,link,date,source
0,1,5 rawat ginjal sehat cegah sakit ginjal,Kesehatan Umum,https://www.biofarma.co.id/id/announcement/det...,3 September 2025,Biofarma
1,2,7 tips jaga sehat mata,Kesehatan Umum,https://www.biofarma.co.id/id/announcement/det...,29 Agustus 2025,Biofarma
2,3,sepele 10 sebab campak abai,Kesehatan Umum,https://www.biofarma.co.id/id/announcement/det...,28 Agustus 2025,Biofarma
3,4,campak sebab gejala cegah,Kesehatan Umum,https://www.biofarma.co.id/id/announcement/det...,25 Agustus 2025,Biofarma
4,5,mudah jaga sehat gigi mulut hindar dari sakit,Kesehatan Umum,https://www.biofarma.co.id/id/announcement/det...,20 Agustus 2025,Biofarma


# Bm25 Retrieval (Title)

In [5]:
searcher = LuceneSearcher('../indexing/indexes/title_jsonl')

queries = [
  'gejala diabetes',
  'makanan sehat untuk jantung',
  'cara mencegah hipertensi',
  'efek samping vaksin covid-19',
  'manfaat olahraga yoga'
]

In [6]:
# Fungsi untuk print hasil
def display_results(query, hits):
    print(f'\n{query}:')

    if not hits:
        print("Tidak ada dokumen yang sesuai.")
        return
    
    # Create a list to store results with scores
    results_with_scores = []
    
    print('Results:')
    for i in range(len(hits)):
        # Get the corresponding row from dataframe
        matching_row = df[df['article_index'] == int(hits[i].docid)].copy()
        
        if not matching_row.empty:
            # Add score to the dataframe row
            matching_row = matching_row.copy()
            matching_row['score'] = hits[i].score
            matching_row['rank'] = i + 1
            results_with_scores.append(matching_row)
            
            print(f'{i+1:2} ID: {hits[i].docid:4} Score: {hits[i].score:.5f}')
            print(f'   Title: {matching_row.iloc[0]["title"]}')
            print()
    
    # Combine all results into a single DataFrame
    if results_with_scores:
        result_df = pd.concat(results_with_scores, ignore_index=True)
        print("Complete Results DataFrame:")
        print(result_df[['rank', 'article_index', 'title', 'score']])
        print("-"*100)
        return result_df
    else:
        print("No matching documents found in DataFrame.")
        return pd.DataFrame()

In [8]:
# Test the function with a query
query = queries[0]  # 'gejala diabetes'
hits = searcher.search(query, k=5)  # Get top 5 results

# Display results and get DataFrame with scores
result_df = display_results(query, hits)

# You can now work with the result DataFrame
print("\nResult DataFrame with scores:")
result_df


gejala diabetes:
Results:
 1 ID: 86   Score: 2.32060
   Title: gejala diabetes melitus tahu

 2 ID: 49   Score: 2.24300
   Title: 5 faktor risiko diabetes tahu

 3 ID: 201  Score: 1.93580
   Title: hari diabetes dunia diabetasol gelar edukasi sehat

 4 ID: 345  Score: 1.90090
   Title: 5 efek samping minum rebus daun ciplukan waspada

 5 ID: 353  Score: 1.90090
   Title: 5 efek samping minum rebus daun ciplukan waspada

Complete Results DataFrame:
   rank  article_index                                              title  \
0     1             86                       gejala diabetes melitus tahu   
1     2             49                      5 faktor risiko diabetes tahu   
2     3            201  hari diabetes dunia diabetasol gelar edukasi s...   
3     4            345   5 efek samping minum rebus daun ciplukan waspada   
4     5            353   5 efek samping minum rebus daun ciplukan waspada   

      score  
0  2.320600  
1  2.243000  
2  1.935800  
3  1.900900  
4  1.900899  


Unnamed: 0,article_index,title,tag,link,date,source,score,rank
0,86,gejala diabetes melitus tahu,Kesehatan Umum,https://www.biofarma.co.id/id/announcement/det...,11 Desember 2024,Biofarma,2.3206,1
1,49,5 faktor risiko diabetes tahu,Kesehatan Umum,https://www.biofarma.co.id/id/announcement/det...,25 Maret 2025,Biofarma,2.243,2
2,201,hari diabetes dunia diabetasol gelar edukasi s...,Berita Kesehatan,https://www.klikdokter.com/info-sehat/berita-k...,Oktober November 2022,Klikdokter,1.9358,3
3,345,5 efek samping minum rebus daun ciplukan waspada,hidup sehat,https://www.alodokter.com/5-efek-samping-minum...,10 September 2025,alodokter,1.9009,4
4,353,5 efek samping minum rebus daun ciplukan waspada,hidup sehat,https://www.alodokter.com/5-efek-samping-minum...,10 September 2025,alodokter,1.900899,5
