In [35]:
import json
import pandas as pd
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.corpus import stopwords

In [36]:
pd.set_option('display.max_colwidth', 500)

In [37]:
istilah_kesehatan = {
    # Tekanan Darah
    "hipertensi": "tekanan darah tinggi",
    "hipotensi": "tekanan darah rendah",

    # Gula Darah
    "hiperglikemia": "kadar gula darah tinggi",
    "hipoglikemia": "kadar gula darah rendah",
    "diabetes melitus": "penyakit kencing manis",

    # Pernapasan
    "dispnea": "sesak napas",
    "rinorea": "pilek atau ingusan",
    "epistaksis": "mimisan",
    "faringitis": "radang tenggorokan",
    "laringitis": "radang pita suara",
    "asma": "bengek",

    # Pencernaan
    "dispepsia": "mag atau gangguan pencernaan",
    "konstipasi": "sembelit atau susah buang air besar",
    "diare": "mencret",
    "hemoroid": "wasir atau ambeien",
    "apendisitis": "radang usus buntu",
    "gastritis": "radang lambung",

    # Kulit dan Alergi
    "urtikaria": "biduran atau kaligata",
    "dermatitis": "eksim atau radang kulit",
    "varisela": "cacar air",
    "morbili": "campak atau tampek",
    "veruka": "kutil",
    "tinea pedis": "kutu air",
    "lotion": "losion",

    # Kepala dan Saraf
    "sefalgia": "sakit kepala atau pusing",
    "insomnia": "susah tidur",
    "sinkop": "pingsan",
    "konvulsi": "kejang",

    # Umum
    "pireksia": "demam",
    "hipertermia": "suhu tubuh sangat tinggi",
    "hipotermia": "suhu tubuh sangat rendah",
    "mialgia": "nyeri otot",
    "artralgia": "nyeri sendi",
    "fatik": "kelelahan atau rasa capai",
    "edema": "bengkak",
    "pruritus": "gatal-gatal",
    "anemia": "kurang darah",
    "karsinoma": "kanker",
    "neoplasma": "tumor",
    "halitosis": "bau mulut",
    "kalkulus renal": "batu ginjal",
    "moisturizing": "moisturizer"
}

In [38]:
stop_words = set(stopwords.words('indonesian'))
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Fungsi untuk stemming
def stem_tokens(tokens):
    lemmas = [stemmer.stem(token) for token in tokens]
    return lemmas

In [39]:
def normalize_text(text, dictionary):
    tokens = text.split()

    normalized_tokens = []
    for token in tokens:
        normalized_token = dictionary.get(token, token)
        normalized_tokens.append(normalized_token)

    return " ".join(normalized_tokens)

In [56]:
def preprocess_query(text):
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stop_words])
    text = ' '.join(stem_tokens(text.split()))
    text = normalize_text(text, istilah_kesehatan)

    return text

In [61]:
def get_ground_truth_ids(jsonl_path, queries, field='contents'):
    # Load all documents
    docs = []
    with open(jsonl_path, 'r', encoding='utf-8') as f:
        for line in f:
            docs.append(json.loads(line))
    
    # For each query, find matching doc ids (no duplicates)
    ground_truth = {}
    for query in queries:
        matches = set()
        q_lower = query.lower()
        q_lower = preprocess_query(q_lower)
        q_lower = q_lower.split()
        print(f"Processed query: {q_lower}")
        for doc in docs:
            for q in q_lower:
                if field in doc and q in doc[field].lower():
                    matches.add(doc['id'])  # Use set to avoid duplicates
        ground_truth[query] = list(matches)
    return ground_truth

In [57]:
queries = [
    'gejala diabetes',
    'makanan sehat untuk jantung',
    'kesehatan mental',
    'efek samping vaksin covid-19',
    'manfaat buah'
]

In [66]:
def pretty_print_ground_truth(ground_truth):
    df = pd.DataFrame([
        {"Query": query, "Ground Truth Doc IDs": ", ".join(map(str, doc_ids)) if doc_ids else "-"}
        for query, doc_ids in ground_truth.items()
    ])
    df['Total'] = df['Ground Truth Doc IDs'].apply(lambda x: len(x.split(', ')) if x != '-' else 0)
    return df

# Title

In [67]:
jsonl_path = '../indexing/title/index_title.jsonl'
ground_truth = get_ground_truth_ids(jsonl_path, queries, field='contents')
pretty_print_ground_truth(ground_truth)

Processed query: ['gejala', 'diabetes']
Processed query: ['makan', 'sehat', 'jantung']
Processed query: ['sehat', 'mental']
Processed query: ['efek', 'samping', 'vaksin', 'covid-19']
Processed query: ['manfaat', 'buah']


Unnamed: 0,Query,Ground Truth Doc IDs,Total
0,gejala diabetes,"4, 6, 9, 400, 21, 410, 414, 417, 164, 40, 41, 42, 49, 51, 325, 72, 201, 74, 76, 77, 78, 80, 84, 86, 91, 221, 350, 96, 353, 103, 104, 105, 106, 107, 367",35
1,makanan sehat untuk jantung,"1, 2, 5, 7, 8, 13, 15, 16, 17, 18, 19, 20, 23, 25, 27, 28, 29, 30, 34, 35, 36, 38, 39, 43, 44, 46, 48, 50, 52, 54, 55, 56, 58, 59, 60, 61, 63, 64, 65, 68, 69, 70, 71, 73, 75, 79, 82, 87, 88, 89, 90, 92, 93, 94, 97, 98, 100, 101, 102, 108, 119, 127, 129, 138, 144, 146, 149, 150, 151, 155, 156, 157, 165, 166, 172, 173, 174, 180, 184, 186, 188, 189, 201, 204, 207, 209, 232, 234, 249, 250, 251, 253, 258, 260, 261, 263, 264, 271, 277, 283, 292, 306, 307, 313, 328, 334, 339, 354, 356, 366, 369, 37...",119
2,kesehatan mental,"1, 2, 5, 7, 8, 13, 15, 16, 17, 18, 19, 20, 23, 25, 27, 28, 29, 30, 34, 35, 36, 38, 39, 43, 44, 46, 48, 50, 52, 54, 55, 56, 58, 59, 60, 61, 63, 64, 65, 68, 69, 70, 71, 73, 75, 79, 82, 87, 88, 89, 90, 92, 93, 94, 97, 98, 100, 101, 102, 108, 127, 129, 138, 144, 146, 149, 150, 151, 155, 157, 165, 166, 172, 173, 174, 180, 184, 186, 188, 189, 201, 204, 207, 208, 209, 234, 250, 251, 260, 261, 263, 264, 271, 277, 283, 292, 306, 307, 313, 328, 334, 339, 354, 366, 370, 382, 401, 411, 419",109
3,efek samping vaksin covid-19,"131, 12, 404, 31, 163, 296, 311, 317, 68, 325, 329, 203, 335, 336, 337, 83, 342, 343, 344, 89, 90, 94, 353, 247, 121",25
4,manfaat buah,"10, 11, 13, 15, 18, 19, 20, 23, 24, 26, 27, 28, 29, 30, 34, 35, 36, 39, 44, 52, 53, 54, 56, 58, 59, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 73, 75, 79, 81, 82, 88, 92, 94, 97, 98, 99, 100, 101, 102, 108, 146, 163, 173, 189, 219, 222, 234, 241, 243, 260, 263, 264, 269, 271, 278, 280, 283, 284, 292, 298, 299, 301, 306, 323, 328, 341, 346, 347, 366, 378",80


# Content

In [68]:
jsonl_path = '../indexing/title_content/index_title_content.jsonl'
ground_truth = get_ground_truth_ids(jsonl_path, queries, field='contents')
pretty_print_ground_truth(ground_truth)

Processed query: ['gejala', 'diabetes']
Processed query: ['makan', 'sehat', 'jantung']
Processed query: ['sehat', 'mental']
Processed query: ['efek', 'samping', 'vaksin', 'covid-19']
Processed query: ['manfaat', 'buah']


Unnamed: 0,Query,Ground Truth Doc IDs,Total
0,gejala diabetes,"1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 16, 19, 21, 22, 23, 24, 25, 26, 27, 28, 30, 31, 32, 35, 36, 38, 39, 40, 41, 42, 43, 44, 46, 47, 49, 50, 51, 61, 62, 64, 65, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 88, 89, 90, 91, 92, 93, 94, 96, 97, 99, 100, 102, 103, 104, 105, 106, 107, 109, 110, 111, 112, 117, 118, 122, 132, 134, 144, 146, 149, 150, 156, 163, 164, 167, 173, 176, 179, 183, 189, 191, 199, 200, 201, 205, 213, 217, 218, 219, 220, 221, 222, 224, 226...",228
1,makanan sehat untuk jantung,"1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 96, 97, 98, 99, 100, 101, 102, 103, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125...",388
2,kesehatan mental,"1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 96, 97, 98, 99, 100, 101, 102, 103, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 1...",360
3,efek samping vaksin covid-19,"1, 3, 4, 9, 11, 12, 13, 15, 17, 18, 19, 20, 21, 22, 23, 24, 27, 31, 32, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 51, 53, 54, 55, 56, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 72, 74, 75, 76, 77, 79, 83, 84, 85, 86, 88, 89, 90, 91, 92, 93, 94, 95, 97, 100, 101, 103, 104, 105, 106, 107, 109, 110, 111, 114, 115, 116, 118, 120, 121, 123, 125, 127, 131, 134, 139, 140, 144, 146, 149, 150, 151, 152, 162, 163, 164, 165, 169, 175, 179, 183, 185, 189, 190, 194, 200, 203, 212, 214, 215...",221
4,manfaat buah,"2, 8, 10, 11, 13, 15, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 33, 34, 35, 36, 37, 38, 39, 43, 44, 45, 48, 49, 50, 52, 53, 54, 55, 56, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 73, 75, 79, 81, 82, 87, 88, 89, 90, 91, 92, 93, 94, 97, 98, 99, 100, 101, 102, 103, 108, 110, 111, 112, 115, 117, 118, 119, 121, 124, 127, 128, 129, 130, 131, 135, 137, 138, 139, 140, 141, 142, 145, 146, 147, 149, 150, 151, 154, 156, 158, 159, 160, 161, 162, 163, 164, 165, 167, 171, 173, 179, 180, 181...",232


# Desc

In [69]:
jsonl_path = '../indexing/title_desc_content/index_title_desc_content.jsonl'
ground_truth = get_ground_truth_ids(jsonl_path, queries, field='contents')
pretty_print_ground_truth(ground_truth)

Processed query: ['gejala', 'diabetes']
Processed query: ['makan', 'sehat', 'jantung']
Processed query: ['sehat', 'mental']
Processed query: ['efek', 'samping', 'vaksin', 'covid-19']
Processed query: ['manfaat', 'buah']


Unnamed: 0,Query,Ground Truth Doc IDs,Total
0,gejala diabetes,"1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 16, 19, 21, 22, 23, 24, 25, 26, 27, 28, 30, 31, 32, 35, 36, 38, 39, 40, 41, 42, 43, 44, 46, 47, 49, 50, 51, 61, 62, 64, 65, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 88, 89, 90, 91, 92, 93, 94, 96, 97, 99, 100, 102, 103, 104, 105, 106, 107, 109, 110, 111, 112, 117, 118, 122, 132, 134, 144, 146, 149, 150, 156, 163, 164, 167, 173, 176, 179, 183, 189, 191, 199, 200, 201, 205, 213, 217, 218, 219, 220, 221, 222, 224, 226...",230
1,makanan sehat untuk jantung,"1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 96, 97, 98, 99, 100, 101, 102, 103, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125...",389
2,kesehatan mental,"1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 96, 97, 98, 99, 100, 101, 102, 103, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 1...",361
3,efek samping vaksin covid-19,"1, 3, 4, 9, 11, 12, 13, 15, 17, 18, 19, 20, 21, 22, 23, 24, 27, 31, 32, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 51, 53, 54, 55, 56, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 72, 74, 75, 76, 77, 79, 83, 84, 85, 86, 88, 89, 90, 91, 92, 93, 94, 95, 97, 100, 101, 103, 104, 105, 106, 107, 109, 110, 111, 114, 115, 116, 118, 120, 121, 123, 125, 127, 131, 134, 139, 140, 144, 146, 149, 150, 151, 152, 162, 163, 164, 165, 169, 175, 179, 183, 185, 189, 190, 194, 200, 203, 212, 214, 215...",221
4,manfaat buah,"2, 8, 10, 11, 13, 15, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 33, 34, 35, 36, 37, 38, 39, 43, 44, 45, 48, 49, 50, 52, 53, 54, 55, 56, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 73, 75, 79, 81, 82, 87, 88, 89, 90, 91, 92, 93, 94, 97, 98, 99, 100, 101, 102, 103, 108, 110, 111, 112, 115, 117, 118, 119, 121, 124, 127, 128, 129, 130, 131, 135, 137, 138, 139, 140, 141, 142, 145, 146, 147, 149, 150, 151, 154, 156, 158, 159, 160, 161, 162, 163, 164, 165, 167, 171, 173, 179, 180, 181...",235
