All libraries and NLTK resources have been imported/downloaded.
1.12.1.post200
True


[nltk_data] Downloading package wordnet to /home/hanif/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/hanif/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [1]:
# For data manipulation and analysis
import pandas as pd
import numpy as np
import os
import re

# For Text Preprocessing (NLTK)
import nltk
from nltk.tokenize import word_tokenize # Kita akan memodifikasi cara ini digunakan
from nltk.corpus import stopwords # Ini hanya untuk NLTK stopwords, tapi kita pakai Sastrawi
from nltk.stem import PorterStemmer, WordNetLemmatizer # Mungkin tidak terpakai untuk Indo, tapi tetap diimpor

# For Text Preprocessing (Sastrawi for Indonesian language stemming)
# Make sure you've installed it: pip install Sastrawi
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

# For Transformer Models (MedicalBERT/XLM-RoBERTa)
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

# For Keyword Extraction (TF-IDF)
from sklearn.feature_extraction.text import TfidfVectorizer

# For Word Embeddings (Gensim)
from gensim.models import Word2Vec

# --- Download NLTK Resources ---
# Kita tetap perlu download beberapa NLTK resources, terutama 'punkt'
# agar word_tokenize (walaupun tidak spesifik Indo) dapat berjalan tanpa LookupError
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

try:
    nltk.data.find('corpora/omw-1.4') # Open Multilingual Wordnet
except LookupError:
    nltk.download('omw-1.4')

print("All libraries and NLTK resources have been imported/downloaded.")
print(f"PyTorch Version: {torch.__version__}"); print(f"CUDA Available: {torch.cuda.is_available()}")

All libraries and NLTK resources have been imported/downloaded.
PyTorch Version: 1.12.1.post200
CUDA Available: True


[nltk_data] Downloading package wordnet to /home/hanif/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/hanif/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
# Mendapatkan current working directory
current_directory = os.getcwd()

print(f"Notebook saat ini berjalan di direktori: {current_directory}")

# Anda juga bisa mencantumkan isi direktori tersebut
print("\nIsi dari direktori saat ini:")
for item in os.listdir(current_directory):
    print(f"- {item}")

Notebook saat ini berjalan di direktori: /home/hanif/ds-ml-projects/medical_chatbot/notebooks

Isi dari direktori saat ini:
- medical_entity_detection.ipynb
- .ipynb_checkpoints


In [3]:
dataset_path = '../data/gejala_penyakit/data_penyakit.csv'

# Verify file existence before attempting to load it
if os.path.exists(dataset_path):
    try:
        df_penyakit = pd.read_csv(dataset_path)
        print("Dataset loaded successfully!")
        print(f"Number of rows and columns: {df_penyakit.shape}")
        print("\nFirst 5 rows of the dataset:")
        print(df_penyakit.head())
        print("\nDataset column information:")
        df_penyakit.info()
        print("\nDataset column names:")
        print(df_penyakit.columns)
    except Exception as e:
        print(f"An error occurred while loading the dataset: {e}")
else:
    print(f"Error: File not found at {dataset_path}. Please ensure the path and filename are correct.")
    print(f"Your Current Working Directory: {os.getcwd()}")

Dataset loaded successfully!
Number of rows and columns: (100, 2)

First 5 rows of the dataset:
                                            penyakit  \
0                     menggigil, demam, sakit kepala   
1  Kaku kuduk, penurunan kesadaran, muntah proyek...   
2  Mata lengket, mata berair, pandangan sedikit k...   
3   Pipi bengkak, nyeri saat mengunyah, nyeri testis   
4        Gusi bengkak, gusi kemerahan, gusi berdarah   

                              diagnosis  
0              Malaria (bentuk benigma)  
1  Meningitis + perdarahan subarachnoid  
2                                   NaN  
3                             Parotitis  
4                                   NaN  

Dataset column information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   penyakit   100 non-null    object
 1   diagnosis  93 non-null     object
dtypes: object(2)
memory us

In [4]:
# Define the model name to load from Hugging Face Model Hub
huggingface_model_name = 'indobenchmark/indobert-base-p1'

# Define the local path where the model and tokenizer will be saved
# This aligns with our project structure: models/medical_bert/model/
local_model_path = '../models/medical_bert/model/'

print(f"Checking for existing model at: {local_model_path}")

try:
    # Check if the model already exists locally
    if os.path.exists(local_model_path) and os.listdir(local_model_path):
        print(f"Loading tokenizer from local path: {local_model_path}")
        tokenizer = AutoTokenizer.from_pretrained(local_model_path)
        print(f"Loading model from local path: {local_model_path}")
        model = AutoModelForTokenClassification.from_pretrained(local_model_path)
        print("Model and tokenizer loaded successfully from local path!")
    else:
        print(f"Local model not found. Downloading from Hugging Face: {huggingface_model_name}")
        # Create the directory if it doesn't exist
        os.makedirs(local_model_path, exist_ok=True)

        # Load the tokenizer from Hugging Face
        tokenizer = AutoTokenizer.from_pretrained(huggingface_model_name)
        # Load the model from Hugging Face
        model = AutoModelForTokenClassification.from_pretrained(huggingface_model_name)

        print(f"Saving tokenizer to local path: {local_model_path}")
        tokenizer.save_pretrained(local_model_path)
        print(f"Saving model to local path: {local_model_path}")
        model.save_pretrained(local_model_path)
        print("Model and tokenizer downloaded and saved locally!")


    # Set model to evaluation mode
    model.eval()
    print("Model set to evaluation mode.")

    # Check if GPU is available and move model to GPU
    if torch.cuda.is_available():
        model.to('cuda')
        print("Model moved to GPU.")
    else:
        print("GPU not available, model running on CPU.")

except Exception as e:
    print(f"An error occurred while loading or saving the model/tokenizer: {e}")
    print("Please ensure you have an internet connection if downloading for the first first time.")

Checking for existing model at: ../models/medical_bert/model/
Loading tokenizer from local path: ../models/medical_bert/model/
Loading model from local path: ../models/medical_bert/model/


2025-05-23 10:13:47.698660: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model and tokenizer loaded successfully from local path!
Model set to evaluation mode.
Model moved to GPU.


In [5]:
# --- Step 1: Ensure 'data' Folder Exists and Create/Update stopwords_id.txt ---
# This part runs once to prepare the stopwords file on disk.
print("Creating or updating stopwords_id.txt file...")

# Initialize StopWordRemoverFactory to get the default stopwords list
stopword_factory_for_file = StopWordRemoverFactory()
stopwords_list_for_file = stopword_factory_for_file.get_stop_words() # Assign the list here

# Define the path for the data folder (assuming notebook is in project root)
data_folder_path = '../data'
stopwords_filepath = os.path.join(data_folder_path, 'stopwords_id.txt')

# Create the 'data' folder if it doesn't exist
os.makedirs(data_folder_path, exist_ok=True)

# Save the stopwords list to the file
with open(stopwords_filepath, 'w', encoding='utf-8') as f:
    for word in stopwords_list_for_file:
        f.write(f"{word}\n")

print(f"File stopwords_id.txt successfully created/updated at: {stopwords_filepath}")

# --- Step 2: Initialize Sastrawi's Stemmer and Load Stopwords for Preprocessing ---
stemmer_factory = StemmerFactory()
stemmer = stemmer_factory.create_stemmer()

# Load the stopwords from the file we just created/updated.
# This ensures consistency, even if the notebook is restarted.
with open(stopwords_filepath, 'r', encoding='utf-8') as f:
    stopwords_id_for_preprocessing = f.read().splitlines()

def preprocess_text(text):
    """
    Function to preprocess text:
    1. Lowercasing
    2. Remove non-alphanumeric characters (keep spaces)
    3. Tokenization
    4. Stopword removal (using the loaded list)
    5. Stemming (using Sastrawi)
    6. Filter out single-character or purely digit tokens
    7. Join tokens back into a string
    """
    if not isinstance(text, str):
        return "" # Handle non-string input, e.g., NaN values

    text = text.lower()  # Lowercasing
    text = re.sub(r'[^a-z0-9\s]', '', text)  # Remove special characters, keep alphanumeric and spaces

    tokens = text.split() # Split the string by spaces

    # Apply stopword removal and stemming in one go
    processed_tokens = [
        stemmer.stem(token) for token in tokens
        if token not in stopwords_id_for_preprocessing # Filter out stop words
        and len(token) > 1 and not token.isdigit() # Filter single-character and purely digit tokens
    ]
    
    return " ".join(processed_tokens) # Join tokens back to a string

print("\nStarting text preprocessing for 'df_penyakit' using Sastrawi...")

# --- Step 3: Apply Preprocessing to DataFrame Columns ---
processed_columns_info = []

# Process 'gejala' column if it exists
if 'penyakit' in df_penyakit.columns: # Using 'gejala' as the original input column name
    df_penyakit['gejala_processed'] = df_penyakit['penyakit'].apply(preprocess_text)
    processed_columns_info.append("'gejala_processed' (from 'penyakit' column)")
else:
    print("Warning: 'penyakit' column not found. Please verify your DataFrame column names.")

# Process 'diagnosis' column if it exists
if 'diagnosis' in df_penyakit.columns:
    df_penyakit['diagnosis_processed'] = df_penyakit['diagnosis'].apply(preprocess_text)
    processed_columns_info.append("'diagnosis_processed' (from 'diagnosis' column)")
else:
    print("Warning: 'diagnosis' column not found. Please verify your DataFrame column names.")

if processed_columns_info:
    print(f"Columns {', '.join(processed_columns_info)} created.")
else:
    print("No relevant columns processed. Please check your DataFrame column names.")

print("\nPreprocessing complete! Displaying first 5 rows with new processed columns:")
print(df_penyakit.head())
print("\nChecking info of processed columns:")
for col_name in ['gejala_processed', 'diagnosis_processed']:
    if col_name in df_penyakit.columns:
        print(f"\nInfo for '{col_name}':")
        df_penyakit[col_name].info()


# --- Step 4: Save the Processed DataFrame (Crucial for Streamlit) ---
# Save the processed DataFrame to a CSV file.
# This output path must be consistent with the path that will be read in your Streamlit app.
output_path = '../data/gejala_penyakit/data_penyakit_processed.csv' # Ensure this path matches what Streamlit reads
df_penyakit.to_csv(output_path, index=False)
print(f"\nProcessed DataFrame has been saved to: {output_path}")

Creating or updating stopwords_id.txt file...
File stopwords_id.txt successfully created/updated at: ../data/stopwords_id.txt

Starting text preprocessing for 'df_penyakit' using Sastrawi...
Columns 'gejala_processed' (from 'penyakit' column), 'diagnosis_processed' (from 'diagnosis' column) created.

Preprocessing complete! Displaying first 5 rows with new processed columns:
                                            penyakit  \
0                     menggigil, demam, sakit kepala   
1  Kaku kuduk, penurunan kesadaran, muntah proyek...   
2  Mata lengket, mata berair, pandangan sedikit k...   
3   Pipi bengkak, nyeri saat mengunyah, nyeri testis   
4        Gusi bengkak, gusi kemerahan, gusi berdarah   

                              diagnosis  \
0              Malaria (bentuk benigma)   
1  Meningitis + perdarahan subarachnoid   
2                                   NaN   
3                             Parotitis   
4                                   NaN   

                          

In [6]:
# Create a list of unique processed symptoms
# Mengambil semua teks gejala yang sudah diproses, lalu memisahkannya menjadi kata-kata unik.
# Flattening the list of lists into a single list of words.
all_symptoms_words = []
for symptoms_text in df_penyakit['gejala_processed'].dropna(): # DropNa to handle potential NaN values
    all_symptoms_words.extend(symptoms_text.split()) # Use split() as we did in preprocessing

# Convert to a set to get unique words, then back to a sorted list
unique_symptoms = sorted(list(set(all_symptoms_words)))

print(f"Total unique symptom words extracted: {len(unique_symptoms)}")
print("First 10 unique symptom words:")
print(unique_symptoms[:10])

# Create a list of unique processed diagnoses/diseases
# Mirip dengan gejala, kita ambil diagnosis yang sudah diproses.
all_diagnosis_words = []
for diagnosis_text in df_penyakit['diagnosis_processed'].dropna(): # DropNa to handle potential NaN values
    all_diagnosis_words.extend(diagnosis_text.split())

unique_diagnosis = sorted(list(set(all_diagnosis_words)))

print(f"\nTotal unique diagnosis words extracted: {len(unique_diagnosis)}")
print("First 10 unique diagnosis words:")
print(unique_diagnosis[:10])

# --- (Opsional) Menggabungkan menjadi satu daftar entitas jika diinginkan ---
# Ini berguna jika Anda ingin membuat satu daftar besar dari semua entitas medis.
# all_medical_entities = sorted(list(set(unique_symptoms + unique_diagnosis)))
# print(f"\nTotal unique medical entities (symptoms + diagnoses): {len(all_medical_entities)}")
# print("First 10 unique medical entities:")
# print(all_medical_entities[:10])

Total unique symptom words extracted: 389
First 10 unique symptom words:
['abdomen', 'air', 'akibat', 'aksila', 'alami', 'alis', 'amandel', 'amis', 'ampas', 'anak']

Total unique diagnosis words extracted: 122
First 10 unique diagnosis words:
['abses', 'akuisita', 'akut', 'alergi', 'anak', 'antrax', 'appendicitis', 'bacterial', 'batuk', 'benigma']


In [7]:
# Initialize TfidfVectorizers for symptoms and diagnoses.
# min_df=1 ensures terms appearing in at least one document are considered.
# max_df=1.0 ensures no upper limit on document frequency (stopwords already handled).
tfidf_vectorizer_gejala = TfidfVectorizer(min_df=1, max_df=1.0)
tfidf_vectorizer_diagnosis = TfidfVectorizer(min_df=1, max_df=1.0)

print("Starting TF-IDF calculation for symptoms and diagnoses...")

# Fit and transform 'gejala_processed' column to TF-IDF matrix.
# .fillna('') handles potential NaN values by treating them as empty strings.
tfidf_matrix_gejala = tfidf_vectorizer_gejala.fit_transform(df_penyakit['gejala_processed'].fillna(''))
print(f"Shape of TF-IDF matrix for symptoms: {tfidf_matrix_gejala.shape}")

# Fit and transform 'diagnosis_processed' column to TF-IDF matrix.
tfidf_matrix_diagnosis = tfidf_vectorizer_diagnosis.fit_transform(df_penyakit['diagnosis_processed'].fillna(''))
print(f"Shape of TF-IDF matrix for diagnoses: {tfidf_matrix_diagnosis.shape}")

# Optional: Save vectorizers and matrices if needed for later use/deployment
# import joblib
# joblib.dump(tfidf_vectorizer_gejala, 'models/word_embeddings/tfidf_vectorizer_gejala.pkl')
# joblib.dump(tfidf_matrix_gejala, 'models/word_embeddings/tfidf_matrix_gejala.pkl')

print("\nTF-IDF calculation completed.")

Starting TF-IDF calculation for symptoms and diagnoses...
Shape of TF-IDF matrix for symptoms: (100, 389)
Shape of TF-IDF matrix for diagnoses: (100, 122)

TF-IDF calculation completed.


In [8]:
# Prepare data for Word2Vec: list of lists of words (tokens)
sentences_for_w2v = []
for text in df_penyakit['gejala_processed'].dropna():
    sentences_for_w2v.append(text.split())
for text in df_penyakit['diagnosis_processed'].dropna():
    sentences_for_w2v.append(text.split())

print(f"Total sentences for Word2Vec training: {len(sentences_for_w2v)}")

# Train the Word2Vec model
# Parameters: vector_size (embedding dimension), window (context window),
# min_count (ignore infrequent words), sg (0: CBOW, 1: Skip-gram), epochs.
print("\nStarting Word2Vec model training...")
word2vec_model = Word2Vec(
    sentences=sentences_for_w2v,
    vector_size=100,
    window=5,
    min_count=1,
    sg=0,
    epochs=100
)
print("Word2Vec model training completed.")

# Save the Word2Vec model to the designated models folder
model_save_path = '../models/word_embeddings/word2vec_medical.model'
os.makedirs(os.path.dirname(model_save_path), exist_ok=True)
word2vec_model.save(model_save_path)
print(f"Word2Vec model saved to: {model_save_path}")

Total sentences for Word2Vec training: 200

Starting Word2Vec model training...
Word2Vec model training completed.
Word2Vec model saved to: ../models/word_embeddings/word2vec_medical.model


In [9]:
# Optional: Test the Word2Vec model (uncomment to run)
print("\nTesting Word2Vec model:")
try:
    similar_words = word2vec_model.wv.most_similar('demam', topn=5)
    print(f"Words similar to 'demam': {similar_words}")
except KeyError:
    print("Word 'demam' not in model vocabulary.")


Testing Word2Vec model:
Words similar to 'demam': [('serviks', 0.9985533356666565), ('sakit', 0.9983733296394348), ('ruam', 0.9982883334159851), ('capai', 0.9981731176376343), ('hari', 0.9981451034545898)]


In [10]:
def detect_medical_entities(text_input, symptom_list, diagnosis_list):
    """
    Detects known medical symptoms and diagnoses in a given text input.
    The input text is preprocessed for better matching.

    Args:
        text_input (str): The raw text input from the user.
        symptom_list (list): A list of unique preprocessed symptom words.
        diagnosis_list (list): A list of unique preprocessed diagnosis words.

    Returns:
        tuple: A tuple containing two lists:
               - detected_symptoms (list of str): List of detected symptom words.
               - detected_diagnoses (list of str): List of detected diagnosis words.
    """
    if not isinstance(text_input, str):
        return [], []

    # Preprocess the input text using the same logic as df_penyakit
    # Lowercasing and removing non-alphanumeric characters
    processed_input = text_input.lower()
    processed_input = re.sub(r'[^a-z0-9\s]', '', processed_input)

    # Tokenize the processed input
    # Using simple split as done for df_penyakit preprocessing
    input_tokens = processed_input.split()

    detected_symptoms = []
    detected_diagnoses = []

    # Check for symptom matches
    for token in input_tokens:
        if token in symptom_list:
            detected_symptoms.append(token)
    
    # Check for diagnosis matches
    for token in input_tokens:
        if token in diagnosis_list:
            detected_diagnoses.append(token)
            
    # Remove duplicates and return sorted lists
    detected_symptoms = sorted(list(set(detected_symptoms)))
    detected_diagnoses = sorted(list(set(detected_diagnoses)))

    return detected_symptoms, detected_diagnoses

print("Fungsi deteksi entitas medis `detect_medical_entities` telah didefinisikan.")

# --- Contoh Penggunaan Fungsi Deteksi Entitas ---
print("\n--- Contoh Deteksi Entitas ---")
contoh_input = "saya mengalami demam tinggi dan sakit kepala yang hebat sekali"
symptoms, diagnoses = detect_medical_entities(contoh_input, unique_symptoms, unique_diagnosis)

print(f"Input: '{contoh_input}'")
print(f"Gejala terdeteksi: {symptoms}")
print(f"Diagnosis terdeteksi: {diagnoses}")

contoh_input_2 = "kaku kuduk dan penurunan kesadaran itu adalah gejala meningitis"
symptoms_2, diagnoses_2 = detect_medical_entities(contoh_input_2, unique_symptoms, unique_diagnosis)

print(f"\nInput: '{contoh_input_2}'")
print(f"Gejala terdeteksi: {symptoms_2}")
print(f"Diagnosis terdeteksi: {diagnoses_2}")

Fungsi deteksi entitas medis `detect_medical_entities` telah didefinisikan.

--- Contoh Deteksi Entitas ---
Input: 'saya mengalami demam tinggi dan sakit kepala yang hebat sekali'
Gejala terdeteksi: ['demam', 'hebat', 'kepala', 'sakit', 'tinggi']
Diagnosis terdeteksi: ['sakit']

Input: 'kaku kuduk dan penurunan kesadaran itu adalah gejala meningitis'
Gejala terdeteksi: ['gejala', 'kaku', 'kuduk']
Diagnosis terdeteksi: ['meningitis']


In [11]:
# Load pre-trained tokenizer and model (assuming they are saved locally)
# Adjust 'path_to_medical_bert_model' if you saved it elsewhere
path_to_medical_bert_model = "../models/medical_bert/model/"

print(f"Loading tokenizer from: {path_to_medical_bert_model}")
try:
    tokenizer = AutoTokenizer.from_pretrained(path_to_medical_bert_model)
    print(f"Loading model from: {path_to_medical_bert_model}")
    model = AutoModelForTokenClassification.from_pretrained(path_to_medical_bert_model)
    model.eval() # Set model to evaluation mode
    print("MedicalBERT tokenizer and model loaded successfully.")
except Exception as e:
    print(f"Error loading MedicalBERT model or tokenizer: {e}")
    print("Please ensure the model files are correctly located at the specified path.")
    # Exit or handle error gracefully if loading fails
    tokenizer = None
    model = None


def bert_ner(text, tokenizer, model):
    """
    Performs Named Entity Recognition (NER) using a pre-trained BERT model.

    Args:
        text (str): The input text to analyze.
        tokenizer: The pre-trained tokenizer (e.g., AutoTokenizer).
        model: The pre-trained model for token classification (e.g., AutoModelForTokenClassification).

    Returns:
        list: A list of dictionaries, where each dictionary represents a detected entity
              with 'word', 'entity_type', and 'score'.
              The entity_type here will be based on the model's internal labels.
    """
    if tokenizer is None or model is None:
        return [] # Return empty if model/tokenizer failed to load

    if not isinstance(text, str) or not text.strip():
        return []

    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)

    # Get predictions (logits) and convert to probabilities
    predictions = torch.softmax(outputs.logits, dim=2).squeeze(0)
    predicted_labels = torch.argmax(predictions, dim=1)

    # Get original tokens and map predicted labels to actual entity tags
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze(0))
    
    # Map label IDs to their string names (e.g., 0 -> 'O', 1 -> 'B-SYMPTOM')
    # This relies on the model's config.id2label dictionary
    id2label = model.config.id2label if hasattr(model.config, 'id2label') else {i: f"LABEL_{i}" for i in range(model.config.num_labels)}

    # Process tokens and labels to extract entities
    detected_entities = []
    
    # Iterate through tokens and their predicted labels
    for i, (token_id, label_id) in enumerate(zip(inputs["input_ids"].squeeze(0), predicted_labels)):
        token = tokenizer.convert_ids_to_tokens([token_id])[0]
        label = id2label.get(label_id.item(), "UNKNOWN") # Get label string

        # Skip special tokens ([CLS], [SEP], [PAD])
        if token in tokenizer.all_special_tokens:
            continue
        
        # Reconstruct original words from WordPiece tokens
        if token.startswith("##"):
            # If the previous token was part of an entity, append this subword
            if detected_entities and detected_entities[-1]['end_idx'] == i - 1: # Check if contiguous
                detected_entities[-1]['word'] += token[2:]
                detected_entities[-1]['end_idx'] = i # Update end index
            else: # Should not happen if it's a continuation, but as fallback
                detected_entities.append({"word": token[2:], "entity_type": label, "start_idx": i, "end_idx": i})
        else:
            detected_entities.append({"word": token, "entity_type": label, "start_idx": i, "end_idx": i})
            
    # Refine and filter entities based on common NER patterns (B-I-O)
    # This part depends heavily on the specific NER labels of your MedicalBERT model.
    # For a general BERT model not fine-tuned for medical NER, most labels might be 'O'.
    # If it *is* fine-tuned, labels might be 'B-SYMPTOM', 'I-SYMPTOM', 'B-DISEASE', etc.
    
    # For this exercise, we will just return tokens that are not 'O' (Outside)
    # This assumes 'O' is label ID 0, which is common. Check model.config.id2label.
    final_entities = []
    current_entity_word = ""
    current_entity_type = ""

    for item in detected_entities:
        word = item['word']
        entity_type = item['entity_type']
        
        # If the label is 'O' or UNKNOWN, it's not a recognized entity part
        if entity_type == 'O' or entity_type.startswith('LABEL_0') or entity_type == 'UNKNOWN': # Assuming 'O' is label 0
            # If we had a current entity, add it to final_entities and reset
            if current_entity_word:
                final_entities.append({'word': current_entity_word.strip(), 'entity_type': current_entity_type})
                current_entity_word = ""
                current_entity_type = ""
            continue
            
        # Handle B- (Beginning) and I- (Inside) tags for multi-word entities
        if entity_type.startswith('B-'):
            if current_entity_word: # If there was a previous entity being built, finalize it
                final_entities.append({'word': current_entity_word.strip(), 'entity_type': current_entity_type})
            current_entity_word = word
            current_entity_type = entity_type[2:] # Remove 'B-' prefix
        elif entity_type.startswith('I-'):
            # Ensure it's a continuation of the same type
            if current_entity_word and entity_type[2:] == current_entity_type:
                current_entity_word += " " + word
            else: # Malformed sequence or new single-word I-tag (treat as B- for simplicity here)
                if current_entity_word: # Add previous entity if it exists
                    final_entities.append({'word': current_entity_word.strip(), 'entity_type': current_entity_type})
                current_entity_word = word
                current_entity_type = entity_type[2:]
        else: # Single-word entities or unexpected tags (treat as individual entity)
            if current_entity_word: # Add previous entity if it exists
                final_entities.append({'word': current_entity_word.strip(), 'entity_type': current_entity_type})
            current_entity_word = word
            current_entity_type = entity_type # Use the label as is

    # Add any remaining entity
    if current_entity_word:
        final_entities.append({'word': current_entity_word.strip(), 'entity_type': current_entity_type})

    return final_entities


print("Fungsi `bert_ner` untuk deteksi entitas menggunakan MedicalBERT telah didefinisikan.")

# --- Contoh Penggunaan Fungsi Deteksi Entitas dengan MedicalBERT ---
print("\n--- Contoh Deteksi Entitas dengan MedicalBERT ---")
contoh_input_bert = "pasien mengalami demam tinggi dan batuk berdahak"
bert_output = bert_ner(contoh_input_bert, tokenizer, model)

print(f"Input: '{contoh_input_bert}'")
print("Output BERT (Detected Entities):")
for item in bert_output:
    print(item)

# Contoh lain
contoh_input_bert_2 = "diagnosis adalah meningitis dan terjadi perdarahan subarachnoid"
bert_output_2 = bert_ner(contoh_input_bert_2, tokenizer, model)

print(f"\nInput: '{contoh_input_bert_2}'")
print("Output BERT (Detected Entities):")
for item in bert_output_2:
    print(item)

Loading tokenizer from: ../models/medical_bert/model/
Loading model from: ../models/medical_bert/model/


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


MedicalBERT tokenizer and model loaded successfully.
Fungsi `bert_ner` untuk deteksi entitas menggunakan MedicalBERT telah didefinisikan.

--- Contoh Deteksi Entitas dengan MedicalBERT ---
Input: 'pasien mengalami demam tinggi dan batuk berdahak'
Output BERT (Detected Entities):
{'word': 'pasien', 'entity_type': 'LABEL_2'}
{'word': 'demam', 'entity_type': 'LABEL_2'}
{'word': 'tinggi', 'entity_type': 'LABEL_2'}
{'word': 'dan', 'entity_type': 'LABEL_2'}
{'word': 'batuk', 'entity_type': 'LABEL_2'}
{'word': 'berdahak', 'entity_type': 'LABEL_1'}

Input: 'diagnosis adalah meningitis dan terjadi perdarahan subarachnoid'
Output BERT (Detected Entities):
{'word': 'diagnosis', 'entity_type': 'LABEL_2'}
{'word': 'adalah', 'entity_type': 'LABEL_1'}
{'word': 'meningitis', 'entity_type': 'LABEL_2'}
{'word': 'dan', 'entity_type': 'LABEL_2'}
{'word': 'perdarahan', 'entity_type': 'LABEL_2'}
{'word': 'subarachnoid', 'entity_type': 'LABEL_1'}


In [12]:
# Ensure df_penyakit, unique_symptoms, unique_diagnosis, tokenizer, and model (MedicalBERT)
# are loaded and defined from previous cells.

def get_response(user_input, df, unique_symptoms, unique_diagnosis, bert_tokenizer, bert_model):
    """
    Generates a chatbot response based on user input, leveraging both keyword and BERT-based entity detection.

    Args:
        user_input (str): Raw user query.
        df (pd.DataFrame): DataFrame with 'gejala_processed' and 'diagnosis_processed' columns.
        unique_symptoms (list): List of unique preprocessed symptom words.
        unique_diagnosis (list): List of unique preprocessed diagnosis words.
        bert_tokenizer: Pre-trained BERT tokenizer.
        bert_model: Pre-trained BERT model for token classification.

    Returns:
        str: The chatbot's response.
    """
    if not user_input.strip():
        return "Halo! Ada yang bisa saya bantu terkait kesehatan Anda?"

    # --- Step 1: Keyword-based Entity Detection ---
    # Detect entities using the lookup table method.
    detected_symptoms_kw, detected_diagnoses_kw = detect_medical_entities(
        user_input, unique_symptoms, unique_diagnosis
    )
    
    # --- Step 2: BERT-based Entity Detection ---
    # Detect entities using the MedicalBERT model for contextual understanding.
    bert_entities = bert_ner(user_input, bert_tokenizer, bert_model)
    
    # Combine results. Prioritize keyword matches and add relevant BERT detections.
    # Collect all detected entities from both methods.
    all_detected_keywords = set(detected_symptoms_kw + detected_diagnoses_kw)
    
    for entity in bert_entities:
        # Check if the BERT-detected entity type is not 'O' (Outside) or a default unassigned label.
        # This assumes LABEL_1, LABEL_2, etc., represent actual medical entities in your BERT model.
        if entity['entity_type'] != 'O' and not entity['entity_type'].startswith('LABEL_0'):
            # Add BERT-detected words if they are meaningful (more than 1 character)
            # and not already present from the keyword lookup.
            if len(entity['word']) > 1 and entity['word'] not in all_detected_keywords:
                all_detected_keywords.add(entity['word'])

    all_detected_keywords = list(all_detected_keywords)
    
    response_parts = []

    if detected_symptoms_kw:
        response_parts.append(f"Saya mendeteksi gejala: {', '.join(detected_symptoms_kw)}.")
    
    if detected_diagnoses_kw:
        response_parts.append(f"Saya mendeteksi diagnosis: {', '.join(detected_diagnoses_kw)}.")

    if all_detected_keywords:
        # Attempt to find the most relevant row in the DataFrame.
        # This part could use TF-IDF or BERT Embeddings for more advanced similarity,
        # but for a quick demo, we'll use keyword matching on 'gejala_processed' and 'diagnosis_processed' columns.

        relevant_rows = []
        for index, row in df.iterrows():
            gejala_text = row['gejala_processed']
            diagnosis_text = row['diagnosis_processed']
            
            # Count how many keywords match in the processed symptom or diagnosis texts.
            match_count = 0
            for keyword in all_detected_keywords:
                if keyword in gejala_text.split() or keyword in diagnosis_text.split():
                    match_count += 1
            
            # If there's any match, store the row along with the match count.
            if match_count > 0:
                relevant_rows.append({'row': row, 'matches': match_count})
        
        # Sort by the highest number of matches.
        relevant_rows = sorted(relevant_rows, key=lambda x: x['matches'], reverse=True)

        if relevant_rows:
            top_match_row = relevant_rows[0]['row']
            response_parts.append(f"Berdasarkan informasi yang Anda berikan, saya menduga ini terkait dengan: **{top_match_row['diagnosis']}**.")
            response_parts.append(f"Beberapa gejala terkait penyakit ini: {top_match_row['penyakit']}.")
        else:
            response_parts.append("Saya tidak dapat menemukan diagnosis spesifik berdasarkan gejala yang Anda sebutkan.")
    else:
        response_parts.append("Mohon berikan detail lebih lanjut tentang gejala atau kondisi Anda.")

    final_response = " ".join(response_parts)
    return final_response if final_response else "Maaf, saya tidak memahami pertanyaan Anda. Bisakah Anda mengulanginya dengan lebih jelas?"

print("Fungsi respons chatbot `get_response` telah didefinisikan.")

# --- Contoh Interaksi Chatbot ---
print("\n--- Contoh Interaksi Chatbot ---")
chat_history = []

def chat_with_bot(message):
    print(f"\nAnda: {message}")
    response = get_response(message, df_penyakit, unique_symptoms, unique_diagnosis, tokenizer, model)
    print(f"Bot: {response}")
    chat_history.append({"user": message, "bot": response})
    
# Contoh 1
chat_with_bot("Saya demam dan batuk berdahak.")

# Contoh 2
chat_with_bot("Gejala saya adalah sakit kepala dan muntah.")

# Contoh 3
chat_with_bot("Apakah itu meningitis?")

# Contoh 4
chat_with_bot("Saya tidak tahu.")

# Example 5 (testing BERT's ability for words not in the lookup table, but possibly related)
# This depends on how well your BERT model is trained for medical NER.
# If your BERT is not specifically trained for medical NER, it may not make much difference.
chat_with_bot("Kepala saya terasa pusing sekali dan mual.")

Fungsi respons chatbot `get_response` telah didefinisikan.

--- Contoh Interaksi Chatbot ---

Anda: Saya demam dan batuk berdahak.
Bot: Saya mendeteksi gejala: batuk, demam. Saya mendeteksi diagnosis: batuk. Berdasarkan informasi yang Anda berikan, saya menduga ini terkait dengan: **nan**. Beberapa gejala terkait penyakit ini: Demam, menggigil, suhu tubuh meningkat, batuk berdahak kadang disertai darah, sesak nafas, nyeri dada.

Anda: Gejala saya adalah sakit kepala dan muntah.
Bot: Saya mendeteksi gejala: gejala, kepala, muntah, sakit. Saya mendeteksi diagnosis: sakit. Berdasarkan informasi yang Anda berikan, saya menduga ini terkait dengan: **Meningitis + perdarahan subarachnoid**. Beberapa gejala terkait penyakit ini: Kaku kuduk, penurunan kesadaran, muntah proyektil, sakit kepala.

Anda: Apakah itu meningitis?
Bot: Saya mendeteksi diagnosis: meningitis. Berdasarkan informasi yang Anda berikan, saya menduga ini terkait dengan: **Meningitis + perdarahan subarachnoid**. Beberapa gejal