# **Eksperimen 2**: *Medical Named Entity Recognition* (NER) Menggunakan DistilBERT
**Arsitektur Model:** DistilBERT Base Cased (*Lightweight Model*)
**Tujuan Eksperimen:** Menguji efisiensi komputasi dan akurasi model terkompresi (DistilBERT) dibandingkan dengan model standar (BERT) dalam tugas ekstraksi entitas medis. DistilBERT memiliki parameter 40% lebih sedikit dan diklaim 60% lebih cepat.

### **Tahap 1: Inisiasi Pustaka dan Dataset**
Sel ini memuat data mentah dan mempersiapkan label kategori.

In [1]:
# --- TAHAP 1: SETUP DATA (MODEL RINGAN) ---
import json
import os
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForTokenClassification
from torch.optim import AdamW
from tqdm.auto import tqdm

# Cek Device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f"‚öôÔ∏è Mode Ringan dimulai di: {device}")

# 1. Load Data (Pastikan folder bc5cdr ada di lokasi yang sama)
dataset_path = 'bc5cdr' 

def load_json_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f]

train_data = load_json_file(os.path.join(dataset_path, 'train.json'))
valid_data = load_json_file(os.path.join(dataset_path, 'valid.json'))

# 2. Load Label
with open(os.path.join(dataset_path, 'label.json'), 'r') as f:
    label_map = json.load(f)
id2label = {v: k for k, v in label_map.items()}
label2id = label_map

print(f"‚úÖ Data Siap. Jumlah Training: {len(train_data)}")

  from .autonotebook import tqdm as notebook_tqdm


‚öôÔ∏è Mode Ringan dimulai di: cuda
‚úÖ Data Siap. Jumlah Training: 5228


### **Tahap 2: Konfigurasi Model & Tokenisasi**
Bagian ini mengganti arsitektur menjadi DistilBERT dan melakukan pra-pemrosesan data.

In [2]:
MODEL_CHECKPOINT = "distilbert-base-cased" 

print(f"üöÄ Menyiapkan Model Ringan: {MODEL_CHECKPOINT}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_CHECKPOINT, 
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id
)
model.to(device)

# Fungsi Tokenisasi (Sama seperti sebelumnya)
def tokenize_and_align_labels(dataset_list):
    all_tokens = [item["tokens"] for item in dataset_list]
    all_tags   = [item["tags"] for item in dataset_list]

    tokenized_inputs = tokenizer(
        all_tokens, truncation=True, is_split_into_words=True,
        max_length=128, padding="max_length"
    )
    labels = []
    for i, label_asli in enumerate(all_tags):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        prev_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None or word_idx == prev_idx:
                label_ids.append(-100)
            else:
                try: label_ids.append(label_asli[word_idx])
                except: label_ids.append(-100)
            prev_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Proses Data
print("‚è≥ Tokenisasi Data untuk DistilBERT...")
tokenized_train = tokenize_and_align_labels(train_data)
tokenized_valid = tokenize_and_align_labels(valid_data)

class NERDataset(Dataset):
    def __init__(self, encodings): self.encodings = encodings
    def __getitem__(self, i): return {k: torch.tensor(v[i]) for k, v in self.encodings.items()}
    def __len__(self): return len(self.encodings["input_ids"])

# DataLoader (Kita bisa naikkan Batch Size jadi 16 karena model ini ringan!)
BATCH_SIZE = 16 
train_loader = DataLoader(NERDataset(tokenized_train), batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(NERDataset(tokenized_valid), batch_size=BATCH_SIZE)

print(f"‚úÖ Siap Latih! Batch Size diperbesar ke {BATCH_SIZE} (Lebih ngebut).")

üöÄ Menyiapkan Model Ringan: distilbert-base-cased...


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚è≥ Tokenisasi Data untuk DistilBERT...
‚úÖ Siap Latih! Batch Size diperbesar ke 16 (Lebih ngebut).


### **Tahap 3: Proses Pelatihan (Training Loop)**

In [3]:
# --- TAHAP 3: TRAINING DISTILBERT (FAIR COMPARISON MODE) ---
from transformers import get_linear_schedule_with_warmup

# Kita samakan dengan BERT "Hardcore" tadi
EPOCHS = 10           
optimizer = AdamW(model.parameters(), lr=5e-5) # Learning rate disamakan atau sedikit disesuaikan

# Scheduler (Opsional, tapi bagus biar adil dengan BERT yang pakai scheduler)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0, 
    num_training_steps=total_steps
)

print(f"üî• Mulai Training Adil: {EPOCHS} Epochs (Sama dengan BERT)...")

model.train()

for epoch in range(EPOCHS):
    total_loss = 0
    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}")
    
    for batch in pbar:
        batch = {k: v.to(device) for k, v in batch.items()}
        
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        
        # Mencegah ledakan gradien (Sama seperti BERT)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        if scheduler:
            scheduler.step()
        
        total_loss += loss.item()
        pbar.set_postfix({'loss': loss.item()})
        
    avg_loss = total_loss / len(train_loader)
    print(f"‚úÖ Epoch {epoch+1} Selesai. Rata-rata Loss: {avg_loss:.4f}")

print("üéâ Training DistilBERT (10 Epochs) Selesai!")

üî• Mulai Training Adil: 10 Epochs (Sama dengan BERT)...


  attn_output = torch.nn.functional.scaled_dot_product_attention(
Epoch 1/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 327/327 [00:35<00:00,  9.12it/s, loss=0.141] 


‚úÖ Epoch 1 Selesai. Rata-rata Loss: 0.1339


Epoch 2/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 327/327 [00:42<00:00,  7.65it/s, loss=0.253]  


‚úÖ Epoch 2 Selesai. Rata-rata Loss: 0.0433


Epoch 3/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 327/327 [00:41<00:00,  7.83it/s, loss=0.00548] 


‚úÖ Epoch 3 Selesai. Rata-rata Loss: 0.0193


Epoch 4/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 327/327 [00:42<00:00,  7.66it/s, loss=0.0216]  


‚úÖ Epoch 4 Selesai. Rata-rata Loss: 0.0107


Epoch 5/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 327/327 [00:45<00:00,  7.16it/s, loss=0.000298]


‚úÖ Epoch 5 Selesai. Rata-rata Loss: 0.0049


Epoch 6/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 327/327 [00:43<00:00,  7.51it/s, loss=0.00656] 


‚úÖ Epoch 6 Selesai. Rata-rata Loss: 0.0026


Epoch 7/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 327/327 [00:42<00:00,  7.73it/s, loss=0.000275]


‚úÖ Epoch 7 Selesai. Rata-rata Loss: 0.0018


Epoch 8/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 327/327 [00:40<00:00,  8.11it/s, loss=0.000103]


‚úÖ Epoch 8 Selesai. Rata-rata Loss: 0.0008


Epoch 9/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 327/327 [00:38<00:00,  8.43it/s, loss=4.99e-5] 


‚úÖ Epoch 9 Selesai. Rata-rata Loss: 0.0006


Epoch 10/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 327/327 [00:37<00:00,  8.62it/s, loss=0.000263]

‚úÖ Epoch 10 Selesai. Rata-rata Loss: 0.0008
üéâ Training DistilBERT (10 Epochs) Selesai!





### **Tahap 4: Evaluasi Kinerja**

In [4]:
# --- TAHAP 4: AMBIL DATA PERBANDINGAN ---
import time
from seqeval.metrics import classification_report

print("üìä Menghitung Akurasi Model Ringan...")
model.eval()
pred_list, label_list = [], []

with torch.no_grad():
    for batch in valid_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
        labels = labels.cpu().numpy()
        
        for i in range(len(labels)):
            temp_pred = [id2label[p] for p, l in zip(predictions[i], labels[i]) if l != -100]
            temp_label = [id2label[l] for l in labels[i] if l != -100]
            pred_list.append(temp_pred)
            label_list.append(temp_label)

print(classification_report(label_list, pred_list))

# --- TES KECEPATAN (LATENCY) ---
print("\n‚è±Ô∏è Tes Kecepatan (DistilBERT)...")
kalimat_tes = "The patient was prescribed Aspirin for severe hypertension."
inputs = tokenizer(kalimat_tes, return_tensors="pt").to(device)

# Warmup
for _ in range(10): _ = model(**inputs)

# Benchmark
start_time = time.time()
loop = 100
with torch.no_grad():
    for _ in range(loop): _ = model(**inputs)
end_time = time.time()

waktu_ms = ((end_time - start_time) / loop) * 1000
print("="*40)
print(f"KECEPATAN DISTILBERT: {waktu_ms:.2f} ms / kalimat")
print("="*40)

üìä Menghitung Akurasi Model Ringan...
              precision    recall  f1-score   support

    Chemical       0.91      0.91      0.91      5325
     Disease       0.78      0.82      0.80      4223

   micro avg       0.85      0.87      0.86      9548
   macro avg       0.84      0.87      0.86      9548
weighted avg       0.85      0.87      0.86      9548


‚è±Ô∏è Tes Kecepatan (DistilBERT)...
KECEPATAN DISTILBERT: 5.13 ms / kalimat


In [5]:
# --- TAHAP 5: PENYIMPANAN ARTEFAK MODEL ---
output_dir = "./model_distilbert_light_final"

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"‚úÖ Model berhasil disimpan pada direktori: {output_dir}")

‚úÖ Model berhasil disimpan pada direktori: ./model_distilbert_light_final
