In [6]:
import sys
sys.path.append('..') 

# EDA

# Preprocessing

## Retrieval Khusus Mr. TyDi

Isi title dan text dari split dev dan test

In [1]:
from datasets import load_dataset

mr_tydi = load_dataset("castorini/mr-tydi", "indonesian")
corpus = load_dataset("castorini/mr-tydi-corpus", "indonesian", trust_remote_code=True)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# SAMPLING - HAPUS NANTI
mr_tydi['dev'] = mr_tydi['dev'].select(range(5))
mr_tydi["test"] = mr_tydi["test"].select(range(5))

In [3]:
corpus_dict = {row["docid"]: (row["title"], row["text"]) for row in corpus["train"]}

def fill_passage_info(example):
    for passage in example["positive_passages"]:
        docid = passage["docid"]
        if docid in corpus_dict:  # Cek apakah docid ada di corpus
            passage["title"], passage["text"] = corpus_dict[docid]
    
    return example

# Terapkan fungsi untuk melengkapi positive_passages di split 'dev' dan 'test'
mr_tydi["dev"] = mr_tydi["dev"].map(fill_passage_info)
mr_tydi["test"] = mr_tydi["test"].map(fill_passage_info)

In [4]:
# SAMPLING - HAPUS NANTI
corpus['train'] = corpus['train'].shuffle(seed=42).select(range(500))
corpus_dict = {row["docid"]: (row["title"], row["text"]) for row in corpus['train']}

Ambil 2 passage untuk negative_passages khusus split dev dan test

In [7]:
import faiss
import numpy as np
import torch
import torch.nn.functional as F
from torch import Tensor
from tqdm import tqdm
import gc
import os
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
from all_utils import average_pool


model_name = "intfloat/multilingual-e5-small"
embedding_tokenizer = AutoTokenizer.from_pretrained(model_name)
embedding_model = AutoModel.from_pretrained(model_name).to("cuda:0")

os.makedirs("./output", exist_ok=True)

# Buat dictionary {docid: (title, text)} untuk lookup cepat dari corpus
corpus_dict = {row["docid"]: (row["title"], row["text"]) for row in corpus["train"]}

# Ambil semua dokumen text dari corpus untuk dijadikan embedding
corpus_docids = list(corpus_dict.keys())
corpus_texts = [f"passage: {corpus_dict[docid][0]} | {corpus_dict[docid][1]}" for docid in corpus_docids]

batch_size = 128  # Sesuaikan dengan VRAM yang tersedia
corpus_embeddings = []

for start_idx in tqdm(range(0, len(corpus_texts), batch_size), desc="Encoding Corpus"):
    end_idx = min(start_idx + batch_size, len(corpus_texts))
    batch_texts = corpus_texts[start_idx:end_idx]

    batch_dict = embedding_tokenizer(batch_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')
    batch_dict = {k: v.to("cuda:0") for k, v in batch_dict.items()}

    with torch.no_grad():
        outputs = embedding_model(**batch_dict)

    embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
    embeddings = F.normalize(embeddings, p=2, dim=1)
    corpus_embeddings.append(embeddings.to(torch.float32).cpu())

corpus_embeddings = torch.cat(corpus_embeddings, dim=0).numpy().astype(np.float32)  # Konversi ke NumPy

# Buat FAISS index untuk pencarian similarity
index = faiss.IndexFlatIP(corpus_embeddings.shape[1])  # IP = Inner Product (Cosine Similarity)
index.add(corpus_embeddings)  # Tambahkan corpus embeddings ke FAISS

# # Simpan index faiss
# faiss.write_index(index, "../generated_data/raw/faiss_index.idx")

Encoding Corpus: 100%|██████████| 4/4 [00:02<00:00,  1.44it/s]


In [8]:
from functools import partial
from preprocess_utils import add_negative_passages
add_neg_psgs = partial(add_negative_passages, embedding_tokenizer=embedding_tokenizer, embedding_model=embedding_model, index=index, corpus_docids=corpus_docids, corpus_dict=corpus_dict)

In [9]:
mr_tydi["dev"] = mr_tydi["dev"].map(add_neg_psgs, with_indices=True, batched=True, batch_size=16)
mr_tydi["test"] = mr_tydi["test"].map(add_neg_psgs, with_indices=True, batched=True, batch_size=16)

## Data Cleaning

In [None]:
from datasets import load_from_disk, load_dataset
from datasets import DatasetDict
import re

# Drop kolom
tydiqa_gold = load_dataset("khalidalt/tydiqa-goldp", 'indonesian', trust_remote_code=True)

# Rename kolom
tydiqa_gold = tydiqa_gold.remove_columns(["language", "document_title", "passage_text"])
tydiqa_gold = tydiqa_gold.rename_column("id", "tydiqa_id")
tydiqa_gold = tydiqa_gold.rename_column("question_text", "query")

# Restrukturisasi kolom
def extract_text(example):
    example["answers"] = example["answers"]["text"] 
    return example

tydiqa_gold = DatasetDict({
    split: dataset.map(extract_text)
    for split, dataset in tydiqa_gold.items()
})

# Fungsi untuk membersihkan teks: hapus newline & whitespace berlebih
def clean_text(text):
    return re.sub(r'\s+', ' ', text).strip()

# Fungsi untuk membersihkan dan memilih jawaban terpendek
def clean_tydiqa(example):
    # Bersihkan query
    example["query"] = clean_text(example["query"])
    
    # Bersihkan answers dan pilih jawaban terpendek jika ada lebih dari satu
    cleaned_answers = [clean_text(ans) for ans in example["answers"]]
    example["answers"] = min(cleaned_answers, key=len) if cleaned_answers else ""  # Pilih jawaban terpendek

    return example

def clean_mr_tydi(example):
    example["query"] = clean_text(example["query"])
    return example

# Terapkan pembersihan pada dataset
tydiqa_gold_cleaned = DatasetDict({
    split: dataset.map(clean_tydiqa)
    for split, dataset in tydiqa_gold.items()
})

mr_tydi_cleaned = DatasetDict({
    split: dataset.map(clean_mr_tydi)
    for split, dataset in mr_tydi.items()
})

for split in tydiqa_gold_cleaned.keys():
    tydiqa_gold_cleaned[split] = tydiqa_gold_cleaned[split].rename_column("answers", "answer")

## Data Integration

In [None]:
from datasets import DatasetDict, concatenate_datasets

tydiqa_gold_combined = concatenate_datasets([tydiqa_gold_cleaned["train"], tydiqa_gold_cleaned["validation"]])

# Buat struktur baru mengikuti split dari mr_tydi_cleaned
joined_datasets = {}

for split, mr_tydi_split in mr_tydi_cleaned.items():
    # Buat dictionary {query: row} dari tydiqa_gold_cleaned untuk lookup cepat
    tydiqa_gold_dict = {row["query"]: row for row in tydiqa_gold_combined}
    
    # Buat daftar baru dengan menggabungkan informasi dari mr_tydi_cleaned dan tydiqa_gold_cleaned
    new_split_data = []
    
    for row in mr_tydi_split:
        query = row["query"]
        tydiqa_data = tydiqa_gold_dict.get(query, None)  # Ambil data dari tydiqa_gold jika ada
        
        # Gabungkan data (jika tidak ada di tydiqa_gold, biarkan bagian tersebut kosong)
        merged_row = {
            **row,  # Data dari mr_tydi_cleaned
            "tydiqa_id": tydiqa_data["tydiqa_id"] if tydiqa_data else None,
            "answer": tydiqa_data["answer"] if tydiqa_data else None
        }
        
        new_split_data.append(merged_row)

    joined_datasets[split] = mr_tydi_split.from_list(new_split_data)

merged_dataset = DatasetDict(joined_datasets)

print("✅ Dataset berhasil digabungkan berdasarkan `query` dengan struktur mengikuti `mr_tydi_cleaned`.")

✅ Dataset berhasil digabungkan berdasarkan `query` dengan struktur mengikuti `mr_tydi_cleaned`.


In [None]:
from datasets import DatasetDict

def remove_none_answers(dataset):
    return dataset.filter(lambda row: row["answer"] is not None)

merged_dataset = DatasetDict({
    "train": remove_none_answers(merged_dataset["train"]),
    "dev": remove_none_answers(merged_dataset["dev"]),
    "test": remove_none_answers(merged_dataset["test"])
})

print("✅ Semua row dengan 'answers = None' telah dihapus dari dataset baru `merged_dataset`.")

Filter: 100%|██████████| 4902/4902 [00:02<00:00, 2440.14 examples/s]
Filter: 100%|██████████| 5/5 [00:00<00:00, 821.09 examples/s]
Filter: 100%|██████████| 5/5 [00:00<00:00, 952.99 examples/s]

✅ Semua row dengan 'answers = None' telah dihapus dari dataset baru `merged_dataset`.





In [18]:
# SAMPLING - HAPUS NANTI
merged_dataset['train'] = merged_dataset['train'].select(range(5))

## Ambil Top 2 Negative Passages pada Split Train

In [19]:
from preprocess_utils import select_top2_negative_passages
select_top2_neg_psgs = partial(select_top2_negative_passages, embedding_tokenizer=embedding_tokenizer, embedding_model = embedding_model)
merged_dataset["train"] = merged_dataset["train"].map(select_top2_neg_psgs)

Map: 100%|██████████| 5/5 [00:02<00:00,  1.86 examples/s]


In [20]:
from preprocess_utils import create_top_3_passages

# Terapkan fungsi ke split train, dev, dan test
merged_dataset["train"] = merged_dataset["train"].map(create_top_3_passages)
merged_dataset["dev"] = merged_dataset["dev"].map(create_top_3_passages)
merged_dataset["test"] = merged_dataset["test"].map(create_top_3_passages)

Map: 100%|██████████| 5/5 [00:00<00:00, 408.09 examples/s]
Map: 100%|██████████| 5/5 [00:00<00:00, 291.96 examples/s]
Map: 100%|██████████| 4/4 [00:00<00:00, 289.63 examples/s]


In [21]:
# Memformat passage agar mengandung string "Judul:...\nTeks:...""
def format_passages(example, psgs_col='top_3_passages',  title_col='title', text_col='text'):
    psgs = example[psgs_col]
    formatted_psgs = []
    for psg in psgs:
        formatted_psgs.append(f"{psg[title_col]} | {psg[text_col]}")

    example['passages'] = formatted_psgs

    return example

for split in merged_dataset.keys():
    merged_dataset[split] = merged_dataset[split].map(format_passages)

Map: 100%|██████████| 5/5 [00:00<00:00, 465.14 examples/s]
Map: 100%|██████████| 5/5 [00:00<00:00, 566.23 examples/s]
Map: 100%|██████████| 4/4 [00:00<00:00, 297.17 examples/s]


## Truncate passages agar panjangnya seragam

In [22]:
t5_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

def truncate_passages(examples):
    # Tokenisasi setiap passage dalam kolom 'passages' dan batasi panjangnya menjadi 512 token
    truncated_passages = []
    for passage in examples['passages']:
        # Tokenize each passage and truncate it to 512 tokens
        tokenized = t5_tokenizer(passage, padding='max_length', truncation=True, max_length=512, add_special_tokens=False)
        
        # Decode input_ids menjadi string dan tambahkan ke list truncated_passages
        truncated_passages.append(t5_tokenizer.decode(tokenized['input_ids'], skip_special_tokens=True))
    
    examples['trunc_passages'] = truncated_passages
    return examples

# Terapkan fungsi ke dataset
for split in merged_dataset.keys():
    merged_dataset[split] = merged_dataset[split].map(truncate_passages)

Map: 100%|██████████| 5/5 [00:00<00:00, 174.24 examples/s]
Map: 100%|██████████| 5/5 [00:00<00:00, 146.17 examples/s]
Map: 100%|██████████| 4/4 [00:00<00:00, 137.54 examples/s]


## Mengurutkan dan Melabeli Passages

In [23]:
from all_utils import apply_similarity_ranking_to_dataset
# Me-rangking passages berdasarkan skor similarity
for split in merged_dataset.keys():
    merged_dataset[split] = apply_similarity_ranking_to_dataset(
        merged_dataset[split], 
        text_col="trunc_passages",
        output_col="ranked_truncPassages_with_labels", 
        tokenizer=embedding_tokenizer, 
        model=embedding_model, 
        device = embedding_model.device
)

Processing ranked_truncPassages_with_labels: 100%|██████████| 5/5 [00:00<00:00, 33.94it/s]
Processing ranked_truncPassages_with_labels: 100%|██████████| 5/5 [00:00<00:00, 45.82it/s]
Processing ranked_truncPassages_with_labels: 100%|██████████| 4/4 [00:00<00:00, 47.47it/s]


## Distribusi ulang split

In [None]:
import pandas as pd
from datasets import Dataset

# Tentukan jumlah baris yang ingin dipindahkan
num_rows_to_move = 578

# Pilih 577 baris acak dari split 'dev'
dev_dataset = merged_dataset['dev']

selected_rows = dev_dataset.select(range(num_rows_to_move))  # Ambil 578 baris pertama setelah shuffle

# Hapus 577 baris yang sudah dipilih dari 'dev'
remaining_dev = dev_dataset.select(range(num_rows_to_move, len(dev_dataset)))

# Konversi ke DataFrame pandas untuk dapat menggunakan concat
train_df = merged_dataset['train'].to_pandas()
selected_rows_df = selected_rows.to_pandas()

# Gabungkan keduanya dengan pandas.concat
new_train_df = pd.concat([train_df, selected_rows_df], ignore_index=True)

# Kembali ke dataset HuggingFace dari DataFrame
new_train = Dataset.from_pandas(new_train_df)

# Perbarui split train dan dev
merged_dataset['train'] = new_train
merged_dataset['dev'] = remaining_dev

In [None]:
def extract_sorted_passages(row):
    passages = row['ranked_truncPassages_with_labels']
    sorted_texts = [passage['text'] for passage in passages]
    return sorted_texts

final_dataset = merged_dataset.map(lambda row: {'sorted_truncPassages': extract_sorted_passages(row)}, batched=False)

print(final_dataset.column_names)
final_dataset = final_dataset.rename_column('answer', 'label')

Map: 100%|██████████| 5/5 [00:00<00:00, 293.00 examples/s]
Map: 100%|██████████| 5/5 [00:00<00:00, 385.87 examples/s]
Map: 100%|██████████| 4/4 [00:00<00:00, 246.00 examples/s]

{'train': ['query_id', 'query', 'positive_passages', 'negative_passages', 'tydiqa_id', 'answer', 'top_3_passages', 'passages', 'trunc_passages', 'ranked_truncPassages_with_labels', 'sorted_truncPassages'], 'dev': ['query_id', 'query', 'positive_passages', 'negative_passages', 'tydiqa_id', 'answer', 'top_3_passages', 'passages', 'trunc_passages', 'ranked_truncPassages_with_labels', 'sorted_truncPassages'], 'test': ['query_id', 'query', 'positive_passages', 'negative_passages', 'tydiqa_id', 'answer', 'top_3_passages', 'passages', 'trunc_passages', 'ranked_truncPassages_with_labels', 'sorted_truncPassages']}





In [35]:
final_dataset.save_to_disk("./output/raw_dataset")

Saving the dataset (1/1 shards): 100%|██████████| 5/5 [00:00<00:00, 238.85 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 5/5 [00:00<00:00, 370.10 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 4/4 [00:00<00:00, 278.99 examples/s]


# Memperbaiki Label dari Split Test
Labelnya bisa beragam

In [None]:
from datasets import load_dataset

# 1) Load split validation
ds = load_dataset("khalidalt/tydiqa-goldp", "indonesian", split="validation")

def dedup_keep_order(seq):
    seen = set()
    result = []
    for x in seq:
        if x not in seen:
            seen.add(x)
            result.append(x)
    return result

def select_answers(example):
    # ambil answers.text (maks 3 item di dataset ini), simpan versi asli (strip, kapital dipertahankan)
    orig = [t.strip() for t in example["answers"]["text"]]
    if not orig:  # jaga-jaga
        example["selected_answer"] = []
        return example

    # versi normalisasi untuk pengecekan substring
    norm = [t.lower().strip() for t in orig]

    # urutkan indeks berdasarkan panjang normalized (terpendek -> terpanjang)
    order = sorted(range(len(norm)), key=lambda i: len(norm[i]))
    # terpendek
    s1_idx = order[0]
    s1_norm, s1_orig = norm[s1_idx], orig[s1_idx]

    # --- aturan awal: jika s1 ada di semua elemen lain -> hanya s1
    contained_in_all = all(
        (s1_norm in nt) for i, nt in enumerate(norm) if i != s1_idx
    )
    if contained_in_all:
        selected = [s1_orig]
    else:
        selected = None
        if len(norm) >= 2:
            s2_idx = order[1]
            s2_norm, s2_orig = norm[s2_idx], orig[s2_idx]

            s2_not_superset = (s1_norm not in s2_norm)

            if s2_not_superset:
                exists_longer_that_contains_s2 = any(
                    (i != s2_idx) and (len(nt) > len(s2_norm)) and (s2_norm in nt)
                    for i, nt in enumerate(norm)
                )
                if exists_longer_that_contains_s2:
                    selected = [s1_orig, s2_orig]
                else:
                    selected = orig[:]

        if selected is None:
            others = [o for o, nt in zip(orig, norm) if nt is not s1_norm and (s1_norm not in nt)]
            selected = [s1_orig] + others

    # hapus duplikat dengan menjaga urutan
    example["selected_answer"] = dedup_keep_order(selected)
    return example

# 2) Tambahkan kolom selected_answer
ds = ds.map(select_answers)

from pathlib import Path
import json
# 3) Load file JSON berisi pembaruan
json_path = Path("labels_to_be_updated.json")
with open(json_path, "r", encoding="utf-8") as f:
    updates = json.load(f)

# Ubah jadi dict agar lebih cepat diakses
update_map = {item["query"]: item["new_answer"] for item in updates if "query" in item}

import re
def apply_manual_updates(example):
    # Ambil teks pertanyaan dan hilangkan spasi berlebih (di awal, akhir, dan tengah)
    q = example["question_text"]
    q = re.sub(r"\s+", " ", q.strip())  # ganti semua whitespace berturut jadi satu spasi

    # Lakukan hal yang sama juga untuk key di update_map agar konsisten
    normalized_update_map = {re.sub(r"\s+", " ", k.strip()): v for k, v in update_map.items()}

    if q in normalized_update_map:
        example["selected_answer"] = normalized_update_map[q]
    return example

# 5) Terapkan ke dataset
ds = ds.map(apply_manual_updates)

# 6) (Opsional) Simpan dataset hasil update
# ds.save_to_disk("tydiqa_indonesian_updated")

# 7) Contoh cek
print(ds)
print(ds[0]["question_text"])
print(ds[0]["selected_answer"])

from datasets import load_dataset

# dataset test_raw
test_raw = load_dataset("khalidrizki/postretrieve-raw-dataset-v2", split="test")

# buat dictionary mapping id -> selected_answer
id2selected = {row["id"]: row["selected_answer"] for row in ds}
id2selected

def add_label_list(example):
    tid = example["tydiqa_id"]
    if tid in id2selected:
        example["label_list"] = id2selected[tid]
    else:
        example["label_list"] = [example["label"].strip()]
    return example

# tambahkan kolom baru
test_raw = test_raw.map(add_label_list)
test_raw = test_raw.remove_columns(["label"])
# cek hasil
print(test_raw)
print(test_raw[0]["tydiqa_id"], test_raw[0]["label_list"])