# Generate Data Mr. Tydi

## 1. Isi tile dan text dari split dev dan test

In [None]:
from datasets import load_dataset

# Unduh dataset multi-lingual retrieval, memiliki 3 split: train, dev, test
mr_tydi = load_dataset("castorini/mr-tydi", "indonesian")

# Load dataset corpus (sumber teks dan title)
corpus = load_dataset("castorini/mr-tydi-corpus", "indonesian", trust_remote_code=True)

# Buat dictionary {docid: (title, text)} untuk pencarian cepat
corpus_dict = {row["docid"]: (row["title"], row["text"]) for row in corpus["train"]}

# Fungsi untuk melengkapi positive_passages dalam dataset mr_tydi
def fill_passage_info(example):
    for passage in example["positive_passages"]:
        docid = passage["docid"]
        if docid in corpus_dict:  # Cek apakah docid ada di corpus
            passage["title"], passage["text"] = corpus_dict[docid]
    
    return example

# Terapkan fungsi untuk melengkapi positive_passages di split 'dev' dan 'test'
mr_tydi["dev"] = mr_tydi["dev"].map(fill_passage_info)
mr_tydi["test"] = mr_tydi["test"].map(fill_passage_info)

# Cek hasilnya
mr_tydi["dev"][0]
# print(mr_tydi["test"][0])  # Contoh setelah pengisian

{'query_id': '3',
 'query': 'Dimana James Hepburn meninggal?',
 'positive_passages': [{'docid': '2386357#15',
   'text': 'Dia dipenjarakan di Puri Dragsholm, 75 kilometer Kopenhagen. Dia ditahan dalam apa yang dikatakan sebagai kondisi yang mengerikan. Dia meninggal pada bulan April 1578.[8][10]',
   'title': 'James Hepburn'}],
 'negative_passages': []}

## 2. Ambil 2 passage untuk negative_passages khusus split dev dan test

In [1]:
import faiss
import numpy as np
import torch.nn.functional as F
import torch
from torch import Tensor
from tqdm import tqdm
import gc
import os
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel

# Load tokenizer & model untuk Multilingual-E5-Small
model_name = "intfloat/multilingual-e5-small"
embedding_tokenizer = AutoTokenizer.from_pretrained(model_name)
embedding_model = AutoModel.from_pretrained(model_name).to("cuda:0")

# Fungsi average pooling
def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Pastikan folder 'generated_data/' ada untuk menyimpan dataset berkala
os.makedirs("../generated_data/raw", exist_ok=True)

# Buat dictionary {docid: (title, text)} untuk lookup cepat dari corpus
corpus_dict = {row["docid"]: (row["title"], row["text"]) for row in corpus["train"]}

# Ambil semua dokumen text dari corpus untuk dijadikan embedding
corpus_docids = list(corpus_dict.keys())
# corpus_texts = [corpus_dict[docid][1] for docid in corpus_docids]  # Ambil teksnya saja
corpus_texts = [f"passage: {corpus_dict[docid][0]} | {corpus_dict[docid][1]}" for docid in corpus_docids]

# Tokenisasi dan embedding seluruh corpus (hanya dilakukan sekali untuk efisiensi)
batch_size = 128  # Sesuaikan dengan VRAM yang tersedia
corpus_embeddings = []

for start_idx in tqdm(range(0, len(corpus_texts), batch_size), desc="Encoding Corpus"):
    end_idx = min(start_idx + batch_size, len(corpus_texts))
    batch_texts = corpus_texts[start_idx:end_idx]

    batch_dict = embedding_tokenizer(batch_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')
    batch_dict = {k: v.to("cuda:0") for k, v in batch_dict.items()}

    with torch.no_grad():
        outputs = embedding_model(**batch_dict)

    embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
    embeddings = F.normalize(embeddings, p=2, dim=1)  # Normalisasi untuk cosine similarity
    corpus_embeddings.append(embeddings.to(torch.float32).cpu())  # Pastikan float32 dan tetap di CPU

# Gabungkan semua embedding menjadi satu tensor besar
corpus_embeddings = torch.cat(corpus_embeddings, dim=0).numpy().astype(np.float32)  # Konversi ke NumPy

# Buat FAISS index untuk pencarian similarity
index = faiss.IndexFlatIP(corpus_embeddings.shape[1])  # IP = Inner Product (Cosine Similarity)
index.add(corpus_embeddings)  # Tambahkan corpus embeddings ke FAISS

Encoding Corpus: 100%|██████████| 11480/11480 [1:52:06<00:00,  1.71it/s] 


In [6]:
# Simpan index faiss
faiss.write_index(index, "../generated_data/raw/faiss_index.idx")

In [None]:
import gc
import torch
import torch.nn.functional as F

def add_negative_passages(batch, indices):
#     batch_queries = batch["query"]  # List of queries
    batch_queries = [f"query: {query}" for query in batch["query"]]

    batch_dict = embedding_tokenizer(batch_queries, max_length=512, padding=True, truncation=True, return_tensors='pt')
    batch_dict = {k: v.to("cuda:0") for k, v in batch_dict.items()}

    with torch.no_grad():
        outputs = embedding_model(**batch_dict)

    query_embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
    query_embeddings = F.normalize(query_embeddings, p=2, dim=1).cpu().numpy().astype(np.float32)  # (batch, dim)

    # FAISS Search untuk semua query dalam batch
    D, I = index.search(query_embeddings, 7)  # Ambil 7 kandidat

    # Iterasi untuk setiap query dalam batch
    negative_passages_batch = []
    for i, idx in enumerate(indices):
        positive_docids = set(p["docid"] for p in batch["positive_passages"][i])
        num_positive = len(positive_docids)

        # Tentukan jumlah negatif sesuai aturan:
        if num_positive == 1:
            max_negatives = 2
        elif num_positive == 2:
            max_negatives = 1
        else:  # 3 atau lebih
            max_negatives = 0
        selected_negative_passages = []

        for doc_idx in I[i]:  # Loop hasil FAISS untuk query ke-i
            if max_negatives == 0:
                break  # Tidak perlu ambil negatif
            docid = corpus_docids[doc_idx]
            if docid not in positive_docids:
                title, text = corpus_dict[docid]
                selected_negative_passages.append({"docid": docid, "title": title, "text": text})
            if len(selected_negative_passages) == max_negatives:
                break

        negative_passages_batch.append(selected_negative_passages)

    batch["negative_passages"] = negative_passages_batch

    # 🔥 **BERSIHKAN CACHE GPU & MEMORI SETELAH BATCH SELESAI**
    del batch_dict, outputs, query_embeddings
    torch.cuda.empty_cache()  # Kosongkan cache GPU
    gc.collect()  # Kosongkan cache CPU untuk menghindari memory leak

    return batch

mr_tydi["dev"] = mr_tydi["dev"].map(add_negative_passages, with_indices=True, batched=True, batch_size=16)
mr_tydi["test"] = mr_tydi["test"].map(add_negative_passages, with_indices=True, batched=True, batch_size=16)

Map: 100%|██████████| 1224/1224 [01:26<00:00, 14.23 examples/s]
Map:  46%|████▋     | 384/829 [00:32<00:26, 16.51 examples/s]

In [None]:
mr_tydi.save_to_disk("../generated_data/raw/mr_tydi_filled")
print("✅ Dataset berhasil disimpan!")

Saving the dataset (1/1 shards): 100%|██████████| 4902/4902 [00:00<00:00, 72832.02 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1224/1224 [00:00<00:00, 66556.40 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 829/829 [00:00<00:00, 67840.13 examples/s]

✅ Dataset berhasil disimpan dengan `save_to_disk`!





# Data Cleaning

In [7]:
from datasets import load_from_disk, load_dataset
mr_tydi = load_from_disk("../generated_data/raw/mr_tydi_filled")
tydiqa_gold = load_dataset("khalidalt/tydiqa-goldp", 'indonesian', trust_remote_code=True)

1. Membuang kolom yang tidak penting (language, document_title, dan passage_text)
2. Mengganti nama kolom (dari yang awalnya id menjadi tydiqa_id, dari yang awalnya question_text ke query)

In [8]:
tydiqa_gold = tydiqa_gold.remove_columns(["language", "document_title", "passage_text"])
# language tidak perlu karena redundan (semua entry berbahasa Indonesia)
# document_title dan passage_text tidak perlu karena tiap pos_psg dan neg_psg dari mr_tydi sudah menyimpan info tersebut
tydiqa_gold = tydiqa_gold.rename_column("id", "tydiqa_id")
tydiqa_gold = tydiqa_gold.rename_column("question_text", "query")
tydiqa_gold

DatasetDict({
    train: Dataset({
        features: ['tydiqa_id', 'query', 'answers'],
        num_rows: 5702
    })
    validation: Dataset({
        features: ['tydiqa_id', 'query', 'answers'],
        num_rows: 565
    })
})

Untuk mengambil jawaban, maka kita perlu mengekstrak elemen text dari kolom answers (dan meninggalkan elemen start_byte dan limit_byte)

In [9]:
from datasets import DatasetDict

def extract_text(example):
    example["answers"] = example["answers"]["text"]  # Ambil hanya bagian text, start_byte dan limit_byte dibuang saja
    return example

# Terapkan fungsi untuk membersihkan answers di setiap split
tydiqa_gold = DatasetDict({
    split: dataset.map(extract_text)
    for split, dataset in tydiqa_gold.items()
})

1. Membuang karakter whitespace yang berlebihan
2. Khusus untuk TyDi QA, mengambil jawaban yang paling pendek (dalam kasus terdapat beberapa jawaban)

In [10]:
example1_id = "-2253919563477221294-3"
example2_id = "8601389648636013237-1"
print("BEFORE")
answer_with_unnecessary_whitespace = tydiqa_gold['train'].filter(lambda x: x['tydiqa_id']==example1_id)
print(answer_with_unnecessary_whitespace['query'], ":", answer_with_unnecessary_whitespace[0]['answers'])

instance_with_multiple_answers = tydiqa_gold['validation'].filter(lambda x: x['tydiqa_id']==example2_id)
print(instance_with_multiple_answers['query'], ":", instance_with_multiple_answers[0])

BEFORE
['Siapa yang menciptakan Emosikon?'] : ['Nicolas\nLoufrani']
['siapakah ketua Perum LKBN pertama?'] : {'tydiqa_id': '8601389648636013237-1', 'query': 'siapakah ketua Perum LKBN pertama?', 'answers': ['Mr. Soemanang', 'Soemanang danAdam Malik']}


In [11]:
import re
from datasets import DatasetDict

# Fungsi untuk membersihkan teks: hapus newline & whitespace berlebih
def clean_text(text):
    return re.sub(r'\s+', ' ', text).strip()

# Fungsi untuk membersihkan dan memilih jawaban terpendek
def clean_tydiqa(example):
    # Bersihkan query
    example["query"] = clean_text(example["query"])
    
    # Bersihkan answers dan pilih jawaban terpendek jika ada lebih dari satu
    cleaned_answers = [clean_text(ans) for ans in example["answers"]]
    example["answers"] = min(cleaned_answers, key=len) if cleaned_answers else ""  # Pilih jawaban terpendek

    return example

# Fungsi untuk membersihkan query di MR-TyDi
def clean_mr_tydi(example):
    example["query"] = clean_text(example["query"])
    return example

# Terapkan pembersihan pada dataset
tydiqa_gold_cleaned = DatasetDict({
    split: dataset.map(clean_tydiqa)
    for split, dataset in tydiqa_gold.items()
})

mr_tydi_cleaned = DatasetDict({
    split: dataset.map(clean_mr_tydi)
    for split, dataset in mr_tydi.items()
})

check_if_answer_is_cleaned = tydiqa_gold_cleaned['train'].filter(lambda x: x['tydiqa_id']==example1_id)
print(check_if_answer_is_cleaned[0]['answers'])

check_if_answer_more_than_1 = tydiqa_gold_cleaned['validation'].filter(lambda x: x['tydiqa_id']==example2_id)
print(check_if_answer_more_than_1[0])

Map: 100%|██████████| 4902/4902 [00:00<00:00, 8071.08 examples/s]
Map: 100%|██████████| 1224/1224 [00:00<00:00, 9484.83 examples/s] 
Map: 100%|██████████| 829/829 [00:00<00:00, 8532.12 examples/s]

Nicolas Loufrani
{'tydiqa_id': '8601389648636013237-1', 'query': 'siapakah ketua Perum LKBN pertama?', 'answers': 'Mr. Soemanang'}





# Data Integration

In [12]:
from datasets import DatasetDict, concatenate_datasets

# Gabungkan split train & validation pada tydiqa_gold_cleaned
tydiqa_gold_combined = concatenate_datasets([tydiqa_gold_cleaned["train"], tydiqa_gold_cleaned["validation"]])

# Buat struktur baru mengikuti split dari mr_tydi_cleaned
joined_datasets = {}

for split, mr_tydi_split in mr_tydi_cleaned.items():
    # Buat dictionary {query: row} dari tydiqa_gold_cleaned untuk lookup cepat
    tydiqa_gold_dict = {row["query"]: row for row in tydiqa_gold_combined}
    
    # Buat daftar baru dengan menggabungkan informasi dari mr_tydi_cleaned dan tydiqa_gold_cleaned
    new_split_data = []
    
    for row in mr_tydi_split:
        query = row["query"]
        tydiqa_data = tydiqa_gold_dict.get(query, None)  # Ambil data dari tydiqa_gold jika ada
        
        # Gabungkan data (jika tidak ada di tydiqa_gold, biarkan bagian tersebut kosong)
        merged_row = {
            **row,  # Data dari mr_tydi_cleaned
            "tydiqa_id": tydiqa_data["tydiqa_id"] if tydiqa_data else None,
            "answers": tydiqa_data["answers"] if tydiqa_data else None
        }
        
        new_split_data.append(merged_row)

    # Konversi kembali ke Dataset
    joined_datasets[split] = mr_tydi_split.from_list(new_split_data)

# Simpan hasil sebagai DatasetDict
merged_dataset = DatasetDict(joined_datasets)

print("✅ Dataset berhasil digabungkan berdasarkan `query` dengan struktur mengikuti `mr_tydi_cleaned`.")
# print("✅ Dataset telah disimpan di 'generated_data/mr_tydi_tydiqa_joined'.")

✅ Dataset berhasil digabungkan berdasarkan `query` dengan struktur mengikuti `mr_tydi_cleaned`.


In [13]:
from datasets import DatasetDict

# Fungsi untuk menghapus rows dengan answers = None
def remove_none_answers(dataset):
    return dataset.filter(lambda row: row["answers"] is not None)

# Buat dataset baru tanpa row yang memiliki answers = None
merged_dataset = DatasetDict({
    "train": remove_none_answers(merged_dataset["train"]),
    "dev": remove_none_answers(merged_dataset["dev"]),
    "test": remove_none_answers(merged_dataset["test"])
})

print("✅ Semua row dengan 'answers = None' telah dihapus dari dataset baru `finished_dataset`.")

Filter: 100%|██████████| 4902/4902 [00:02<00:00, 2162.69 examples/s]
Filter: 100%|██████████| 1224/1224 [00:00<00:00, 14877.13 examples/s]
Filter: 100%|██████████| 829/829 [00:00<00:00, 16170.95 examples/s]

✅ Semua row dengan 'answers = None' telah dihapus dari dataset baru `finished_dataset`.





# Cutting Down Negative Passages dari split `Train` agar hanya 2 Passages saja

In [4]:
from transformers import AutoTokenizer, AutoModel

# Load tokenizer & model untuk Multilingual-E5-Small
model_name = "intfloat/multilingual-e5-small"
embedding_tokenizer = AutoTokenizer.from_pretrained(model_name)
embedding_model = AutoModel.from_pretrained(model_name).to("cuda:0")

In [15]:
import torch.nn.functional as F
import torch
from torch import Tensor
from tqdm import tqdm
import gc
from datasets import load_from_disk


# Fungsi average pooling
def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

# Fungsi untuk memilih top 4 negative_passages berdasarkan similarity (dari yang awalnya 29-30 negative passages)
def select_top2_negative_passages(example):
    query_text = f'query: {example["query"]}'
    negative_passages = example["negative_passages"]

    # Jika sudah <= 2, tidak perlu pemrosesan
    if len(negative_passages) <= 2:
        return example

    # Ambil teks dari negative_passages
    neg_texts = [f'passage: {neg["title"]} | {neg['text']}' for neg in negative_passages]

    # Tokenisasi dan embedding query serta negative_passages
    batch_dict = embedding_tokenizer([query_text] + neg_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')
    batch_dict = {k: v.to("cuda:0") for k, v in batch_dict.items()}

    with torch.no_grad():
        outputs = embedding_model(**batch_dict)

    # Hitung embedding dan normalisasi
    embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
    embeddings = F.normalize(embeddings, p=2, dim=1)  # Normalisasi untuk cosine similarity

    # Hitung similarity scores (query vs negative_passages)
    query_embedding = embeddings[0].unsqueeze(0)  # Query ada di indeks pertama
    neg_embeddings = embeddings[1:]  # Negative passages setelah query
    scores = (query_embedding @ neg_embeddings.T).squeeze(0)  # Cosine similarity

    # Ambil indeks top 2 dengan similarity tertinggi
    top_indices = torch.argsort(scores, descending=True)[:2]

    # Simpan hanya 2 negative_passages terbaik
    example["negative_passages"] = [negative_passages[i] for i in top_indices]

    # Bersihkan cache GPU setelah query diproses
    del batch_dict, outputs, embeddings, scores
    torch.cuda.empty_cache()
    gc.collect()

    return example

# Terapkan fungsi ke split train
merged_dataset["train"] = merged_dataset["train"].map(select_top2_negative_passages)

Map: 100%|██████████| 4542/4542 [32:12<00:00,  2.35 examples/s]


## Hanya mengambil top 3 passages
Hal ini bisa dilakukan dengan:
1. Jika positive_passages ada 3, maka ambil ketiga-tiganya
2. Jika positive_passages ada 2, maka ambil 2 positive_passages dan 1 negative_passages
3. Jika positive_passages ada 1, maka ambil 1 postive_passages dan 2 negative_passages

In [None]:
# post_neg_subset = merged_dataset.select_columns(["query_id", "positive_passages", "negative_passages"]) 
# post_neg_subset.save_to_disk("../generated_data/raw/positive_negative_subset")

Saving the dataset (1/1 shards): 100%|██████████| 4542/4542 [00:00<00:00, 222839.27 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1143/1143 [00:00<00:00, 36016.54 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 565/565 [00:00<00:00, 25952.58 examples/s]


In [16]:
from datasets import DatasetDict
import random

# Fungsi untuk membuat kolom 'top_3_passages'
def create_top_3_passages(example):
    # Mengambil positive_passages dan negative_passages
    positive_passages = example["positive_passages"]
    negative_passages = example["negative_passages"]

    # Gabungkan 3 passages sesuai dengan aturan yang diinginkan
    if len(positive_passages) == 3:
        top_3_passages = positive_passages
    elif len(positive_passages) == 2:
        top_3_passages = positive_passages + [negative_passages[0]]  # Ambil negative pertama
    elif len(positive_passages) == 1:
        top_3_passages = positive_passages + negative_passages[:2]  # Ambil 2 negative pertama
    else:
        top_3_passages = []  # Default jika tidak sesuai dengan aturan

    example["top_3_passages"] = top_3_passages
    return example

# Terapkan fungsi ke split train, dev, dan test
merged_dataset["train"] = merged_dataset["train"].map(create_top_3_passages)
merged_dataset["dev"] = merged_dataset["dev"].map(create_top_3_passages)
merged_dataset["test"] = merged_dataset["test"].map(create_top_3_passages)

Map: 100%|██████████| 4542/4542 [00:00<00:00, 5382.76 examples/s]
Map: 100%|██████████| 1143/1143 [00:00<00:00, 4348.12 examples/s]
Map: 100%|██████████| 565/565 [00:00<00:00, 4706.00 examples/s]


In [18]:
# Rename kolom 'answers' menjadi 'answer' di semua split
for split in merged_dataset.keys():
    merged_dataset[split] = merged_dataset[split].rename_column("answers", "answer")

In [None]:
# Memformat passage agar mengandung string "Judul:...\nTeks:...""
def format_passages(example, psgs_col='top_3_passages',  title_col='title', text_col='text'):
    psgs = example[psgs_col]
    formatted_psgs = []
    for psg in psgs:
        formatted_psgs.append(f"{psg[title_col]} | {psg[text_col]}")

    example['passages'] = formatted_psgs

    return example

for split in merged_dataset.keys():
    merged_dataset[split] = merged_dataset[split].map(format_passages)

In [None]:
def check_top_3_passages(example):
    top_3_passages = example["top_3_passages"]
    
    # Jika top_3_passages kosong atau tidak memiliki 3 elemen, beri peringatan
    if not top_3_passages or len(top_3_passages) != 3:
        print(f"Peringatan: top_3_passages untuk query_id {example['query_id']} tidak lengkap.")
        return False  # Mengembalikan False untuk menandakan bahwa data ini tidak valid
    
    return True  # Jika valid, kembalikan True

# Terapkan fungsi pengecekan ke split train, dev, dan test
train_valid = [check_top_3_passages(example) for example in merged_dataset["train"]]
dev_valid = [check_top_3_passages(example) for example in merged_dataset["dev"]]
test_valid = [check_top_3_passages(example) for example in merged_dataset["test"]]

# Jika ada data yang invalid, beri tahu dan hentikan eksekusi
if not all(train_valid) or not all(dev_valid) or not all(test_valid):
    print("Terdapat data yang tidak valid di salah satu split. Proses dihentikan.")
else:
    # Jika semua valid, lanjutkan untuk simpan dataset
    print("Semua data valid. Melanjutkan penyimpanan dataset.")
    final_dataset = merged_dataset.remove_columns(["positive_passages", "negative_passages, top_3_passages"])
    final_dataset.save_to_disk("../generated_data/raw/raw_dataset")

Semua data valid. Melanjutkan penyimpanan dataset.


Saving the dataset (1/1 shards): 100%|██████████| 4542/4542 [00:00<00:00, 182151.81 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1143/1143 [00:00<00:00, 93957.54 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 565/565 [00:00<00:00, 58312.99 examples/s]


# Mengurutkan dan melabeli passages

In [15]:
from datasets import load_dataset
final_dataset = load_dataset('khalidrizki/post-retrieval-research_raw-dataset')
t5_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

In [16]:
def truncate_passages(examples):
    # Tokenisasi setiap passage dalam kolom 'passages' dan batasi panjangnya menjadi 512 token
    truncated_passages = []
    for passage in examples['passages']:
        # Tokenize each passage and truncate it to 512 tokens
        tokenized = t5_tokenizer(passage, padding='max_length', truncation=True, max_length=512, add_special_tokens=False)
        
        # Decode input_ids menjadi string dan tambahkan ke list truncated_passages
        truncated_passages.append(t5_tokenizer.decode(tokenized['input_ids'], skip_special_tokens=True))
    
    examples['trunc_passages'] = truncated_passages
    return examples

# Terapkan fungsi ke dataset
for split in final_dataset.keys():
    final_dataset[split] = final_dataset[split].map(truncate_passages)

Map: 100%|██████████| 4542/4542 [00:13<00:00, 338.61 examples/s]
Map: 100%|██████████| 1143/1143 [00:03<00:00, 370.69 examples/s]
Map: 100%|██████████| 565/565 [00:01<00:00, 383.04 examples/s]


In [17]:
from preprocessing import apply_similarity_ranking_to_dataset
# Me-rangking passages berdasarkan skor similarity
for split in final_dataset.keys():
    final_dataset[split] = apply_similarity_ranking_to_dataset(
        final_dataset[split], 
        text_col="trunc_passages",
        output_col="ranked_truncPassages_with_labels", 
        tokenizer=embedding_tokenizer, 
        model=embedding_model, 
        device = embedding_model.device
)

Processing ranked_truncPassages_with_labels: 100%|██████████| 4542/4542 [01:20<00:00, 56.71it/s]
Processing ranked_truncPassages_with_labels: 100%|██████████| 1143/1143 [00:19<00:00, 57.89it/s]
Processing ranked_truncPassages_with_labels: 100%|██████████| 565/565 [00:09<00:00, 57.01it/s]


Memindahkan 578 baris dari split dev ke train

In [18]:
import pandas as pd
from datasets import Dataset

# Tentukan jumlah baris yang ingin dipindahkan
num_rows_to_move = 578

# Pilih 577 baris acak dari split 'dev'
dev_dataset = final_dataset['dev']

selected_rows = dev_dataset.select(range(num_rows_to_move))  # Ambil 578 baris pertama setelah shuffle

# Hapus 577 baris yang sudah dipilih dari 'dev'
remaining_dev = dev_dataset.select(range(num_rows_to_move, len(dev_dataset)))

# Konversi ke DataFrame pandas untuk dapat menggunakan concat
train_df = final_dataset['train'].to_pandas()
selected_rows_df = selected_rows.to_pandas()

# Gabungkan keduanya dengan pandas.concat
new_train_df = pd.concat([train_df, selected_rows_df], ignore_index=True)

# Kembali ke dataset HuggingFace dari DataFrame
new_train = Dataset.from_pandas(new_train_df)

# Perbarui split train dan dev
final_dataset['train'] = new_train
final_dataset['dev'] = remaining_dev

In [16]:
def extract_sorted_passages(row):
    # Mengambil ranked_truncPassages_with_labels yang sudah terurut
    passages = row['ranked_truncPassages_with_labels']
    
    # Mengambil teks dari setiap passage
    sorted_texts = [passage['text'] for passage in passages]
    
    return sorted_texts

# Menggunakan method map untuk menerapkan fungsi ke setiap row di dataset
final_dataset = final_dataset.map(lambda row: {'sorted_truncPassages': extract_sorted_passages(row)}, batched=False)

# Memeriksa hasilnya: Pastikan 'sorted_truncPassages' sudah ada di dataset
print(final_dataset.column_names)  # Untuk memastikan nama kolom yang tersedia
final_dataset = final_dataset.rename_column('answer', 'label')

Map: 100%|██████████| 5120/5120 [00:00<00:00, 5630.19 examples/s]
Map: 100%|██████████| 565/565 [00:00<00:00, 3618.95 examples/s]
Map: 100%|██████████| 565/565 [00:00<00:00, 3861.37 examples/s]

{'train': ['query_id', 'query', 'tydiqa_id', 'answer', 'passages', 'trunc_passages', 'ranked_truncPassages_with_labels', 'sorted_truncPassages'], 'dev': ['query_id', 'query', 'tydiqa_id', 'answer', 'passages', 'trunc_passages', 'ranked_truncPassages_with_labels', 'sorted_truncPassages'], 'test': ['query_id', 'query', 'tydiqa_id', 'answer', 'passages', 'trunc_passages', 'ranked_truncPassages_with_labels', 'sorted_truncPassages']}





# Push to Huggingface Hub

In [None]:
final_dataset.push_to_hub("khalidrizki/postretrieve-raw-dataset-v2")

Creating parquet from Arrow format: 100%|██████████| 5/5 [00:00<00:00, 83.72ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.94s/it]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 103.86ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.89s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 111.25ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.48s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/khalidrizki/post-retrieval-research_raw-dataset/commit/2521dd4ed7a0466c17b6e0661c6db86efab7bd61', commit_message='Upload dataset', commit_description='', oid='2521dd4ed7a0466c17b6e0661c6db86efab7bd61', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/khalidrizki/post-retrieval-research_raw-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='khalidrizki/post-retrieval-research_raw-dataset'), pr_revision=None, pr_num=None)