# Load Model, Tokenizer, dan Dataset

In [3]:
from datasets import load_dataset, load_from_disk
dataset = load_dataset("khalidrizki/postretrieve-raw-dataset-v2")
dataset

DatasetDict({
    train: Dataset({
        features: ['query_id', 'query', 'tydiqa_id', 'label', 'passages', 'trunc_passages', 'ranked_truncPassages_with_labels', 'sorted_truncPassages'],
        num_rows: 5120
    })
    dev: Dataset({
        features: ['query_id', 'query', 'tydiqa_id', 'label', 'passages', 'trunc_passages', 'ranked_truncPassages_with_labels', 'sorted_truncPassages'],
        num_rows: 565
    })
    test: Dataset({
        features: ['query_id', 'query', 'tydiqa_id', 'label', 'passages', 'trunc_passages', 'ranked_truncPassages_with_labels', 'sorted_truncPassages'],
        num_rows: 565
    })
})

In [2]:
import random
from copy import deepcopy
import pandas as pd
from datasets import Dataset

# Membuat salinan dataset
ds = deepcopy(dataset)

# Tentukan jumlah baris yang ingin dipindahkan
num_rows_to_move = 578

# Pilih 577 baris acak dari split 'dev'
dev_dataset = ds['dev']

selected_rows = dev_dataset.select(range(num_rows_to_move))  # Ambil 577 baris pertama setelah shuffle

# Hapus 577 baris yang sudah dipilih dari 'dev'
remaining_dev = dev_dataset.select(range(num_rows_to_move, len(dev_dataset)))

# Konversi ke DataFrame pandas untuk dapat menggunakan concat
train_df = ds['train'].to_pandas()
selected_rows_df = selected_rows.to_pandas()

# Gabungkan keduanya dengan pandas.concat
new_train_df = pd.concat([train_df, selected_rows_df], ignore_index=True)

# Kembali ke dataset HuggingFace dari DataFrame
new_train = Dataset.from_pandas(new_train_df)

# Perbarui split train dan dev
ds['train'] = new_train
ds['dev'] = remaining_dev

# Sekarang, finetuning_dataset['train'] sudah berisi 577 baris tambahan, dan finetuning_dataset['dev'] sudah dikurangi.
ds

DatasetDict({
    train: Dataset({
        features: ['query_id', 'query', 'tydiqa_id', 'answer', 'passages', 'trunc_passages', 'ranked_truncPassages_with_labels'],
        num_rows: 5120
    })
    dev: Dataset({
        features: ['query_id', 'query', 'tydiqa_id', 'answer', 'passages', 'trunc_passages', 'ranked_truncPassages_with_labels'],
        num_rows: 565
    })
    test: Dataset({
        features: ['query_id', 'query', 'tydiqa_id', 'answer', 'passages', 'trunc_passages', 'ranked_truncPassages_with_labels'],
        num_rows: 565
    })
})

In [3]:
ds.push_to_hub('khalidrizki/postretrieve-raw-dataset')

Creating parquet from Arrow format: 100%|██████████| 6/6 [00:00<00:00, 32.70ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:04<00:00,  4.30s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 31.94ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.61s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 43.60ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.74s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/khalidrizki/postretrieve-raw-dataset/commit/79184ccb3d078a7810a0ad1282ffd266a9a64719', commit_message='Upload dataset', commit_description='', oid='79184ccb3d078a7810a0ad1282ffd266a9a64719', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/khalidrizki/postretrieve-raw-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='khalidrizki/postretrieve-raw-dataset'), pr_revision=None, pr_num=None)

In [1]:
from general_utils import load_model_and_tokenizer

model_name = "Qwen/Qwen3-1.7B" 
model, tokenizer, config = load_model_and_tokenizer(model_name)

print("BERHASIL MELOAD MODEL DAN DATASET\n")

  from .autonotebook import tqdm as notebook_tqdm


Loading model on cuda with torch.bfloat16


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  9.01it/s]


BERHASIL MELOAD MODEL DAN DATASET



In [3]:
# for split in dataset.keys():
#     dataset[split] = dataset[split].select(range(5))

dataset

DatasetDict({
    train: Dataset({
        features: ['query_id', 'query', 'tydiqa_id', 'answer', 'passages', 'trunc_passages', 'ranked_truncPassages_with_labels'],
        num_rows: 4542
    })
    dev: Dataset({
        features: ['query_id', 'query', 'tydiqa_id', 'answer', 'passages', 'trunc_passages', 'ranked_truncPassages_with_labels'],
        num_rows: 1143
    })
    test: Dataset({
        features: ['query_id', 'query', 'tydiqa_id', 'answer', 'passages', 'trunc_passages', 'ranked_truncPassages_with_labels'],
        num_rows: 565
    })
})

# RECOMP

## Membuat dataset finetuning untuk melatih kompresor RECOMP

In [4]:
from copy import deepcopy
recomp_draft_dataset = deepcopy(dataset)

Mengekstrak text & is_positive dari ranked passages

In [5]:
from functools import partial
from general_utils import extract_topk_texts
extract_fn = partial(
    extract_topk_texts, 
    ranked_units='ranked_truncPassages_with_labels', 
    returned_units_col='joined_passages', 
    returned_labels_col='passages_label'
)
for split in recomp_draft_dataset.keys():
    recomp_draft_dataset[split] = recomp_draft_dataset[split].map(extract_fn)

In [None]:
from general_utils import generate_per_row
from tqdm import tqdm
model.eval()
instruction = '{context}\n\nRingkaslah teks di atas menjadi tepat 2 kalimat (40 kata) agar menjawab pertanyaan secara mendetail. TANPA PENGANTAR. Pertanyaan: "{query}' 
for split in recomp_draft_dataset.keys():
    current_dataset = recomp_draft_dataset[split]
    decoded_outputs = []
    summaries = []

    for row in tqdm(current_dataset, desc=f"Summarizing split: {split}"):
        try:
            answer = generate_per_row(  # decoded_output, 
                row, 
                'query', 
                'joined_passages', 
                tokenizer, 
                model, 
                config.device_type, 
                instruction
            )
            # decoded_outputs.append(decoded_output)
            summaries.append(answer)
        except Exception as e:
            raise RuntimeError(
                f"Error during summarization for row:\n"
                f"Query: {row['query']}\n"
                f"Context: {row['joined_passages'][:200]}...\n"
                f"Error message: {str(e)}"
            )

    # dataset[split] = current_dataset.add_column("decoded_output", decoded_outputs)
    recomp_draft_dataset[split] = recomp_draft_dataset[split].add_column("summary", summaries)

In [7]:
recomp_draft_dataset.save_to_disk('generated_data/RECOMP_draft')

Saving the dataset (1/1 shards): 100%|██████████| 4542/4542 [00:00<00:00, 87498.93 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1143/1143 [00:00<00:00, 65990.66 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 565/565 [00:00<00:00, 38314.36 examples/s]


In [7]:
from datasets import load_from_disk
recomp_draft_dataset = load_from_disk('generated_data/RECOMP_draft')

In [None]:
from metrics import evaluate_substringmatch_f1
from datasets import Dataset, DatasetDict
from tqdm import tqdm
model.eval()

finetuning_dict = {}

instruction_w_summary = "Konteks: {context}\nBerdasarkan konteks sebelumnya, jawab pertanyaan berikut dalam dua kalimat. Pertanyaan: {query}"
instruction_wo_summary= "Jawab pertanyaan berikut dalam dua kalimat. Pertanyaan: {query}"
for split in recomp_draft_dataset.keys():
    current_dataset = recomp_draft_dataset[split]
    results = []
    for row in tqdm(current_dataset, desc=f"Generating responses (w/ & wo/ summary) on split: {split}"):
        label = row['answer']
        completion_w_summary = generate_per_row( # decoded_summ_output, 
            row=row, 
            query_col='query', 
            ctx_col='summary', 
            tokenizer=tokenizer, 
            model=model, 
            device_type=config.device_type, 
            instruction=instruction_w_summary
        )
        completion_wo_summary = generate_per_row(  #decoded_wo_summ_output, 
            row=row, 
            query_col='query', 
            ctx_col=None, 
            tokenizer=tokenizer, 
            model=model, 
            device_type=config.device_type, 
            instruction=instruction_wo_summary
        )

        sm_w, f1_w = evaluate_substringmatch_f1(completion_w_summary.strip(), label.strip())
        sm_wo, f1_wo=evaluate_substringmatch_f1(completion_wo_summary.strip(), label.strip())

        if (sm_wo == 1 and sm_wo > sm_w) or (f1_wo >= f1_w):
            final_summary = ""
        else:
            final_summary = row['summary']
        
        results.append(
            {
                "query": row['query'], 
                "passages":row['ranked_truncPassages_with_labels'], 
                "summary": row['summary'], 
                "final_summary": final_summary, 
                "label": label, 
                "model_outputs": {
                    "w_summary": {
                        "completion": completion_w_summary,
                        "em": sm_w,
                        "f1": f1_w, 
                        # "decoded_output": decoded_summ_output
                    },
                    "wo_summary": {
                        "completion": completion_wo_summary,
                        "em": sm_wo,
                        "f1": f1_wo, 
                        # "decoded_output": decoded_wo_summ_output
                    }
                }
            }
        )
    finetuning_dict[split] = Dataset.from_list(results)
finetuning_dataset = DatasetDict(finetuning_dict)


Generating responses (w/ & wo/ summary) on split: train: 100%|██████████| 4542/4542 [5:54:29<00:00,  4.68s/it]  
Generating responses (w/ & wo/ summary) on split: dev: 100%|██████████| 1143/1143 [1:30:02<00:00,  4.73s/it]
Generating responses (w/ & wo/ summary) on split: test: 100%|██████████| 565/565 [45:39<00:00,  4.85s/it] 


In [20]:
import random
from copy import deepcopy
import pandas as pd

# Membuat salinan dataset
ds = deepcopy(finetuning_dataset)

# Tentukan jumlah baris yang ingin dipindahkan
num_rows_to_move = 578

# Pilih 577 baris acak dari split 'dev'
dev_dataset = ds['dev']

selected_rows = dev_dataset.select(range(num_rows_to_move))  # Ambil 577 baris pertama setelah shuffle

# Hapus 577 baris yang sudah dipilih dari 'dev'
remaining_dev = dev_dataset.select(range(num_rows_to_move, len(dev_dataset)))

# Konversi ke DataFrame pandas untuk dapat menggunakan concat
train_df = ds['train'].to_pandas()
selected_rows_df = selected_rows.to_pandas()

# Gabungkan keduanya dengan pandas.concat
new_train_df = pd.concat([train_df, selected_rows_df], ignore_index=True)

# Kembali ke dataset HuggingFace dari DataFrame
new_train = Dataset.from_pandas(new_train_df)

# Perbarui split train dan dev
ds['train'] = new_train
ds['dev'] = remaining_dev

# Sekarang, finetuning_dataset['train'] sudah berisi 577 baris tambahan, dan finetuning_dataset['dev'] sudah dikurangi.
ds

DatasetDict({
    train: Dataset({
        features: ['query', 'passages', 'summary', 'final_summary', 'label', 'generated_answer'],
        num_rows: 5120
    })
    dev: Dataset({
        features: ['query', 'passages', 'summary', 'final_summary', 'label', 'generated_answer'],
        num_rows: 565
    })
    test: Dataset({
        features: ['query', 'passages', 'summary', 'final_summary', 'label', 'generated_answer'],
        num_rows: 565
    })
})

In [23]:
ds = ds.rename_column("generated_answer", "model_outputs")

In [None]:
from functools import partial
import sys
sys.path.append('..')
from general_utils import extract_topk_texts
from datasets import load_dataset

recomp_draft_dataset = load_dataset("khalidrizki/postretrieve-raw-dataset-v2")

extract_fn = partial(
    extract_topk_texts, 
    ranked_units='ranked_truncPassages_with_labels', 
    returned_units_col='joined_passages', 
    returned_labels_col='passages_label'
)
for split in recomp_draft_dataset.keys():
    recomp_draft_dataset[split] = recomp_draft_dataset[split].map(extract_fn)
    
dataset = load_dataset('khalidrizki/RECOMP-finetuning-final')

from datasets import DatasetDict

# Buat dictionary baru dengan split yang sudah dimodifikasi
new_dataset = DatasetDict({
    "train": dataset["train"].add_column("joined_passages", recomp_draft_dataset["train"]["joined_passages"]),
    "dev": dataset["dev"].add_column("joined_passages", recomp_draft_dataset["dev"]["joined_passages"]),
    "test": dataset["test"].add_column("joined_passages", recomp_draft_dataset["test"]["joined_passages"]),
})

repo_ds = "khalidrizki/RECOMP-finetuning-final-fix"
new_dataset.push_to_hub(repo_ds)

## Latihan

In [None]:
# tanpa kemampuan selektif
!python train_summarizer.py \
  --dataset_name khalidrizki/RECOMP-finetuning-final \
  --text_column joined_passages \
  --query_column query \
  --summary_column summary \
  --model_name_or_path google/flan-t5-base \
  --seed 42 \
  --num_train_epochs 3 \
  --per_device_train_batch_size=1 \
  --gradient_accumulation_steps=8 \
  --per_device_eval_batch_size=16 \
  --learning_rate 1e-5 \
  --max_source_length 1620 
  --resize_position_embeddings True
  --max_target_length 52 \
  --output_dir ./models/ \
  --logging_first_step True \
  --do_train \
  --do_eval \
  --predict_with_generate \
  --save_total_limit 3

## Testing

In [1]:
from datasets import load_dataset 
dataset = load_dataset("khalidrizki/RECOMP-finetuning-final-fix")
test_data = dataset["test"]

from general_utils import load_model_and_tokenizer
model_name='Qwen/Qwen3-1.7B'
model, tokenizer, config = load_model_and_tokenizer(model_name)

  from .autonotebook import tqdm as notebook_tqdm


Loading model on cuda with torch.bfloat16


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 14.49it/s]


### Selective

In [8]:
from copy import deepcopy
test_data_selective = deepcopy(test_data)
test_data_selective = test_data_selective.rename_column('summary', 'summary_generated_by_qwen_during_training')
test_data_selective = test_data_selective.rename_column('final_summary', 'selected_summary_generated_by_qwen_during_training')

In [None]:
# Muat baris-baris teks dari file
with open("./RECOMP/outputs/RECOMP-selective-fix-2025-06-27_15-12-35/generated_predictions.txt", "r", encoding="utf-8", errors='replace') as f:
    predictions = [line.strip() for line in f.readlines()]

# Menambahkan try-except di bagian assert
try:
    # Pastikan jumlah prediksi sama dengan jumlah data
    assert len(predictions) == len(test_data_selective), f"Jumlah prediksi ({len(predictions)}) tidak sama dengan jumlah data ({len(test_data_selective)})"

except AssertionError:
    # Jika terjadi error, tambahkan string kosong pada predictions
    while len(predictions) < len(test_data_selective):
        predictions.append("")  # Menambahkan string kosong ke predictions


In [10]:
gen_summary_col = "selective_summary_for_testing"
test_data_selective = test_data_selective.add_column(gen_summary_col, predictions)

In [17]:
from metrics import evaluate_substringmatch_f1
from general_utils import generate_per_row
from tqdm import tqdm
model.eval()

results = []
for row in tqdm(test_data_selective, desc=f"TESTING: Generating responses using summary from selective model..."):
    label = row['label']
    instruction = ''
    if test_data_selective[gen_summary_col] == '':
        instruction = "Jawab pertanyaan berikut dalam satu kalimat. Pertanyaan: {query}"
    else:
        instruction = "Konteks: {context}\nBerdasarkan konteks sebelumnya, jawab pertanyaan berikut dalam satu kalimat. Pertanyaan: {query}"

    completion = generate_per_row( # decoded_summ_output, 
        row=row, 
        query_col='query', 
        ctx_col=gen_summary_col, 
        tokenizer=tokenizer, 
        model=model, 
        device_type=config.device_type, 
        instruction=instruction
    )

    sm, f1 = evaluate_substringmatch_f1(completion.strip(), label.strip())
    
    results.append(
        {
            "query": row['query'], 
            "passages":row['joined_passages'], 
            "summary_used_for_ctx": row[gen_summary_col], 
            "label": label, 
            "completion": completion,
            "em": sm,
            "f1": f1
        }
    )

TESTING: Generating responses using summary from selective model...: 100%|██████████| 565/565 [24:49<00:00,  2.64s/it]


In [18]:
import os
import json

# Tentukan path file output JSON
output_dir = "./outputs/RECOMP"
os.makedirs(output_dir, exist_ok=True)  # Membuat direktori jika belum ada
output_file = os.path.join(output_dir, "SELECTIVE_test_results.json")

# Simpan results ke dalam file JSON
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=4)

print(f"Results telah disimpan di {output_file}")

Results telah disimpan di ./outputs/RECOMP\SELECTIVE_test_results.json


In [24]:
from datasets import Dataset
selective_recomp = Dataset.from_list(results)
print("HASIL RECOMP dengan kemampuan selektif")
print("rerata substring match:", sum(selective_recomp['em'])/len(selective_recomp))
print("rerata F1:", sum(selective_recomp['f1'])/len(selective_recomp))

HASIL RECOMP dengan kemampuan selektif
rerata substring match: 0.11327433628318584
rerata F1: 0.11271414083351855


### Unselective

In [19]:
from copy import deepcopy
test_data_unselective = deepcopy(test_data)
test_data_unselective = test_data_unselective.rename_column('summary', 'summary_generated_by_qwen_during_training')
test_data_unselective.remove_columns('final_summary')
# test_data_unselective = test_data_unselective.rename_column('final_summary', 'selected_summary_generated_by_qwen_during_training')

# Muat baris-baris teks dari file
with open("./RECOMP/outputs/RECOMP-unselective-fix-2025-06-27_15-17-40/generated_predictions.txt", "r", encoding="utf-8", errors='replace') as f:
    predictions = [line.strip() for line in f.readlines()]

# Menambahkan try-except di bagian assert
try:
    # Pastikan jumlah prediksi sama dengan jumlah data
    assert len(predictions) == len(test_data_unselective), f"Jumlah prediksi ({len(predictions)}) tidak sama dengan jumlah data ({len(test_data_unselective)})"

except AssertionError:
    # Jika terjadi error, tambahkan string kosong pada predictions
    while len(predictions) < len(test_data_unselective):
        predictions.append("")  # Menambahkan string kosong ke predictions


gen_summary_col = "unselective_summary_for_testing"
test_data_unselective = test_data_unselective.add_column(gen_summary_col, predictions)

In [20]:
from metrics import evaluate_substringmatch_f1
from general_utils import generate_per_row
from tqdm import tqdm
model.eval()

results_un = []
for row in tqdm(test_data_unselective, desc=f"TESTING: Generating responses using summary from unselective model..."):
    label = row['label']
    instruction = "Konteks: {context}\nBerdasarkan konteks sebelumnya, jawab pertanyaan berikut dalam satu kalimat. Pertanyaan: {query}"

    completion = generate_per_row( # decoded_summ_output, 
        row=row, 
        query_col='query', 
        ctx_col=gen_summary_col, 
        tokenizer=tokenizer, 
        model=model, 
        device_type=config.device_type, 
        instruction=instruction
    )

    sm, f1 = evaluate_substringmatch_f1(completion.strip(), label.strip())
    
    results_un.append(
        {
            "query": row['query'], 
            "passages":row['joined_passages'], 
            "summary_used_for_ctx": row[gen_summary_col], 
            "label": label, 
            "completion": completion,
            "em": sm,
            "f1": f1
        }
    )

TESTING: Generating responses using summary from unselective model...: 100%|██████████| 565/565 [22:58<00:00,  2.44s/it]


In [25]:
from datasets import Dataset
unselective_recomp = Dataset.from_list(results_un)
print("HASIL RECOMP tanpa kemampuan selektif")
print("rerata substring match:", sum(unselective_recomp['em'])/len(unselective_recomp))
print("rerata F1:", sum(unselective_recomp['f1'])/len(unselective_recomp))

HASIL RECOMP tanpa kemampuan selektif
rerata substring match: 0.2743362831858407
rerata F1: 0.220610480602904


In [21]:
import os
import json

# Tentukan path file output JSON
output_dir = "./outputs/RECOMP"
os.makedirs(output_dir, exist_ok=True)  # Membuat direktori jika belum ada
output_file = os.path.join(output_dir, "UNSelective_test_results.json")

# Simpan results_un ke dalam file JSON
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(results_un, f, ensure_ascii=False, indent=4)

print(f"Results telah disimpan di {output_file}")

Results telah disimpan di ./outputs/RECOMP\UNSelective_test_results.json


# CRAG

In [4]:
from datasets import load_dataset
crag_dataset = load_dataset('khalidrizki/postretrieve-raw-dataset')

Downloading readme: 100%|██████████| 838/838 [00:00<?, ?B/s] 
Downloading data: 100%|██████████| 15.1M/15.1M [00:02<00:00, 6.60MB/s]
Downloading data: 100%|██████████| 1.56M/1.56M [00:00<00:00, 2.21MB/s]
Downloading data: 100%|██████████| 1.52M/1.52M [00:00<00:00, 1.96MB/s]
Generating train split: 100%|██████████| 5120/5120 [00:00<00:00, 75710.26 examples/s]
Generating dev split: 100%|██████████| 565/565 [00:00<00:00, 28432.04 examples/s]
Generating test split: 100%|██████████| 565/565 [00:00<00:00, 233338.10 examples/s]


In [6]:
import re
from typing import List, Dict, Any, Tuple
from nltk.tokenize import sent_tokenize

def split_between_title_and_text(text: str) -> Tuple[str, str]:
    """Memisahkan bagian Judul dan Teks dari input lengkap."""
    title, content = text.split("|", 1)
    return title.strip(), content.strip()

def split_sentences(text: str) -> List[str]:
    """Hilangkan sitasi dan pecah teks menjadi kalimat-kalimat."""
    cleaned_text = re.sub(r'\[\d+\]', '', text)
    return [s.strip() for s in sent_tokenize(cleaned_text) if s.strip()]

def create_rolling_segments(title: str, sentences: List[str], window_size: int = 3, stride: int = 2) -> List[str]:
    """Buat rolling window segment dari kalimat-kalimat dengan format 'Title | Kalimat…'."""
    segments = []
    if len(sentences) < window_size:
        segments.append(f"{title} | {' '.join(sentences)}")
    else:
        for i in range(0, len(sentences) - window_size + 1, stride):
            group = sentences[i:i + window_size]
            combined = f"{title} | {' '.join(group)}"
            segments.append(combined)
    return segments

def prepare_context_chunks(text: str, is_positive: bool) -> List[Dict[str, Any]]:
    """Proses satu passage menjadi rolling segments dan labelnya."""
    title, content = split_between_title_and_text(text)
    sentences = split_sentences(content)
    segments = create_rolling_segments(title, sentences, window_size=3)
    return [{"text": seg, "is_positive": is_positive} for seg in segments]

def split_each_passages(example):
    ranked_passages = example["ranked_truncPassages_with_labels"]
    all_chunks = []
    all_labels = []

    for psg in ranked_passages:
        chunks = prepare_context_chunks(psg["text"], psg["is_positive"])
        for c in chunks:
            all_chunks.append(c["text"])
            all_labels.append(c["is_positive"])

    return {
        "context_chunks": all_chunks,
        "chunk_labels": all_labels
    }

for split in crag_dataset.keys():
    crag_dataset[split] = crag_dataset[split].map(split_each_passages)
    total_chunks = sum(len(row["context_chunks"]) for row in crag_dataset[split])
    average_chunks = total_chunks / len(crag_dataset[split])
    print(f"Rerata jumlah elemen di 'context_chunks' split {split}: {average_chunks:.2f}")

Map: 100%|██████████| 5120/5120 [00:03<00:00, 1635.50 examples/s]


Rerata jumlah elemen di 'context_chunks' split train: 4.52


Map: 100%|██████████| 565/565 [00:00<00:00, 1898.76 examples/s]


Rerata jumlah elemen di 'context_chunks' split dev: 4.34


Map: 100%|██████████| 565/565 [00:00<00:00, 1795.29 examples/s]

Rerata jumlah elemen di 'context_chunks' split test: 4.41





In [7]:
from transformers import AutoTokenizer, AutoModel
from datasets import DatasetDict
from general_utils import apply_similarity_ranking_to_dataset
embedding_tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-small')
embedding_model = AutoModel.from_pretrained('intfloat/multilingual-e5-small')
embedding_model.eval()

for split in crag_dataset.keys():
    crag_dataset[split] = apply_similarity_ranking_to_dataset(
        crag_dataset[split], 
        text_col="context_chunks", 
        label_col="chunk_labels", 
        output_col="ranked_chunks_with_labels", 
        tokenizer=embedding_tokenizer,
        model=embedding_model
    )

Processing ranked_chunks_with_labels: 100%|██████████| 5120/5120 [13:13<00:00,  6.46it/s]
Processing ranked_chunks_with_labels: 100%|██████████| 565/565 [01:24<00:00,  6.71it/s]
Processing ranked_chunks_with_labels: 100%|██████████| 565/565 [01:23<00:00,  6.75it/s]


In [8]:
crag_dataset.push_to_hub('khalidrizki/CRAG-3sentences-chunks')

Creating parquet from Arrow format: 100%|██████████| 6/6 [00:00<00:00, 18.58ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:04<00:00,  4.85s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 24.83ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.74s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 28.33ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.83s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/khalidrizki/CRAG-3sentences-chunks/commit/d3970033c61ef586434db7037c81a274eb0cffcb', commit_message='Upload dataset', commit_description='', oid='d3970033c61ef586434db7037c81a274eb0cffcb', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/khalidrizki/CRAG-3sentences-chunks', endpoint='https://huggingface.co', repo_type='dataset', repo_id='khalidrizki/CRAG-3sentences-chunks'), pr_revision=None, pr_num=None)

## Membuat Jawaban
dengan variasi chunks

In [9]:
from general_utils import load_model_and_tokenizer
model_name='Qwen/Qwen3-1.7B'
model, tokenizer, config = load_model_and_tokenizer(model_name)

Loading model on cuda with torch.bfloat16


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 10.07it/s]


In [24]:
dataset = load_dataset('khalidrizki/CRAG-3sentences-chunks')
test_dataset = dataset['test']

### Jumlah chunks = 1

In [27]:
from general_utils import extract_topk_texts
from functools import partial
k = 1
extract_fn = partial(
    extract_topk_texts, 
    k=k, 
    ranked_units='ranked_chunks_with_labels', 
    returned_units_col="top1_chunks",
    returned_labels_col="top1_labels"
)
test_dataset = test_dataset.map(extract_fn)

Map: 100%|██████████| 565/565 [00:00<00:00, 2571.87 examples/s]


In [37]:
from general_utils import generate_per_row
from metrics import evaluate_substringmatch_f1
from tqdm import tqdm
from datasets import Dataset

top1_generations = []
instruction = "Konteks: {context}\nBerdasarkan konteks sebelumnya, jawab pertanyaan berikut dalam satu kalimat. Pertanyaan: {query}"

for row in tqdm(test_dataset, desc="Generating answer using the 1 chunk with most similarity"):
    label = row['answer']
    generated_answer = generate_per_row(
        row=row, 
        query_col='query', 
        ctx_col='top1_chunks', 
        tokenizer=tokenizer, 
        model=model, 
        device_type=config.device_type, 
        instruction=instruction
    )
    sm, f1 = evaluate_substringmatch_f1(generated_answer.strip(), label.strip())
    top1_generations.append({
        'query': row['query'], 
        'passages': row['ranked_chunks_with_labels'], 
        'filtered_chunks': row['top1_chunks'], 
        'label': label, 
        'generated_answer': generated_answer, 
        'em': sm, 
        'f1': f1
        }
    )

CRAG_top1 = Dataset.from_list(top1_generations)
print("HASIL CRAG dengan Top 1")
print("rerata substring match:", sum(CRAG_top1['em'])/len(CRAG_top1))
print("rerata F1:", sum(CRAG_top1['f1'])/len(CRAG_top1))

Generating answer using the 1 chunk with most similarity: 100%|██████████| 565/565 [26:20<00:00,  2.80s/it]

HASIL CRAG dengan Top 1
rerata substring match: 0.4690265486725664
rerata F1: 0.3026155217667255





In [41]:
CRAG_top1.save_to_disk('./outputs/CRAG/top1')

Saving the dataset (1/1 shards): 100%|██████████| 565/565 [00:00<00:00, 38930.57 examples/s]


### Jumlah chunks = 2

In [38]:
from general_utils import extract_topk_texts
from functools import partial
k = 2
extract2_fn = partial(
    extract_topk_texts, 
    k=k, 
    ranked_units='ranked_chunks_with_labels', 
    returned_units_col="top2_chunks",
    returned_labels_col="top2_labels"
)
test_dataset = test_dataset.map(extract2_fn)

from general_utils import generate_per_row
from metrics import evaluate_substringmatch_f1
from tqdm import tqdm
from datasets import Dataset

top2_generations = []
instruction = "Konteks: {context}\nBerdasarkan konteks sebelumnya, jawab pertanyaan berikut dalam satu kalimat. Pertanyaan: {query}"

for row in tqdm(test_dataset, desc="Generating answer using the 2 chunks with most similarity"):
    label = row['answer']
    generated_answer = generate_per_row(
        row=row, 
        query_col='query', 
        ctx_col='top2_chunks', 
        tokenizer=tokenizer, 
        model=model, 
        device_type=config.device_type, 
        instruction=instruction
    )
    sm, f1 = evaluate_substringmatch_f1(generated_answer.strip(), label.strip())
    top2_generations.append({
        'query': row['query'], 
        'passages': row['ranked_chunks_with_labels'], 
        'filtered_chunks': row['top2_chunks'], 
        'label': label, 
        'generated_answer': generated_answer, 
        'em': sm, 
        'f1': f1
        }
    )

CRAG_top2 = Dataset.from_list(top2_generations)
print("HASIL CRAG dengan Top 2")
print("rerata substring match:", sum(CRAG_top2['em'])/len(CRAG_top2))
print("rerata F1:", sum(CRAG_top2['f1'])/len(CRAG_top2))

Map: 100%|██████████| 565/565 [00:00<00:00, 2567.49 examples/s]
Generating answer using the 2 chunks with most similarity: 100%|██████████| 565/565 [27:49<00:00,  2.95s/it]

HASIL CRAG dengan Top 2
rerata substring match: 0.5469026548672566
rerata F1: 0.33058291391801087





In [42]:
CRAG_top2.save_to_disk('./outputs/CRAG/top2')

Saving the dataset (1/1 shards): 100%|██████████| 565/565 [00:00<00:00, 35326.11 examples/s]


### Jumlah chunks = 3

In [39]:
from general_utils import extract_topk_texts
from functools import partial
k = 3
extract3_fn = partial(
    extract_topk_texts, 
    k=k, 
    ranked_units='ranked_chunks_with_labels', 
    returned_units_col="top3_chunks",
    returned_labels_col="top3_labels"
)
test_dataset = test_dataset.map(extract3_fn)

from general_utils import generate_per_row
from metrics import evaluate_substringmatch_f1
from tqdm import tqdm
from datasets import Dataset

top3_generations = []
instruction = "Konteks: {context}\nBerdasarkan konteks sebelumnya, jawab pertanyaan berikut dalam satu kalimat. Pertanyaan: {query}"

for row in tqdm(test_dataset, desc="Generating answer using the 3 chunks with most similarity"):
    label = row['answer']
    generated_answer = generate_per_row(
        row=row, 
        query_col='query', 
        ctx_col='top3_chunks', 
        tokenizer=tokenizer, 
        model=model, 
        device_type=config.device_type, 
        instruction=instruction
    )
    sm, f1 = evaluate_substringmatch_f1(generated_answer.strip(), label.strip())
    top3_generations.append({
        'query': row['query'], 
        'passages': row['ranked_chunks_with_labels'], 
        'filtered_chunks': row['top3_chunks'], 
        'label': label, 
        'generated_answer': generated_answer, 
        'em': sm, 
        'f1': f1
        }
    )

CRAG_top3 = Dataset.from_list(top3_generations)
print("HASIL CRAG dengan Top 3")
print("rerata substring match:", sum(CRAG_top3['em'])/len(CRAG_top3))
print("rerata F1:", sum(CRAG_top3['f1'])/len(CRAG_top3))

Map: 100%|██████████| 565/565 [00:00<00:00, 3272.12 examples/s]
Generating answer using the 3 chunks with most similarity: 100%|██████████| 565/565 [26:16<00:00,  2.79s/it]

HASIL CRAG dengan Top 3
rerata substring match: 0.584070796460177
rerata F1: 0.35286040653377126





In [43]:
CRAG_top3.save_to_disk('./outputs/CRAG/top3')

Saving the dataset (1/1 shards): 100%|██████████| 565/565 [00:00<00:00, 86907.06 examples/s]


### RAG Normal

In [40]:
from general_utils import extract_topk_texts
from functools import partial
k = 3
extract_all_fn = partial(
    extract_topk_texts, 
    k=k, 
    ranked_units='ranked_truncPassages_with_labels', 
    returned_units_col="sorted_passages",
    returned_labels_col="sortedPassages_labels"
)
test_dataset = test_dataset.map(extract_all_fn)

from general_utils import generate_per_row
from metrics import evaluate_substringmatch_f1
from tqdm import tqdm
from datasets import Dataset

ori_passages_generations = []
instruction = "Konteks: {context}\nBerdasarkan konteks sebelumnya, jawab pertanyaan berikut dalam satu kalimat. Pertanyaan: {query}"

for row in tqdm(test_dataset, desc="Generating answer using the original passages (not splitted into chunks)"):
    label = row['answer']
    generated_answer = generate_per_row(
        row=row, 
        query_col='query', 
        ctx_col='sorted_passages', 
        tokenizer=tokenizer, 
        model=model, 
        device_type=config.device_type, 
        instruction=instruction
    )
    sm, f1 = evaluate_substringmatch_f1(generated_answer.strip(), label.strip())
    ori_passages_generations.append({
        'query': row['query'], 
        'passages': row['ranked_truncPassages_with_labels'], 
        'filtered_chunks': row['sorted_passages'], 
        'label': label, 
        'generated_answer': generated_answer, 
        'em': sm, 
        'f1': f1
        }
    )

normal_RAG = Dataset.from_list(ori_passages_generations)
print("HASIL RAG normal")
print("rerata substring match:", sum(normal_RAG['em'])/len(normal_RAG))
print("rerata F1:", sum(normal_RAG['f1'])/len(normal_RAG))

Map: 100%|██████████| 565/565 [00:00<00:00, 2399.06 examples/s]
Generating answer using the original passages (not splitted into chunks): 100%|██████████| 565/565 [29:42<00:00,  3.16s/it]

HASIL RAG normal
rerata substring match: 0.6035398230088496
rerata F1: 0.35978795140582226





In [44]:
normal_RAG.save_to_disk('./outputs/CRAG/normal-rag')

Saving the dataset (1/1 shards): 100%|██████████| 565/565 [00:00<00:00, 75441.93 examples/s]


# Playground

In [14]:
ds = load_dataset('khalidrizki/RECOMP-finetuning-final')
ds

DatasetDict({
    train: Dataset({
        features: ['query', 'passages', 'summary', 'final_summary', 'label', 'model_outputs'],
        num_rows: 5120
    })
    dev: Dataset({
        features: ['query', 'passages', 'summary', 'final_summary', 'label', 'model_outputs'],
        num_rows: 565
    })
    test: Dataset({
        features: ['query', 'passages', 'summary', 'final_summary', 'label', 'model_outputs'],
        num_rows: 565
    })
})