# Load Model, Tokenizer, dan Dataset

In [1]:
# Load ===================================================================================
from datasets import load_dataset
from utils import load_model_and_tokenizer
from transformers import AutoTokenizer

model_name = "Qwen/Qwen3-1.7B" 
model, tokenizer, config = load_model_and_tokenizer(model_name)
student_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
dataset = load_dataset("khalidrizki/post-retrieval-research_raw-dataset")
print("BERHASIL MELOAD MODEL DAN DATASET\n")

  from .autonotebook import tqdm as notebook_tqdm


Loading model on cuda with torch.bfloat16


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 17.35it/s]


BERHASIL MELOAD MODEL DAN DATASET



# Formatting

In [2]:

# Formatting ============================================================================================================
import datasets
from functools import partial

def format_passages(example, psgs_col):
    """
    Join ketiga passages dengan \n\n sebagai penghubung
    """
    
    example['formatted_passages'] = "\n\n".join(example[psgs_col])
    return example

_format_psgs = partial(
    format_passages, 
    psgs_col = 'passages'
)

fin_dataset = {}

for split in dataset.keys():
    fin_dataset[split] = dataset[split].map(_format_psgs)
    fin_dataset[split] = fin_dataset[split].remove_columns('passages')

fin_dataset = datasets.DatasetDict(fin_dataset)
print("BERHASIL MEMFORMAT PASSAGES\n")


BERHASIL MEMFORMAT PASSAGES



# Generate Summaries

In [3]:
# Generate draft summary =============================================================================================
from transformers import set_seed
from summarize import generate_summary_dataset
import time
from datasets import DatasetDict

set_seed(42)

# Dictionary untuk menyimpan hasil per split
processed_splits = {}

# Loop untuk setiap split (train, dev, test)
for split in fin_dataset.keys():
    print(f"🔄 Memproses split: {split}")

    start = time.time()

    # Jalankan rangkuman untuk split tertentu
    final_dataset = generate_summary_dataset(
        dataset=fin_dataset[split],  # Proses per split
        query_col="query",
        psgs_col="formatted_passages", 
        model=model, 
        teacher_tokenizer=tokenizer,
        student_tokenizer=student_tokenizer,
        batch_size=1, 
        temperature=0, 
        create_truncated_psg_column=True
    )

    end = time.time()
    duration = end - start
    print(f"✅ Split {split} selesai dalam {duration:.2f} detik")

    # Simpan hasil per split ke dalam dictionary
    processed_splits[split] = final_dataset

# Gabungkan kembali hasil per split menjadi DatasetDict
dataset_with_draft_summary = DatasetDict(processed_splits)
print("BERHASIL MEMBUAT DRAFT SUMMARY\n")

🔄 Memproses split: train


Summarizing dataset:   0%|          | 0/4542 [00:00<?, ?it/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Summarizing dataset: 100%|██████████| 4542/4542 [4:27:19<00:00,  3.53s/it]  


✅ Split train selesai dalam 16040.18 detik
🔄 Memproses split: dev


Summarizing dataset: 100%|██████████| 1143/1143 [1:07:38<00:00,  3.55s/it]


✅ Split dev selesai dalam 4059.40 detik
🔄 Memproses split: test


Summarizing dataset: 100%|██████████| 565/565 [33:12<00:00,  3.53s/it]


✅ Split test selesai dalam 1993.46 detik
BERHASIL MEMBUAT DRAFT SUMMARY



In [4]:
# Path penyimpanan hasil akhir
save_path = "./generated_data/draft_summary_dataset"

# Simpan dataset yang telah digabungkan
dataset_with_draft_summary.save_to_disk(save_path)

Saving the dataset (1/1 shards): 100%|██████████| 4542/4542 [00:00<00:00, 69823.59 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1143/1143 [00:00<00:00, 47283.19 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 565/565 [00:00<00:00, 28881.46 examples/s]


# Answer Generation with Summary

Membuat dataset latihan (tuning) dimana jika:
1. baris yang dengan-rangkuman menghasilkan jawaban lebih baik (EM lebih besar atau F1 lebih besar) maka summary disimpan sebagai kolom final_summary
2. baris yang tanpa-rangkuman menghasilkan jawaban lebih baik, maka string kosong ("") ditambahkan ke final_summary
Untuk melihat baris yang memenuhi kondisi 2, bisa mengecek melalui EDA-tydiqa.ipynb (cell-cell terakhir)

In [7]:
# Generate final summary for Fine-Tuning ==================================================================================
from generate_answer import generate_answers_and_compare_between_with_and_without_summary
from datasets import DatasetDict, Dataset
import time

processed_splits = {}

# 🔹 Mulai proses evaluasi per split
for split in dataset_with_draft_summary.keys():
    print(f"🔄 Memproses split: {split}")

    start_time = time.time()
    final_dataset = generate_answers_and_compare_between_with_and_without_summary(
        dataset=dataset_with_draft_summary[split],
        passages_column='truncated_passages',
        query_column='query', 
        label_column='answer', 
        summary_column='summary',  
        model=model,
        tokenizer=tokenizer
    )
    end_time = time.time()

    # 🕒 Waktu eksekusi
    print(f"✅ Proses selesai dalam {end_time - start_time:.2f} detik")
    processed_splits[split] = Dataset.from_list(final_dataset)

fin_dataset = DatasetDict(processed_splits)

save_path = "./generated_data/RECOMP-tuning-seeded-truncated"
fin_dataset.save_to_disk(save_path)

print(f"📄 Hasil telah disimpan dalam {save_path}")

🔄 Memproses split: train


Generating responses (w/ & wo/ summary): 100%|██████████| 4542/4542 [7:53:33<00:00,  6.26s/it]  


✅ Proses selesai dalam 28413.48 detik
🔄 Memproses split: dev


Generating responses (w/ & wo/ summary): 100%|██████████| 1143/1143 [1:59:56<00:00,  6.30s/it] 


✅ Proses selesai dalam 7196.87 detik
🔄 Memproses split: test


Generating responses (w/ & wo/ summary): 100%|██████████| 565/565 [1:00:55<00:00,  6.47s/it]


✅ Proses selesai dalam 3655.68 detik


Saving the dataset (1/1 shards): 100%|██████████| 4542/4542 [00:00<00:00, 164068.87 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1143/1143 [00:00<00:00, 111167.29 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 565/565 [00:00<00:00, 71692.08 examples/s]

📄 Hasil telah disimpan dalam ./generated_data/RECOMP-tuning-seeded-truncated





In [1]:
# Latih model, tapi tanpa kemampuan seleksi
!python train_summarizer.py --dataset_name khalidrizki/RECOMP-tuning-truncated --text_column passages --query_column query --summary_column summary --model_name_or_path google/flan-t5-base --seed 42 --num_train_epochs 3 --per_device_train_batch_size=4 --gradient_accumulation_steps=2 --per_device_eval_batch_size=32 --learning_rate 1e-5 --max_target_length 52 --output_dir ./models/ --logging_first_step True --do_train --do_eval --predict_with_generate  --save_total_limit 3

06/12/2025 22:39:05 - INFO - __main__ - Training/evaluation parameters Seq2SeqTrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_s

Overwrite dataset info from restored data version if exists.
Loading Dataset info from C:\Users\LENOVO\.cache\huggingface\datasets/khalidrizki___recomp-tuning-truncated/default/0.0.0/0af0dd211ef16e1ecae2ee0d4b857744a1ad1e05
Found cached dataset recomp-tuning-truncated (C:/Users/LENOVO/.cache/huggingface/datasets/khalidrizki___recomp-tuning-truncated/default/0.0.0/0af0dd211ef16e1ecae2ee0d4b857744a1ad1e05)
Loading Dataset info from C:/Users/LENOVO/.cache/huggingface/datasets/khalidrizki___recomp-tuning-truncated/default/0.0.0/0af0dd211ef16e1ecae2ee0d4b857744a1ad1e05
[INFO|configuration_utils.py:693] 2025-06-12 22:39:16,213 >> loading configuration file config.json from cache at C:\Users\LENOVO\.cache\huggingface\hub\models--google--flan-t5-base\snapshots\7bcac572ce56db69c1ea7c8af255c5d7c9672fc2\config.json
[INFO|configuration_utils.py:765] 2025-06-12 22:39:16,223 >> Model config T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2

In [2]:
# Latih model dengan kemampuan seleksi
!python train_summarizer.py --dataset_name khalidrizki/RECOMP-tuning-truncated --text_column passages --query_column query --summary_column final_summary --model_name_or_path google/flan-t5-base --seed 42 --num_train_epochs 3 --per_device_train_batch_size=4 --gradient_accumulation_steps=2 --per_device_eval_batch_size=32 --learning_rate 1e-5 --max_target_length 52 --output_dir ./models/ --logging_first_step True --do_train --do_eval --predict_with_generate  --save_total_limit 3

06/13/2025 04:33:53 - INFO - __main__ - Training/evaluation parameters Seq2SeqTrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_s

Overwrite dataset info from restored data version if exists.
Loading Dataset info from C:\Users\LENOVO\.cache\huggingface\datasets/khalidrizki___recomp-tuning-truncated/default/0.0.0/0af0dd211ef16e1ecae2ee0d4b857744a1ad1e05
Found cached dataset recomp-tuning-truncated (C:/Users/LENOVO/.cache/huggingface/datasets/khalidrizki___recomp-tuning-truncated/default/0.0.0/0af0dd211ef16e1ecae2ee0d4b857744a1ad1e05)
Loading Dataset info from C:/Users/LENOVO/.cache/huggingface/datasets/khalidrizki___recomp-tuning-truncated/default/0.0.0/0af0dd211ef16e1ecae2ee0d4b857744a1ad1e05
[INFO|configuration_utils.py:693] 2025-06-13 04:34:07,105 >> loading configuration file config.json from cache at C:\Users\LENOVO\.cache\huggingface\hub\models--google--flan-t5-base\snapshots\7bcac572ce56db69c1ea7c8af255c5d7c9672fc2\config.json
[INFO|configuration_utils.py:765] 2025-06-13 04:34:07,113 >> Model config T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2

In [5]:
!python train_summarizer.py --model_name_or_path khalidrizki/RECOMP-unselective-final --do_predict --dataset_name khalidrizki/RECOMP-tuning-truncated --max_target_length 52 --output_dir ./outputs/FINAL-seeded-truncated --per_device_eval_batch_size=32 --predict_with_generate --text_column passages --query_column query --summary_column final_summary

06/13/2025 10:51:32 - INFO - __main__ - Training/evaluation parameters Seq2SeqTrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=False,
do_predict=True,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_

Overwrite dataset info from restored data version if exists.
Loading Dataset info from C:\Users\LENOVO\.cache\huggingface\datasets/khalidrizki___recomp-tuning-truncated/default/0.0.0/0af0dd211ef16e1ecae2ee0d4b857744a1ad1e05
Found cached dataset recomp-tuning-truncated (C:/Users/LENOVO/.cache/huggingface/datasets/khalidrizki___recomp-tuning-truncated/default/0.0.0/0af0dd211ef16e1ecae2ee0d4b857744a1ad1e05)
Loading Dataset info from C:/Users/LENOVO/.cache/huggingface/datasets/khalidrizki___recomp-tuning-truncated/default/0.0.0/0af0dd211ef16e1ecae2ee0d4b857744a1ad1e05
[INFO|configuration_utils.py:693] 2025-06-13 10:51:46,108 >> loading configuration file config.json from cache at C:\Users\LENOVO\.cache\huggingface\hub\models--khalidrizki--RECOMP-unselective-final\snapshots\2d73052058b035241c6fcb4d323b1a61444b17b4\config.json
[INFO|configuration_utils.py:765] 2025-06-13 10:51:46,116 >> Model config T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout":

In [7]:
!python train_summarizer.py --model_name_or_path khalidrizki/RECOMP-selective-final --do_predict --dataset_name khalidrizki/RECOMP-tuning-truncated --max_target_length 52 --output_dir ./outputs/FINAL-seeded-truncated --per_device_eval_batch_size=32 --predict_with_generate --text_column passages --query_column query --summary_column final_summary

06/13/2025 11:23:00 - INFO - __main__ - Training/evaluation parameters Seq2SeqTrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=False,
do_predict=True,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_

Overwrite dataset info from restored data version if exists.
Loading Dataset info from C:\Users\LENOVO\.cache\huggingface\datasets/khalidrizki___recomp-tuning-truncated/default/0.0.0/0af0dd211ef16e1ecae2ee0d4b857744a1ad1e05
Found cached dataset recomp-tuning-truncated (C:/Users/LENOVO/.cache/huggingface/datasets/khalidrizki___recomp-tuning-truncated/default/0.0.0/0af0dd211ef16e1ecae2ee0d4b857744a1ad1e05)
Loading Dataset info from C:/Users/LENOVO/.cache/huggingface/datasets/khalidrizki___recomp-tuning-truncated/default/0.0.0/0af0dd211ef16e1ecae2ee0d4b857744a1ad1e05
[INFO|configuration_utils.py:693] 2025-06-13 11:23:11,859 >> loading configuration file config.json from cache at C:\Users\LENOVO\.cache\huggingface\hub\models--khalidrizki--RECOMP-selective-final\snapshots\08bd03c085b373fc01cc01cc7b768d558c4e982b\config.json
[INFO|configuration_utils.py:765] 2025-06-13 11:23:11,867 >> Model config T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0