# Load Model, Tokenizer, dan Dataset

In [1]:
from datasets import load_from_disk
dataset = load_from_disk("../generated_data/raw/fin_dataset")

  from .autonotebook import tqdm as notebook_tqdm


In [1]:
from utils import load_model_and_tokenizer

model_name = "Qwen/Qwen3-1.7B" 
model, tokenizer, config = load_model_and_tokenizer(model_name)

  from .autonotebook import tqdm as notebook_tqdm


Loading model on cuda with torch.bfloat16


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  9.53it/s]


# Formatting

In [3]:
import datasets
from functools import partial

from utils import load_model_and_tokenizer
from datasets import load_from_disk
dataset = load_from_disk("../generated_data/raw/fin_dataset")

model_name = "Qwen/Qwen3-1.7B" 
model, tokenizer, config = load_model_and_tokenizer(model_name)

def format_passages(example, psgs_col):
    """
    Join ketiga passages dengan \n\n sebagai penghubung
    """
    
    example['formatted_passages'] = "\n\n".join(example[psgs_col])
    return example

_format_psgs = partial(
    format_passages, 
    psgs_col = 'passages'
)

processed_dataset = {}

for split in dataset.keys():
    processed_dataset[split] = dataset[split].map(_format_psgs)

processed_dataset = datasets.DatasetDict(processed_dataset)


Loading model on cuda with torch.bfloat16


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 19.56it/s]


In [None]:
# import torch

# # Set print options untuk menampilkan semua elemen tensor
# torch.set_printoptions(threshold=torch.inf)


In [22]:
processed_dataset

DatasetDict({
    train: Dataset({
        features: ['query_id', 'query', 'tydiqa_id', 'answer', 'passages', 'formatted_passages'],
        num_rows: 4542
    })
    dev: Dataset({
        features: ['query_id', 'query', 'tydiqa_id', 'answer', 'passages', 'formatted_passages'],
        num_rows: 1143
    })
    test: Dataset({
        features: ['query_id', 'query', 'tydiqa_id', 'answer', 'passages', 'formatted_passages'],
        num_rows: 565
    })
})

## Generate Summary & Save

In [4]:
from summarize import generate_summary_dataset
import time
from datasets import DatasetDict

# Dictionary untuk menyimpan hasil per split
processed_splits = {}

# Loop untuk setiap split (train, dev, test)
for split in processed_dataset.keys():
    print(f"🔄 Memproses split: {split}")

    start = time.time()

    # Jalankan rangkuman untuk split tertentu
    processed_split = generate_summary_dataset(
        dataset=processed_dataset[split],  # Proses per split
        query_col="query",
        psgs_col="formatted_passages", 
        model=model, 
        tokenizer=tokenizer,
        batch_size=1
    )

    end = time.time()
    duration = end - start
    print(f"✅ Split {split} selesai dalam {duration:.2f} detik")

    # Simpan hasil per split ke dalam dictionary
    processed_splits[split] = processed_split

# Gabungkan kembali hasil per split menjadi DatasetDict
final_dataset = DatasetDict(processed_splits)
# Path penyimpanan hasil akhir
save_path = "generated_data/draft_summary_dataset"

# Simpan dataset yang telah digabungkan
final_dataset.save_to_disk(save_path)

🔄 Memproses split: train


Summarizing dataset:   0%|          | 0/4542 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Summarizing dataset: 100%|██████████| 4542/4542 [5:09:44<00:00,  4.09s/it]  


✅ Split train selesai dalam 18585.36 detik
🔄 Memproses split: dev


Summarizing dataset: 100%|██████████| 1143/1143 [1:17:18<00:00,  4.06s/it]


✅ Split dev selesai dalam 4638.98 detik
🔄 Memproses split: test


Summarizing dataset: 100%|██████████| 565/565 [37:41<00:00,  4.00s/it]


✅ Split test selesai dalam 2261.74 detik


Saving the dataset (1/1 shards): 100%|██████████| 4542/4542 [00:00<00:00, 121556.19 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1143/1143 [00:00<00:00, 110557.14 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 565/565 [00:00<00:00, 52817.92 examples/s]


# Answer Generation with Summary

In [1]:
from generate_answer import generate_answers_and_evaluate
from datasets import load_from_disk, DatasetDict, Dataset
from utils import load_model_and_tokenizer
import json
import time

loaded_dataset = load_from_disk("./generated_data/draft_summary_dataset")
model_name = "Qwen/Qwen3-1.7B"
model, tokenizer, config = load_model_and_tokenizer(model_name)

  from .autonotebook import tqdm as notebook_tqdm


Loading model on cuda with torch.bfloat16


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 16.42it/s]


Membuat dataset latihan (tuning) dimana jika:
1. baris yang dengan-rangkuman menghasilkan jawaban lebih baik (EM lebih besar atau F1 lebih besar) maka summary disimpan sebagai kolom final_summary
2. baris yang tanpa-rangkuman menghasilkan jawaban lebih baik, maka string kosong ("") ditambahkan ke final_summary
Untuk melihat baris yang memenuhi kondisi 2, bisa mengecek melalui EDA-tydiqa.ipynb (cell-cell terakhir)

In [None]:
processed_splits = {}

# 🔹 Mulai proses evaluasi per split
for split in loaded_dataset.keys():
    print(f"🔄 Memproses split: {split}")

    start_time = time.time()
    processed_split = generate_answers_and_evaluate(
        dataset=loaded_dataset[split],
        model=model,
        tokenizer=tokenizer
    )
    end_time = time.time()

    # 🕒 Waktu eksekusi
    print(f"✅ Proses selesai dalam {end_time - start_time:.2f} detik")
    processed_splits[split] = Dataset.from_list(processed_split)

processed_dataset = DatasetDict(processed_splits)

save_path = "./generated_data/RECOMP_tuning"
processed_dataset.save_to_disk(save_path)

print(f"📄 Hasil telah disimpan dalam {save_path}")

🔄 Memproses split: train


Generating responses:   0%|          | 0/4542 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Generating responses: 100%|██████████| 4542/4542 [7:54:47<00:00,  6.27s/it]  


✅ Proses selesai dalam 28487.57 detik
🔄 Memproses split: dev


Generating responses: 100%|██████████| 1143/1143 [1:59:48<00:00,  6.29s/it] 


✅ Proses selesai dalam 7188.78 detik
🔄 Memproses split: test


Generating responses: 100%|██████████| 565/565 [58:07<00:00,  6.17s/it] 


✅ Proses selesai dalam 3487.64 detik


Saving the dataset (1/1 shards): 100%|██████████| 4542/4542 [00:00<00:00, 130544.77 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1143/1143 [00:00<00:00, 93871.07 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 565/565 [00:00<00:00, 71981.71 examples/s]

📄 Hasil telah disimpan dalam ./generated_data/RECOMP_tuning



