# Load Model, Tokenizer, dan Dataset

In [1]:
from datasets import load_from_disk
from utils import load_model_and_tokenizer

model_name = "meta-llama/Llama-3.2-3B-Instruct"
model, tokenizer, config = load_model_and_tokenizer(model_name)
finished_dataset = load_from_disk("./generated_data/mr_tydi_tydiqa_final")

Loading model on cuda with torch.float16


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# Summarizing

In [13]:
import datasets

def format_passages(example):
    """
    Menggabungkan positive_passages dan negative_passages, 
    lalu memformat top 3 passages menjadi string terstruktur.
    """
    # Gabungkan positive_passages dan negative_passages
    all_passages = example["positive_passages"] + example["negative_passages"]
    
    # Format 3 passage pertama
    formatted_passages = [
        f"Judul: {p['title']} \nTeks: {p['text']}" for p in all_passages[:3]  # Ambil 3 passage pertama
    ]
    
    return {"formatted_passages": "\n\n".join(formatted_passages)}

# Terapkan transformasi pada semua split (train, dev, test) dalam sekali `.map()`
processed_dataset = {}

for split in finished_dataset.keys():  # Loop untuk setiap split: "train", "dev", "test"
    processed_dataset[split] = finished_dataset[split].map(format_passages)

# Konversi kembali ke DatasetDict
processed_dataset = datasets.DatasetDict(processed_dataset)

# Cek hasil pada salah satu split
print(processed_dataset["train"][0])  # Lihat hasil dari train


Map:   0%|          | 0/4542 [00:00<?, ? examples/s]

Map:   0%|          | 0/1143 [00:00<?, ? examples/s]

Map:   0%|          | 0/565 [00:00<?, ? examples/s]

{'query_id': '0', 'query': 'dimanakah Dr. Ernest François Eugène Douwes Dekker meninggal?', 'positive_passages': [{'docid': '7080#33', 'text': 'Ernest Douwes Dekker wafat dini hari tanggal 28 Agustus 1950 (tertulis di batu nisannya; 29 Agustus 1950 versi van der Veur, 2006) dan dimakamkan di TMP Cikutra, Bandung.', 'title': 'Ernest Douwes Dekker'}], 'negative_passages': [{'docid': '20439#25', 'text': 'Eduard Douwes Dekker kemudian pindah ke Ingelheim am Rhein dekat Sungai Rhein sampai akhirnya meninggal 19 Februari 1887.', 'title': 'Eduard Douwes Dekker'}, {'docid': '7080#0', 'text': 'Dr. Ernest François Eugène Douwes Dekker (umumnya dikenal dengan nama Douwes Dekker atau Danudirja Setiabudi; ) adalah seorang pejuang kemerdekaan dan pahlawan nasional Indonesia.', 'title': 'Ernest Douwes Dekker'}, {'docid': '7080#2', 'text': 'Douwes Dekker terlahir di Pasuruan, Jawa Timur, pada tanggal 8 Oktober 1879, sebagaimana yang dia tulis pada riwayat hidup singkat saat mendaftar di Universitas Zu

In [15]:
processed_dataset

DatasetDict({
    train: Dataset({
        features: ['query_id', 'query', 'positive_passages', 'negative_passages', 'tydiqa_id', 'answers', 'formatted_passages'],
        num_rows: 4542
    })
    dev: Dataset({
        features: ['query_id', 'query', 'positive_passages', 'negative_passages', 'tydiqa_id', 'answers', 'formatted_passages'],
        num_rows: 1143
    })
    test: Dataset({
        features: ['query_id', 'query', 'positive_passages', 'negative_passages', 'tydiqa_id', 'answers', 'formatted_passages'],
        num_rows: 565
    })
})

#### Buat sampel kecil untuk testing awal (jika perlu)

In [None]:
import random

SEED = 42
random.seed(SEED)

# Ambil sampel dari tiap split
train_sample = random.sample(list(finished_dataset["train"]), 132)
dev_sample = random.sample(list(finished_dataset["dev"]), 40)
test_sample = random.sample(list(finished_dataset["test"]), 28)

# Gabungkan semua sampel
sample_data = train_sample + dev_sample + test_sample

print(f"Total samples: {len(sample_data)}")  # Harusnya 71 (50+13+8)

def combine_passages(example):
    example["all_passages"] = example["positive_passages"] + example["negative_passages"]
    return example

# Terapkan ke semua sampel
sample_data = [combine_passages(ex) for ex in sample_data]

def format_passages(example):
    formatted_passages = [
        f"Judul: {p['title']} \nTeks: {p['text']}" for p in example["all_passages"][:3]  # Ambil 5 passage
    ]
    return "\n\n".join(formatted_passages)  # Gabungkan semua passages

# Tambahkan kolom "formatted_passages"
for ex in sample_data:
    ex["formatted_passages"] = format_passages(ex)

Total samples: 200


## Generate Summary & Save

In [17]:
from summarize import summarize_top_5_combined
import time
from datasets import DatasetDict

model_name = "meta-llama/Llama-3.2-3B-Instruct"

# Dictionary untuk menyimpan hasil per split
processed_splits = {}

# Loop untuk setiap split (train, dev, test)
for split in processed_dataset.keys():
    print(f"🔄 Memproses split: {split}")

    start = time.time()

    # Jalankan rangkuman untuk split tertentu
    processed_split = summarize_top_5_combined(
        model_name=model_name,
        dataset=processed_dataset[split],  # Proses per split
        query_col="query",
        docs_col="formatted_passages", 
        model=model, 
        tokenizer=tokenizer,
        batch_size=1
    )

    end = time.time()
    duration = end - start
    print(f"✅ Split {split} selesai dalam {duration:.2f} detik")

    # Simpan hasil per split ke dalam dictionary
    processed_splits[split] = processed_split

# Gabungkan kembali hasil per split menjadi DatasetDict
final_dataset = DatasetDict(processed_splits)

# Path penyimpanan hasil akhir
save_path = "generated_data/draft_summary_dataset"

# Simpan dataset yang telah digabungkan
final_dataset.save_to_disk(save_path)

print(f"✅ DatasetDict telah disimpan di {save_path}")

🔄 Memproses split: train


Summarizing dataset: 100%|██████████| 4542/4542 [5:51:08<00:00,  4.64s/it]   


✅ Split train selesai dalam 21068.54 detik
🔄 Memproses split: dev


Summarizing dataset: 100%|██████████| 1143/1143 [1:26:45<00:00,  4.55s/it]


✅ Split dev selesai dalam 5206.19 detik
🔄 Memproses split: test


Summarizing dataset: 100%|██████████| 565/565 [43:37<00:00,  4.63s/it]


✅ Split test selesai dalam 2617.99 detik


Saving the dataset (0/1 shards):   0%|          | 0/4542 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1143 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/565 [00:00<?, ? examples/s]

✅ DatasetDict telah disimpan di generated_data/draft_summary_dataset


# Answer Generation with Summary

In [None]:
from generate_answer import generate_answers_and_evaluate
from datasets import load_from_disk, DatasetDict, Dataset
from utils import load_model_and_tokenizer
import json
import time

loaded_dataset = load_from_disk("./generated_data/draft_summary_dataset")
model_name = "meta-llama/Llama-3.2-1B-Instruct"
model, tokenizer, config = load_model_and_tokenizer(model_name)

processed_splits = {}

# 🔹 Mulai proses evaluasi per split
for split in loaded_dataset.keys():
    print(f"🔄 Memproses split: {split} dengan model 1B")

    start_time = time.time()
    processed_split = generate_answers_and_evaluate(
        dataset=loaded_dataset[split],
        model=model,
        tokenizer=tokenizer
    )
    end_time = time.time()

    # 🕒 Waktu eksekusi
    print(f"✅ Proses selesai dalam {end_time - start_time:.2f} detik")
    processed_splits[split] = Dataset.from_list(processed_split)

processed_dataset = DatasetDict(processed_splits)

save_path = "./generated_data/TUNING_final_summary"
processed_dataset.save_to_disk(save_path)

# # 📂 Simpan hasil berdasarkan ukuran model
# output_file = f"./generated_data/evaluated_summary_result_1B_FOR_TUNING.json"
# with open(output_file, "w", encoding="utf-8") as f:
#     json.dump(processed_splits, f, indent=4, ensure_ascii=False)

print(f"📄 Hasil telah disimpan dalam {save_path}")



Loading model on cuda with torch.float16
🔄 Memproses split: train dengan model 1B


Generating responses:   0%|          | 0/4542 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` instance instead by default (as opposed to the legacy tuple of tuples format). If you want to keep returning the legacy format, please set `return_legacy_cache=True`.
Generating responses: 100%|██████████| 4542/4542 [1:55:24<00:00,  1.52s/it]  


✅ Proses selesai dalam 6924.12 detik
🔄 Memproses split: dev dengan model 1B


Generating responses: 100%|██████████| 1143/1143 [33:26<00:00,  1.76s/it]


✅ Proses selesai dalam 2006.35 detik
🔄 Memproses split: test dengan model 1B


Generating responses: 100%|██████████| 565/565 [14:10<00:00,  1.50s/it]

✅ Proses selesai dalam 850.10 detik





Saving the dataset (0/1 shards):   0%|          | 0/4542 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1143 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/565 [00:00<?, ? examples/s]

📄 Hasil telah disimpan dalam ./generated_data/TUNING_final_summary


In [None]:
dataset_unduh = loaded_dataset = load_from_disk("./generated_data/TUNING_final_summary")


DatasetDict({
    train: Dataset({
        features: ['query', 'passages', 'summary', 'final_summary', 'answer', 'generated_results'],
        num_rows: 4542
    })
    dev: Dataset({
        features: ['query', 'passages', 'summary', 'final_summary', 'answer', 'generated_results'],
        num_rows: 1143
    })
    test: Dataset({
        features: ['query', 'passages', 'summary', 'final_summary', 'answer', 'generated_results'],
        num_rows: 565
    })
})

In [15]:
import json

# 📂 Load kedua dataset JSON
with open("./generated_data/SAMPEL_evaluated_summary_result_1B_latest.json", "r", encoding="utf-8") as f:
    dataset_1B = json.load(f)

with open("./generated_data/SAMPEL_evaluated_summary_result_3B_latest.json", "r", encoding="utf-8") as f:
    dataset_3B = json.load(f)

# 🔄 Loop melalui setiap split (train, dev, test)
superior_counts_1B = {}
superior_counts_3B = {}
superior_queries_1B = {}  # Menyimpan query dari row yang superior di 1B
superior_queries_3B = {}  # Menyimpan query dari row yang superior di 3B

for split in dataset_1B.keys():
    # 🔍 Cek superior_by_having_no_summary untuk masing-masing dataset
    superior_1B = []
    superior_3B = []

    # 🔍 Proses dataset 1B
    for idx, row in enumerate(dataset_1B[split]):
        w_summary = row["generated_results"]["w_summary"]
        wo_summary = row["generated_results"]["wo_summary"]

        # 1️⃣ Cek apakah EM dari wo_summary bernilai 1, jika iya, tambahkan jika >= EM dari w_summary
        if wo_summary["em"] == 1 and wo_summary["em"] >= w_summary["em"]:
            superior_1B.append(row["query"])
        # 2️⃣ Jika kondisi di atas tidak terpenuhi, cek apakah F1 dari w_summary < wo_summary
        elif w_summary["f1"] < wo_summary["f1"]:
            superior_1B.append(row["query"])

    # 🔍 Proses dataset 3B
    for idx, row in enumerate(dataset_3B[split]):
        w_summary = row["generated_results"]["w_summary"]
        wo_summary = row["generated_results"]["wo_summary"]

        # 1️⃣ Cek apakah EM dari wo_summary bernilai 1, jika iya, tambahkan jika >= EM dari w_summary
        if wo_summary["em"] == 1 and wo_summary["em"] >= w_summary["em"]:
            superior_3B.append(row["query"])
        # 2️⃣ Jika kondisi di atas tidak terpenuhi, cek apakah F1 dari w_summary < wo_summary
        elif w_summary["f1"] < wo_summary["f1"]:
            superior_3B.append(row["query"])

    # 🔄 Simpan jumlah row yang superior_by_having_no_summary
    superior_counts_1B[split] = len(superior_1B)
    superior_counts_3B[split] = len(superior_3B)

    # Simpan query-query yang memenuhi kondisi
    superior_queries_1B[split] = superior_1B
    superior_queries_3B[split] = superior_3B

# 📢 Output hasil analisis
print("=== Perbandingan superior_by_having_no_summary ===")
for split in dataset_1B.keys():
    print(f"\n📂 Split: {split}")
    print(f"🔹 Jumlah superior_by_having_no_summary di 1B: {superior_counts_1B[split]}")
    print(f"🔹 Jumlah superior_by_having_no_summary di 3B: {superior_counts_3B[split]}")

    # Tampilkan query dari dataset 1B
    print(f"\n📌 Query dari row yang superior_by_having_no_summary di dataset 1B:")
    for query in superior_queries_1B[split]:
        print(f"   - {query}")

    # Tampilkan query dari dataset 3B
    print(f"\n📌 Query dari row yang superior_by_having_no_summary di dataset 3B:")
    for query in superior_queries_3B[split]:
        print(f"   - {query}")

    print("\n" + "=" * 60)


=== Perbandingan superior_by_having_no_summary ===

📂 Split: train
🔹 Jumlah superior_by_having_no_summary di 1B: 4
🔹 Jumlah superior_by_having_no_summary di 3B: 5

📌 Query dari row yang superior_by_having_no_summary di dataset 1B:
   - Perusahaan apa yang membuat Accuracy International Arctic Warfare?
   - darimanakah taekwondo berasal?
   - apakah nama ibukota Argentina?
   - berapakah luas Spanyol ?

📌 Query dari row yang superior_by_having_no_summary di dataset 3B:
   - Perusahaan apa yang membuat Accuracy International Arctic Warfare?
   - darimanakah taekwondo berasal?
   - Siapa ibu Yesus kristus?
   - apakah nama ibukota Argentina?
   - Kapan hari kemerdekaan Kamboja


📂 Split: dev
🔹 Jumlah superior_by_having_no_summary di 1B: 0
🔹 Jumlah superior_by_having_no_summary di 3B: 1

📌 Query dari row yang superior_by_having_no_summary di dataset 1B:

📌 Query dari row yang superior_by_having_no_summary di dataset 3B:
   - Ada berapa bulan dalam tahun Hijriah ?


📂 Split: test
🔹 Jumlah s

### (untuk diisi)