# Load Model, Tokenizer, dan Dataset

In [1]:
from datasets import load_dataset, load_from_disk
dataset = load_from_disk("generated_data/research-raw-dataset")

  from .autonotebook import tqdm as notebook_tqdm


In [1]:
from general_utils import load_model_and_tokenizer

model_name = "Qwen/Qwen3-1.7B" 
model, tokenizer, config = load_model_and_tokenizer(model_name)

print("BERHASIL MELOAD MODEL DAN DATASET\n")

  from .autonotebook import tqdm as notebook_tqdm


Loading model on cuda with torch.bfloat16


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  9.01it/s]


BERHASIL MELOAD MODEL DAN DATASET



In [3]:
# for split in dataset.keys():
#     dataset[split] = dataset[split].select(range(5))

dataset

DatasetDict({
    train: Dataset({
        features: ['query_id', 'query', 'tydiqa_id', 'answer', 'passages', 'trunc_passages', 'ranked_truncPassages_with_labels'],
        num_rows: 4542
    })
    dev: Dataset({
        features: ['query_id', 'query', 'tydiqa_id', 'answer', 'passages', 'trunc_passages', 'ranked_truncPassages_with_labels'],
        num_rows: 1143
    })
    test: Dataset({
        features: ['query_id', 'query', 'tydiqa_id', 'answer', 'passages', 'trunc_passages', 'ranked_truncPassages_with_labels'],
        num_rows: 565
    })
})

# RECOMP

## Membuat dataset finetuning untuk melatih kompresor RECOMP

In [4]:
from copy import deepcopy
recomp_draft_dataset = deepcopy(dataset)

Mengekstrak text & is_positive dari ranked passages

In [5]:
from functools import partial
from general_utils import extract_topk_texts
extract_fn = partial(
    extract_topk_texts, 
    ranked_units='ranked_truncPassages_with_labels', 
    returned_units_col='joined_passages', 
    returned_labels_col='passages_label'
)
for split in recomp_draft_dataset.keys():
    recomp_draft_dataset[split] = recomp_draft_dataset[split].map(extract_fn)

In [6]:
from general_utils import generate_per_row
from tqdm import tqdm
model.eval()
instruction = '{context}\n\nRingkaslah teks di atas menjadi tepat 2 kalimat (40 kata) agar menjawab pertanyaan secara mendetail. TANPA PENGANTAR. Pertanyaan: "{query}' 
for split in recomp_draft_dataset.keys():
    current_dataset = recomp_draft_dataset[split]
    decoded_outputs = []
    summaries = []

    for row in tqdm(current_dataset, desc=f"Summarizing split: {split}"):
        try:
            answer = generate_per_row(  # decoded_output, 
                row, 
                'query', 
                'joined_passages', 
                tokenizer, 
                model, 
                config.device_type, 
                instruction
            )
            # decoded_outputs.append(decoded_output)
            summaries.append(answer)
        except Exception as e:
            raise RuntimeError(
                f"Error during summarization for row:\n"
                f"Query: {row['query']}\n"
                f"Context: {row['joined_passages'][:200]}...\n"
                f"Error message: {str(e)}"
            )

    # dataset[split] = current_dataset.add_column("decoded_output", decoded_outputs)
    recomp_draft_dataset[split] = recomp_draft_dataset[split].add_column("summary", summaries)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Summarizing split: train:   0%|          | 3/4542 [00:16<6:58:15,  5.53s/it]


KeyboardInterrupt: 

In [7]:
recomp_draft_dataset.save_to_disk('generated_data/RECOMP_draft')

Saving the dataset (1/1 shards): 100%|██████████| 4542/4542 [00:00<00:00, 87498.93 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1143/1143 [00:00<00:00, 65990.66 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 565/565 [00:00<00:00, 38314.36 examples/s]


In [7]:
from datasets import load_from_disk
recomp_draft_dataset = load_from_disk('generated_data/RECOMP_draft')

In [None]:
from metrics import evaluate_substringmatch_f1
from datasets import Dataset, DatasetDict
from tqdm import tqdm
model.eval()

finetuning_dict = {}

instruction_w_summary = "Konteks: {context}\nBerdasarkan konteks sebelumnya, jawab pertanyaan berikut dalam dua kalimat. Pertanyaan: {query}"
instruction_wo_summary= "Jawab pertanyaan berikut dalam dua kalimat. Pertanyaan: {query}"
for split in recomp_draft_dataset.keys():
    current_dataset = recomp_draft_dataset[split]
    results = []
    for row in tqdm(current_dataset, desc=f"Generating responses (w/ & wo/ summary) on split: {split}"):
        label = row['answer']
        completion_w_summary = generate_per_row( # decoded_summ_output, 
            row=row, 
            query_col='query', 
            ctx_col='summary', 
            tokenizer=tokenizer, 
            model=model, 
            device_type=config.device_type, 
            instruction=instruction_w_summary
        )
        completion_wo_summary = generate_per_row(  #decoded_wo_summ_output, 
            row=row, 
            query_col='query', 
            ctx_col=None, 
            tokenizer=tokenizer, 
            model=model, 
            device_type=config.device_type, 
            instruction=instruction_wo_summary
        )

        sm_w, f1_w = evaluate_substringmatch_f1(completion_w_summary.strip(), label.strip())
        sm_wo, f1_wo=evaluate_substringmatch_f1(completion_wo_summary.strip(), label.strip())

        if (sm_wo == 1 and sm_wo > sm_w) or (f1_wo >= f1_w):
            final_summary = ""
        else:
            final_summary = row['summary']
        
        results.append(
            {
                "query": row['query'], 
                "passages":row['ranked_truncPassages_with_labels'], 
                "summary": row['summary'], 
                "final_summary": final_summary, 
                "label": label, 
                "model_outputs": {
                    "w_summary": {
                        "completion": completion_w_summary,
                        "em": sm_w,
                        "f1": f1_w, 
                        # "decoded_output": decoded_summ_output
                    },
                    "wo_summary": {
                        "completion": completion_wo_summary,
                        "em": sm_wo,
                        "f1": f1_wo, 
                        # "decoded_output": decoded_wo_summ_output
                    }
                }
            }
        )
    finetuning_dict[split] = Dataset.from_list(results)
finetuning_dataset = DatasetDict(finetuning_dict)


Generating responses (w/ & wo/ summary) on split: train: 100%|██████████| 4542/4542 [5:54:29<00:00,  4.68s/it]  
Generating responses (w/ & wo/ summary) on split: dev: 100%|██████████| 1143/1143 [1:30:02<00:00,  4.73s/it]
Generating responses (w/ & wo/ summary) on split: test: 100%|██████████| 565/565 [45:39<00:00,  4.85s/it] 


In [20]:
import random
from copy import deepcopy
import pandas as pd

# Membuat salinan dataset
ds = deepcopy(finetuning_dataset)

# Tentukan jumlah baris yang ingin dipindahkan
num_rows_to_move = 578

# Pilih 577 baris acak dari split 'dev'
dev_dataset = ds['dev']

selected_rows = dev_dataset.select(range(num_rows_to_move))  # Ambil 577 baris pertama setelah shuffle

# Hapus 577 baris yang sudah dipilih dari 'dev'
remaining_dev = dev_dataset.select(range(num_rows_to_move, len(dev_dataset)))

# Konversi ke DataFrame pandas untuk dapat menggunakan concat
train_df = ds['train'].to_pandas()
selected_rows_df = selected_rows.to_pandas()

# Gabungkan keduanya dengan pandas.concat
new_train_df = pd.concat([train_df, selected_rows_df], ignore_index=True)

# Kembali ke dataset HuggingFace dari DataFrame
new_train = Dataset.from_pandas(new_train_df)

# Perbarui split train dan dev
ds['train'] = new_train
ds['dev'] = remaining_dev

# Sekarang, finetuning_dataset['train'] sudah berisi 577 baris tambahan, dan finetuning_dataset['dev'] sudah dikurangi.
ds

DatasetDict({
    train: Dataset({
        features: ['query', 'passages', 'summary', 'final_summary', 'label', 'generated_answer'],
        num_rows: 5120
    })
    dev: Dataset({
        features: ['query', 'passages', 'summary', 'final_summary', 'label', 'generated_answer'],
        num_rows: 565
    })
    test: Dataset({
        features: ['query', 'passages', 'summary', 'final_summary', 'label', 'generated_answer'],
        num_rows: 565
    })
})

In [23]:
ds = ds.rename_column("generated_answer", "model_outputs")

In [None]:
# tanpa kemampuan selektif
!python train_summarizer.py \
  --dataset_name khalidrizki/RECOMP-finetuning-final \
  --text_column joined_passages \
  --query_column query \
  --summary_column summary \
  --model_name_or_path google/flan-t5-base \
  --seed 42 \
  --num_train_epochs 3 \
  --per_device_train_batch_size=4 \
  --gradient_accumulation_steps=2 \
  --per_device_eval_batch_size=32 \
  --learning_rate 1e-5 \
  --max_source_length 1620 
  --resize_position_embeddings True
  --max_target_length 52 \
  --output_dir ./models/RECOMP/ \
  --logging_first_step True \
  --do_train \
  --do_eval \
  --predict_with_generate \
  --save_total_limit 3

# CRAG

In [None]:
crag_dataset = deepcopy(dataset)
crag_dataset

In [None]:
import re
from typing import List, Dict, Any, Tuple
from nltk.tokenize import sent_tokenize

def split_between_title_and_text(text: str) -> Tuple[str, str]:
    """Memisahkan bagian Judul dan Teks dari input lengkap."""
    title, content = text.split("|", 1)
    return title.strip(), content.strip()

def split_sentences(text: str) -> List[str]:
    """Hilangkan sitasi dan pecah teks menjadi kalimat-kalimat."""
    cleaned_text = re.sub(r'\[\d+\]', '', text)
    return [s.strip() for s in sent_tokenize(cleaned_text) if s.strip()]

def create_rolling_segments(title: str, sentences: List[str], window_size: int = 3, stride: int = 2) -> List[str]:
    """Buat rolling window segment dari kalimat-kalimat dengan format 'Title | Kalimat…'."""
    segments = []
    if len(sentences) < window_size:
        segments.append(f"{title} | {' '.join(sentences)}")
    else:
        for i in range(0, len(sentences) - window_size + 1, stride):
            group = sentences[i:i + window_size]
            combined = f"{title} | {' '.join(group)}"
            segments.append(combined)
    return segments

def prepare_context_chunks(text: str, is_positive: bool) -> List[Dict[str, Any]]:
    """Proses satu passage menjadi rolling segments dan labelnya."""
    title, content = split_between_title_and_text(text)
    sentences = split_sentences(content)
    segments = create_rolling_segments(title, sentences, window_size=3)
    return [{"text": seg, "is_positive": is_positive} for seg in segments]

def split_each_passages(example):
    ranked_passages = example["ranked_truncPassages_with_labels"]
    all_chunks = []
    all_labels = []

    for psg in ranked_passages:
        chunks = prepare_context_chunks(psg["text"], psg["is_positive"])
        for c in chunks:
            all_chunks.append(c["text"])
            all_labels.append(c["is_positive"])

    return {
        "context_chunks": all_chunks,
        "chunk_labels": all_labels
    }

for split in crag_dataset.keys():
    crag_dataset[split] = crag_dataset[split].map(split_each_passages)
    total_chunks = sum(len(row["context_chunks"]) for row in crag_dataset[split])
    average_chunks = total_chunks / len(crag_dataset[split])
    print(f"Rerata jumlah elemen di 'context_chunks' split {split}: {average_chunks:.2f}")

In [None]:
from transformers import AutoTokenizer, AutoModel
from datasets import DatasetDict
from general_utils import apply_similarity_ranking_to_dataset
embedding_tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-small')
embedding_model = AutoModel.from_pretrained('intfloat/multilingual-e5-small')
embedding_model.eval()

for split in crag_dataset.keys():
    crag_dataset[split] = apply_similarity_ranking_to_dataset(
        crag_dataset[split], 
        text_col="context_chunks", 
        label_col="chunk_labels", 
        output_col="ranked_chunks_with_labels", 
        tokenizer=embedding_tokenizer,
        model=embedding_model
    )

In [None]:
crag_dataset.save_to_disk('../generated_data/CRAG_dataset_NEW')