In [1]:
from preprocessing import XRAG_TOKEN, _concat_messages_qwen
from datasets import load_from_disk, load_dataset
import torch
from transformers import AutoTokenizer, set_seed
import sys
sys.path.append('..')
from model.E5Retriever import E5Retriever
from model.xQwen3 import XQwen3ForCausalLM
from functools import partial
from tokenizers import AddedToken

set_seed(42)

model_name_or_path = "Qwen/Qwen3-1.7B"
projector_path = "../output/finetuned/2025-07-19_projector_learningrate_1e-3/projector_checkpoints/epoch_10/projector.pth"
retriever_name_or_path = 'intfloat/multilingual-e5-small'
tokenizer = AutoTokenizer.from_pretrained(
    model_name_or_path, 
    padding_side='left'
)

# Menambahkan token xrag ke daftar token tokenizer (karena kita menginjeksi projector ke qwen3)
num_added_tokens = 0
num_added_tokens += tokenizer.add_tokens([AddedToken(XRAG_TOKEN,lstrip=False,rstrip=False)])
xrag_token_id = tokenizer.convert_tokens_to_ids(XRAG_TOKEN)

max_test_samples = None
device = torch.device("cuda:0")

if retriever_name_or_path.lower().startswith('intfloat/multilingual-e5'):
    retriever = E5Retriever(retriever_name_or_path)
    retriever_tokenizer = AutoTokenizer.from_pretrained(retriever_name_or_path)
retriever_hidden_size = retriever.get_embed_dim()
retriever.eval()
retriever = retriever.to(device)

def load_xrag_dataset(test_path, test_split, background_col, retriever_name):
    if test_path.startswith('khalidrizki'):
        test_data = load_dataset(test_path)[test_split]   
    else: 
        test_data = load_from_disk(test_path)[test_split]

    if retriever_name is not None and retriever_name.lower().startswith('intfloat/multilingual-e5'):
        def add_passage_prefix(example):
            example[background_col] = ["passage: " + x for x in example[background_col]]
            return example
        
        test_data = test_data.map(add_passage_prefix)
    return test_data

retriever_name_or_path = 'intfloat/multilingual-e5-small'
test_data = load_xrag_dataset(
    test_path="khalidrizki/postretrieve-raw-dataset-v2", 
    test_split='test',
    background_col="sorted_truncPassages",
    retriever_name= retriever_name_or_path
)

def str_format(row, background_col, query_col):
    prompt_template = "Rujuklah latar belakang: {background} Pertanyaan: {query}"
    retrieval_embed_length = len(row[background_col])
    background = " ".join([XRAG_TOKEN]*retrieval_embed_length)
    prompt = prompt_template.format_map(dict(background=background, query=row[query_col].strip()))
    return prompt

def prepare_prompt(row, tokenizer, query_col, background_col):
    prompt = str_format(
        row, 
        background_col, 
        query_col
    )

    messages = [{'role':'user', 'content':prompt}]
    prompt = _concat_messages_qwen(messages, tokenizer, add_generation_prompt=True)
    return {'prompt':prompt}

if max_test_samples is not None:
    test_data = test_data.select(range(max_test_samples))
prep_prompt = partial(prepare_prompt, tokenizer=tokenizer, query_col='query', background_col='sorted_truncPassages')
formatted_test_data = test_data.map(prep_prompt)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from utils import get_retrieval_embeds

@torch.no_grad()
def compute_retrieval_embeds_per_example(example, background_col):
    passages = example[background_col]
    
    tokenized = retriever_tokenizer(
        passages,
        max_length=512,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )
    
    input_ids = tokenized["input_ids"].to("cuda")
    attention_mask = tokenized["attention_mask"].to("cuda")

    # get embeddings: tensor [num_passages, dim]
    embeds = get_retrieval_embeds(
        retriever=retriever,
        input_ids=input_ids,
        attention_mask=attention_mask,
    ).cpu()

    # convert each vector to list[float] for serialization
    embed_list = [embed.tolist() for embed in embeds]

    return {"retrieval_embeds": embed_list}

print(f"Preparing document embeddings with {retriever_name_or_path}...")

compute_retrieval_embeds = partial(
    compute_retrieval_embeds_per_example, 
    background_col = 'sorted_truncPassages'
)

formatted_test_data = formatted_test_data.map(
    compute_retrieval_embeds,
    batched=False
)
del retriever.model

Preparing document embeddings with intfloat/multilingual-e5-small...


## Score the generated answers

### dgn model lengkap

In [3]:
from transformers import AutoConfig
from model.xQwen3 import XQwen3ForCausalLM, XQwen3Config

# config = AutoConfig.from_pretrained(model_name_or_path)
# MODEL_CLASS = eval(config.architectures[0])
# model = MODEL_CLASS.from_pretrained(
#     model_name_or_path, 
#     torch_dtype=torch.bfloat16 
# ).to("cuda:0")

config = XQwen3Config.from_pretrained(model_name_or_path, retriever_hidden_size=384)
model = XQwen3ForCausalLM.from_pretrained(  # XLlamaForCausalLM
    model_name_or_path,
    config=config,
    torch_dtype = 'bfloat16'
).to("cuda:0")

model.load_state_dict(torch.load(projector_path), strict=False)
model.eval()

if retriever is not None:
    assert XRAG_TOKEN in tokenizer.get_vocab() 
    model.set_xrag_token_id(xrag_token_id)
    if num_added_tokens > 0:
        model.resize_token_embeddings(len(tokenizer))

from utils import stop_sequences_criteria
from tqdm import tqdm
import torch

def llm_for_open_generation_dataset(
    llm,
    llm_tokenizer,
    formatted_dataset,
    batch_size=2,
    enable_progress_bar=True,
):
    full_completion = []
    # generated_answers = []
    
    total_test_number = len(formatted_dataset)
    device = llm.device

    progress_bar = tqdm(range(0, total_test_number, batch_size), ncols=60, disable=not enable_progress_bar)

    for start_idx in range(0, total_test_number, batch_size):
        batch = formatted_dataset.select(range(start_idx, min(start_idx + batch_size, total_test_number)))

        prompts = batch["prompt"]
        tokenized_prompt = llm_tokenizer(prompts, padding='longest', return_tensors='pt', truncation=True, max_length=512)

        input_ids = tokenized_prompt.input_ids.to(device)
        attention_mask = tokenized_prompt.attention_mask.to(device)   

        stopping_criteria = stop_sequences_criteria(llm_tokenizer, input_ids.shape[1], input_ids.shape[0])
        retrieval_kwargs = {}

        if "retrieval_embeds" in batch.column_names:
            embeds_batch = batch["retrieval_embeds"]
            # Flatten and convert to tensor
            embeds = [torch.tensor(vec, dtype=torch.float32) for sublist in embeds_batch for vec in sublist]
            embeds = torch.stack(embeds).to(device)
            retrieval_kwargs["retrieval_embeds"] = embeds
            stopping_criteria = stop_sequences_criteria(llm_tokenizer, 0, input_ids.shape[0])

        with torch.no_grad():
            generated_output = llm.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                # stopping_criteria=stopping_criteria,
                do_sample=False,
                max_new_tokens=52,
                pad_token_id=llm_tokenizer.pad_token_id,
                **retrieval_kwargs,
            )

        output = llm_tokenizer.batch_decode(generated_output, skip_special_tokens=True)
        full_completion.extend([x.strip() for x in output])

        # input_length = input_ids.shape[1] 
        # answer = llm_tokenizer.batch_decode(generated_output[:, input_length:], skip_special_tokens=True)
        # generated_answers.extend([x.strip() for x in answer])

        progress_bar.update(batch_size)

    return full_completion  #, generated_answers

eval_batch_size = 1

prompt_and_results = llm_for_open_generation_dataset(
    llm=model,
    llm_tokenizer=tokenizer,
    formatted_dataset=formatted_test_data,
    batch_size=eval_batch_size,
    enable_progress_bar=True
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 11.89it/s]
Some weights of XQwen3ForCausalLM were not initialized from the model checkpoint at Qwen/Qwen3-1.7B and are newly initialized: ['projector.projector.0.bias', 'projector.projector.0.weight', 'projector.projector.2.bias', 'projector.projector.2.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(projector_path), strict=False)
100%|█████████████████████| 565/565 [20:20<00:00,  2.16s/it]


In [4]:
import sys
sys.path.append('../..')
from metrics import evaluate_substringmatch_f1
from datasets import Dataset
xrag_generations = []
for i, generation in enumerate(prompt_and_results):
    sm, f1 = evaluate_substringmatch_f1(generation.strip(), formatted_test_data['label'][i].strip())
    xrag_generations.append({
        'query': formatted_test_data['query'][i], 
        'passages': formatted_test_data['ranked_truncPassages_with_labels'][i], 
        'retrieval_embeds': formatted_test_data['retrieval_embeds'][i], 
        'label': formatted_test_data['label'][i], 
        'generated_answer': generation, 
        'substring_match': sm, 
        'f1': f1
    })

xrag_result = Dataset.from_list(xrag_generations)
print("HASIL xRAG")
print("rerata substring match:", sum(xrag_result['substring_match'])/len(xrag_result))
print("rerata F1:", sum(xrag_result['f1'])/len(xrag_result))

HASIL xRAG
rerata substring match: 0.0831858407079646
rerata F1: 0.0963426216019985


In [5]:
xrag_result.save_to_disk('../output/test_scores/generation_and_score')

Saving the dataset (1/1 shards): 100%|██████████| 565/565 [00:00<00:00, 37023.79 examples/s]


In [6]:
for i, pr_and_res in enumerate(prompt_and_results):
    print('query:', formatted_test_data['query'][i])
    print('label:', formatted_test_data['label'][i])
    print(pr_and_res)
    print()

query: Siapakah yang menemuka benua Amerika ?
label: orang-orang Viking
Jawaban: Amerika adalah benua yang ditemukan oleh Amerika Serikat, yang berarti bahwa benua itu ditemukan oleh orang-orang Amerika.

query: Dimanakah letak Donggala ?
label: Sulawesi Tengah, Indonesia
Kabupaten Donggala adalah sebuah kabupaten di Sulawesi Tengah, Indonesia.

query: Siapa bapak Teknik industri?
label: Frederick Winslow Taylor
1. **Robert Owen** adalah bapak teknik industri.

query: Kapan Penghulu Rasyid meninggal ?
label: 15 Desember 1861
1998

query: seberapa luas kah samudera pasifik?
label: 179,7 juta km²
16.500.000 km²

query: apakah yang dimaksud denga geisha ?
label: seniman-penghibur (entertainer) tradisional Jepang
seorang wanita yang berpakaian dengan pakaian khas, berpakaian dengan kain khusus, dan berpakaian dengan pakaian yang terkunci, dan berpakaian dengan pakaian yang ter

query: Kapan Bank BCA mengeluarkan kartu debit?
label: 2000-an
1998

query: Dimana kantor pusat General Motors?
l

In [None]:
# import unicodedata
# import regex

# class SimpleTokenizer(object):
#     ALPHA_NUM = r'[\p{L}\p{N}\p{M}]+'
#     NON_WS = r'[^\p{Z}\p{C}]'

#     def __init__(self):
#         """
#         Args:
#             annotators: None or empty set (only tokenizes).
#         """
#         self._regexp = regex.compile(
#             '(%s)|(%s)' % (self.ALPHA_NUM, self.NON_WS),
#             flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE
#         )

#     def tokenize(self, text, uncased=False):
#         matches = [m for m in self._regexp.finditer(text)]
#         if uncased:
#             tokens = [m.group().lower() for m in matches]
#         else:
#             tokens = [m.group() for m in matches]
#         return tokens

# def _normalize(text):
#     return unicodedata.normalize('NFD', text)

# def has_answer(answers, text, tokenizer=SimpleTokenizer()) -> bool:
#     """Check if a document contains an answer string."""
#     text = _normalize(text)
#     text = tokenizer.tokenize(text, uncased=True)

#     for answer in answers:
#         answer = _normalize(answer)
#         answer = tokenizer.tokenize(answer, uncased=True)
#         for i in range(0, len(text) - len(answer) + 1):
#             if answer == text[i: i + len(answer)]:
#                 return True
#     return False

# def get_substring_match_score(outputs,answers):
#     """
#     outputs: [string1,string2]
#     answers: [
#                 [string1_1,string1_2],
#                 [string2_1,string2_2]
#              ]
#     """
#     import numpy as np
#     assert len(outputs) == len(answers)
#     if not isinstance(answers[0],list):
#         answers = [[x] for x in answers]
#     substring_match_scores = []
#     answer_lengths = []
#     for output,answer in zip(outputs,answers):
#         if has_answer(answer,output): # EM evaluation
#             substring_match_scores.append(1.0)
#         else:
#             substring_match_scores.append(0.0)
        
#         answer_lengths.append(len(output.split()))

#     substring_match = round(sum(substring_match_scores)/len(outputs), 4)
#     lens = round(np.mean(answer_lengths), 4)

#     return substring_match,substring_match_scores

In [None]:
# from collections import Counter
# import numpy as np
# import string


# def normalize_answer(s):
#     def remove_articles(text):
#         return regex.sub(r'\b(a|an|the)\b', ' ', text)

#     def white_space_fix(text):
#         return ' '.join(text.split())

#     def remove_punc(text):
#         exclude = set(string.punctuation)
#         return ''.join(ch for ch in text if ch not in exclude)

#     def lower(text):
#         return text.lower()

#     return white_space_fix(remove_articles(remove_punc(lower(s))))

# def f1_score(prediction, ground_truth):
#     prediction_tokens = normalize_answer(prediction).split()
#     ground_truth_tokens = normalize_answer(ground_truth).split()
#     common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
#     num_same = sum(common.values())
#     if num_same == 0:
#         return 0
#     precision = 1.0 * num_same / len(prediction_tokens)
#     recall = 1.0 * num_same / len(ground_truth_tokens)
#     f1 = (2 * precision * recall) / (precision + recall)
#     return f1


# def f1(prediction, ground_truths):
#     return max([f1_score(prediction, gt) for gt in ground_truths])

# def eval_truthfulqa(outputs,answers):

#     f1_scores = []
#     # rl_scores = []
#     for output,answer in zip(outputs,answers):

#         f1_scores.append(f1(output, answer))
#         # rl_scores.append(rl(output, answer))

#     F1 = round(np.mean(f1_scores), 4)
#     # RL = round(np.mean(rl_scores), 4)

#     return F1,  f1_scores #,RL,rl_scores

# import json

# # Ambil jawaban referensi dari data
# answers = [x['label'] for x in formatted_test_data]

# # Ganti args dengan parameter eksplisit
# eval_metrics = 'substring_match'  # atau 'fact_checking_acc', 'truthfulqa_f1_rl'
# use_rag = True
# avg_prompt_length = sum(len(x) for x in formatted_test_data["prompt"]) / len(formatted_test_data)
# model_name_or_path = "../output/finetuned/finished_model"
# retriever_name_or_path = "intfloat/multilingual-e5-small"

# # Evaluasi
# if eval_metrics == 'substring_match':
#     score, score_per_sample = get_substring_match_score(generated_results, answers)
# elif eval_metrics == 'truthfulqa_f1_rl':
#     f1, rl, f1_scores, rl_scores = eval_truthfulqa(generated_results, answers)
#     score = f"{f1}-{rl}"
#     score_per_sample = [(f1_score, rl_score) for f1_score, rl_score in zip(f1_scores, rl_scores)]

# # Tampilkan hasil
# result_dict = {
#     "dataset": "final_dataset",
#     "batch_size": eval_batch_size,
#     "include_retrieval": use_rag,
#     "avg_prompt_length": avg_prompt_length,
#     "model": model_name_or_path,
#     f"{eval_metrics}": score,
# }

# if retriever_name_or_path is not None:
#     result_dict["retriever"] = retriever_name_or_path

# print(json.dumps(result_dict, indent=4))

{
    "dataset": "final_dataset",
    "batch_size": 2,
    "include_retrieval": true,
    "avg_prompt_length": 186.92920353982302,
    "model": "../output/finetuned/finished_model",
    "substring_match": 0.0,
    "retriever": "intfloat/multilingual-e5-small"
}


In [None]:
# import sys
# sys.path.append('../..')
# from datasets import Dataset
# from metrics import evaluate_substringmatch_f1

# testing_results = []
# labels = formatted_test_data['label']
# for i, row in enumerate(formatted_test_data):
#     label = labels[i]
#     generated_result = generated_results[i]
#     sm, f1 = evaluate_substringmatch_f1(generated_result.strip(), label.strip())

#     testing_results.append(
#         {
#             'query': row['query'],
#             'passages': row['sorted_truncPassages'],
#             'full_prompt_and_completion': prompt_and_results[i], 
#             'completion': generated_result, 
#             'label': label, 
#             'em': sm, 
#             'f1': f1
#         }
#     )

# xRAG_results= Dataset.from_list(testing_results)
# print("Hasil dari xRAG")
# print("rerata substring match:", sum(xRAG_results['em'])/len(xRAG_results))
# print("rerata F1:", sum(xRAG_results['f1'])/len(xRAG_results))

In [None]:
# import os
# import json

# # Direktori tujuan
# output_dir = "../output/test_scores/qwen1.7_e5small_batch2_epoch3_2025-06-27-21.40"

# # Buat direktori jika belum ada
# os.makedirs(output_dir, exist_ok=True)

# # Simpan result_dict ke file JSON
# result_path = os.path.join(output_dir, "result_dict.json")
# with open(result_path, "w") as f:
#     json.dump(result_dict, f, indent=4)

# # Simpan score_per_sample ke file JSON
# score_path = os.path.join(output_dir, "score_per_sample.json")
# with open(score_path, "w") as f:
#     json.dump(score_per_sample, f, indent=4)