In [1]:
import re
from typing import List, Tuple
from nltk import sent_tokenize
from datasets import load_from_disk
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

dataset = load_from_disk("../generated_data/raw/final_dataset")
train_dataset = dataset['train']
dev_dataset = dataset['dev']
test_dataset = dataset['test']
print('selesai memuat dataset...')

def split_between_title_and_text(text: str) -> Tuple[str, str]:
    """Memisahkan bagian Judul dan Teks dari input lengkap."""
    title, content = text.split("|", 1)
    return title.strip(), content.strip()

def split_sentences(text: str):
    # Hapus sitasi dalam format [1], [2], dst.
    cleaned_text = re.sub(r'\[\d+\]', '', text)
    
    # Jalankan sentence tokenizer dari nltk
    sentences = sent_tokenize(cleaned_text)
    
    return [s.strip() for s in sentences if s.strip()]

def create_rolling_segments(title: str, sentences: List[str], window_size: int = 3, stride: int = 2) -> List[str]:
    segments = []
    if len(sentences) < window_size:
        segments.append(f"{title} | {' '.join(sentences)}")
    else:
        for i in range(0, len(sentences) - window_size + 1, stride):
            group = sentences[i:i + window_size]
            combined = title + " | " + " ".join(group)
            segments.append(combined)
    return segments

def prepare_context_chunks(text: str) -> List[str]:
    """Fungsi utama: memproses teks penuh menjadi rolling segments."""
    judul, teks = split_between_title_and_text(text)
    kalimat_list = split_sentences(teks)
    segments = create_rolling_segments(judul, kalimat_list, window_size=3)
    return segments

def split_each_passages(example):
    # Pisahkan teks menjadi pasangan Judul-Teks berdasarkan '\n\n'
    passages = example["passages"]
    all_chunks = []
    
    # Proses setiap pasangan Judul-Teks
    for passage in passages:
        # Panggil fungsi prepare_context_chunks pada setiap bagian
        all_chunks.extend(prepare_context_chunks(passage))
    
    return {"context_chunks": all_chunks}
# ds=ds.map(split_each_passages)
train_dataset = train_dataset.map(split_each_passages)
dev_dataset = dev_dataset.map(split_each_passages)
test_dataset = test_dataset.map(split_each_passages)
print('selesai membuat chunks...')

  from .autonotebook import tqdm as notebook_tqdm


selesai memuat dataset...


Map: 100%|██████████| 4542/4542 [00:02<00:00, 2022.20 examples/s]
Map: 100%|██████████| 1143/1143 [00:00<00:00, 1809.48 examples/s]
Map: 100%|██████████| 565/565 [00:00<00:00, 1955.94 examples/s]

selesai membuat chunks...





In [3]:
tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-small')
model = AutoModel.from_pretrained('intfloat/multilingual-e5-small')
model.eval()

# Fungsi pooling dari model card
def average_pool(last_hidden_states, attention_mask):
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

# Fungsi pencocokan top-3
def get_top3_chunks(query, chunks):
    if not chunks:
        return [], [], []

    input_texts = ["query: " + query] + ["passage: " + chunk for chunk in chunks]
    batch = tokenizer(input_texts, padding=True, truncation=True, max_length=512, return_tensors="pt")

    with torch.no_grad():
        outputs = model(**batch)
    embeddings = average_pool(outputs.last_hidden_state, batch['attention_mask'])
    embeddings = F.normalize(embeddings, p=2, dim=1)

    query_embedding = embeddings[0]
    chunk_embeddings = embeddings[1:]
    scores = (query_embedding @ chunk_embeddings.T) * 100

    topk = torch.topk(scores, k=min(3, len(chunks)))  # Antisipasi jika chunk < 3
    top_chunks = [chunks[i] for i in topk.indices.tolist()]
    top_scores = topk.values.tolist()
    top_indices = topk.indices.tolist()

    return top_chunks, top_scores, top_indices

# Terapkan ke dataset
def process_dataset(dataset):
    top_chunks_all = []
    top_scores_all = []
    top_indices_all = []

    for example in tqdm(dataset, desc="Processing"):
        query = example['query']
        chunks = example['context_chunks']

        top_chunks, top_scores, top_indices = get_top3_chunks(query, chunks)

        top_chunks_all.append(top_chunks)
        top_scores_all.append(top_scores)
        top_indices_all.append(top_indices)

    # Tambahkan kolom baru
    dataset = dataset.add_column("top_chunks", top_chunks_all)
    dataset = dataset.add_column("top_chunk_scores", top_scores_all)
    dataset = dataset.add_column("top_chunk_indices", top_indices_all)
    return dataset

# Proses semua split
train_dataset = process_dataset(train_dataset)
dev_dataset = process_dataset(dev_dataset)
test_dataset = process_dataset(test_dataset)

Processing: 100%|██████████| 4542/4542 [12:37<00:00,  5.99it/s]
Processing: 100%|██████████| 1143/1143 [03:03<00:00,  6.24it/s]
Processing: 100%|██████████| 565/565 [01:31<00:00,  6.21it/s]


In [None]:
from datasets import DatasetDict

dataset = DatasetDict({
    "train": train_dataset,
    "dev": dev_dataset,
    "test": test_dataset
})
dataset.save_to_disk("../generated_data/CRAG_top_reranked_chunks")

Saving the dataset (1/1 shards): 100%|██████████| 4542/4542 [00:00<00:00, 154459.15 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1143/1143 [00:00<00:00, 83688.39 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 565/565 [00:00<00:00, 46257.70 examples/s]


Map: 100%|██████████| 4542/4542 [00:01<00:00, 3425.62 examples/s]
Map: 100%|██████████| 1143/1143 [00:00<00:00, 3494.14 examples/s]
Map: 100%|██████████| 565/565 [00:00<00:00, 3395.50 examples/s]
