# PLAYGROUND NOTEBOOK
Notebook untuk mengecek kode yang telah dibuat

### Cek CUDA

In [3]:
import torch

# Cek GPU yang sedang aktif (default)
current_device = torch.cuda.current_device()
print(f"Current active GPU: {current_device} ({torch.cuda.get_device_name(current_device)})")

# Pindahkan tensor ke GPU tertentu
tensor = torch.tensor([1, 2, 3])
gpu_tensor = tensor.to(f'cuda:{current_device}')
print(f"Tensor is on device: {gpu_tensor.device}")

Current active GPU: 0 (NVIDIA GeForce RTX 4050 Laptop GPU)
Tensor is on device: cuda:0


In [4]:
# GPU yang digunakan
def check_gpu():
    device = torch.device("cuda:0")

    # Total memori GPU
    total_memory = torch.cuda.get_device_properties(device).total_memory / (1024 ** 3)  # Dalam GB
    print(f"Total GPU Memory: {total_memory:.2f} GB")

    # Memori yang sudah dialokasikan oleh PyTorch
    allocated_memory = torch.cuda.memory_allocated(device) / (1024 ** 3)  # Dalam GB
    print(f"Allocated GPU Memory: {allocated_memory:.2f} GB")

    max_reserved_memory = torch.cuda.max_memory_reserved(device) / (1024 ** 3)  # Dalam GB
    print(f"Max Reserved GPU Memory: {max_reserved_memory:.2f} GB")

    # Memori GPU yang dicadangkan oleh PyTorch
    reserved_memory = torch.cuda.memory_reserved(device) / (1024 ** 3)  # Dalam GB
    print(f"Reserved GPU Memory: {reserved_memory:.2f} GB")

    # Memori GPU yang tersedia
    free_memory = reserved_memory - allocated_memory
    print(f"Free GPU Memory: {free_memory:.2f} GB")

In [5]:
### before cleaning cache and before loading embedding model
check_gpu()

Total GPU Memory: 6.00 GB
Allocated GPU Memory: 5.98 GB
Max Reserved GPU Memory: 6.03 GB
Reserved GPU Memory: 6.03 GB
Free GPU Memory: 0.04 GB


In [11]:
# After loading embedding model, but before executing embedding on input texts
check_gpu()

Total GPU Memory: 6.00 GB
Allocated GPU Memory: 0.53 GB
Max Reserved GPU Memory: 11.37 GB
Reserved GPU Memory: 0.55 GB
Free GPU Memory: 0.02 GB


In [18]:
# Clean cuda GPU cache
torch.cuda.empty_cache()

In [26]:
# setelah menjalankan program
check_gpu()

Total GPU Memory: 6.00 GB
Allocated GPU Memory: 2.84 GB
Max Reserved GPU Memory: 11.37 GB
Reserved GPU Memory: 3.15 GB
Free GPU Memory: 0.31 GB


### Import library dan load model utk generasi

In [1]:
import pandas as pd
from utils import load_model_and_tokenizer
from transformers import AutoTokenizer, AutoModel
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
pd.set_option('display.max_colwidth', None)

In [2]:
# # Muat tokenizer dan model
# embedding_id = 'intfloat/multilingual-e5-base'
# embedding_tokenizer = AutoTokenizer.from_pretrained(embedding_id)
# embedding_model = AutoModel.from_pretrained(embedding_id, torch_dtype=torch.float16)
# embedding_model = embedding_model.to("cuda:0")

In [2]:
# llama32_results = pd.read_json("meta-llama_Llama-3.2-1B-Instruct-50_results.json")
# model_name = "meta-llama/Llama-3.2-1B-Instruct"
model_name = "meta-llama/Llama-3.2-3B-Instruct"
model, tokenizer, config = load_model_and_tokenizer(model_name)

Loading model on cuda with torch.float16


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  return t.to(


In [3]:
import torch
# Cek apakah model telah berada di GPU
print(f"Model device: {next(model.parameters()).device}")

# Periksa penggunaan memori GPU
print(f"Allocated GPU Memory: {torch.cuda.memory_allocated() / (1024**3):.2f} GB")
print(f"Reserved GPU Memory: {torch.cuda.memory_reserved() / (1024**3):.2f} GB")

Model device: cuda:0
Allocated GPU Memory: 5.98 GB
Reserved GPU Memory: 6.03 GB


## Cek Prioritas Memilih Token *Ground-Truth*

### Explore apakah ada token x di dalam ground truth yang mana token x tersebut dihasilkan berkali-kali di *generated answer*

menyimpan rows tersebut ke variabel *rows_with_duplicates*

#### meta-llama_Llama-3.2-1B-Instruct-50_results (NEW version)

In [3]:
from utils import is_groundtruth_duplicated_in_generation

# Tambahkan kolom baru untuk menandai apakah ada duplikasi kata, apakah kata tersebut ada di 'answer',
# dan daftar kata yang merupakan duplikat
llama32_results['duplicates_in_answer'], llama32_results['duplicate_words_list'] = zip(
    *llama32_results.apply(lambda row: is_groundtruth_duplicated_in_generation(row['generated_completion'], row['answer']), axis=1)
)

# Filter baris yang memiliki duplikasi kata
rows_with_duplicates = llama32_results[llama32_results['duplicates_in_answer']]
print(f"Banyak row yang ground truth terduplikat dalam generation: {rows_with_duplicates.shape[0]}")

In [10]:
rows_with_duplicates[['id', 'question', 'passage', 'generated_completion', 'answer','tokens', 'duplicate_words_list']].head(3)

Unnamed: 0,id,question,passage,generated_completion,answer,tokens,duplicate_words_list
1,-6515196497017052077-5,Siapa arsitek Balai Kota Seoul?,"Pada tanggal 18 Februari 2008, desain Yoo Kerl dari iArc terpilih dalam kompetisi desain Pemerintah Metropolitan Seoul.[6] Bagian atas depan bangunan baru ini dirancang untuk menonjol dalam bentuk lengkung. Perancang Yoo merangkul tiga kata kunci yakni ""tradisi, warga negara, dan masa depan"" dalam rancangannya, yang mencerminkan elemen horisontal dari arsitektur tradisional tingkat rendah Korea dan penafsiran ulang dari nuansa mendalam dan lekukan atap atap.[7]",Arsitek yang dirancang desain Yoo Kerl dari iArc adalah Yoo Kerl.,Yoo Kerl,"[Ar, site, k, Ġyang, Ġdir, anc, ang, Ġdes, ain, ĠY, oo, ĠKer, l, Ġdari, Ġi, Arc, Ġadalah, ĠY, oo, ĠKer, l, ., <|eot_id|>]","[Yoo, Kerl]"
4,7410193178422667195-3,Dimana Jamie Richard Vardy lahir?,"Lahir di Sheffield, South Yorkshire, Vardy memulai kariernya di tim muda Stocksbridge Park Steels pada umur 16 tahun setelah dilepas Sheffield Wednesday. Ia masuk ke tim reserve dan tim utama dibawah arahan Gary Marrow tahun 2007, menerima £30 perminggu di klub. Setelah performa menakjubkannya, beberapa tim dari Football League tertarik dan tahun 2009 ia menghabiskan seminggu melakukan trial dengan Crewe Alexandra. Transfer itu tidak terjadi dan ia lalu menolak kontrak jangka pendek dengan Rotherham United.","Jamie Richard Vardy lahir di Sheffield, South Yorkshire, Inggris.","Sheffield, South Yorkshire","[J, am, ie, ĠRichard, ĠV, ardy, Ġla, hir, Ġdi, ĠSheffield, ,, ĠSouth, ĠYorkshire, ,, ĠIng, gr, is, ., <|eot_id|>]","[,]"
5,7176612552296612393-4,siapakah orang tua John Fitzgerald Kennedy?,"John Fitzgerald Kennedy lahir di 83 Beals Street, Brookline, Massachusetts, pada tanggal 29 Mei 1917[10] dari pasangan pebisnis/politikus Joseph Patrick ""Joe"" Kennedy, Sr. (1888–1969) dan filantropis Rose Elizabeth Fitzgerald (1890–1995). Joe adalah putra sulung pebisnis/politikus Patrick Joseph ""P. J."" Kennedy (1858–1929) dan Mary Augusta Hickey (1857–1923). Rose adalah putri sulung Wali Kota Boston John Francis ""Honey Fitz"" Fitzgerald (1863–1950) dan Mary Josephine ""Josie"" Hannon (1865–1964). Keempat kakek-neneknya adalah anak-anak imigran Irlandia.[1]","Menurut teks, Joseph Patrick ""Joe"" Kennedy, Sr. adalah ayah John Fitzgerald Kennedy, sedangkan Rose Elizabeth Fitzgerald adalah ibu.","Joseph Patrick ""Joe"" Kennedy, Sr. (1888–1969) dan filantropis Rose Elizabeth Fitzgerald","[Men, ur, ut, Ġte, ks, ,, ĠJoseph, ĠPatrick, Ġ"", Joe, "", ĠKennedy, ,, ĠSr, ., Ġadalah, Ġay, ah, ĠJohn, ĠFitzgerald, ĠKennedy, ,, Ġsed, angkan, ĠRose, ĠElizabeth, ĠFitzgerald, Ġadalah, Ġib, u, ., <|eot_id|>]","[,, "", Kennedy, Fitzgerald, .]"


### Memahami kode retrieve_correct_token_logprobs

#### llama_32_1B_50_pairs (OLD version)

In [3]:
results = pd.read_json("llama_32_1B_50_pairs.json")
row = results.loc[[46], :].copy()
row[["id", "question", "generated_completion", "answer"]]

Unnamed: 0,id,question,generated_completion,answer
46,8223462044041100059-0,"Dimana letak daerah ""Pardembanan""?","Maaf, saya tidak bisa memberikan informasi tentang lokasi ""Pardembanan"" karena tidak ada informasi yang lengkap tentang daerah tersebut. Namun, saya bisa memberikan informasi umum tentang Kabupaten Asahan,","Kabupaten Asahan, Sumatera Utara, Indonesia"


In [4]:
from retrieve_logprobs import retrieve_correct_token_logprobs
row_updated = retrieve_correct_token_logprobs(row, tokenizer)

In [6]:
row_updated['answer_logprobs']

46    {' Kab': -2.058924436569214, 'upaten': -3.707477662828751e-05, ' As': -6.55653229841846e-06, 'ahan': -2.145769485650817e-06, ',': -1.106772899627685, 'Sum': None, 'ater': None, 'a': None, 'Ut': None, 'ara': None, 'Ind': None, 'onesia': None}
Name: answer_logprobs, dtype: object

## Prototipe membuat dataset

### Memilih subset dari TyDi QA

#### Load dataset

In [1]:
from datasets import load_dataset

tydiqa_gold = load_dataset("khalidalt/tydiqa-goldp", 'indonesian', trust_remote_code=True)
mr_tydi = load_dataset("castorini/mr-tydi", "indonesian")

In [4]:
tydiqa_gold

DatasetDict({
    train: Dataset({
        features: ['id', 'language', 'document_title', 'passage_text', 'question_text', 'answers'],
        num_rows: 5702
    })
    validation: Dataset({
        features: ['id', 'language', 'document_title', 'passage_text', 'question_text', 'answers'],
        num_rows: 565
    })
})

#### Filtering untuk row dimana jawaban tidak terlalu panjang (tidak lebih dari 6 kata)

In [12]:
# import json 

# Fungsi untuk memeriksa jumlah kata dalam key 'text'
def filter_short_answers(example):
    # Ambil jawaban pertama dari list 'text' (jika ada) dan hitung jumlah katanya
    text = example['answers']['text']
    if text:  # Pastikan tidak kosong
        return len(text[0].split()) <= 6
    return False

# Filter dataset train untuk mengambil hanya baris dengan key 'text' <= 6 kata
filtered_train = tydiqa_gold['train'].filter(filter_short_answers)

# filtered_train = load_dataset('json', data_files="tydiqa_answer_less_than_six_words.json")

# Konversi dataset ke list of dictionaries
filtered_train_as_list = [dict(row) for row in filtered_train]

# Simpan ke file JSON
# with open("tydiqa_answer_less_than_six_words.json", "w", encoding="utf-8") as f:
#     json.dump(filtered_train_as_list, f, indent=4, ensure_ascii=False)

Karena dataset mr tydi hanya memiliki data positive_passages dan negative_passages utk split train, maka akan difilter dulu row di mr-tydi-split-train lalu filter kembali untuk tydiqa-split-train

In [13]:
all_train_questions = set(row["question_text"] for row in filtered_train)

# Fungsi untuk mencocokkan query di mr_tydi dengan all_train_questions di tydiqa_gold
def filter_mr_tydi_to_the_six_word_answers(example):
    return example["query"] in all_train_questions

# Filter semua split di mr_tydi berdasarkan query yang cocok
six_words_anwers_mr_tydi_train = mr_tydi["train"].filter(filter_mr_tydi_to_the_six_word_answers)

Filter:   0%|          | 0/4902 [00:00<?, ? examples/s]

In [14]:
all_retrieval_questions = set(row["query"] for row in six_words_anwers_mr_tydi_train)

# Fungsi untuk mencocokkan query di mr_tydi dengan question_text di tydiqa_gold
def filter_tydiqa_to_the_six_word_answers(example):
    return example["question_text"] in all_retrieval_questions

# Filter semua split di mr_tydi berdasarkan query yang cocok
six_words_anwers_tydiqa_train = tydiqa_gold["train"].filter(filter_tydiqa_to_the_six_word_answers)

In [15]:
from collections import defaultdict

# Fungsi untuk memeriksa validitas passage
def is_valid_passage(passage):
    return passage["text"].strip() != "" and passage["title"].strip() != ""

# Fungsi untuk menghitung distribusi panjang passage dan mengembalikan query_id
def compute_passage_distribution_with_ids(dataset, column_name):
    distribution = defaultdict(list)  # Menyimpan query_id berdasarkan panjang passages
    for row in dataset:
        if column_name in row:
            valid_passages = [
                passage for passage in row[column_name] if is_valid_passage(passage)
            ]
            length = len(valid_passages)  # Hitung jumlah valid_passages
            distribution[length].append(row["query_id"])  # Simpan query_id berdasarkan panjang
    return distribution

# Hitung distribusi panjang untuk positive_passages
positive_distribution_with_ids = compute_passage_distribution_with_ids(six_words_anwers_mr_tydi_train, "positive_passages")

# Hitung distribusi panjang untuk negative_passages
negative_distribution_with_ids = compute_passage_distribution_with_ids(six_words_anwers_mr_tydi_train, "negative_passages")

# Tampilkan hasil
print("Distribusi Positive Passages:")
for length, query_ids in sorted(positive_distribution_with_ids.items()):
    print(f"{length} valid passages: {len(query_ids)} rows")
    print(f"Query IDs: {query_ids}\n")

print("\nDistribusi Negative Passages:")
for length, query_ids in sorted(negative_distribution_with_ids.items()):
    print(f"{length} valid passages: {len(query_ids)} rows")
    print(f"Query IDs: {query_ids}\n")

Distribusi Positive Passages:
1 valid passages: 3539 rows
Query IDs: ['0', '1', '2', '6', '7', '8', '9', '12', '13', '14', '16', '18', '20', '23', '26', '28', '29', '31', '33', '36', '38', '39', '40', '42', '43', '44', '45', '47', '48', '49', '51', '54', '56', '57', '59', '61', '63', '64', '66', '67', '68', '69', '70', '71', '73', '74', '75', '77', '80', '83', '84', '85', '87', '89', '90', '91', '93', '94', '97', '101', '102', '104', '105', '106', '107', '110', '112', '114', '115', '117', '118', '119', '120', '122', '124', '130', '132', '135', '137', '139', '141', '142', '143', '146', '147', '148', '152', '154', '158', '160', '162', '164', '166', '168', '173', '174', '175', '179', '183', '184', '185', '186', '188', '189', '190', '191', '192', '193', '194', '195', '197', '200', '201', '202', '206', '207', '208', '209', '210', '212', '214', '215', '217', '221', '222', '223', '228', '230', '233', '234', '240', '242', '243', '245', '248', '249', '250', '252', '253', '255', '258', '259', '2

##### Memilih 30 sample untuk percobaan kecil

In [18]:
# Ambil 30 baris secara random dari dataset yang telah difilter
selected_rows = six_words_anwers_tydiqa_train.shuffle(seed=42).select(range(30))

# Ambil semua question_text dari random_rows (tydi_qa subset)
question_texts_samples = set(row["question_text"] for row in selected_rows)

In [19]:
# Fungsi untuk mencocokkan query di mr_tydi dengan question_text di tydiqa_gold
def filter_queries(example):
    return example["query"] in question_texts_samples

# Filter semua split di mr_tydi berdasarkan query yang cocok
filtered_mr_tydi_samples = six_words_anwers_mr_tydi_train.filter(filter_queries)

Filter:   0%|          | 0/3539 [00:00<?, ? examples/s]

In [20]:
filtered_tydiqa_samples = selected_rows.rename_column("question_text", "query")

# Konversi filtered_mr_tydi menjadi dictionary dengan query sebagai kunci
filtered_mr_tydi_samples_dict = {row["query"]: row for row in filtered_mr_tydi_samples}

# Gabungkan selected_rows dengan filtered_mr_tydi berdasarkan query
joined_rows = []
for row in filtered_tydiqa_samples:
    query = row["query"]
    print(query)
    
    if query in filtered_mr_tydi_samples_dict:
        # Gabungkan baris dari selected_rows dan filtered_mr_tydi
        combined_row = {**row, **filtered_mr_tydi_samples_dict[query]}  # Merge kedua dictionary
        joined_rows.append(combined_row)

# Hasil akhir: dataset baru dengan hasil join
from datasets import Dataset
joined_dataset = Dataset.from_list(joined_rows)

# Tampilkan informasi hasil
print(f"Total rows in joined dataset: {len(joined_dataset)}")
print(joined_dataset[0])  # Contoh row

Siapakah penulis anime naruto ?
Apa novel pertama yang ditulis Ernest Miller Hemingway?
Berapa lama Gempa bumi Lisboa 1755 terjadi?
Apa itu Ligia?
Siapa pendiri Boeing Commercial Airplanes?
umur berapakah saat David Émile Durkheim mendirikan fakultas sosiologi pertama di Eropa?
Berapa luas kota Raha ?
Siapakah Menteri Transmigrasi dan Pemukiman Proyek lahan gambut satu juta hektar?
Ada berapa jenis gulma ?
Kapan universitas Sriwijaya berdiri ?
Siapa nama pemimipin perang Goguryeo-Su Dinasti Sui pertama ?
Darimana asal Robert Heinrich Herman Koch?
berapakah luas Provinsi Jawa Tengah?
Kapan pemerintahan Gugghiermu I dimulai?
dimanakah televisi pertama diciptakan?
apakah sistem pemerintahan di Amerika Serikat?
Apakah ibukota Serbia?
Dimana letak Universitas Cape Town?
Siapa raja pertama Dinasti Ming?
apakah nama ibukota Afganistan?
Siapa itu Bimasena?
Kapan Muhammad Saleh Werdisastro lahir ?
Apa mata uang jepang?
Dimana Samantha Louise Lewthwaite lahir?
berapakah luas benua Afrika?
Berapa

In [73]:
# Hitung distribusi panjang untuk positive_passages
positive_distribution_with_ids = compute_passage_distribution_with_ids(joined_dataset, "positive_passages")

# Hitung distribusi panjang untuk negative_passages
negative_distribution_with_ids = compute_passage_distribution_with_ids(joined_dataset, "negative_passages")

# Tampilkan hasil
print("Distribusi Positive Passages:")
for length, query_ids in sorted(positive_distribution_with_ids.items()):
    print(f"{length} valid passages: {len(query_ids)} rows")
    print(f"Query IDs: {query_ids}\n")

print("\nDistribusi Negative Passages:")
for length, query_ids in sorted(negative_distribution_with_ids.items()):
    print(f"{length} valid passages: {len(query_ids)} rows")
    print(f"Query IDs: {query_ids}\n")

Distribusi Positive Passages:
1 valid passages: 30 rows
Query IDs: ['1308', '5767', '2165', '3960', '1126', '5291', '365', '2426', '461', '2757', '6126', '84', '2489', '4276', '5455', '5295', '6032', '5432', '4309', '2347', '2890', '4740', '4541', '3520', '687', '3125', '2278', '5804', '3680', '4787']


Distribusi Negative Passages:
8 valid passages: 1 rows
Query IDs: ['3960']

29 valid passages: 21 rows
Query IDs: ['5767', '2165', '1126', '5291', '365', '2426', '2757', '6126', '84', '2489', '4276', '6032', '5432', '4740', '4541', '3520', '687', '3125', '2278', '5804', '4787']

30 valid passages: 8 rows
Query IDs: ['1308', '461', '5455', '5295', '4309', '2347', '2890', '3680']



In [None]:
from datasets import Dataset

# Fungsi untuk menggabungkan positive_passages dan negative_passages
def combine_passages(example):
    example['all_passages'] = example['positive_passages'] + example['negative_passages']
    return example

# Proses penggabungan menggunakan map
joined_dataset = joined_dataset.map(combine_passages)

# Mengecek hasil
print(joined_dataset[0]['all_passages'])  # Menampilkan isi kolom baru 'all_passages' pada row pertama

In [39]:
# Fungsi untuk memformat passages
def format_passages(example):
    # Memformat setiap dictionary dalam all_passages menjadi string
    example['formatted_passages'] = [
        f"judul: {passage['title']} teks: {passage['text']}"
        for passage in example['all_passages']
    ]
    return example

# Proses memformat dan menambahkan kolom formatted_passages
joined_dataset = joined_dataset.map(format_passages)

# Mengecek hasil
print(joined_dataset[0]['formatted_passages'])  # Menampilkan isi kolom baru 'formatted_passages' pada row pertama

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

['judul: Masashi Kishimoto teks: Masashi Kishimoto(岸本 斉史,Kishimoto Masashi, Template:Lahirmati[1]) adalah seorang Mangaka Jepang. Masashi Kishimoto mulai mengembangkan bakatnya akan menggambar semenjak usia SD. Dia menjadi mangaka terkenal semenjak karyanya, Naruto sukses besar baik di Jepang sendiri ataupun di negara-negara lain. Dia suka membaca manga sejak usia muda, sampai dia menunjukkan keinginannya untuk menulis manga sendiri. Akira Toriyama dan Katsuhiro Otomo adalah sebagai inspirasi utamanya. Pada tahun 1999 Naruto pertama kali dipublikasikan di Shounen jump membuat Kishimoto menerima penghargaan hop step. Saudara kembar Masashi Kishimoto, Seishi Kishimoto, juga merupakan seniman manga dengan karyanya yang terkenal 666 Satan (O-Parts Hunter) dan Blazer Drive. Selama penerbitan Naruto, Kishimoto menikah dan menjadi seorang ayah.[2]', 'judul: Daftar karakter Naruto teks: Beberapa daftar karakter yang muncul dalam serial anime atau manga Naruto.', 'judul: Sasuke Uchiha teks: Say

In [46]:
import json
# Konversi dataset ke list of dictionaries
samples = [dict(row) for row in joined_dataset]

# Simpan ke file JSON
with open("samples.json", "w", encoding="utf-8") as f:
    json.dump(samples, f, indent=4, ensure_ascii=False)

Mengecek 1 row spesifik berdasarkan query_id

In [26]:
# Query yang ingin dicari
target_query_id = '1308'
# Fungsi filter untuk mencocokkan query
def match_query(example):
    return example["query_id"] == target_query_id

# Filter dataset di semua split untuk mencari baris yang sesuai
inspect = joined_dataset.filter(match_query)
inspect

Filter:   0%|          | 0/30 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'language', 'document_title', 'passage_text', 'query', 'answers', 'query_id', 'positive_passages', 'negative_passages'],
    num_rows: 1
})

## Embedding

### Load dataset

In [6]:
from datasets import load_dataset
samples = load_dataset('json', data_files="samples.json")
samples = samples.remove_columns("language")
samples = samples.rename_column("id", "tydiqa_id")
samples = samples.rename_column("query_id", "mrtydi_id")

### Menghasilkan embedding dan similarity scores

__multilingual-e5-base__

source: https://huggingface.co/intfloat/multilingual-e5-base

In [7]:
import torch.nn.functional as F
import torch
from torch import Tensor
from tqdm import tqdm
import gc
from datasets import load_dataset

# Fungsi average pooling
def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

def add_similarity_to_dataset(dataset, batch_size=4):
    """
    Tambahkan kolom 'scored_passages' ke dataset, berisi passage dan similarity score.

    Args:
        dataset: Dataset Hugging Face.

    Returns:
        Dataset dengan kolom baru 'scored_passages'.
    """
    scored_passages = []
    num_batches = len(range(0, len(dataset), batch_size))  # Total jumlah batch

    for start_idx in tqdm(range(0, len(dataset), batch_size), desc="Processing Batches", total=num_batches):
        end_idx = min(start_idx + batch_size, len(dataset))
        batch = dataset.select(range(start_idx, end_idx))
        
        formatted_texts = []  # List untuk menyimpan pasangan (formatted_text, row)
        # Loop pertama: Bangun formatted_texts
        for row in batch:
            formatted_text = [f"query: {row['query']}"] + [f"passage: {p}" for p in row['formatted_passages']]
            
            # Validasi panjang formatted_text
            assert len(formatted_text) == len(row['formatted_passages']) + 1, (
                f"Mismatch: formatted_text({len(formatted_text)}) vs passages({len(row['formatted_passages'])}) + 1"
            )

            # Simpan pasangan (formatted_text, row)
            formatted_texts.append((formatted_text, row))
            
        # Loop kedua: Proses formatted_texts untuk similarity scoring
        for row_texts, row in formatted_texts:
            # Tokenisasi
            batch_dict = embedding_tokenizer(row_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')
            batch_dict = {k: v.to("cuda:0") for k, v in batch_dict.items()}
            try:
                outputs = embedding_model(**batch_dict)
            except Exception as e:
                print(f"Error: {e}")
                raise

            # Embedding dan normalisasi
            embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
            embeddings = F.normalize(embeddings, p=2, dim=1)

            # Hitung similarity scores
            query_embedding = embeddings[0].unsqueeze(0)  # Query ada di indeks pertama
            passage_embeddings = embeddings[1:]  # Passages ada setelah query
            scores = (query_embedding @ passage_embeddings.T).squeeze(0)

            # Gabungkan passage dengan skor
            row_scored_passages = [
                {"passage": row['formatted_passages'][i], "similarity": scores[i].item()}
                for i in range(len(row['formatted_passages']))
            ]
            scored_passages.append(row_scored_passages)

        # Bersihkan cache GPU setelah memproses batch
        del batch_dict, outputs, embeddings
        torch.cuda.empty_cache()
        gc.collect()

    # Tambahkan kolom baru ke dataset
    print("DI LUAR FOR LOOP")
    print("len 'formatted_passages'indeks 0:", len(dataset[0]['formatted_passages']))
    dataset = dataset.add_column("scored_passages", scored_passages)
    torch.cuda.empty_cache()
    
    return dataset

# Muat dataset dan tambahkan kolom scored_passages
samples = load_dataset('json', data_files="samples.json")
samples["train"] = add_similarity_to_dataset(samples["train"])

Processing Batches: 100%|██████████| 8/8 [00:50<00:00,  6.35s/it]

DI LUAR FOR LOOP
len 'formatted_passages'indeks 0: 31





### Mengambil top 5 dan menggabungkannya menjadi 1 teks

In [8]:
def add_top_5_combined_passages(dataset):
    """
    Tambahkan kolom baru 'top_5_combined' ke dataset, berisi gabungan
    top 5 passages berdasarkan similarity score.

    Args:
        dataset: Dataset Hugging Face dengan kolom 'scored_passages'.

    Returns:
        Dataset dengan kolom baru 'top_5_combined'.
    """
    combined_passages = []

    for row in dataset:
        # Ambil dan urutkan top 5 passages berdasarkan similarity
        scored_passages = row["scored_passages"]
        top_5 = sorted(scored_passages, key=lambda x: x["similarity"], reverse=True)[:5]

        # Gabungkan hanya bagian 'passage' dengan pemisah newline
        combined_string = "\n".join(p["passage"] for p in top_5)
        combined_passages.append(combined_string)

    # Tambahkan kolom baru ke dataset
    dataset = dataset.add_column("top_5_combined", combined_passages)
    return dataset

# Tambahkan kolom 'top_5_combined' ke dataset samples["train"]
samples["train"] = add_top_5_combined_passages(samples["train"])

In [29]:
def wrap_text(text, line_length=203):
    return '\n'.join(text[i:i + line_length] for i in range(0, len(text), line_length))

In [16]:
samples = samples.remove_columns("language")
samples = samples.rename_column("id", "tydiqa_id")
samples = samples.rename_column("query_id", "mrtydi_id")

In [22]:
samples

DatasetDict({
    train: Dataset({
        features: ['tydiqa_id', 'document_title', 'passage_text', 'query', 'answers', 'mrtydi_id', 'positive_passages', 'negative_passages', 'all_passages', 'formatted_passages', 'scored_passages', 'top_5_combined'],
        num_rows: 30
    })
})

In [24]:
import json
# Konversi dataset ke list of dictionaries
samples_w_scores = [dict(row) for row in samples['train']]

# Simpan ke file JSON
with open("samples_w_scores.json", "w", encoding="utf-8") as f:
    json.dump(samples_w_scores, f, indent=4, ensure_ascii=False)

## Generate Rangkuman

In [3]:
from datasets import load_dataset
samples = load_dataset('json', data_files="samples_w_scores.json")

In [4]:
from summarize import summarize_top_5_combined
import time

model_name = "meta-llama/Llama-3.2-1B-Instruct"

start = time.time()
samples['train'] = summarize_top_5_combined(
    model_name=model_name,
    dataset=samples["train"],
    model=model, 
    tokenizer=tokenizer,
    batch_size=1
)
end = time.time()
duration = end - start
print(f"Durasi: {duration} detik")

Summarizing dataset:   0%|          | 0/30 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` instance instead by default (as opposed to the legacy tuple of tuples format). If you want to keep returning the legacy format, please set `return_legacy_cache=True`.
Summarizing dataset: 100%|██████████| 30/30 [04:50<00:00,  9.69s/it]


Durasi: 290.9894685745239 detik


Batch size = 2 --> Durasi: 356.94 detik

Batch size = 1 --> Durasi: 290.99 detik

In [5]:
dataset_iter = iter(range(len(samples['train'])))

def inspect_next():
    """Menampilkan pertanyaan, ground-truth, dan hasil generasi untuk item berikutnya."""
    try:
        i = next(dataset_iter)
        print(f"\nIndex: {i}")
        print("Pertanyaan:", samples['train'][i]['query'])
        print()
        print("Ground-truth:", samples['train'][i]['answers']['text'][0])
        print()
        print("Generasi:", samples['train'][i]['summary'])
        print("="*50)
    except StopIteration:
        print("Sudah mencapai akhir dataset.")

# Jalankan pertama kali
inspect_next()


Index: 0
Pertanyaan: Siapakah penulis anime naruto ?

Ground-truth: Masashi Kishimoto

Generasi: Masashi Kishimoto adalah penulis anime Naruto.


In [12]:
inspect_next()


Index: 7
Pertanyaan: Siapakah Menteri Transmigrasi dan Pemukiman Proyek lahan gambut satu juta hektar?

Ground-truth: Siswono Yudo Husodo

Generasi: Siswono Yudo Husodo merupakan Menteri Transmigrasi dan Pemukiman Perambah Hutan (PPH) yang digagas untuk meluncurkan proyek lahan gambut satu juta hektar


In [13]:
import json
# Konversi dataset ke list of dictionaries
samples_w_summary = [dict(row) for row in samples['train']]

# Simpan ke file JSON
with open("samples_w_summary.json", "w", encoding="utf-8") as f:
    json.dump(samples_w_summary, f, indent=4, ensure_ascii=False)