In [1]:
import os
import csv
import random
import json

In [8]:
from pathlib import Path

# Base directory for data
BASE_DIR = Path("./dataset/processed/msmarco-amharic-news_dataset")

# Define file paths
BM25_RANKINGS_PATH = Path("./models/bm25/outputs/bm25_rankings.tsv")  
QRELS_PATH = BASE_DIR / "qrels_train.tsv"
JSONL_OUTPUT_PATH = Path("outputs/bm25_triplets.jsonl")
CSV_OUTPUT_PATH = Path("outputs/bm25_triplets.tsv")

# Ensure the outputs directory exists
OUTPUT_DIR = Path("outputs")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)


In [9]:
# Load QRELs (Ground truth relevant documents)
qid2positives = {}
with open(QRELS_PATH, "r", encoding="utf-8") as f:
    for line in f:
        qid, _, doc_id, label = map(int, line.strip().split())
        if label == 1:
            qid2positives.setdefault(qid, []).append(doc_id)
print(f"Loaded {len(qid2positives)} queries with positive documents.")

Loaded 39871 queries with positive documents.


In [10]:
# Load BM25 Retrieval Results (store as tuples: (doc_id, rank))
bm25_results = {}
with open(BM25_RANKINGS_PATH, "r", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter="\t")
    for row in reader:
        if len(row) < 3:
            continue
        qid, doc_id, rank = map(int, row[:3])
        bm25_results.setdefault(qid, []).append((doc_id, rank))  # Store doc_id and rank as a tuple

print(f"Loaded {len(bm25_results)} queries with BM25 ranking results.")

Loaded 39729 queries with BM25 ranking results.


In [11]:
print("QREL query IDs:", list(qid2positives.keys())[:5])
print("BM25 query IDs:", list(bm25_results.keys())[:5])

QREL query IDs: [0, 1, 2, 3, 4]
BM25 query IDs: [0, 1, 2, 3, 4]


In [12]:
import random

# Generate triplets (Query, Positive, 5 Negatives)
No_BM25_results = 0
No_valid_negative_samples = 0
triplets = []
NUM_NEGATIVES = 5  # Number of negative samples per query

# Set seed for reproducibility
random.seed(42)

for qid, pos_docs in qid2positives.items():  # Each query has at least one positive document
    if qid not in bm25_results:
        No_BM25_results += 1
        continue  # Skip if BM25 retrieved nothing

    pos_doc_id = pos_docs[0]  # Since each query has only one judged positive

    # Extract BM25 results for this query and filter based on rank
    bm25_ranked_docs = [(doc_id, rank) for doc_id, rank in bm25_results[qid] if rank > 10]  # Ignore top 10

    # Extract only doc IDs (excluding rank) and filter out positive document
    neg_candidates = [doc_id for doc_id, _ in bm25_ranked_docs if doc_id != pos_doc_id]

    if len(neg_candidates) < NUM_NEGATIVES:
        print(f"No enough valid negative samples for QID {qid}. Skipping.")
        No_valid_negative_samples += 1
        continue

    # Select 5 negative documents randomly
    selected_negatives = random.sample(neg_candidates, NUM_NEGATIVES)

    # Create triplets for each negative sample
    for neg_doc_id in selected_negatives:
        triplets.append([qid, pos_doc_id, neg_doc_id])

print("No BM25 results:", No_BM25_results)
print("No valid negative samples:", No_valid_negative_samples)
print(f"Total triplets generated: {len(triplets)}")


No BM25 results: 142
No valid negative samples: 0


In [13]:
# Save JSONL format
with open(JSONL_OUTPUT_PATH, "w", encoding="utf-8") as f:
    for triplet in triplets:
        f.write(json.dumps(triplet) + "\n")

# Save CSV format
with open(CSV_OUTPUT_PATH, "w", encoding="utf-8") as f:
    writer = csv.writer(f, delimiter="\t")
    writer.writerows(triplets)

print(f"Saved {len(triplets)} triplets:")
print(f"JSONL: {JSONL_OUTPUT_PATH}")
print(f"CSV: {CSV_OUTPUT_PATH}")

Saved 39729 triplets:
JSONL: outputs/bm25_triplets.jsonl
CSV: outputs/bm25_triplets.tsv
