# Synthesize data from TREC-DL

Process:
1. Get topics and qrels in Pyserini format
2. Retrieve top 100 documents using BM25 to generate rank results 
3. Evaluate nDCG@10 performance
4. Convert to Huggingface format and upload to Huggingface Hub

### step 123

In [32]:
import sys
sys.path.append('/home/mila/l/le.zhang/scratch/DeepRerank')
from pyserini.search import get_topics, get_qrels
# from pyserini.search.lucene import LuceneSearcher
from run_evaluation import THE_TOPICS, THE_INDEX
from trec_eval import EvalFunction
data = 'dl21'

# searcher = LuceneSearcher.from_prebuilt_index(THE_INDEX[data])
DLV2 = ['dl20', 'dl21', 'dl22', 'dl23']
topics = get_topics(THE_TOPICS[data] if data not in dl else data)
qrels = get_qrels(THE_TOPICS[data])




In [27]:
# 将dl21, dl22, dl23的qels txt 拼接在一起
import json
qrels = {}
for data in ['dl21', 'dl22', 'dl23']:
    qrels_file = get_qrels_file(THE_TOPICS[data])
    with open(qrels_file, 'r') as f:
        for line in f:
            qid, _, docid, rel = line.strip().split()
            if qid not in qrels:
                qrels[qid] = {}
            qrels[qid][docid] = int(rel)
           
# Save the combined qrels to a file
output_file = '/home/mila/l/le.zhang/scratch/DeepRerank/data/combined_qrels.txt'
with open(output_file, 'w') as f:
    for qid in sorted(qrels.keys()):
        for docid, rel in qrels[qid].items():
            f.write(f"{qid} 0 {docid} {rel}\n")
           
# sa


In [14]:
for data in ['dl21', 'dl22', 'dl23']:
    topics = get_topics(THE_TOPICS[data] if data not in dl else data)
    print(f"\nEvaluating {data}:, len(topics): {len(topics)}")
    EvalFunction.main(THE_TOPICS[data], f'/home/mila/l/le.zhang/scratch/DeepRerank/data/20{data[2:]}_passage_top100.txt')


Evaluating dl21:, len(topics): 477
{'NDCG@1': 0.45283, 'NDCG@5': 0.44937, 'NDCG@10': 0.44583, 'MAP@1': 0.00984, 'MAP@5': 0.03842, 'MAP@10': 0.06709, 'Recall@1': 0.00984, 'Recall@5': 0.04129, 'Recall@10': 0.07564}

Evaluating dl22:, len(topics): 500
{'NDCG@1': 0.29167, 'NDCG@5': 0.28879, 'NDCG@10': 0.26917, 'MAP@1': 0.00352, 'MAP@5': 0.01282, 'MAP@10': 0.02194, 'Recall@1': 0.00352, 'Recall@5': 0.01596, 'Recall@10': 0.0287}

Evaluating dl23:, len(topics): 700
{'NDCG@1': 0.32724, 'NDCG@5': 0.271, 'NDCG@10': 0.26242, 'MAP@1': 0.01349, 'MAP@5': 0.03043, 'MAP@10': 0.04414, 'Recall@1': 0.01349, 'Recall@5': 0.03519, 'Recall@10': 0.05851}


### step 4

In [31]:
# 读取3个top100文件，拼接在一起
combined_rank_results = []
for data in ['dl21', 'dl22', 'dl23']:
    with open(f'/home/mila/l/le.zhang/scratch/DeepRerank/data/{data}_bm25_rank_results.json', 'r') as f:
        rank_results = json.load(f)
        print(f"len(rank_results): {len(rank_results)}")
        combined_rank_results.extend(rank_results)
        


len(rank_results): 53
len(rank_results): 76
len(rank_results): 82


In [43]:
import copy
import random
replicate_rank_results = []
replicate_times = 50
for item in combined_rank_results:
    for _ in range(replicate_times):
        new_item = copy.deepcopy(item)
        # Randomly select 20 hits from original hits list
        new_item['hits'] = random.sample(new_item['hits'], min(20, len(new_item['hits'])))
        replicate_rank_results.append(new_item)


In [44]:
len(replicate_rank_results)

10550

In [45]:
from datasets import Dataset
import pandas as pd
from huggingface_hub import login
df = pd.DataFrame(replicate_rank_results)
dataset = Dataset.from_pandas(df)

# Print dataset info
print(f"Dataset size: {len(dataset)}")
print(dataset)

# Upload the dataset to Huggingface Hub
dataset.push_to_hub(
    "le723z/TREC-DL-TOP100",  # Replace with your desired repository name
    private=False,  # Set to True if you want a private dataset
)

Dataset size: 10550
Dataset({
    features: ['query', 'hits'],
    num_rows: 10550
})


Creating parquet from Arrow format: 100%|██████████| 11/11 [00:00<00:00, 191.71ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.74it/s]


CommitInfo(commit_url='https://huggingface.co/datasets/le723z/TREC-DL-TOP100/commit/30973904f974bf04f5f2e3702eb536973a0c38cc', commit_message='Upload dataset', commit_description='', oid='30973904f974bf04f5f2e3702eb536973a0c38cc', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/le723z/TREC-DL-TOP100', endpoint='https://huggingface.co', repo_type='dataset', repo_id='le723z/TREC-DL-TOP100'), pr_revision=None, pr_num=None)

In [8]:
"""
Preprocess dataset for countdown task - given a target number and N numbers, generate equations to reach target
"""

import re
import os
from datasets import Dataset, load_dataset
from random import randint, seed, choice
from typing import List, Tuple
from tqdm import tqdm
from hdfs_io import copy, makedirs
import argparse


def make_prefix(dp, template_type):
    query = dp['query']
    hits = dp['hits']
    num_hits = len(hits)
    # NOTE: also need to change reward_score/countdown.py
    if template_type == 'base':
        raise NotImplementedError
    elif template_type == 'qwen-instruct':
        """This works for Qwen Instruct Models"""
        # prefix = (
        #     f"<|im_start|>system\n"
        #     f"You are DeepRerank, an intelligent assistant that can rank passages based on their relevancy to the search query. "
        #     f"You first thinks about the reasoning process in the mind and then provides the user with the answer."
        #     f"<|im_end|>\n"
        #     f"<|im_start|>user\n"
        #     f"I will provide you with passages, each indicated by number identifier []. Rank the passages based on their relevance to the search query.\n"
        #     f"Show your work in <think> </think> tags. And return the final answer in <answer> </answer> tags. \n"
        #     f"Sarch Query: {query}. \nRank the {num_hits} passages above based on their relevance to the search query."
        #     f"The passages should be listed in descending order using identifiers. The most relevant passages should be listed first. The output format should be <answer> [] > [] </answer>, e.g., <answer> [1] > [2] </answer>."
        #     f"<|im_end|>\n"
        #     f"<|im_start|>assistant\n"
        #     f"Okay, please provide the passages.\n"
        #     f"<|im_end|>\n"
        # )

        prefix = (
            f"I will provide you with passages, each indicated by number identifier []. Rank the passages based on their relevance to the search query."
            f"Show your work in <think> </think> tags. And return the final answer in <answer> </answer> tags. \n"
            f"Search Query: {query}. \nRank the {num_hits} passages above based on their relevance to the search query."
            f"The passages should be listed in descending order using identifiers. The most relevant passages should be listed first. The output format should be <answer> [] > [] </answer>, e.g., <answer> [1] > [2] </answer>."
        )
      
        prefix += f"<|im_start|>assistant\n  Let me rank the passages. <think> "
        return prefix



raw_dataset = load_dataset('le723z/TREC-DL-TOP100', split='train')

def make_map_fn(split):
    def process_fn(example, idx):
        question = make_prefix(example, template_type="qwen-instruct")
        messages = [
                {"role": "system", "content": "You are DeepRerank, an intelligent assistant that can rank passages based on their relevancy to the search query. You first thinks about the reasoning process in the mind and then provides the user with the answer."},
                {"role": "user","content": question},
                {"role": "assistant", "content": "Okay, please provide the passages."}
            ]
        rank = 0
        for hit in example['hits']:
            content = hit['content']
            content = content.replace('Title: Content: ', '')
            content = content.strip()
            content = ' '.join(content.split()[:int(300)]) # max token fea each passage is 300
            messages.append({"role": "user", "content": f"[{rank}] {content}"})
            messages.append({"role": "assistant", "content": f"Received passage [{rank}]."})
            rank += 1
        messages.append({"role": "user", "content": f"Please analyze and rank the passages based on their relevance to the search query. First explain your reasoning within <think> </think> tags, then provide the passage IDs in descending order of relevance within <answer> </answer> tags."})
        data = {
            "data_source": "TREC-DL-TOP100",
            "prompt": messages,
            "ability": "rerank",
            "reward_model": {
                "style": "rule",
                "ground_truth": ""
            },
            "extra_info": {
                'split': split,
                'index': idx,
            }
        }
        return data
    return process_fn

train_dataset = raw_dataset.map(function=make_map_fn('train'), with_indices=True)

local_dir = '/home/mila/l/le.zhang/scratch/DeepRerank/data'
hdfs_dir = None

train_dataset.to_parquet(os.path.join(local_dir, 'train.parquet'))

if hdfs_dir is not None:
    makedirs(hdfs_dir)
    copy(src=local_dir, dst=hdfs_dir) 

Map: 100%|██████████| 10550/10550 [00:03<00:00, 2644.50 examples/s]
Creating parquet from Arrow format: 100%|██████████| 11/11 [00:00<00:00, 28.79ba/s]


In [9]:
train_dataset[0]['prompt']

[{'content': 'You are DeepRerank, an intelligent assistant that can rank passages based on their relevancy to the search query. You first thinks about the reasoning process in the mind and then provides the user with the answer.',
  'role': 'system'},
 {'content': 'I will provide you with passages, each indicated by number identifier []. Rank the passages based on their relevance to the search query.Show your work in <think> </think> tags. And return the final answer in <answer> </answer> tags. \nSearch Query: what does prenatal care include. \nRank the 20 passages above based on their relevance to the search query.The passages should be listed in descending order using identifiers. The most relevant passages should be listed first. The output format should be <answer> [] > [] </answer>, e.g., <answer> [1] > [2] </answer>.<|im_start|>assistant\n  Let me rank the passages. <think> ',
  'role': 'user'},
 {'content': 'Okay, please provide the passages.', 'role': 'assistant'},
 {'content':

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen2.5-7B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [10]:
# 统计所有样本的token长度
from tqdm import tqdm
lengths = []
for i in tqdm(range(len(train_dataset))):
    text = tokenizer.apply_chat_template(
        train_dataset[i]['prompt'],
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer(text, return_tensors="pt")
    lengths.append(len(model_inputs['input_ids'][0]))
    
print(f"Average length: {sum(lengths)/len(lengths):.2f}")
print(f"Max length: {max(lengths)}")
print(f"Min length: {min(lengths)}")

100%|██████████| 10550/10550 [00:48<00:00, 219.10it/s]

Average length: 2018.18
Max length: 8837
Min length: 1598





In [7]:
print(text)

<|im_start|>system
You are DeepRerank, an intelligent assistant that can rank passages based on their relevancy to the search query. You first thinks about the reasoning process in the mind and then provides the user with the answer.<|im_end|>
<|im_start|>user
I will provide you with passages, each indicated by number identifier []. Rank the passages based on their relevance to the search query.Show your work in <think> </think> tags. And return the final answer in <answer> </answer> tags. 
Search Query: collins the good to great. 
Rank the 20 passages above based on their relevance to the search query.The passages should be listed in descending order using identifiers. The most relevant passages should be listed first. The output format should be <answer> [] > [] </answer>, e.g., <answer> [1] > [2] </answer>.<|im_start|>assistant
  Let me rank the passages. <think> <|im_end|>
<|im_start|>assistant
Okay, please provide the passages.<|im_end|>
<|im_start|>user
[0] See: http://www.chiefe