<h1>Generate Simulated Single Topics</h1>
<p>Script to generate simulated topics and qrels with different distribution of relevant documents</p>

In [1]:
from uuid import uuid4
from numpy import random

<h3>Parameters</h3>

In [2]:
# Length of the run
RUN_LENGTH = 1000
# Number of relevant documents
NUM_REL = 100

<h3>Topics with Recall Base 1</h3>
<p>Generate topics which retrieve all the relevant documents.</p>

In [3]:
# Topics to be generated
# sim_topic_ideal_rb1_100: all documents are at the top of the ranking
# sim_topic_reversed_rb1_100: all documents are at the and of the ranking
# sim_topic_realistic_rb1_100: document are retrieved through the ranking with decreasing probability 

# Initialize lists of tuples to store rankings
ideal_ranking = [None] * RUN_LENGTH
reversed_ranking = [None] * RUN_LENGTH

# Define a list of relevant and non relevant documents
rel_document = []
nrel_document = []

with open('sim_topic_ideal_rb1_100.txt', 'w') as f_ideal, open('sim_qrel_100.txt', 'w') as f_qrels:
    
    # Variable to count relevant documents
    rel_cnt = 0
    # For each rank position
    for r in range(0, RUN_LENGTH):
        # Set the topic id
        topic = 1
        # Define the document id
        docid = uuid4().hex
        # Q0 entry required by TREC run format
        q0 = 'Q0'
        # Set the rank position equal to r
        ideal_rank = r
        reversed_rank = RUN_LENGTH - r
        # Fake score compute as a function of the rank position
        ideal_score = (RUN_LENGTH - r) / RUN_LENGTH
        reversed_score = r / RUN_LENGTH
        # Set the run name as the file name
        tag = 'sim_topic_ideal_rb1_100'
        
        # Write the ideal run
        f_ideal.write(' '.join([str(topic), q0, docid, str(ideal_rank), str(ideal_score), tag, '\n']))
        
        # Store the ranking in a list of tuples
        # Ideal ranking
        ideal_ranking[r] = (topic, docid, ideal_rank, ideal_score)
        # Reversed ranking
        reversed_ranking[r] = (topic, docid, reversed_rank, reversed_score)
        
        # Write the qrels file
        if rel_cnt < NUM_REL:
            # Set relevance to 1
            rel = 1
            # Add the document to the list of relevant documents
            rel_document.append(docid)
        else:
            # Set relevance to 0
            rel = 0
            # Append the document id to the list of non relevant documents
            nrel_document.append(docid)
        
        # Write the qrel line
        f_qrels.write(' '.join([str(topic), '0', docid, str(rel), '\n']))
        # Update the document counter
        rel_cnt += 1

In [4]:
# Sort and write the reversed ranking
# Sort by the third element, i.e., the rank position
sorted_reversed_ranking = sorted(reversed_ranking, key=lambda x: x[2])

# Print the reversed ranking in a file
with open('sim_topic_reversed_rb1_100.txt', 'w') as f_reversed:
    
    # for each rank position
    for r in range(0, RUN_LENGTH):
        
        # Get the corresponding fields
        topic = sorted_reversed_ranking[r][0]
        q0 = 'Q0'
        docid = sorted_reversed_ranking[r][1]
        rank = sorted_reversed_ranking[r][2]
        score = sorted_reversed_ranking[r][3]
        # Set the run name as the file name
        tag = 'sim_topic_reversed_rb1_100'
        
        # Write the ideal run
        f_reversed.write(' '.join([str(topic), q0, docid, str(rank), str(score), tag, '\n']))

In [5]:
# Generate the realistic ranking
# We need 100 rank positions for the relevant documents
# Beta parameter = 1/lambda (mean)
beta = 75

# Initialize a set of rank position
relevant_rank_positions = set()

while len(relevant_rank_positions) < NUM_REL:
    
    # Generate a random rank position with exponential distribution
    rank_position = int(round(random.exponential(beta)))
    
    if rank_position > 0 and rank_position <= RUN_LENGTH:
    
        # Add the new rank position to the set
        relevant_rank_positions.add(rank_position)


In [6]:
# Generate the realistic ranking
with open('sim_topic_realistic_rb1_100.txt', 'w') as f_realistic:
    
    # For each rank position
    for r in range(0, RUN_LENGTH):
        # Set the topic id
        topic = 1
        # Q0 entry required by TREC run format
        q0 = 'Q0'
        # Set the rank position equal to r
        rank = r
        # Fake score compute as a function of the rank position
        score = (RUN_LENGTH - r) / RUN_LENGTH
        # Set the run name as the file name
        tag = 'sim_topic_realistic_rb1_100'
        
        # Get the correct document id
        if r in relevant_rank_positions:
            # The document should be relevant
            docid = rel_document.pop()
        else:
            # The document should be not relevant
            docid = nrel_document.pop()
        
        # Write the ideal run
        f_realistic.write(' '.join([str(topic), q0, docid, str(rank), str(score), tag, '\n']))

<h3>Topics with Recall Base &#60; 1</h3>
<p>We simply need to create extra documents that are added to the qrels.</p>

In [7]:
# Copy the qrel file
!cp sim_qrel_100.txt sim_qrel_200.txt

In [8]:
# Extra relevant documents in the qrels
# Append the extra relevant documents in the qrels
with open('sim_qrel_200.txt', 'a') as f_qrels:

    for r in range(0, NUM_REL):
        topic = 1
        docid = uuid4().hex
        
        rel = 1
        f_qrels.write(' '.join([str(topic), '0', docid, str(rel), '\n']))