<h1>Reformat Original Runs</h1>
<p>This script parses the original runs and:</p>
<ol>
<li>Keep just the assessed topics;</li>
<li>Remove extra documents and cut the runs at 1000 documents for each topic.</li>
</ol>

In [1]:
# Import the libraries
import os
from operator import itemgetter

In [2]:
# Input parameters

# Path to the folder which conteins the original runs:
base_path = "/Users/tjz514/Documents/uni/2020/sigir2020_reproducibility/experiments/runs/original"
# Path to the folder which contains the assessed topics
qrels_file = "/Users/tjz514/Documents/uni/2020/sigir2020_reproducibility/experiments/qrels/qrels_common_core_2017.txt"
# run name
run_name = "WCrobust0405"

In [3]:
# Get the ids of the assessed topics

# We will get them from the qrels file instead of parsing the topic file
# The fields in the qrels file are space separated
# Initialize the set which will store the topic id
assessed_topic_ids = set()

# read the input file as a whole
with open(qrels_file) as f:
    input_file = f.readlines()

# parse each line and populate the corresponding dictionaries
for line in input_file:
    # parse the line and keep only the topic_id  
    topic_id, _, _, _ = line.strip().split()
    
    # Add the topic id to the set of topic ids
    assessed_topic_ids.add(int(topic_id))

In [4]:
# Import the original run

# Path to the original run
run_file = os.path.join(base_path, (run_name + ".txt"))

# Initialize the run as a list of tuples
# (topic_id, doc_id, rank, score)
run = []

# Read the input file as a whole
with open(run_file) as f:
    input_file = f.readlines()

# Parse each line and populate the corresponding dictionaries
for line in input_file:
    # parse the line and keep only the topic_id  
    topic_id, _, doc_id, rank, score, _ = line.strip().split()
    # Add the tuple to the list
    run.append((int(topic_id), doc_id, int(rank), score))

# Close the run file
f.close()

In [5]:
# Sort the run by topic id and rank
run.sort(key=itemgetter(2))         # rank ascending
run.sort(key=itemgetter(0))         # topic_id ascending

In [6]:
# Write the new run: keep only the assessed topics and 1000 documents for each topic

# Path to the new run file
new_run_file = os.path.join(base_path, (run_name + "_replicability.txt"))
# Open a new file to write the run
new_run_f = open(new_run_file, "w+")
# Initialize the topic_id
current_topic_id = ""
# Maximum number of documents for each topic
max_rank = 1000

# For each item in the run list
for (topic_id, doc_id, rank, score) in run:
    # Check if the topic is in the set of assessed topics
    if topic_id in assessed_topic_ids:
    
        # Check if the topic_id has changed
        if current_topic_id != topic_id:
            # Update the current topic_id
            current_topic_id = topic_id
            # Update the rank position
            current_rank = 0

        # If the rank is lower than the maximum allowed
        if current_rank < max_rank:
            # Write the data in the new run file
            string_results = "%i\tQ0\t%s\t%i\t%s\t%s\n" % (topic_id, doc_id, rank, score, run_name)
            # print(string_results)
            new_run_f.write(string_results)

        # Update the current rank
        current_rank = current_rank + 1

# Close the new run file
new_run_f.close()