<h1>Rename Runs</h1>
<p>This script parses each run in the directory and:</p>
<ol>
<li>Change the run name and set it equal to the file name;</li>
<li>Remove extra documents and cut the runs at 1000 documents for each topic;</li>
<li>Sort the run by topic id and rank.</li>    
</ol>

In [1]:
# Import the libraries
import os
import shutil
from operator import itemgetter

In [2]:
# Input parameters
# Path which contains the folders with the runs
base_path = "/Users/tjz514/Documents/uni/2020/sigir2020_reproducibility/experiments/runs/reproducibility_0405_rename"

In [3]:
# List all the files in the subfolders
# List of tuples: (run_path, run_id) 
run_files = []
# Walk through the subfolders
for path, subdirs, files in os.walk(base_path):
    # For each file in the subfolders
    for name in files:
        # Check if this is a run file, the extension should be .txt
        if name.endswith(".txt"):
            # Append the tuple to the list
            # os.path.join(path, name) returns the path to the run file
            # name.replace(".txt", "") remove the extension
            run_files.append((os.path.join(path, name), name.replace(".txt", "")))

In [4]:
# Read the run files and import the runs (list of tuples) in a collection
# key = run_name
# value = list of tuples
run_set = {}
for (run_path, run_name) in run_files:
    
    # Initialize the run as a list of tuples
    # (topic_id, doc_id, rank, score)
    run = []
    
    #print(run_path)
    
    # Read the input file as a whole
    with open(run_path) as f:
        input_file = f.readlines()

    # Parse each line and populate the corresponding dictionaries
    for line in input_file:
        # parse the line and keep only the topic_id  
        topic_id, _, doc_id, rank, score, _ = line.strip().split()
        # Add the tuple to the list
        run.append((int(topic_id), doc_id, int(rank), score))
    
    # Close the run file
    f.close()
    
    # add the list of tuples to the collection
    run_set[run_name] = run

In [5]:
# Print the new run files
for (run_path, run_name) in run_files:
    
    # Get the corresponding list of tuples form the collection
    run = run_set[run_name]
    # Sort the run by topic id and rank
    run.sort(key=itemgetter(2))         # rank ascending
    run.sort(key=itemgetter(0))         # topic_id ascending
    
    # Open a new file to write the run
    new_run_f = open(run_path, "w+")
    # Initialize the topic_id
    current_topic_id = ""
    # Maximum number of documents for each topic
    max_rank = 1000
    
    # For each item in the run list
    for (topic_id, doc_id, rank, score) in run:
        # Check if the topic_id as changed
        if current_topic_id != topic_id:
            # Update the current topic_id
            current_topic_id = topic_id
            # Update the rank position
            current_rank = 0
            
        # If the rank is lower than the maximum allowed
        if current_rank < max_rank:
            # Write the data in the new run file
            string_results = "%i\tQ0\t%s\t%i\t%s\t%s\n" % (topic_id, doc_id, rank, score, run_name)
            # print(string_results)
            new_run_f.write(string_results)
            
        # Update the current rank
        current_rank = current_rank + 1

    # Close the new run file
    new_run_f.close()