<h1>Compute RBO</h1>

In [1]:
# Import the neede libraries
import os
from operator import itemgetter
from rbo import rbo

<h3>Input Parameters</h3>

In [2]:
# Imput Parameters
# Run Name: "WCrobust04" or "WCrobust0405"
original_run_name = "WCrobust04"

# RBO persistency parameter
p = 0.8

# Path to the folder with the replicated runs
replicability_runs_folder = os.path.join("/Users/tjz514/Documents/uni/2020/sigir2020_reproducibility/experiments/runs/replicability_rename", original_run_name.lower())

# Path to the folder with the original runs
original_run_file = os.path.join("/Users/tjz514/Documents/uni/2020/sigir2020_reproducibility/experiments/runs/original", original_run_name + "_replicability.txt")

# Output path, where the csv file with RBO scores will be saved
output_path = os.path.join("/Users/tjz514/Documents/uni/2020/sigir2020_reproducibility/experiments/matlab/results/measures")


<h3>Util Functions</h3>
<ul>
<li><code>import_run</code>: function to import the runs from a text file;</li>
<li><code>print_results</code>: function to print RBO scores in a csv file</li>
</ul>

In [3]:
def import_run(run_file):
    # Import a run
    # Each run is a dictionary (topic_id, ranking), where:
    # - key: is the topic_id (string)
    # - value: is the list of documents id
    # We just need a list of doc_id for each topic
    # We sort the documents as in trec_eval
    
    # Args:
        # run_file: path to the file to be imported.

    # Returns:
        # The script parses each line of the input file and maps it to a dictionary
        # run[topic_id] = [per_topic_ranking]

        # The script adopts the same ordering as trec_eval, i.e. it sorts rankings 
        # by descending order of score and descending lexicographical order of 
        # doc_id. Moreover, as done by trec_eval, it represents the document 
        # score as a float.

    # Initialize the run as a dictionary
    run = {}

    # Read the input file as a whole
    with open(run_file) as f:
        input_file = f.readlines()

    # Close the run file
    f.close()    

    # Initialize the current topic id
    old_topic_id = "-"

    # Parse each line and populate the corresponding dictionaries
    for line in input_file:
        # parse the line and keep the topic_id, the doc_id, and the score 
        current_topic_id, _, doc_id, rank, score, _ = line.strip().split()

        # If the current topic is different from the previous line
        if old_topic_id != current_topic_id:

            # If this is the first iteration
            if old_topic_id == "-":

                # Per topic ranking, list of tuples (doc_id, scores)
                # Intitialized as empty
                ranking = []

            else:

                # Sort the ranking as in trec_eval
                ranking.sort(key=itemgetter(0), reverse=True)         # doc_id descending
                ranking.sort(key=itemgetter(1), reverse=True)         # score descending

                # Save the ranking in the dictionary
                # We need just the doci_id
                run[old_topic_id] = [x for (x, y) in ranking]

                # Intitialized as empty
                ranking = []

        # Add the tuple to the list
        ranking.append((doc_id, float(score)))

        # Update the topic_id for the next iteration
        old_topic_id = current_topic_id

    # We still need to add the ranking for the last topic
    ranking.sort(key=itemgetter(0), reverse=True)         # doc_id descending
    ranking.sort(key=itemgetter(1), reverse=True)         # score descending
    run[current_topic_id] = [x for (x, y) in ranking]
    
    return run

In [4]:
def print_results(rbo_results, topic_ids, runsfile_names, output_path, original_run_name, measure_name, p):
    # Create the csv file and write the measures scores
    
    # Args:
        # rbo_results: dictionary with rbo scores for each run and topic
        # topic_ids: topic idssorted in ascending way
        # runsfile_names: run ids sorted in ascending way
        # output_path: folder which will contain the output results
        # original_run_name: original run name to be used to defune the result file name
        # measure_name: RBO with cutoff to be used to defune the result file name
        # p: RBO persistency parameter to be used to defune the result file name

    # Define the file name
    output_file_path = os.path.join(output_path, "rpl_" + original_run_name.lower() + "_" + measure_name + "_p" + str(p).replace(".", "") +".csv")
    # Create the result file
    with open(output_file_path, "w") as f:
        # Create the header: run_id and topic ids
        header = "topic_id," + ",".join(runsfile_names) + "\n"
        # write the header in the file
        f.write(header)

        # Loop over the topics
        for topic_id in topic_ids:
            # Empty list which will store the score for each topic
            rbo_scores = []
            # Loop over the runs
            for run_id in runsfile_names:
                # Get the measures score
                rbo_scores.append(str(rbo_results.get((run_id, topic_id))))

            # Create the line which will be written in the csv file
            result_line =  topic_id + "," + (",".join(rbo_scores)) + "\n"
            # Write the line in the file
            f.write(result_line)

        # Close the file
        f.close()
        
    return

<h3>Import the Original Run</h3>

In [5]:
# Import the original run
original_run = import_run(original_run_file)

# Get the list of topic_id
topic_ids = original_run.keys()

In [6]:
# Get the paths of the replicability runs
# List all the files in the subfolders
# List of tuples: (run_path, run_id) 
run_files = []
# Walk through the subfolders
for path, subdirs, files in os.walk(replicability_runs_folder):
    # For each file in the subfolders
    for name in files:
        # Check if this is a run file, the extension should be .txt
        if name.endswith(".txt"):
            # Append the tuple to the list
            # os.path.join(path, name) returns the path to the run file
            # name.replace(".txt", "") remove the extension
            run_files.append((os.path.join(path, name), name.replace(".txt", "")))

In [7]:
# Results are saved in a dictionary
# key: (run_id, topic_id)
# value: RBO score for each topic, float
# RBO is computed at cut-off 10, 100, and 1000
results_rbo_5 = {}
results_rbo_10 = {}
results_rbo_20 = {}
results_rbo_50 = {}
results_rbo_100 = {}
results_rbo_1000 = {}

In [8]:
# Import the runs on the replicability folder
# For each run compute RBO between the original and the replicated run
for (run_path, run_name) in run_files:
    
    # Import the run
    replicated_run = import_run(run_path)
    
    # For each topic
    for topic_id in topic_ids:
        
        # get the corresponding original and replicated ranking
        original_ranking = original_run[topic_id]
        replicated_ranking = replicated_run[topic_id]
        # Compute RBO for each cut-off
        rbo_5 = rbo.rbo(original_ranking[:4], replicated_ranking[:4], p)
        #rbo_10 = rbo.rbo(original_ranking[:9], replicated_ranking[:9], p)
        rbo_20 = rbo.rbo(original_ranking[:19], replicated_ranking[:19], p)
        rbo_50 = rbo.rbo(original_ranking[:49], replicated_ranking[:49], p)
        #rbo_100 = rbo.rbo(original_ranking[:99], replicated_ranking[:99], p)
        #rbo_1000 = rbo.rbo(original_ranking[:999], replicated_ranking[:999], p)
    
        # Store the result in the dictionary
        results_rbo_5[(run_name, topic_id)] = rbo_5["ext"]
        #results_rbo_10[(run_name, topic_id)] = rbo_10["ext"]
        results_rbo_20[(run_name, topic_id)] = rbo_20["ext"]
        results_rbo_50[(run_name, topic_id)] = rbo_50["ext"]
        #results_rbo_100[(run_name, topic_id)] = rbo_100["ext"]
        #results_rbo_1000[(run_name, topic_id)] = rbo_1000["ext"]
    
    
    # Delete the imported run
    replicated_run.clear()

<h3>Print the results in a CSV file</h3>
<p>This will generate the tables for the SIGIR paper.
The csv is formatted as follows:<br>
<code>run_id,score_t1,score_t2,...,score_tN</code><br>
The scores use the dot as decimal separator.<br>
For each measure, we generate a different file.
</p>

In [9]:
# Print RBO results in a CSV file
# Sorted list of topics
topic_ids = sorted(topic_ids)
# topic_ids.sort()
output_path = os.path.join("/Users/tjz514/Documents/uni/2020/sigir2020_reproducibility/experiments/matlab/results/measures")

# Sort the runs alphabetically
runsfile_names = [y for (x, y) in run_files]
runsfile_names.sort()

# Print the results
print_results(results_rbo_5, topic_ids, runsfile_names, output_path, original_run_name, "rbo_5", p)
#print_results(results_rbo_10, topic_ids, runsfile_names, output_path, original_run_name, "rbo_10", p)
print_results(results_rbo_20, topic_ids, runsfile_names, output_path, original_run_name, "rbo_20", p)
print_results(results_rbo_50, topic_ids, runsfile_names, output_path, original_run_name, "rbo_50", p)
#print_results(results_rbo_100, topic_ids, runsfile_names, output_path, original_run_name, "rbo_100", p)
#print_results(results_rbo_1000, topic_ids, runsfile_names, output_path, original_run_name, "rbo_1000", p)