In [5]:
import pandas as pd
import numpy as np
import os

In [6]:
goldfile   = "../../resources/20180622processedGoldStandardTopics.tsv.gz"
resultsdir = "../../results_disease_2017/"
statsdir   = "../../stats_disease_2017/"
measures   = ["ndcg","infNDCG", "P_10"]

In [7]:
allgsdf = pd.read_csv(goldfile, delimiter="\t")
gsdf = allgsdf.drop(["title", "abstract", "major_mesh", "minor_mesh"], axis=1)

# Read all results

In [8]:
def calculateRelFoundCounts(dir):
    # Read all results in the given result directory
    resultfiles = sorted(os.listdir(dir))
    # Read the result files as DataFrames into a map
    resultdfmap = []
    for f in resultfiles:
        resultdfmap.append(pd.read_csv(dir+f, delimiter="\t", names=["topic", "Q0", "docid", "rank", "score", "run"]))
    # Create a DataFrame multiindexed with the file name (because those are the keys of the DF maps)
    resultmultidxdf = pd.concat(resultdfmap)
    resultmultidxdf.set_index(["run"], inplace=True)
    
    # Create a duplication of the relevant GS document to match the results
    gsreldocs = gsdf.query("relevance_score > 0")[["trec_topic_number", "trec_doc_id"]]
    l = []
    for experiment in set(resultmultidxdf.index):
        idx = pd.Index([experiment]*len(gsreldocs), name="run")
        gscopy = gsreldocs.copy()
        gscopy.index = idx
        l.append(gscopy)
    gsdfs = pd.concat(l)
    
    # Merge the duplicated GS with the results
    # With a `left` join, thus eliminating all irrelevant documents.
    relmerge = pd.merge(gsdfs, resultmultidxdf, how="left", left_on=["run", "trec_topic_number", "trec_doc_id"], right_on=["run", "topic", "docid"])
    relmerge.set_index("trec_topic_number", append="True", inplace=True)
    
    # Count the number of found documents per run and topic    
    countsruntopic = relmerge.groupby(["run", "trec_topic_number"]).count()
    countsruntopic = countsruntopic.drop(["Q0", "docid", "rank", "score"],axis=1)
    countsruntopic.columns = ["relgs", "relfound"]
    
    # return the left-merged data and the counts
    return relmerge,countsruntopic

In [9]:
def prepareStats(path):
    """
    Reads a single stats CSV file, excludes the 'all' row and converts the topic numbers to ints.
    Then sets the Topic columns as the new index.
    Returns a DataFrame indexed by the non-'all' topics.
    """
    df = pd.read_csv(path).query("Topic != 'all'")
    df["Topic"] = df["Topic"].astype(int)
    df.sort_values(by="Topic", inplace=True)
    df = df.set_index("Topic")
    return df

In [27]:
def getMeanStatsPerRun(statspath):
    """
    Reads a directory of stat CSV files. Concatenates all the DataFrames and calculates the means for all
    score measurements of the topics per run, effectively returning the 'all' row for each run. Note,
    however, that really just the mean over the measures is given which should be the 'all' value but the actual
    'all' value is not used here.
    Returns only those measures defined in the 'measures' list at the beginning of this cell.
    """
    statfiles = sorted(list(filter(lambda f: f.endswith(".csv"), os.listdir(statspath))))
    runstatsmap = {}
    for stat in statfiles:
        df = prepareStats(statspath+stat)
        run = stat.replace("OFFICIAL_", "").replace(".csv", "")
        runstatsmap[run] = df
    allstats = pd.concat(runstatsmap)
    allstats.index.names = ["run", "Topic"]
    allstats = allstats[measures]
    meanstats = allstats.mean(level="run")
    return meanstats

In [11]:
def getRelFoundWithMeanRank(resultspath, statspath):
    gsleftmerged,counts = calculateRelFoundCounts(resultspath)
    meanstats = getMeanStatsPerRun(statspath)
    counts    = counts.sum(level="run").sort_values("relfound")
    merge     = pd.merge(counts, meanstats, on="run")
    meanranks = gsleftmerged["rank"].dropna().mean(level="run")
    stdranks  = gsleftmerged["rank"].dropna().std(level="run")
    merge["meanrank"] = meanranks
    merge["stdrank"] = stdranks
    return merge.sort_values("relfound")

In [28]:
getMeanStatsPerRun(statsdir)

Unnamed: 0_level_0,ndcg,infNDCG,P_10
run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dishyper--dis1.0-hyp0.0-syn0.0,0.504927,0.35266,0.453333
dishyper--dis1.0-hyp0.0-syn1.0,0.504927,0.35266,0.453333
dishyper--dis1.0-hyp1.0-syn0.0,0.446457,0.281613,0.366667
dishyper--dis1.0-hyp1.0-syn1.0,0.446457,0.281613,0.366667
dissyn--dis1.0-hyp0.0-syn0.0,0.529317,0.388357,0.486667
dissyn--dis1.0-hyp0.0-syn1.0,0.53506,0.391587,0.483333
dissyn--dis1.0-hyp1.0-syn0.0,0.529317,0.388357,0.486667
dissyn--dis1.0-hyp1.0-syn1.0,0.53506,0.391587,0.483333
dissynhyper--dis1.0-hyp0.0-syn0.0,0.504927,0.35266,0.453333
dissynhyper--dis1.0-hyp0.0-syn1.0,0.51382,0.35823,0.453333


In [29]:
from parse import *
s = "DSYN_DHYP_DPT_GSYN_WR--mmm:phrase-op:OR-wr:false-sl:5.trec_results.gz"
filenameformat = "{run}--mmm:{multfields}-op:{op}-wr:{wordremoval}-sl:{slop}"
parse(filenameformat, s)

<Result () {'run': 'DSYN_DHYP_DPT_GSYN_WR', 'multfields': 'phrase', 'op': 'OR', 'wordremoval': 'false', 'slop': '5.trec_results.gz'}>