In [207]:
import pandas as pd
import numpy as np
import os

In [None]:
goldfile   = "../../resources/20180622processedGoldStandardTopics.tsv.gz"
resultsdir = "../../results/"
statsdir   = "../../stats_pmclass/"
measures   = ["ndcg","infNDCG", "P_10"]

In [203]:
allgsdf = pd.read_csv(goldfile, delimiter="\t")
gsdf = allgsdf.drop(["title", "abstract", "major_mesh", "minor_mesh"], axis=1)

# Read all results

In [173]:
def calculateRelFoundCounts(dir):
    # Read all results in the given result directory
    resultfiles = sorted(os.listdir(dir))
    # Read the result files as DataFrames into a map
    resultdfmap = []
    for f in resultfiles:
        resultdfmap.append(pd.read_csv(dir+f, delimiter="\t", names=["topic", "Q0", "docid", "rank", "score", "run"]))
    # Create a DataFrame multiindexed with the file name (because those are the keys of the DF maps)
    resultmultidxdf = pd.concat(resultdfmap)
    resultmultidxdf.set_index(["run"], inplace=True)
    
    # Create a duplication of the relevant GS document to match the results
    gsreldocs = gsdf.query("relevance_score > 0")[["trec_topic_number", "trec_doc_id"]]
    l = []
    for experiment in set(resultmultidxdf.index):
        idx = pd.Index([experiment]*len(gsreldocs), name="run")
        gscopy = gsreldocs.copy()
        gscopy.index = idx
        l.append(gscopy)
    gsdfs = pd.concat(l)
    
    # Merge the duplicated GS with the results
    # With a `left` join, thus eliminating all irrelevant documents.
    relmerge = pd.merge(gsdfs, resultmultidxdf, how="left", left_on=["run", "trec_topic_number", "trec_doc_id"], right_on=["run", "topic", "docid"])
    relmerge.set_index("trec_topic_number", append="True", inplace=True)
    
    # Count the number of found documents per run and topic    
    countsruntopic = relmerge.groupby(["run", "trec_topic_number"]).count()
    countsruntopic = countsruntopic.drop(["Q0", "docid", "rank", "score"],axis=1)
    countsruntopic.columns = ["relgs", "relfound"]
    
    # return the left-merged data and the counts
    return relmerge,countsruntopic

In [174]:
def prepareStats(path):
    """
    Reads a single stats CSV file, excludes the 'all' row and converts the topic numbers to ints.
    Then sets the Topic columns as the new index.
    Returns a DataFrame indexed by the non-'all' topics.
    """
    df = pd.read_csv(path).query("Topic != 'all'")
    df["Topic"] = df["Topic"].astype(int)
    df.sort_values(by="Topic", inplace=True)
    df = df.set_index("Topic")
    return df

In [194]:
def getMeanStatsPerRun(statspath):
    """
    Reads a directory of stat CSV files. Concatenates all the DataFrames and calculates the means for all
    score measurements of the topics per run, effectively returning the 'all' row for each run. Note,
    however, that really just the mean over the measures is given which should be the 'all' value but the actual
    'all' value is not used here.
    Returns only those measures defined in the 'measures' list at the beginning of this cell.
    """
    statfiles = sorted(list(filter(lambda f: f.endswith(".csv") in f, os.listdir(statspath))))
    runstatsmap = {}
    for stat in statfiles:
        df = prepareStats(statspath+stat)
        run = stat.replace("OFFICIAL_", "").replace(".csv", "")
        runstatsmap[run] = df
    allstats = pd.concat(runstatsmap)
    allstats.index.names = ["run", "Topic"]
    allstats = allstats[measures]
    meanstats = allstats.mean(level="run")
    return meanstats

In [195]:
def getRelFoundWithMeanRank(resultspath, statspath):
    gsleftmerged,counts = calculateRelFoundCounts(resultspath)
    meanstats = getMeanStatsPerRun(statspath)
    counts    = counts.sum(level="run").sort_values("relfound")
    merge     = pd.merge(counts, meanstats, on="run")
    meanranks = gsleftmerged["rank"].dropna().mean(level="run")
    stdranks  = gsleftmerged["rank"].dropna().std(level="run")
    merge["meanrank"] = meanranks
    merge["stdrank"] = stdranks
    return merge.sort_values("relfound")

In [243]:
df = pd.DataFrame(np.random.rand(10,5))
df.columns = ["ndcg", "infNDCG", "Rprec", "P_5", "P_10"]

In [249]:
mi = pd.MultiIndex.from_product([["Baseline_wr", "Baseline", "Dis_wr", "Dis", "COSMIC"],[1,2]], names=["run", "topic"])
df = df.set_index(mi)

In [282]:
files = ["file--d:1-m:5-h:7"]

In [270]:
from parse import *

In [349]:
result = parse("file--d:{d}-m:{m}-h:{h}", "file--d:1-m:5-h:7")

In [350]:
result.named

{'d': '1', 'h': '7', 'm': '5'}

In [289]:
df = pd.DataFrame(np.random.rand(10,5))
df2 = pd.DataFrame(np.random.rand(10,2))

In [317]:
(pd.concat([df,df2], axis=1)*100).round(2)

Unnamed: 0,0,1,2,3,4,0.1,1.1
0,37.36,0.62,71.86,89.33,63.57,29.53,79.51
1,61.04,99.39,98.7,40.73,6.83,59.63,57.45
2,35.93,53.95,48.33,28.05,98.69,76.95,52.5
3,15.61,56.03,10.63,39.1,37.58,65.93,48.67
4,54.48,82.53,27.3,63.26,86.22,46.13,73.93
5,4.43,18.65,20.25,67.25,78.39,2.64,46.55
6,46.9,59.0,83.09,67.41,29.96,1.63,49.01
7,24.62,99.2,15.51,38.95,1.94,81.04,75.18
8,70.69,44.62,76.58,1.22,3.18,8.79,32.47
9,3.85,72.39,27.84,71.23,62.34,3.84,69.09


In [305]:
s = ["dissyn--mmm:most_fields-op:AND-wr:false", "gendisall--mmm:most_fields-op:AND-wr:true"]
dicts = []
for file in s:
    dicts.append(parse("{run}--mmm:{multfields}-op:{op}-wr:{wordremoval}",file).named)
dicts

[{'multfields': 'most_fields',
  'op': 'AND',
  'run': 'dissyn',
  'wordremoval': 'false'},
 {'multfields': 'most_fields',
  'op': 'AND',
  'run': 'gendisall',
  'wordremoval': 'true'}]

In [333]:
df = pd.DataFrame(dicts)
df

Unnamed: 0,multfields,op,run,wordremoval
0,most_fields,AND,dissyn,False
1,most_fields,AND,gendisall,True


In [348]:
[2] + list(range(len(df.columns)-1))

[2, 0, 1, 2]

In [342]:
df.columns[:-1].append("huhu")

TypeError: all inputs must be Index

In [332]:
df.columns

Index(['multfields', 'op', 'run', 'wordremoval'], dtype='object')