In [265]:
import numpy as np
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from itertools import chain

In [364]:
gsdatacsv = pd.read_csv("../../resources/20180622processedGoldStandardTopics.tsv.gz", delimiter="\t")
preddatacsv = pd.read_csv("../../resources/20180622processedGoldStandardTopics.tsv.gz", delimiter="\t")

In [None]:
# Arrange the data frames to be multi-indexed with a topic index for the topic number where the documents ids
# are grouped into
gsdata = gsdatacsv.set_index(['trec_topic_number', 'trec_doc_id']).sort_index()
preddata = preddatacsv.set_index(['trec_topic_number', 'trec_doc_id']).sort_index()
if gsdata.shape[0] != preddata.shape[0]:
    raise ValueError("The gold standard has " + gsdata.shape[0] + " rows but the predicted data has " + preddata.shape[0])
joined = gsdata.join(preddata, lsuffix="_gs", rsuffix="_pred", how="left")

In [399]:
g1 = gsdata.loc[30]["gene1_annotation_desc"].dropna().shape[0]
g2 = gsdata.loc[30]["gene2_annotation_desc"].dropna().shape[0]
g3 = gsdata.loc[30]["gene3_annotation_desc"].dropna().shape[0]
print(g1, g2, g3, g1+g2+g3)

246 246 246 738


In [None]:
def evalColumns(cols, printTopics=True):
    # This builds the list of columns in the joined dataframe be suffixing all the given column
    # names with the _gs and _pred suffixes
    collist = list(chain.from_iterable((col+"_gs", col+"_pred") for col in cols))
    
    # Extract exactly those columns we want to compare
    cols_joined = joined[collist]

    # The 'confusion labels'. To have the confusion matrix to be comparable over the topics,
    # we need to always give the same list of labels
    conf_labels = set([])
    for col in collist:
        conf_labels.update(joined[col].dropna().values)
    conf_labels = sorted(list(conf_labels))
    # We will collect all the labels of all topic here to get overall results
    allgslabels   = None
    allpredlabels = None
    # Iterate over the topics (first part of the multi index)
    for i in cols_joined.index.levels[0]:
        topici     = cols_joined.loc[i]
        gslabels   = None
        predlabels = None
        # We will now concatenate the lists of labels from the different columns that are given.
        # The idea is that we have multiple genes for some topics which have their own columns,
        # geneX_annotation_desc. We are interested into how well we can recognize the gene label,
        # so we want all the information in a single column as input into the scoring functions below
        for col in list(filter(lambda s: s.endswith("_gs"), collist)):
            gslabels = pd.concat([gslabels, topici[col]])
        for col in list(filter(lambda s: s.endswith("_pred"), collist)):
            predlabels = pd.concat([predlabels, topici[col]])
        
        joinedlabels = pd.DataFrame({"gslabels": gslabels, "predlabels": predlabels})

        if (joinedlabels["gslabels"].notnull() & joinedlabels["predlabels"].isnull()).any():
            raise ValueError("The prediction contains null values where the gold standard has a value")
        
        joinedlabels = joinedlabels.dropna()
        gslabels   = joinedlabels["gslabels"]
        predlabels = joinedlabels["predlabels"]
        
        allgslabels = pd.concat([allgslabels, gslabels])
        allpredlabels = pd.concat([allpredlabels, predlabels])
        
        if printTopics:
            print("Topic", i)
            print(confusion_matrix(gslabels, predlabels, conf_labels))
            print(conf_labels)
            print("ACC:", accuracy_score(gslabels, predlabels))
       
    print("Allover")
    print(confusion_matrix(allgslabels, allpredlabels, conf_labels))
    print(conf_labels)
    print("ACC:", accuracy_score(allgslabels, allpredlabels))

In [454]:
evalColumns(["disease_desc"], False)

Allover
[[4149    0    0    0]
 [   0  938    0    0]
 [   0    0 1273    0]
 [   0    0    0 2914]]
['Exact', 'More General', 'More Specific', 'Not Disease']
ACC: 1.0


In [455]:
evalColumns(["gene1_annotation_desc", "gene2_annotation_desc", "gene3_annotation_desc"], False)

Allover
[[ 651    0    0    0]
 [   0 5065    0    0]
 [   0    0 3974    0]
 [   0    0    0 1655]]
['Different Variant', 'Exact', 'Missing Gene', 'Missing Variant']
ACC: 1.0
