In [227]:
from collections import defaultdict

def flatten_white_space(input_string, expression=" +", replacement=" "):
    return re.sub(expression, replacement, input_string)

def parse_rel_result_str(string, delimiter=' ', empty_start=True):
    ''' string : input string'''
    ''' delimiter : seperates results'''
    '''empty_start : is first result empty?'''
    '''return : list of tuples STRINGS'''
    results = string.split(delimiter)
    return results[1:] if empty_start else results

def parse_tuple(string):
    doc_num, score = string.strip('()').split(',')
    return (int(doc_num), int(score))

def parse_document(string):
    doc_num, score = string.strip('()').split(',')
    return Document(int(doc_num), int(score))




class QuerySet():

    def __init__(self, queries):
        self.queries = queries

        
class Query():
    
    def __init__(self, rel_docs, ret_docs, total_rel_docs=None, score=None):
        self.rel_docs = rel_docs
        self.ret_docs = ret_docs
        if total_rel_docs is None:
            self.total_rel_docs = len(self.rel_docs)
        else:
            self.total_rel_docs = total_rel_docs
        self.score = score

        
class DocumentSet():

    def __init__(self, list_or_set_of_documents):
        self.docs = list_or_set_of_documents

    def all_doc_nums(self):
        doc_nums = []
        for document in self.docs:
            doc_nums.append(document.doc_num)
        return set(doc_nums)
    
    def all_docs(self):
        return self.docs
    
    def all_docs_ordered_by_score(self, rev=True):
        return sorted(self.docs, key=lambda document: document.score, reverse=rev)
    
class Document():

    def __init__(self, doc_num, score):
        self.doc_num = doc_num
        self.score = score
        
    def __repr__(self):
        return "(" + str(self.doc_num) + "," + str(self.score) + ")"

        
class ResultSet():

    def __init__(self, results):
        self.results = results
        
    def results_for_query(self, query_num):
        result_set = []
        for current_result in self.results:
            if current_result.query_num == int(query_num):
                result_set.append(current_result)
        return result_set
    
    def get_doc_attributes_for_query(self, query_num, attribute='doc_num'):
        results = self.results_for_query(query_num)
        return_set = []
        for current_result in results:
            return_set.append(getattr(current_result, attribute))
        return return_set
    
    def ordered_doc_attrs_for_query_by_rank(self, query_num, attr='doc_num', rev=False):
        results = sorted(self.results_for_query(query_num), key=lambda result: result.rank, reverse=rev)
        return_set = []
        for current_result in results:
            return_set.append(getattr(current_result, attr))
        return return_set
    
    # note: sorted has some weird behaviour as lots of scores are the same.. watch out for this..
    def ordered_doc_attrs_for_query_by_score(self, query_num, attr='doc_num', rev=True):
        results = sorted(self.results_for_query(query_num), key=lambda result: result.score, reverse=rev)
        return_set = []
        for current_result in results:
            return_set.append(getattr(current_result, attr))
        return return_set
    

class Result():

    def __init__(self, query_num, doc_num, rank, score):
        self.query_num = int(query_num)
        self.doc_num = int(doc_num)
        self.rank = int(rank)
        self.score = float(score)

        
class System():
    
    def __init__(self, filepath):
        self.filepath = filepath
        self.raw_results = None
        self.results = None
    
    def read_raw_result_set(self):
        with open(self.filepath, "r") as results_file:
            data = results_file.readlines()
        self.raw_results = [x.strip('\n') for x in data]
    
    def raw_result_set(self):
        if self.raw_results is None:
            self.read_raw_result_set()    
        return self.raw_results
         
    def result_set(self):
        if self.results is not None:
            return self.results
        if self.raw_results is None:
            self.read_raw_result_set()
        new_result_set = []
        for current_result in self.raw_results:
            # current result attributes
            attrs = current_result.split(' ')
            # TODO: replace 0, 2, 3 and 4 with CONSTS
            new_result_set.append(Result(attrs[0], attrs[2], attrs[3], attrs[4]))
        self.results = ResultSet(new_result_set)
        return self.results

    
class Evaluator():
    
    def __init__(self, filepath):
        self.filepath = filepath
        self.raw_eval_file = None
        self.relevent_documents_for_q = None
        
    def read_eval_file(self):
        with open(self.filepath, "r") as evaluation_file:
            data = evaluation_file.readlines()
        self.raw_eval_file = [x.strip('\n') for x in data]
    
    def eval_file(self):
        if self.raw_eval_file is None:
            self.read_eval_file()
        return self.raw_eval_file
    
    def parse_queries_for_rel_docs(self):
        if self.raw_eval_file is None:
            self.read_eval_file()
        query_rel_docs = defaultdict(int)
        # TODO: replace ':' literal with CONST
        # we're breaking the evaluation file in a query # and their relevent results
        for query, results in [qrel.split(':') for qrel in self.raw_eval_file]:
            q_idx = int(query)
            q_rel = flatten_white_space(results)
            rel_docs = [parse_document(doc_tup) for doc_tup in parse_rel_result_str(q_rel)]
            query_rel_docs[q_idx] = DocumentSet(rel_docs)
        self.relevent_documents_for_q = query_rel_docs
        
    def get_rel_docs_for_query(self, query_number):
        if self.relevent_documents_for_q is None:
            self.parse_queries_for_rel_docs()
        return self.relevent_documents_for_q[query_number]

In [210]:
e = Evaluator("../systems/qrels.txt")

In [211]:
e.read_eval_file()

In [212]:
e.parse_queries_for_rel_docs()

In [213]:
rel_doc_set_for_q_1 = e.get_rel_docs_for_query(1)

In [214]:
rel_doc_set_for_q_1.all_doc_nums()

{5011, 6850, 8709, 9090, 9574, 9684}

In [215]:
for x in rel_doc_set_for_q_1.all_docs_ordered_by_score(rev=True):
    print(x)

(9090,3)
(6850,2)
(9574,2)
(8709,1)
(9684,1)
(5011,1)


In [216]:
s1 = System("../systems/S1.results")

In [217]:
s1.read_raw_result_set()

In [218]:
res_set_for_s1 = s1.result_set()

In [219]:
res_set_for_s1.get_doc_attributes_for_query(1)

[6567,
 9652,
 9684,
 7844,
 9584,
 7853,
 9090,
 9574,
 6850,
 9616,
 9281,
 9891,
 9665,
 8446,
 9591,
 9198,
 5690,
 5493,
 6915,
 6171,
 5023,
 7732,
 8871,
 8124,
 6602,
 7353,
 8564,
 8419,
 7735,
 7730,
 9069,
 8179,
 7745,
 9625,
 9394,
 9156,
 9961,
 5936,
 5014,
 7811,
 6675,
 7980,
 7553,
 6473,
 8270,
 7771,
 8375,
 5660,
 9530,
 5368,
 7925,
 8170,
 6927,
 5019,
 6470,
 7821,
 6182,
 9572,
 6894,
 10034,
 6848,
 6640,
 9478,
 6050,
 7879,
 9595,
 7183,
 5687,
 7909,
 8922,
 7598,
 6381,
 5169,
 9860,
 9487,
 9371,
 9247,
 6907,
 6484,
 9192,
 7965,
 5776,
 5825,
 7151,
 6964,
 8741,
 9853,
 9499,
 8326,
 7142,
 9076,
 6601,
 6308,
 5506,
 8370,
 6342,
 7532,
 6967,
 5101,
 8684,
 7396,
 7206,
 5807,
 6006,
 6849,
 6471,
 6300,
 5698,
 5350,
 9315,
 8716,
 8193,
 7230,
 6971,
 5954,
 5697,
 9071,
 8740,
 8545,
 8187,
 7715,
 6780,
 6410,
 5625,
 5464,
 5229,
 5095,
 9079,
 8856,
 8860,
 7872,
 5495,
 9589,
 9484,
 9327,
 8276,
 7623,
 6578,
 6521,
 5569,
 9582,
 9078,
 7964

In [220]:
len(set(res_set_for_s1.get_doc_attributes_for_query(1)))

500

In [226]:
res_set_for_s1.ordered_doc_attrs_for_query_by_score(1, rev=False)

[10007,
 9951,
 9947,
 9831,
 9804,
 9739,
 9697,
 9696,
 9663,
 9660,
 9654,
 9642,
 9630,
 9519,
 9514,
 9460,
 9439,
 9404,
 9402,
 9390,
 9384,
 9360,
 9326,
 9232,
 9225,
 9206,
 9153,
 9152,
 9149,
 9116,
 9105,
 9012,
 8987,
 8954,
 8910,
 8875,
 8827,
 8731,
 8576,
 8575,
 8556,
 8537,
 8481,
 8436,
 8433,
 8389,
 8336,
 8325,
 8308,
 8183,
 8148,
 8139,
 8137,
 8136,
 8107,
 8103,
 8100,
 8080,
 8047,
 8046,
 8021,
 7975,
 7942,
 7906,
 7857,
 7838,
 7800,
 7779,
 7674,
 7513,
 7454,
 7419,
 7414,
 7409,
 7366,
 7362,
 7336,
 7298,
 7290,
 7279,
 7277,
 7226,
 7120,
 7036,
 7019,
 6987,
 6884,
 6872,
 6793,
 6781,
 6775,
 6766,
 6679,
 6641,
 6627,
 6559,
 6558,
 6556,
 6555,
 6549,
 6474,
 6378,
 6336,
 6259,
 8698,
 8407,
 8322,
 8015,
 7892,
 7734,
 7542,
 7106,
 6309,
 6158,
 6126,
 6032,
 5494,
 5486,
 5360,
 5266,
 8294,
 8243,
 8041,
 8013,
 7883,
 7216,
 6311,
 6262,
 8418,
 7723,
 6696,
 6263,
 5799,
 5608,
 10020,
 10015,
 9680,
 9633,
 9553,
 9549,
 9538,
 9299,
 91

In [49]:
with open("../systems/S1.results", "r") as f:
    data = f.readlines()

In [50]:
y = []
for x in data:
    y.append(str.strip(x))

In [51]:
z = []
for result in y:
    z.append(result.split(' '))

In [52]:
z

[['1', '0', '6567', '1', '5.0743', '0'],
 ['1', '0', '9652', '2', '4.4829', '0'],
 ['1', '0', '9684', '3', '4.3478', '0'],
 ['1', '0', '7844', '4', '4.3268', '0'],
 ['1', '0', '9584', '5', '4.2160', '0'],
 ['1', '0', '7853', '6', '4.0384', '0'],
 ['1', '0', '9090', '7', '4.0021', '0'],
 ['1', '0', '9574', '8', '4.0011', '0'],
 ['1', '0', '6850', '9', '3.9796', '0'],
 ['1', '0', '9616', '10', '3.9646', '0'],
 ['1', '0', '9281', '11', '3.8694', '0'],
 ['1', '0', '9891', '12', '3.8448', '0'],
 ['1', '0', '9665', '13', '3.8105', '0'],
 ['1', '0', '8446', '14', '3.8105', '0'],
 ['1', '0', '9591', '15', '3.7642', '0'],
 ['1', '0', '9198', '16', '3.7352', '0'],
 ['1', '0', '5690', '17', '3.6808', '0'],
 ['1', '0', '5493', '18', '3.6772', '0'],
 ['1', '0', '6915', '19', '3.6539', '0'],
 ['1', '0', '6171', '20', '3.6375', '0'],
 ['1', '0', '5023', '21', '3.6375', '0'],
 ['1', '0', '7732', '22', '3.6234', '0'],
 ['1', '0', '8871', '23', '3.5779', '0'],
 ['1', '0', '8124', '24', '3.5058', '0'],
 

In [53]:
result_objs = []
for a in z:
    result_objs.append(Result(a[0], a[2], a[3], a[4]))

In [54]:
test_set = ResultSet(result_objs)

In [55]:
test_set.get_results_for_query(4)[0].doc_num

9739

In [56]:
s = System("../systems/S1.results")

In [58]:
t = s.result_set()

In [59]:
t.get_results_for_query(4)[0].doc_num

9739

In [61]:
with open("../systems/qrels.txt") as eval_file:
    abc = eval_file.readlines()

In [62]:
abc

['1: (9090,3) (6850,2) (9574,2) (8709,1) (9684,1) (5011,1)\n',
 '2: (5715,2) (9677,2) (5766,2) (6327,1) (6079,1) (5653,1) (6498,1) (7117,1)\n',
 '3: (9743,3)\n',
 '4: (6491,3) (5269,3) (8032,3) (9444,3) (8988,2) (9445,2) (5883,2) (7435,2) (9745,1) (10029,1) (7224,1) (9038,1) (7827,1) (6675,1) (9720,1) (6289,1) (9746,1) (6836,1) (10119,1) (4742,1) (9739,1) (5783,1) (10117,1) (8414,1) (5865,1) (8315,1) (9523,1) (8318,1) (6288,1) (5268,1) (7620,1) (7046,1) (6054,1) (9744,1) (6743,1) (9278,1) (8562,1) (6382,1) (6334,1) (6292,1)\n',
 '5: (1646,1) (2126,1) (3111,1) (4983,1) (8646,1) (6669,1) (8282,1)\n',
 '6: (8433,3) (7487,3)  (6736,3) (5305,1) (8844,1) (9736,1) (9541,1) (8261,1) (8120,1) (7424,1) (8593,1) (7737,1)\n',
 '7: (7646,3) (3156,2) (4144,1)\n',
 '8: (9891,3) (7844,3) (9574,2) (9684,2) (9090,1) (9652,1) (9281,1) (6171,1)\n',
 '9: (5884,3) (5995,3) (5121,3) (6923,3) (7686,2) (7704,2) (6496,2) (5565,2) (9760,2) (8344,2) (5138,1) (9104,1) (7847,1) (5644,1) (7961,1) (7687,1) (9647,1) (

In [92]:
bc = [x.strip('\n') for x in abc]

In [110]:
from collections import defaultdict

def flatten_white_space(input_string, expression=" +", replacement=" "):
    return re.sub(expression, replacement, input_string)

def parse_rel_result_str(string, delimiter=' ', empty_start=True):
    ''' string : input string'''
    ''' delimiter : seperates results'''
    '''empty_start : is first result empty?'''
    '''return : list of tuples STRINGS'''
    results = string.split(delimiter)
    return results[1:] if empty_start else results

def parse_tuple(string):
    doc_num, score = string.strip('()').split(',')
    return (int(doc_num), int(score))

qrels = defaultdict(int)
for q, r in [x.split(':') for x in bc]:
    query = int(q)
    result = flatten_white_space(r)
    qrels[query] = [parse_tuple(doc_tuple) for doc_tuple in parse_rel_result_str(result)]

In [111]:
qrels

defaultdict(int,
            {1: [(9090, 3),
              (6850, 2),
              (9574, 2),
              (8709, 1),
              (9684, 1),
              (5011, 1)],
             2: [(5715, 2),
              (9677, 2),
              (5766, 2),
              (6327, 1),
              (6079, 1),
              (5653, 1),
              (6498, 1),
              (7117, 1)],
             3: [(9743, 3)],
             4: [(6491, 3),
              (5269, 3),
              (8032, 3),
              (9444, 3),
              (8988, 2),
              (9445, 2),
              (5883, 2),
              (7435, 2),
              (9745, 1),
              (10029, 1),
              (7224, 1),
              (9038, 1),
              (7827, 1),
              (6675, 1),
              (9720, 1),
              (6289, 1),
              (9746, 1),
              (6836, 1),
              (10119, 1),
              (4742, 1),
              (9739, 1),
              (5783, 1),
              (10117, 1),
              (