In [101]:
# Basic Ranked Retrieval (RRI)
import numpy as np
import pandas as pd
from xml.dom import minidom
from xml.etree import cElementTree as ElementTree
import os
import nltk
import ssl
import math
import csv

try:
     _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
     pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/isabelasarmiento/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/isabelasarmiento/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/isabelasarmiento/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [102]:
## Imports
def documentReader():
    """
    DocString Reads the documents
    :return: Nothing
    """
    documents_path = os.path.join(os.getcwd(), 'docs/docs-raw-texts')
    documentos = {}
    documents_paths = os.listdir(documents_path)
    documents_paths.sort()
    #print(documents_paths)
    for filename in documents_paths:
        file_path = os.path.join(documents_path, filename)
        #print(filename)
        xmldoc = minidom.parse(file_path)
        id = xmldoc.getElementsByTagName('public')[0].attributes['publicId'].value
        title = xmldoc.getElementsByTagName('fileDesc')[0].attributes['title'].value
        data = next(ElementTree.parse(file_path).iter('raw')).text
        documentos[id] = (title + ' ' + data).replace(u'\xa0', u' ').replace('\n', ' ')
    return documentos

documentos = documentReader()
print(list(documentos.items())[0])


('d001', 'William Beaumont and the Human Digestion William Beaumont and the Human Digestion.  William Beaumont: Physiology of digestion Image Source.  On November 21, 1785, US-American surgeon William Beaumont was born. He became best known as “Father of Gastric Physiology” following his research on human digestion. William Beaumont was born in Lebanon, Connecticut and became a physician. He served as a surgeon’s mate in the Army during the War of 1812. He opened a private practice in Plattsburgh, New York, but rejoined the Army as a surgeon in 1819. Beaumont was stationed at Fort Mackinac on Mackinac Island in Michigan in the early 1820s when it existed to protect the interests of the American Fur Company. The fort became the refuge for a wounded 19-year-old French-Canadian fur trader named Alexis St. Martin when a shotgun went off by accident in the American Fur Company store at close range June 6th, 1822. St. Martin’s wound was quite serious because his stomach was perforated and se

In [103]:
def tokenization(documentos):
    """
    :param documentos:
    :return:
    """
    nltk_stop_words_en = set(nltk.corpus.stopwords.words("english"))
    p_stemmer = nltk.stem.porter.PorterStemmer()
    wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()

    word_tok = {key: nltk.word_tokenize(doc) for key, doc in documentos.items()}
    word_tok_sw = {key: [token for token in doc if token not in nltk_stop_words_en] for key, doc in word_tok.items()}
    # nltk_stemedList_en = {key: [p_stemmer.stem(word) for word in doc] for key, doc in word_tok_sw.items()}
    nltk_lemmaList = {key: [wordnet_lemmatizer.lemmatize(word) for word in doc] for key, doc in word_tok_sw.items()}

    return nltk_lemmaList

tokenized_docs = tokenization(documentos)
print(list(tokenized_docs.items())[0])

('d001', ['William', 'Beaumont', 'Human', 'Digestion', 'William', 'Beaumont', 'Human', 'Digestion', '.', 'William', 'Beaumont', ':', 'Physiology', 'digestion', 'Image', 'Source', '.', 'On', 'November', '21', ',', '1785', ',', 'US-American', 'surgeon', 'William', 'Beaumont', 'born', '.', 'He', 'became', 'best', 'known', '“', 'Father', 'Gastric', 'Physiology', '”', 'following', 'research', 'human', 'digestion', '.', 'William', 'Beaumont', 'born', 'Lebanon', ',', 'Connecticut', 'became', 'physician', '.', 'He', 'served', 'surgeon', '’', 'mate', 'Army', 'War', '1812', '.', 'He', 'opened', 'private', 'practice', 'Plattsburgh', ',', 'New', 'York', ',', 'rejoined', 'Army', 'surgeon', '1819', '.', 'Beaumont', 'stationed', 'Fort', 'Mackinac', 'Mackinac', 'Island', 'Michigan', 'early', '1820s', 'existed', 'protect', 'interest', 'American', 'Fur', 'Company', '.', 'The', 'fort', 'became', 'refuge', 'wounded', '19-year-old', 'French-Canadian', 'fur', 'trader', 'named', 'Alexis', 'St.', 'Martin', 's

In [104]:
def makeInvertedIndex(tokenized_docs):
    index = {}

    for id, doc in tokenized_docs.items():
        #id = int(id[-3:]) #paasa dnjk al entero njk.
        for token in doc:
            if token in index :
                if index[token]['posting'][-1][0] == id:
                    index[token]['posting'][-1][1] += 1
                else:
                    index[token]['posting'].append([id, 1])
                    index[token]['freq'] += 1

            else:
                index[token] = {
                    'posting': [[id, 1]],
                    'freq': 1
                }
    return index


invertedIndex = makeInvertedIndex(tokenized_docs)
print(list(invertedIndex.items())[0])

('William', {'posting': [['d001', 6], ['d015', 6], ['d028', 4], ['d035', 2], ['d055', 4], ['d056', 5], ['d069', 6], ['d088', 3], ['d091', 1], ['d092', 1], ['d095', 1], ['d098', 2], ['d102', 5], ['d106', 1], ['d109', 1], ['d111', 1], ['d129', 1], ['d136', 8], ['d138', 3], ['d147', 1], ['d175', 1], ['d179', 2], ['d180', 1], ['d189', 2], ['d190', 1], ['d191', 1], ['d197', 1], ['d212', 1], ['d230', 1], ['d241', 2], ['d254', 1], ['d257', 1], ['d266', 2], ['d272', 1], ['d273', 8], ['d274', 1], ['d289', 1], ['d291', 1], ['d294', 1], ['d299', 1], ['d300', 1], ['d309', 1], ['d310', 5], ['d320', 6], ['d323', 1], ['d330', 7]], 'freq': 46})


In [105]:
def queries_reader():
    """
    DocString Reads the query
    :return:
    """
    queries_path = os.path.join(os.getcwd(), 'docs/queries-raw-texts')
    queries = {}
    queries_paths = os.listdir(queries_path)
    queries_paths.sort()
    #print(documents_paths)
    for filename in queries_paths:
        file_path = os.path.join(queries_path, filename)
        #print(filename)
        xmldoc = minidom.parse(file_path)
        id = xmldoc.getElementsByTagName('public')[0].attributes['publicId'].value
        query = next(ElementTree.parse(file_path).iter('raw')).text
        queries[id] = query.replace(u'\xa0', u' ').replace('\n', ' ')
    return queries

queries = queries_reader()
print(list(queries.items())[0])

('q01', 'Fabrication of music instruments')


In [106]:
def queries_tokenization(queries):
    """
    :param documentos:
    :return:
    """
    nltk_stop_words_en = set(nltk.corpus.stopwords.words("english"))
    wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()
    #print("items", queries.items())
    tokenized_queries = {}
    for key,doc in queries.items():
        word_tok = nltk.word_tokenize(doc)
        word_tok_sw = [token for token in word_tok if token not in nltk_stop_words_en]
        nltk_lemmaList = [wordnet_lemmatizer.lemmatize(word) for word in word_tok_sw]
        #print(nltk_lemmaList)
        tokenized_queries[key] = nltk_lemmaList

    return tokenized_queries



tokenized_queries = queries_tokenization(queries)
print(list(tokenized_queries.items())[0])

('q01', ['Fabrication', 'music', 'instrument'])


In [107]:
def basic_ranked_retrieval(queries,invertedIndex,documents,N):
    """
    DocString
    :return: Nothing
    """
    scores= {}
    query_scores_template = {}
    for key, doc in documents.items():
        query_scores_template[key] = 0

    for query,tokens in queries.items():
        query_scores = query_scores_template.copy()
        for token in tokens:
            if token in invertedIndex:
                df = invertedIndex[token]["freq"]
                idf = math.log10( N / df )
                for docs in invertedIndex[token]["posting"]:
                    tf = docs[1]
                    tf_w = math.log10(1 + tf)
                    #if docs[0] not in query_scores:
                    #    query_scores[docs[0]] = 0
                    query_scores[docs[0]] += tf_w*idf
        clean_query_scores = { k : v for k,v in query_scores.items() if v != 0}
        clean_query_scores = dict(sorted(clean_query_scores.items(), key=lambda item: item[1], reverse=True))
        scores[query] = clean_query_scores#query_scores

    return scores

RRI = basic_ranked_retrieval(tokenized_queries,invertedIndex, documentos, len(documentos))
print(list(RRI.items())[0])

('q01', {'d254': 1.3322084124448144, 'd016': 1.280133266247014, 'd085': 0.7608038472948082, 'd185': 0.7210322829593981, 'd209': 0.7210322829593981, 'd060': 0.6881765238016224, 'd100': 0.6881765238016224, 'd153': 0.6881765238016224, 'd186': 0.6553207646438466, 'd006': 0.571404565150006, 'd215': 0.571404565150006, 'd099': 0.5193294189522057, 'd243': 0.5193294189522057, 'd004': 0.36051614147969907, 'd039': 0.36051614147969907, 'd065': 0.36051614147969907, 'd094': 0.36051614147969907, 'd130': 0.36051614147969907, 'd136': 0.36051614147969907, 'd152': 0.36051614147969907, 'd162': 0.36051614147969907, 'd164': 0.36051614147969907, 'd184': 0.36051614147969907, 'd195': 0.36051614147969907, 'd312': 0.36051614147969907, 'd316': 0.36051614147969907, 'd028': 0.3276603823219233, 'd038': 0.3276603823219233, 'd074': 0.3276603823219233, 'd082': 0.3276603823219233, 'd116': 0.3276603823219233, 'd170': 0.3276603823219233, 'd172': 0.3276603823219233, 'd212': 0.3276603823219233, 'd229': 0.3276603823219233, '

In [108]:
def score(query, document_id, invertedIndex, documents):
    """
    DocString
    :return: Nothing
    """
    score = 0
    N = len(documents)

    tokens = query[1]
    for token in tokens:
        token_in_index = invertedIndex.get(token, "unknown_token")
        if token_in_index != "unknown_token":
            token_posting = token_in_index["posting"]
            tf = 0
            tf_w = 0
            for doc_freq_pair in token_posting:
                if doc_freq_pair[0] == document_id:
                    tf = doc_freq_pair[1]
                    tf_w = math.log10(1 + tf)
                    break
            df = token_in_index["freq"]
            idf = math.log10( N / df )
            score += tf_w*idf
    return  score


query1 = list(tokenized_queries.items())[3]

scoreQ1 = score(query1, 'd001', invertedIndex, documentos)
print(scoreQ1)

0.2564797561654694


In [152]:
def read_judgemnts_file():
    """
    DocString
    :return: Nothing
    """
    document_path = os.path.join(os.getcwd(), 'docs/relevance-judgments.tsv')
    tsv_file = open(document_path)
    read_tsv = csv.reader(tsv_file, delimiter="\t")
    relevance = {}
    for row in read_tsv:
        documents = row[1].split(',')
        query_relevance = {pair.split(':')[0] : int(pair.split(':')[1]) for pair in documents }
        query_relevance = dict(sorted(query_relevance.items(), key=lambda item: item[0]))
        relevance[row[0]] = query_relevance
    return relevance


relevance = read_judgemnts_file()
print(relevance)


{'q01': {'d016': 5, 'd186': 4, 'd254': 5}, 'q02': {'d136': 2, 'd139': 2, 'd143': 4, 'd147': 2, 'd149': 2, 'd164': 4, 'd228': 4, 'd283': 4, 'd291': 4, 'd293': 4, 'd318': 2}, 'q03': {'d105': 2, 'd147': 3, 'd152': 3, 'd283': 4, 'd291': 4, 'd318': 2}, 'q04': {'d010': 3, 'd019': 2, 'd049': 2, 'd270': 3, 'd275': 3, 'd286': 2, 'd330': 2}, 'q06': {'d026': 4, 'd069': 2, 'd233': 3, 'd257': 2, 'd297': 3, 'd329': 5}, 'q07': {'d004': 3, 'd077': 3, 'd179': 3, 'd266': 2}, 'q08': {'d005': 4, 'd028': 3, 'd081': 2, 'd108': 3, 'd110': 4, 'd117': 3, 'd121': 2, 'd180': 2, 'd205': 2, 'd251': 5, 'd271': 3, 'd292': 2}, 'q09': {'d177': 2, 'd198': 3, 'd199': 5, 'd205': 3, 'd217': 2, 'd223': 2}, 'q10': {'d052': 2, 'd065': 3, 'd068': 2, 'd076': 3, 'd100': 2, 'd199': 4, 'd215': 2, 'd231': 4}, 'q12': {'d239': 4, 'd250': 4, 'd258': 3, 'd277': 4}, 'q13': {'d049': 4, 'd056': 4, 'd239': 2, 'd258': 2, 'd277': 2}, 'q14': {'d002': 2, 'd005': 3, 'd041': 3, 'd081': 4, 'd091': 4, 'd093': 3, 'd117': 2, 'd130': 3, 'd142': 2, '

In [180]:
def precision_at_k(relevance: list, k: int):
    """
    DocString
    :return: Nothing
    """
    if k == 0:
        return 0
    l = np.array(relevance[:k]).sum()/k
    return l

def recall_at_k(relevance: list, nr_relevant: int, k: int):
    """
    DocString
    :return: Nothing
    """
    l = np.array(relevance[:k]).sum()/nr_relevant
    return l

def average_precision(relevance):
    """
    DocString
    :return: Nothing
    """

    length = len(relevance)
    sum = 0
    for i in range(length):
        if relevance[i]:
            sum += precision_at_k(relevance, i+1)
    return sum / np.array(relevance).sum() if np.array(relevance).sum()!=0 else 0

def mean_avg_precision(l):
    """
    DocString
    :return: Nothing
    """
    average = 0
    for lista in l:
        #print(average_precision(lista))
        average+= average_precision(lista)

    mean = average / len(l)
    return mean

def dcg_at_k(relevance, k: int):
    """
    DocString
    :return: Nothing
    """
    sum = 0
    i =  0
    for rel_i in relevance[: k]:
        i+= 1
        sum += rel_i/np.log2(max(i, 2))

    return sum

def ndcg_at_k(relevance, rel_sorted, k):
    """
    DocString
    :return: Nothing
    """
    #rel_sorted = sorted(relevance, reverse=True)
    max = dcg_at_k(rel_sorted, k)
    real = dcg_at_k(relevance, k)

    return real/ max if max != 0 else 0

In [138]:
def make_binary_score(query_tuple,relevance):
    query = query_tuple[0]
    ranking = query_tuple[1]
    binary_score = []
    M = len(relevance[query])
    i=1;
    for document, score in ranking.items():
        if i>M:
            break
        if document in relevance[query]:
            binary_score.append(1)
        else:
            binary_score.append(0)
        i += 1
    #print(binary_score)
    return binary_score, M


In [None]:
def precision_for_RRI(RRI,relevance):
    """
    DocString
    :return: Nothing
    """
    precisions = {}
    for query in RRI.items():
        binary_score, M = make_binary_score(query,relevance)
        precisions[query[0]] = precision_at_k(binary_score,M)
    return precisions

precisions = precision_for_RRI(RRI,relevance)
print(precisions)

In [126]:
def recall_for_RRI(RRI,relevance):
    """
    DocString
    :return: Nothing
    """
    recalls = {}
    for query in RRI.items():
        binary_score, M = make_binary_score(query,relevance)
        recalls[query[0]] = recall_at_k(binary_score,M,M)
    return recalls

recalls = recall_for_RRI(RRI,relevance)
print(recalls)



{'q01': 0.6666666666666666, 'q02': 0.6363636363636364, 'q03': 1.0, 'q04': 0.8571428571428571, 'q06': 0.8333333333333334, 'q07': 0.25, 'q08': 0.75, 'q09': 0.8333333333333334, 'q10': 0.5, 'q12': 0.75, 'q13': 0.6, 'q14': 0.4166666666666667, 'q16': 0.5, 'q17': 0.75, 'q18': 0.7142857142857143, 'q19': 0.5, 'q22': 0.5714285714285714, 'q23': 0.25, 'q24': 0.0, 'q25': 0.75, 'q26': 1.0, 'q27': 0.5, 'q28': 0.6666666666666666, 'q29': 0.4166666666666667, 'q32': 1.0, 'q34': 1.0, 'q36': 0.5, 'q37': 0.6666666666666666, 'q38': 0.375, 'q40': 0.7777777777777778, 'q41': 0.8571428571428571, 'q42': 0.6666666666666666, 'q44': 0.5, 'q45': 0.75, 'q46': 0.5}


In [151]:
def map_for_RRI(RRI,relevance):
    """
    DocString
    :return: Nothing
    """
    precisions = []
    for query in RRI.items():
        binary_score, M = make_binary_score(query,relevance)
        precisions.append(binary_score)
    #print(precisions)
    map = mean_avg_precision(precisions)
    return map

map = map_for_RRI(RRI,relevance)
print(map)

0.882196866625438


In [176]:
def make_non_binary_score(query_tuple,relevance):
    query = query_tuple[0]
    ranking = query_tuple[1]
    non_binary_score = []
    M = len(relevance[query])
    i=1;
    for document, score in ranking.items():
        if i>M:
            break
        if document in relevance[query]:
            non_binary_score.append(relevance[query][document])
        else:
            non_binary_score.append(0)
        i += 1
    rel_sorted = [rel for doc,rel in relevance[query].items() ]
    rel_sorted = sorted(rel_sorted, reverse=True)
    return non_binary_score, rel_sorted, M

query1 = list(RRI.items())[0]
prueba = make_non_binary_score(query1,relevance)
print(prueba)

([5, 5, 0], [5, 5, 4], 3)


In [179]:
def ndcg_for_RRI(RRI,relevance):
    """
    DocString
    :return: Nothing
    """
    ndcgs = {}
    for query in RRI.items():
        non_binary_score, rel_sorted, M  = make_non_binary_score(query,relevance)
        #print(non_binary_score)
        ndcgs[query[0]] = ndcg_at_k(non_binary_score, rel_sorted, M)
    return ndcgs

ndcg = ndcg_for_RRI(RRI,relevance)
print(ndcg)

av_ndgc = 0
for key,val in ndcg.items():
    av_ndgc += val
print(av_ndgc/len(ndcg))

{'q01': 0.7984848580994974, 'q02': 0.6664051841014526, 'q03': 0.9125990044855482, 'q04': 0.8375872815409342, 'q06': 0.9135933534799239, 'q07': 0.3373519727104165, 'q08': 0.8723596292045686, 'q09': 0.8823021871008767, 'q10': 0.474468013427548, 'q12': 0.7920776427801035, 'q13': 0.8326604750636971, 'q14': 0.42286177976615147, 'q16': 0.6, 'q17': 0.8770340301127121, 'q18': 0.8675918411468322, 'q19': 0.7142857142857143, 'q22': 0.6619973907677502, 'q23': 0.6037460821441866, 'q24': 0.0, 'q25': 0.6626480272895834, 'q26': 1.0, 'q27': 0.7659193871914459, 'q28': 0.8086698088039842, 'q29': 0.6638253243294739, 'q32': 0.9947696772861268, 'q34': 1.0, 'q36': 0.6072766647980451, 'q37': 0.8637574337885663, 'q38': 0.3732229932956358, 'q40': 0.7648371537954458, 'q41': 0.7698204997888964, 'q42': 0.7579237460681981, 'q44': 0.6058794570542323, 'q45': 0.8922521020745832, 'q46': 0.7337495705923308}
0.7237702367535559
