In [37]:
from gensim.parsing.porter import PorterStemmer
from gensim.parsing.preprocessing import remove_stopwords
from gensim import corpora
from gensim import models
from gensim import similarities
from smart_open import smart_open
import nltk
from xml.dom import minidom
from xml.etree import cElementTree as ElementTree
import os
import csv
import numpy as np
import pandas as pd

# Gensim Corpus and Tf.idf Model

# Gensim Corpus and Tf.Idf Model

## Document Reader

In [4]:

def documentReader(path, queries = False):
    """
    DocString
    :return: Nothing
    """
    documents_path = os.path.join(os.getcwd(), path)
    documentos = {}
    for filename in os.listdir(documents_path):
        file_path = os.path.join(documents_path, filename)
        xmldoc = minidom.parse(file_path)
        id = xmldoc.getElementsByTagName('public')[0].attributes['publicId'].value
        title = '' if queries else xmldoc.getElementsByTagName('fileDesc')[0].attributes['title'].value
        data = next(ElementTree.parse(file_path).iter('raw')).text
        documentos[id] = (title + ' ' + data).replace(u'\xa0', u' ').replace('\n', ' ')

    return documentos
documentos = documentReader('docs/docs-raw-texts')
NRO_DOCS = len(documentos)
DOCS_IDs = list(documentos.keys())
print(list(documentos.items())[0])

('d001', 'William Beaumont and the Human Digestion William Beaumont and the Human Digestion.  William Beaumont: Physiology of digestion Image Source.  On November 21, 1785, US-American surgeon William Beaumont was born. He became best known as “Father of Gastric Physiology” following his research on human digestion. William Beaumont was born in Lebanon, Connecticut and became a physician. He served as a surgeon’s mate in the Army during the War of 1812. He opened a private practice in Plattsburgh, New York, but rejoined the Army as a surgeon in 1819. Beaumont was stationed at Fort Mackinac on Mackinac Island in Michigan in the early 1820s when it existed to protect the interests of the American Fur Company. The fort became the refuge for a wounded 19-year-old French-Canadian fur trader named Alexis St. Martin when a shotgun went off by accident in the American Fur Company store at close range June 6th, 1822. St. Martin’s wound was quite serious because his stomach was perforated and se

### Tokenize

In [5]:
p = PorterStemmer()
def process(text):
    doc_nor = text.lower()
    doc_sw = remove_stopwords(doc_nor)
    doc_stem = p.stem_sentence(doc_sw)
    return nltk.word_tokenize(doc_stem)

docDict = []
for key, doc in documentos.items():
    docDict.append(process(doc))

docDict[0][:5]

NameError: name 'documentos' is not defined

In [6]:
dictionary = corpora.Dictionary(docDict)
dictionary.save('docs/midict.dict')
print(dictionary.token2id['information'])

76


In [7]:
##  Market Matrix format
# Step 1: Build the corpus from big file
class MyCorpus():
    def __init__(self, documents):
        self.documents = documents
    def __iter__(self):
        for key, doc in self.documents.items():
            yield dictionary.doc2bow(process(doc))

corpus_memory_friendly = MyCorpus(documentos)
corpora.MmCorpus.serialize("docs/corpus.mm",corpus_memory_friendly)

#### Read Maket Matrix format from disk

In [8]:
corpus = corpora.MmCorpus("docs/corpus.mm")
# No hacer esto en una implementacion real
for doc in corpus:
    print(doc[:10])
    break

[(0, 20.0), (1, 21.0), (2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0), (8, 1.0), (9, 1.0)]


#### Build tf.idf model from corpus

In [3]:
dictionary = corpora.Dictionary.load('docs/midict.dict')
corpus = corpora.MmCorpus('docs/corpus.mm')
tfidf = models.TfidfModel(corpus)

In [6]:
#Test to verify correct reading
query = "Machine learning"
query_doc_bow = dictionary.doc2bow(process(query)) # Important: Same corpus preprocess
print(query_doc_bow)
print(tfidf[query_doc_bow])

[(241, 1), (5809, 1)]
[(241, 0.2642196547502339), (5809, 0.9644625311766483)]


#### Make similarity matrix

In [7]:
index = similarities.MatrixSimilarity(tfidf[corpus])
index.save('docs/similmatrix.index')
print('Finished')

Finished


## Querying and validating

In [None]:
index = similarities.MatrixSimilarity.load('docs/similmatrix.index')
sims = index[tfidf[query_doc_bow]]
print(list(enumerate(sims))[:10])


### Read and proccess queries

In [29]:
def queries_reader():
    """
    :return:
    """
    queries_path = os.path.join(os.getcwd(), 'docs/queries-raw-texts')
    queries = {}
    queries_paths = os.listdir(queries_path)
    queries_paths.sort()
    #print(documents_paths)
    query_index = []
    for filename in queries_paths:
        file_path = os.path.join(queries_path, filename)
        #print(filename)
        xmldoc = minidom.parse(file_path)
        id = xmldoc.getElementsByTagName('public')[0].attributes['publicId'].value
        query = next(ElementTree.parse(file_path).iter('raw')).text
        queries[id] = query.replace(u'\xa0', u' ').replace('\n', ' ')
        query_index.append(id)
    return queries, query_index

queries, query_index = queries_reader()

In [34]:
def queries_evaluation(queries):
    queries_rank = {}
    for idq, query in queries.items():
        query_doc_bow = dictionary.doc2bow(process(query))
        sims = index[tfidf[query_doc_bow]]
        sorted_vals = sorted(list(enumerate(sims)), key=lambda x: x[1], reverse=True)
        clean_query_scores = [ "d{0:0=3d}".format(id+1) for id,v in sorted_vals if v != 0]
        queries_rank[idq] = clean_query_scores
    return queries_rank


queries_ranking = queries_evaluation(queries)
print(queries_ranking["q01"])

['d016', 'd259', 'd254', 'd186', 'd085', 'd209', 'd215', 'd170', 'd153', 'd008', 'd185', 'd154', 'd163', 'd315', 'd296', 'd060', 'd089', 'd243', 'd004', 'd006', 'd162', 'd100', 'd094', 'd179', 'd145', 'd059', 'd039', 'd329', 'd299', 'd273', 'd312', 'd028', 'd311', 'd082', 'd281', 'd255', 'd065', 'd074', 'd317', 'd265', 'd229', 'd275', 'd130', 'd021', 'd077', 'd152', 'd195', 'd052', 'd316', 'd038', 'd164', 'd024', 'd123', 'd136', 'd184']


In [26]:
def read_judgemnts_file():
    """
    DocString
    :return: Nothing
    """
    document_path = os.path.join(os.getcwd(), 'docs/relevance-judgments.tsv')
    tsv_file = open(document_path)
    read_tsv = csv.reader(tsv_file, delimiter="\t")
    relevance = {}
    for row in read_tsv:
        documents = row[1].split(',')
        query_relevance = {pair.split(':')[0] : pair.split(':')[1] for pair in documents }
        query_relevance = dict(sorted(query_relevance.items(), key=lambda item: item[0]))
        relevance[row[0]] = query_relevance
    return relevance


relevance = read_judgemnts_file()
print(relevance['q02'])

{'d136': '2', 'd139': '2', 'd143': '4', 'd147': '2', 'd149': '2', 'd164': '4', 'd228': '4', 'd283': '4', 'd291': '4', 'd293': '4', 'd318': '2'}


In [35]:
def make_binary_result(results, relevant_res):
    bin_relevant = {}
    rel_scale_repr = {}
    for query, relevant_docs in relevant_res.items():
        bin_repr = []
        scaled_repr = []
        M = len(relevant_docs)
        for doc_id, rel_scale in relevant_docs.items():
            bin = 1 if doc_id in results[query][:M] else 0
            bin_repr.append(bin)
            scaled_repr.append(bin * int(rel_scale))
        bin_relevant[query] = bin_repr
        rel_scale_repr[query] = scaled_repr
    return bin_relevant, rel_scale_repr

bin_results, scaled_results = make_binary_result(queries_ranking, relevance)
bin_results['q02']

[0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0]

### Definition of IR metrics functions

In [38]:
def precision_at_k(relevance: list, k: int):
    """
    DocString
    :return: Nothing
    """
    if k == 0:
        return 0
    l = np.array(relevance[:k]).sum()/k
    return l

def recall_at_k(relevance: list, nr_relevant: int, k: int):
    """
    DocString
    :return: Nothing
    """
    l = np.array(relevance[:k]).sum()/nr_relevant
    return l

def average_precision(relevance):
    """
    DocString
    :return: Nothing
    """

    length = len(relevance)
    sum = 0
    for i in range(length):
        if relevance[i]:
            sum += precision_at_k(relevance, i+1)
    if np.array(relevance).sum()==0:
        return 0
    else:
        return sum / np.array(relevance).sum()

def mean_avg_precision(l):
    """
    DocString
    :return: Nothing
    """
    average = 0
    for lista in l:
        #print(average_precision(lista))
        average+= average_precision(lista)

    mean = average / len(l)
    return mean

mean_avg_precision([[0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 1, 1], [0, 1, 0, 1, 1, 1, 1]])

0.35468253968253965

In [39]:
def dcg_at_k(relevance, k: int):
    """
    DocString
    :return: Nothing
    """

    sum = 0
    i =  0
    for rel_i in relevance[: k]:
        i+= 1
        sum += rel_i/np.log2(max(i, 2))

    return sum

dcg_at_k([4, 4, 3, 0, 0, 1, 3, 3, 3, 0], 6)

def ndcg_at_k(relevance, k):
    """
    DocString
    :return: Nothing
    """
    rel_sorted = sorted(relevance, reverse=True)
    max = dcg_at_k(rel_sorted, k)
    real = dcg_at_k(relevance, k)

    return real/ max


ndcg_at_k([4, 4, 3, 0, 0, 1, 3, 3, 3, 0], 6)

0.7424602308163405

In [40]:
print(recall_at_k(bin_results['q01'], 3, 3))


0.6666666666666666


## Compute Evaluation Metrics for each query

In [42]:
def evaluation_metric(bin_queries, query_index, scaled_results):
    COLUMNS = ['P@M', 'R@M', 'NDCG@M']
    records = []
    for query, bin_vec in bin_queries.items():
        scaled = scaled_results[query]
        M = len(bin_vec)
        pm = precision_at_k(bin_vec, M)
        rm = recall_at_k(bin_vec, M, M)
        ndcg = ndcg_at_k(scaled, M)
        records.append([pm, rm, ndcg])

    return pd.DataFrame.from_records(records, index=query_index, columns=COLUMNS)

metrics = evaluation_metric(bin_results, query_index, scaled_results)
metrics.head(10)


  return real/ max


Unnamed: 0,P@M,R@M,NDCG@M
q01,0.666667,0.666667,0.815465
q02,0.363636,0.363636,0.428656
q03,0.5,0.5,0.567635
q04,0.714286,0.714286,0.92821
q06,0.666667,0.666667,0.691704
q07,0.5,0.5,0.8
q08,0.666667,0.666667,0.837297
q09,0.833333,0.833333,0.880115
q10,0.375,0.375,0.577633
q12,1.0,1.0,0.989111


### MAP

In [43]:
def overall_map(bin_results):
    matrix = [vector for key, vector in bin_results.items() ]
    return mean_avg_precision(matrix)

print(f'MAP resultante de todas las queries: {overall_map(bin_results)}')

MAP resultante de todas las queries: 0.6533977342259655
