In [37]:
from gensim.parsing.porter import PorterStemmer
from gensim.parsing.preprocessing import remove_stopwords
from gensim import corpora
from gensim import models
from gensim import similarities
from smart_open import smart_open
import nltk
from xml.dom import minidom
from xml.etree import cElementTree as ElementTree
import os
import csv
import numpy as np
import pandas as pd

# Gensim Corpus and Tf.idf Model

# Gensim Corpus and Tf.Idf Model

## Document Reader

In [4]:

def documentReader(path, queries = False):
    """
    This method reads the documents
    :return: Dictionary of documents {dXXX: content of document dXXX}
    """
    documents_path = os.path.join(os.getcwd(), path)
    documentos = {}
    for filename in os.listdir(documents_path):
        file_path = os.path.join(documents_path, filename)
        xmldoc = minidom.parse(file_path)
        id = xmldoc.getElementsByTagName('public')[0].attributes['publicId'].value
        title = '' if queries else xmldoc.getElementsByTagName('fileDesc')[0].attributes['title'].value
        data = next(ElementTree.parse(file_path).iter('raw')).text
        documentos[id] = (title + ' ' + data).replace(u'\xa0', u' ').replace('\n', ' ')
    return documentos

documentos = documentReader('docs/docs-raw-texts')
NRO_DOCS = len(documentos)
DOCS_IDs = list(documentos.keys())

print(list(documentos.items())[0])

('d001', 'William Beaumont and the Human Digestion William Beaumont and the Human Digestion.  William Beaumont: Physiology of digestion Image Source.  On November 21, 1785, US-American surgeon William Beaumont was born. He became best known as “Father of Gastric Physiology” following his research on human digestion. William Beaumont was born in Lebanon, Connecticut and became a physician. He served as a surgeon’s mate in the Army during the War of 1812. He opened a private practice in Plattsburgh, New York, but rejoined the Army as a surgeon in 1819. Beaumont was stationed at Fort Mackinac on Mackinac Island in Michigan in the early 1820s when it existed to protect the interests of the American Fur Company. The fort became the refuge for a wounded 19-year-old French-Canadian fur trader named Alexis St. Martin when a shotgun went off by accident in the American Fur Company store at close range June 6th, 1822. St. Martin’s wound was quite serious because his stomach was perforated and se

### Tokenize

In [5]:
p = PorterStemmer()
def process(text):
    """
    Tokenizes the text with gensim. Removes stopwords and uses a
    stemmer.
    :param text: the text to be tokenized
    :return: the tokenized token
    """
    doc_nor = text.lower()
    doc_sw = remove_stopwords(doc_nor)
    doc_stem = p.stem_sentence(doc_sw)
    return nltk.word_tokenize(doc_stem)

docDict = []

"""
Tokenizes each document in the document lists, returns
an array of tokenized documents.
"""
for key, doc in documentos.items():
    docDict.append(process(doc))

docDict[0][:5]

NameError: name 'documentos' is not defined

In [6]:
"""
Creates the dictionary with the gensim corpora object
"""
dictionary = corpora.Dictionary(docDict)
dictionary.save('docs/midict.dict')
print(dictionary.token2id['information'])

76


In [7]:
##  Market Matrix format
# Builds the corpus from big file and saves it in a file
class MyCorpus():
    def __init__(self, documents):
        self.documents = documents
    def __iter__(self):
        for key, doc in self.documents.items():
            yield dictionary.doc2bow(process(doc))

corpus_memory_friendly = MyCorpus(documentos)
corpora.MmCorpus.serialize("docs/corpus.mm",corpus_memory_friendly)

#### Read Maket Matrix format from disk

In [8]:
corpus = corpora.MmCorpus("docs/corpus.mm")
for doc in corpus:
    print(doc[:10])
    break

[(0, 20.0), (1, 21.0), (2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0), (8, 1.0), (9, 1.0)]


#### Build tf.idf model from corpus

In [3]:
dictionary = corpora.Dictionary.load('docs/midict.dict')
corpus = corpora.MmCorpus('docs/corpus.mm')
tfidf = models.TfidfModel(corpus)

In [6]:
#Test to verify correct reading
query = "Machine learning"
query_doc_bow = dictionary.doc2bow(process(query)) # Important: Same corpus preprocess
print(query_doc_bow)
print(tfidf[query_doc_bow])

[(241, 1), (5809, 1)]
[(241, 0.2642196547502339), (5809, 0.9644625311766483)]


#### Make similarity matrix

In [7]:
index = similarities.MatrixSimilarity(tfidf[corpus])
index.save('docs/similmatrix.index')
print('Finished')

Finished


## Querying and validating

In [None]:
index = similarities.MatrixSimilarity.load('docs/similmatrix.index')
sims = index[tfidf[query_doc_bow]]
print(list(enumerate(sims))[:10])


### Read and proccess queries

In [29]:
def queries_reader():
    """
    This method reads the queries
    :return: Dictionary of queries {qYY: content of query qYY}
    """
    queries_path = os.path.join(os.getcwd(), 'docs/queries-raw-texts')
    queries = {}
    queries_paths = os.listdir(queries_path)
    queries_paths.sort()
    #print(documents_paths)
    query_index = []
    for filename in queries_paths:
        file_path = os.path.join(queries_path, filename)
        #print(filename)
        xmldoc = minidom.parse(file_path)
        id = xmldoc.getElementsByTagName('public')[0].attributes['publicId'].value
        query = next(ElementTree.parse(file_path).iter('raw')).text
        queries[id] = query.replace(u'\xa0', u' ').replace('\n', ' ')
        query_index.append(id)
    return queries, query_index

queries, query_index = queries_reader()

In [34]:
def queries_evaluation(queries):
    """
    Queries tokenization using gensim and queries evaluation with
    created index and tfidf model.
    :param queries in a dictionary, {qYY: content of query qYY}
    :return: Dictionary of queries en the ranked relevant documents
    {'qXX': ['dYYY', 'dZZZ',...]}
    """
    queries_rank = {}
    for idq, query in queries.items():
        query_doc_bow = dictionary.doc2bow(process(query))
        sims = index[tfidf[query_doc_bow]]
        sorted_vals = sorted(list(enumerate(sims)), key=lambda x: x[1], reverse=True)
        clean_query_scores = [ "d{0:0=3d}".format(id+1) for id,v in sorted_vals if v != 0]
        queries_rank[idq] = clean_query_scores
    return queries_rank


queries_ranking = queries_evaluation(queries)
print(queries_ranking["q01"])

['d016', 'd259', 'd254', 'd186', 'd085', 'd209', 'd215', 'd170', 'd153', 'd008', 'd185', 'd154', 'd163', 'd315', 'd296', 'd060', 'd089', 'd243', 'd004', 'd006', 'd162', 'd100', 'd094', 'd179', 'd145', 'd059', 'd039', 'd329', 'd299', 'd273', 'd312', 'd028', 'd311', 'd082', 'd281', 'd255', 'd065', 'd074', 'd317', 'd265', 'd229', 'd275', 'd130', 'd021', 'd077', 'd152', 'd195', 'd052', 'd316', 'd038', 'd164', 'd024', 'd123', 'd136', 'd184']


## Evaluation

In [35]:
def read_judgemnts_file():
    """
    Lee el archivo de relevancia de los jueces
    :return: Diccionario con pares key: value, donde el key es el id de cada query y el value
    es otro doccionario con las ids de los docs relevantes para esa query ordenados en forma creciente.
    """
    document_path = os.path.join(os.getcwd(), 'docs/relevance-judgments.tsv')
    tsv_file = open(document_path)
    read_tsv = csv.reader(tsv_file, delimiter="\t")
    relevance = {}
    for row in read_tsv:
        documents = row[1].split(',')
        query_relevance = {pair.split(':')[0] : pair.split(':')[1] for pair in documents }
        query_relevance = dict(sorted(query_relevance.items(), key=lambda item: item[0]))
        relevance[row[0]] = query_relevance
    return relevance


relevance = read_judgemnts_file()
print(relevance['q01'])

[0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0]

In [None]:
def make_binary_result(results, relevant_res):
    """
    Este método toma los resultados crudos obtenidos para las queries (Para cada query la lista de documentos ordenaos
    por relevancia), devuelve 3 representaciones de estos resultados. La primera es la representacion binaria at K.
    Que es del mismo tamaño que el número de documentos relevantes. La segunda es esta misma lista pero con la escala
    dada por el archivo de evaluación. La tercera está destinada al cálculo del MAP, tiene la representación binaria
    hasta que salgan todos los documentos relevantes o simplemente de todos los documentos, además en su segundo
    componente tiene el número de documentos relevantes que deberían salir en los resultados según el archivo de
    evaluación.
    :param results: Diccionario con resultados crudos de cada query. Ej: {'q01': ['d254', 'd016', 'd153', ...]}
    :param relevant_res: Las 3 representaciones antes mencionadas
    :return:
    """
    bin_relevant = {}
    rel_scale_repr = {}
    map_relevant_docs = {}
    for query, relevant_docs in relevant_res.items():
        bin_repr = []
        scaled_repr = []
        map_repr = []
        M = len(relevant_docs)
        for doc_id, rel_scale in relevant_docs.items():
            bin = 1 if doc_id in results[query][:M] else 0
            bin_repr.append(bin)
            scaled_repr.append(bin * int(rel_scale))
        i = 0
        for doc_id in results[query]:
            if i < M:
                map_bin = 1 if doc_id in relevant_res[query] else 0
                i += map_bin
                map_repr.append(map_bin)
        bin_relevant[query] = bin_repr
        rel_scale_repr[query] = scaled_repr
        map_relevant_docs[query] = [map_repr, M]
    return bin_relevant, rel_scale_repr, map_relevant_docs

bin_results, scaled_results, map_relevant_docs = make_binary_result(results, relevance)
print(bin_results['q01'])
print(scaled_results['q01'])

In [38]:
print('Primeros 5 documentos devueltos como relevantes para q01: \n', results['q01'][:5])
print('Documentos relevantes para q01 según jueces: \n' , relevance['q01'])
print('Representación binaria de q01, hasta el último doc relevante: \n' ,map_relevant_docs['q01'])

0.35468253968253965

### Definition of IR metrics functions

In [40]:
def precision_at_k(relevance: list, k: int):
    """
    DocString
    :return: Nothing
    """
    if k == 0:
        return 0
    l = np.array(relevance[:k]).sum()/k
    return l

def recall_at_k(relevance: list, nr_relevant: int, k: int):
    """
    DocString
    :return: Nothing
    """
    l = np.array(relevance[:k]).sum()/nr_relevant
    return l

def average_precision(relevance):
    """
    DocString
    :return: Nothing
    """

    length = len(relevance[0])
    sum = 0
    for i in range(length):
        if relevance[0][i]:
            sum += precision_at_k(relevance[0], i+1)
    if np.array(relevance[0]).sum()==0:
        return 0
    else:
        return sum / relevance[1]

def mean_avg_precision(l):
    """
    DocString
    :return: Nothing
    """
    mean = np.array([ average_precision(lista) for lista in l]).mean()
    return mean

mean_avg_precision([[[0, 0, 0, 0, 0, 0, 1], 1], [[0, 0, 0, 1, 1], 2], [[0, 1, 0, 1, 1, 1, 1], 5]])

0.6666666666666666


In [None]:
def dcg_at_k(relevance, k: int):
    """
    Calcula el DCG at K de un vector binario representando los resultados relevantes para una query.
    :param relevance: Vector binario
    :return: DCG at K de nuestra query
    """

    sum = 0
    i =  0
    for rel_i in relevance[: k]:
        i+= 1
        sum += rel_i/np.log2(max(i, 2))

    return sum

dcg_at_k([4, 4, 3, 0, 0, 1, 3, 3, 3, 0], 6)

def ndcg_at_k(relevance, k):
    """
    Calcula el ndcg at k de un vector binario
    :return: NDCG at K.
    """
    rel_sorted = sorted(relevance, reverse=True)
    max = dcg_at_k(rel_sorted, k)
    real = dcg_at_k(relevance, k)

    return real/ max


ndcg_at_k([4, 4, 3, 0, 0, 1, 3, 3, 3, 0], 6)

In [42]:
print(recall_at_k(bin_results['q01'], 3, 3))


  return real/ max


Unnamed: 0,P@M,R@M,NDCG@M
q01,0.666667,0.666667,0.815465
q02,0.363636,0.363636,0.428656
q03,0.5,0.5,0.567635
q04,0.714286,0.714286,0.92821
q06,0.666667,0.666667,0.691704
q07,0.5,0.5,0.8
q08,0.666667,0.666667,0.837297
q09,0.833333,0.833333,0.880115
q10,0.375,0.375,0.577633
q12,1.0,1.0,0.989111


## Compute Evaluation Metrics for each query

In [43]:
def evaluation_metric(bin_queries, query_index, scaled_results):
    """

    :param bin_queries: Diccionario con valores {query Key: vector}, donde el vector corresponde a una lista
    con la representación binaria de un de los resultados encontrados para una query con relación a los dados
    en el archivo de evaluación. Ej, para q01, los relevantes son: d186,d254,d016. El RRDV devuelve d254, d016,
    d153. Por ende, la representación binaria de q01, en el orden del archivo de evaluación es: [0, 1, 1]
    :param query_index: Lista con los ids de las queries. ['qo1', 'qo2', ...]
    :param scaled_results: Representación escalada de los resultados de las queries usando la escala dada en el
    archivo de evaluación. Ej, q01 pasa de [0, 1, 1] a [0, 5, 5]
    :return: Un dataframe con el cálculo del P@M, r@M y NDCG@M para cada query
    """
    COLUMNS = ['P@M', 'R@M', 'NDCG@M']
    records = []
    for query, bin_vec in bin_queries.items():
        scaled = scaled_results[query]
        M = len(bin_vec)
        pm = precision_at_k(bin_vec, M)
        rm = recall_at_k(bin_vec, M, M)
        ndcg = ndcg_at_k(scaled, M)
        records.append([pm, rm, ndcg])
        
    return pd.DataFrame.from_records(records, index=query_index, columns=COLUMNS)
        
metrics = evaluation_metric(bin_results, queries_index, scaled_results)
metrics.head(10)

MAP resultante de todas las queries: 0.6533977342259655


### MAP

In [None]:
def overall_map(map_relevant_docs):
    """
    Función que calcula el MAP de los resultados de las queries.
    :param map_relevant_docs: Vector binario de las queries asegurandose de que aparezcan todos los documentos relevantes
    :return: El Mean average precision de los resultados de las queries.
    """
    matrix = [vector for key, vector in map_relevant_docs.items() ]
    return mean_avg_precision(matrix)

print(f'MAP resultante de todas las queries: {overall_map(map_relevant_docs)}')