In [1]:
from gensim.parsing.porter import PorterStemmer
from gensim.parsing.preprocessing import remove_stopwords
from gensim import corpora
from gensim import models
from gensim import similarities
from smart_open import smart_open
import nltk
from xml.dom import minidom
from xml.etree import cElementTree as ElementTree
import os



# Gensim Corpus and Tf.idf Model

# Gensim Corpus and Tf.Idf Model

## Document Reader

In [9]:

def documentReader(path, queries = False):
    """
    DocString
    :return: Nothing
    """
    documents_path = os.path.join(os.getcwd(), path)
    documentos = {}
    for filename in os.listdir(documents_path):
        file_path = os.path.join(documents_path, filename)
        xmldoc = minidom.parse(file_path)
        id = xmldoc.getElementsByTagName('public')[0].attributes['publicId'].value
        title = '' if queries else xmldoc.getElementsByTagName('fileDesc')[0].attributes['title'].value
        data = next(ElementTree.parse(file_path).iter('raw')).text
        documentos[id] = (title + ' ' + data).replace(u'\xa0', u' ').replace('\n', ' ')

    return documentos
documentos = documentReader('docs/docs-raw-texts')
NRO_DOCS = len(documentos)
DOCS_IDs = list(documentos.keys())
print(list(documentos.items())[0])

('d038', 'Evangelista Torricelli and the Barometer Evangelista Torricelli and the Barometer.  Evangelista Torricelli (1608-1647). On October 15, 1608, Italian physicist and mathematician Evangelista Torricelli was born, best known for his invention of the barometer, but is also known for his advances in Optics. Evangelista Torricelli was born in Rome, the firstborn child of Gaspare Ruberti, a poor textile worker, and Giacoma Torricelli. His family was from Faenza in the Province of Ravenna, then part of the Papal States. His parents sent Evangelista to be educated in Faenza, under the care of his uncle, Jacobo, a Camaldolese monk, who after a basic education took Torricelli into a Jesuit College in 1624, to study mathematics and philosophy. Then Torricelli went to Rome to study science under the Benedictine monk Benedetto Castelli, a student of Galileo Galilei. While in Rome, Torricelli became also the student of the mathematician, Bonaventura Cavalieri, with whom he became great frien

### Tokenize

In [10]:
p = PorterStemmer()
def process(text):
    doc_nor = text.lower()
    doc_sw = remove_stopwords(doc_nor)
    doc_stem = p.stem_sentence(doc_sw)
    return nltk.word_tokenize(doc_stem)

docDict = []
for key, doc in documentos.items():
    docDict.append(process(doc))

docDict[0][:5]

['evangelista', 'torricelli', 'baromet', 'evangelista', 'torricelli']

In [6]:
dictionary = corpora.Dictionary(docDict)
dictionary.save('docs/midict.dict')
print(dictionary.token2id['information'])

76


In [7]:
##  Market Matrix format
# Step 1: Build the corpus from big file
class MyCorpus():
    def __init__(self, documents):
        self.documents = documents
    def __iter__(self):
        for key, doc in self.documents.items():
            yield dictionary.doc2bow(process(doc))

corpus_memory_friendly = MyCorpus(documentos)
corpora.MmCorpus.serialize("docs/corpus.mm",corpus_memory_friendly)

#### Read Maket Matrix format from disk

In [8]:
corpus = corpora.MmCorpus("docs/corpus.mm")
# No hacer esto en una implementacion real
for doc in corpus:
    print(doc[:10])
    break

[(0, 20.0), (1, 21.0), (2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0), (8, 1.0), (9, 1.0)]


#### Build tf.idf model from corpus

In [4]:
dictionary = corpora.Dictionary.load('docs/midict.dict')
corpus = corpora.MmCorpus('docs/corpus.mm')
tfidf = models.TfidfModel(corpus)

In [11]:
#Test to verify correct reading
query = "Machine learning"
query_doc_bow = dictionary.doc2bow(process(query)) # Important: Same corpus preprocess
print(query_doc_bow)
print(tfidf[query_doc_bow])

[(241, 1), (5809, 1)]
[(241, 0.2642196547502339), (5809, 0.9644625311766483)]


#### Make similarity matrix

In [5]:
index = similarities.MatrixSimilarity(tfidf[corpus])
index.save('docs/similmatrix.index')
print('Finished')

Finished


## Querying and validating

In [12]:
index = similarities.MatrixSimilarity.load('docs/similmatrix.index')
sims = index[tfidf[query_doc_bow]]
print(list(enumerate(sims))[:10])


[(0, 0.0), (1, 0.0017983386), (2, 0.0008260712), (3, 0.0), (4, 0.0), (5, 0.0012694553), (6, 0.0), (7, 0.0), (8, 0.0), (9, 0.0)]


### Read and proccess queries

In [13]:
def queries_reader():
    """
    :return:
    """
    queries_path = os.path.join(os.getcwd(), 'docs/queries-raw-texts')
    queries = {}
    queries_paths = os.listdir(queries_path)
    queries_paths.sort()
    #print(documents_paths)
    for filename in queries_paths:
        file_path = os.path.join(queries_path, filename)
        #print(filename)
        xmldoc = minidom.parse(file_path)
        id = xmldoc.getElementsByTagName('public')[0].attributes['publicId'].value
        query = next(ElementTree.parse(file_path).iter('raw')).text
        queries[id] = query.replace(u'\xa0', u' ').replace('\n', ' ')
    return queries

queries = queries_reader()

In [16]:
def queries_evaluation(queries):
    queries_rank = {}
    for idq, query in queries.items():
        query_doc_bow = dictionary.doc2bow(process(query))
        sims = index[tfidf[query_doc_bow]]
        #print(list(enumerate(sims)))
        sorted_vals = sorted(list(enumerate(sims)), key=lambda x: x[1], reverse=True)
        #print(sorted_vals)
        clean_query_scores = [ id+1 for id,v in sorted_vals if v != 0]
        queries_rank[idq] = clean_query_scores

    return queries_rank


queries_ranking = queries_evaluation(queries)
print(queries_ranking["q01"])




[16, 259, 254, 186, 85, 209, 215, 170, 153, 8, 185, 154, 163, 315, 296, 60, 89, 243, 4, 6, 162, 100, 94, 179, 145, 59, 39, 329, 299, 273, 312, 28, 311, 82, 281, 255, 65, 74, 317, 265, 229, 275, 130, 21, 77, 152, 195, 52, 316, 38, 164, 24, 123, 136, 184]
