In [2]:
import numpy as np
import pandas as pd
from xml.dom import minidom
from xml.etree import cElementTree as ElementTree
import os
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
# Ranked Retrieval and Document Vectorization

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\diego\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\diego\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\diego\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
def documentReader():
    """
    DocString
    :return: Nothing
    """
    documents_path = os.path.join(os.getcwd(), 'docs/docs-raw-texts')
    documentos = {}
    for filename in os.listdir(documents_path):
        file_path = os.path.join(documents_path, filename)
        xmldoc = minidom.parse(file_path)
        id = xmldoc.getElementsByTagName('public')[0].attributes['publicId'].value
        title = xmldoc.getElementsByTagName('fileDesc')[0].attributes['title'].value
        data = next(ElementTree.parse(file_path).iter('raw')).text
        documentos[id] = (title + ' ' + data).replace(u'\xa0', u' ').replace('\n', ' ')

    return documentos
documentos = documentReader()
NRO_DOCS = len(documentos)
DOCS_IDs = list(documentos.keys())
print(list(documentos.items())[0])

('d001', 'William Beaumont and the Human Digestion William Beaumont and the Human Digestion.  William Beaumont: Physiology of digestion Image Source.  On November 21, 1785, US-American surgeon William Beaumont was born. He became best known as “Father of Gastric Physiology” following his research on human digestion. William Beaumont was born in Lebanon, Connecticut and became a physician. He served as a surgeon’s mate in the Army during the War of 1812. He opened a private practice in Plattsburgh, New York, but rejoined the Army as a surgeon in 1819. Beaumont was stationed at Fort Mackinac on Mackinac Island in Michigan in the early 1820s when it existed to protect the interests of the American Fur Company. The fort became the refuge for a wounded 19-year-old French-Canadian fur trader named Alexis St. Martin when a shotgun went off by accident in the American Fur Company store at close range June 6th, 1822. St. Martin’s wound was quite serious because his stomach was perforated and se

In [7]:
def tokenization(documentos):
    """
    :param documentos:
    :return:
    """
    nltk_stop_words_en = set(nltk.corpus.stopwords.words("english"))
    p_stemmer = nltk.stem.porter.PorterStemmer()
    wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()

    word_tok = {key: nltk.word_tokenize(doc) for key, doc in documentos.items()}
    word_tok_sw = {key: [token for token in doc if token not in nltk_stop_words_en] for key, doc in word_tok.items()}
    # nltk_stemedList_en = {key: [p_stemmer.stem(word) for word in doc] for key, doc in word_tok_sw.items()}
    nltk_lemmaList = {key: [wordnet_lemmatizer.lemmatize(word) for word in doc] for key, doc in word_tok_sw.items()}

    return nltk_lemmaList
tokenized_docs = tokenization(documentos)
print(list(tokenized_docs.items())[0])

('d001', ['William', 'Beaumont', 'Human', 'Digestion', 'William', 'Beaumont', 'Human', 'Digestion', '.', 'William', 'Beaumont', ':', 'Physiology', 'digestion', 'Image', 'Source', '.', 'On', 'November', '21', ',', '1785', ',', 'US-American', 'surgeon', 'William', 'Beaumont', 'born', '.', 'He', 'became', 'best', 'known', '“', 'Father', 'Gastric', 'Physiology', '”', 'following', 'research', 'human', 'digestion', '.', 'William', 'Beaumont', 'born', 'Lebanon', ',', 'Connecticut', 'became', 'physician', '.', 'He', 'served', 'surgeon', '’', 'mate', 'Army', 'War', '1812', '.', 'He', 'opened', 'private', 'practice', 'Plattsburgh', ',', 'New', 'York', ',', 'rejoined', 'Army', 'surgeon', '1819', '.', 'Beaumont', 'stationed', 'Fort', 'Mackinac', 'Mackinac', 'Island', 'Michigan', 'early', '1820s', 'existed', 'protect', 'interest', 'American', 'Fur', 'Company', '.', 'The', 'fort', 'became', 'refuge', 'wounded', '19-year-old', 'French-Canadian', 'fur', 'trader', 'named', 'Alexis', 'St.', 'Martin', 's

In [8]:
def makeInvertedIndex(tokenized_docs):
    index = {}

    for id, doc in tokenized_docs.items():
        id = int(id[-3:]) #paasa dnjk al entero njk.
        for token in doc:
            if token in index :
                if index[token]['posting'][-1][0] == id:
                    index[token]['posting'][-1][1] += 1
                else:
                    index[token]['posting'].append([id, 1])
                    index[token]['freq'] += 1

            else:
                index[token] = {
                    'posting': [[id, 1]],
                    'freq': 1
                }
    return index

invertedIndex = makeInvertedIndex(tokenized_docs)
print(list(invertedIndex.items())[0])


('William', {'posting': [[1, 6], [15, 6], [28, 4], [35, 2], [55, 4], [56, 5], [69, 6], [88, 3], [91, 1], [92, 1], [95, 1], [98, 2], [102, 5], [106, 1], [109, 1], [111, 1], [129, 1], [136, 8], [138, 3], [147, 1], [175, 1], [179, 2], [180, 1], [189, 2], [190, 1], [191, 1], [197, 1], [212, 1], [230, 1], [241, 2], [254, 1], [257, 1], [266, 2], [272, 1], [273, 8], [274, 1], [289, 1], [291, 1], [294, 1], [299, 1], [300, 1], [309, 1], [310, 5], [320, 6], [323, 1], [330, 7]], 'freq': 46})


In [10]:
print(list(invertedIndex.items())[1])
print(len(list(invertedIndex.keys())))

('Beaumont', {'posting': [[1, 13]], 'freq': 1})
20446


In [17]:
def tfidfWeightedVector(invertedIndex):

    weightedVectorMatrix = []
    index = []
    columns = []
    for term, term_dict in invertedIndex.items():
        weighted_vector = np.zeros(NRO_DOCS)
        freq = term_dict['freq']
        index.append(term)
        for id, t_freq in term_dict['posting']:
            tfidf = np.log(1 + t_freq) * np.log10(NRO_DOCS / freq)
            weighted_vector[ id - 1] = tfidf

        weightedVectorMatrix.append(weighted_vector)


    weighted_vector_df = pd.DataFrame.from_records(data=weightedVectorMatrix, index=index, columns=DOCS_IDs)
    return weighted_vector_df

weighted_vector_df = tfidfWeightedVector(invertedIndex)
weighted_vector_df.head()

Unnamed: 0,d001,d002,d003,d004,d005,d006,d007,d008,d009,d010,...,d322,d323,d324,d325,d326,d327,d328,d329,d330,d331
William,1.667782,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.594076,0.0,0.0,0.0,0.0,0.0,0.0,1.782227,0.0
Beaumont,6.649971,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Human,1.36346,0.860247,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Digestion,3.493223,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
print(f'Matriz tfidf de dimension {weighted_vector_df.shape}')

Matriz tfidf de dimension (20446, 331)


In [None]:

def cosine_Similarity(doc_vec1, doc_vec2):
    return (doc_vec1 @ doc_vec2) / (doc_vec1.abs() * doc_vec2.abs())


In [25]:
pdf = pd.DataFrame.from_records([[1, 2, 3], [5, 6, 7]], index=['a', 'b'], columns=['c', 'd', 'e'])
pdf2 = pd.DataFrame.from_records([[1, 2, 3], [5, 6, 7]], index=['a', 'b'], columns=['c', 'd', 'e'])
pdf

Unnamed: 0,c,d,e
a,1,2,3
b,5,6,7


In [42]:
print((pdf @ pdf2) / (pdf.iloc[[1]].abs()*pdf2.iloc[[1]].abs()))
pdf.iloc[[1]]

Unnamed: 0,c,d,e
b,5,6,7
