In [61]:
import numpy as np
import pandas as pd
from xml.dom import minidom
from xml.etree import cElementTree as ElementTree
import os
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
# Ranked Retrieval and Document Vectorization

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\diego\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\diego\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\diego\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [54]:
def documentReader():
    """
    DocString
    :return: Nothing
    """
    documents_path = os.path.join(os.getcwd(), 'docs/docs-raw-texts')
    documentos = {}
    for filename in os.listdir(documents_path):
        file_path = os.path.join(documents_path, filename)
        xmldoc = minidom.parse(file_path)
        id = xmldoc.getElementsByTagName('public')[0].attributes['publicId'].value
        title = xmldoc.getElementsByTagName('fileDesc')[0].attributes['title'].value
        data = next(ElementTree.parse(file_path).iter('raw')).text
        documentos[id] = (title + ' ' + data).replace(u'\xa0', u' ').replace('\n', ' ')

    return documentos
documentos = documentReader()
print(list(documentos.items())[0])

[('d001', 'William Beaumont and the Human Digestion William Beaumont and the Human Digestion.  William Beaumont: Physiology of digestion Image Source.  On November 21, 1785, US-American surgeon William Beaumont was born. He became best known as “Father of Gastric Physiology” following his research on human digestion. William Beaumont was born in Lebanon, Connecticut and became a physician. He served as a surgeon’s mate in the Army during the War of 1812. He opened a private practice in Plattsburgh, New York, but rejoined the Army as a surgeon in 1819. Beaumont was stationed at Fort Mackinac on Mackinac Island in Michigan in the early 1820s when it existed to protect the interests of the American Fur Company. The fort became the refuge for a wounded 19-year-old French-Canadian fur trader named Alexis St. Martin when a shotgun went off by accident in the American Fur Company store at close range June 6th, 1822. St. Martin’s wound was quite serious because his stomach was perforated and s

In [64]:
def tokenization(documentos):
    """
    :param documentos:
    :return:
    """
    nltk_stop_words_en = set(nltk.corpus.stopwords.words("english"))
    p_stemmer = nltk.stem.porter.PorterStemmer()
    wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()

    word_tok = {key: nltk.word_tokenize(doc) for key, doc in documentos.items()}
    word_tok_sw = {key: [token for token in doc if token not in nltk_stop_words_en] for key, doc in word_tok.items()}
    nltk_stemedList_en = {key: [p_stemmer.stem(word) for word in doc] for key, doc in word_tok_sw.items()}
    nltk_lemmaList = {key: [wordnet_lemmatizer.lemmatize(word) for word in doc] for key, doc in nltk_stemedList_en.items()}

    return nltk_lemmaList
tokens = tokenization(documentos)
print(list(tokens.items())[0])

('d001', ['william', 'beaumont', 'human', 'digest', 'william', 'beaumont', 'human', 'digest', '.', 'william', 'beaumont', ':', 'physiolog', 'digest', 'imag', 'sourc', '.', 'On', 'novemb', '21', ',', '1785', ',', 'us-american', 'surgeon', 'william', 'beaumont', 'born', '.', 'He', 'becam', 'best', 'known', '“', 'father', 'gastric', 'physiolog', '”', 'follow', 'research', 'human', 'digest', '.', 'william', 'beaumont', 'born', 'lebanon', ',', 'connecticut', 'becam', 'physician', '.', 'He', 'serv', 'surgeon', '’', 'mate', 'armi', 'war', '1812', '.', 'He', 'open', 'privat', 'practic', 'plattsburgh', ',', 'new', 'york', ',', 'rejoin', 'armi', 'surgeon', '1819', '.', 'beaumont', 'station', 'fort', 'mackinac', 'mackinac', 'island', 'michigan', 'earli', '1820', 'exist', 'protect', 'interest', 'american', 'fur', 'compani', '.', 'the', 'fort', 'becam', 'refug', 'wound', '19-year-old', 'french-canadian', 'fur', 'trader', 'name', 'alexi', 'st.', 'martin', 'shotgun', 'went', 'accid', 'american', 'fur

('d001', ['william', 'beaumont', 'human', 'digest', 'william', 'beaumont', 'human', 'digest', '.', 'william', 'beaumont', ':', 'physiolog', 'digest', 'imag', 'sourc', '.', 'On', 'novemb', '21', ',', '1785', ',', 'us-american', 'surgeon', 'william', 'beaumont', 'born', '.', 'He', 'becam', 'best', 'known', '“', 'father', 'gastric', 'physiolog', '”', 'follow', 'research', 'human', 'digest', '.', 'william', 'beaumont', 'born', 'lebanon', ',', 'connecticut', 'becam', 'physician', '.', 'He', 'serv', 'surgeon', '’', 'mate', 'armi', 'war', '1812', '.', 'He', 'open', 'privat', 'practic', 'plattsburgh', ',', 'new', 'york', ',', 'rejoin', 'armi', 'surgeon', '1819', '.', 'beaumont', 'station', 'fort', 'mackinac', 'mackinac', 'island', 'michigan', 'earli', '1820', 'exist', 'protect', 'interest', 'american', 'fur', 'compani', '.', 'the', 'fort', 'becam', 'refug', 'wound', '19-year-old', 'french-canadian', 'fur', 'trader', 'name', 'alexi', 'st.', 'martin', 'shotgun', 'went', 'accid', 'american', 'fur