In [64]:
# Basic Ranked Retrieval (RRI)
import numpy as np
import pandas as pd
from xml.dom import minidom
from xml.etree import cElementTree as ElementTree
import os
import nltk
import ssl
import math
try:
     _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
     pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/isabelasarmiento/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/isabelasarmiento/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/isabelasarmiento/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [65]:
## Imports
def documentReader():
    """
    DocString Reads the documents
    :return: Nothing
    """
    documents_path = os.path.join(os.getcwd(), 'docs/docs-raw-texts')
    documentos = {}
    documents_paths = os.listdir(documents_path)
    documents_paths.sort()
    #print(documents_paths)
    for filename in documents_paths:
        file_path = os.path.join(documents_path, filename)
        #print(filename)
        xmldoc = minidom.parse(file_path)
        id = xmldoc.getElementsByTagName('public')[0].attributes['publicId'].value
        title = xmldoc.getElementsByTagName('fileDesc')[0].attributes['title'].value
        data = next(ElementTree.parse(file_path).iter('raw')).text
        documentos[id] = (title + ' ' + data).replace(u'\xa0', u' ').replace('\n', ' ')
    return documentos

documentos = documentReader()
print(list(documentos.items())[0])


('d001', 'William Beaumont and the Human Digestion William Beaumont and the Human Digestion.  William Beaumont: Physiology of digestion Image Source.  On November 21, 1785, US-American surgeon William Beaumont was born. He became best known as “Father of Gastric Physiology” following his research on human digestion. William Beaumont was born in Lebanon, Connecticut and became a physician. He served as a surgeon’s mate in the Army during the War of 1812. He opened a private practice in Plattsburgh, New York, but rejoined the Army as a surgeon in 1819. Beaumont was stationed at Fort Mackinac on Mackinac Island in Michigan in the early 1820s when it existed to protect the interests of the American Fur Company. The fort became the refuge for a wounded 19-year-old French-Canadian fur trader named Alexis St. Martin when a shotgun went off by accident in the American Fur Company store at close range June 6th, 1822. St. Martin’s wound was quite serious because his stomach was perforated and se

In [66]:
def tokenization(documentos):
    """
    :param documentos:
    :return:
    """
    nltk_stop_words_en = set(nltk.corpus.stopwords.words("english"))
    p_stemmer = nltk.stem.porter.PorterStemmer()
    wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()

    word_tok = {key: nltk.word_tokenize(doc) for key, doc in documentos.items()}
    word_tok_sw = {key: [token for token in doc if token not in nltk_stop_words_en] for key, doc in word_tok.items()}
    # nltk_stemedList_en = {key: [p_stemmer.stem(word) for word in doc] for key, doc in word_tok_sw.items()}
    nltk_lemmaList = {key: [wordnet_lemmatizer.lemmatize(word) for word in doc] for key, doc in word_tok_sw.items()}

    return nltk_lemmaList

tokenized_docs = tokenization(documentos)
print(list(tokenized_docs.items())[0])

('d001', ['William', 'Beaumont', 'Human', 'Digestion', 'William', 'Beaumont', 'Human', 'Digestion', '.', 'William', 'Beaumont', ':', 'Physiology', 'digestion', 'Image', 'Source', '.', 'On', 'November', '21', ',', '1785', ',', 'US-American', 'surgeon', 'William', 'Beaumont', 'born', '.', 'He', 'became', 'best', 'known', '“', 'Father', 'Gastric', 'Physiology', '”', 'following', 'research', 'human', 'digestion', '.', 'William', 'Beaumont', 'born', 'Lebanon', ',', 'Connecticut', 'became', 'physician', '.', 'He', 'served', 'surgeon', '’', 'mate', 'Army', 'War', '1812', '.', 'He', 'opened', 'private', 'practice', 'Plattsburgh', ',', 'New', 'York', ',', 'rejoined', 'Army', 'surgeon', '1819', '.', 'Beaumont', 'stationed', 'Fort', 'Mackinac', 'Mackinac', 'Island', 'Michigan', 'early', '1820s', 'existed', 'protect', 'interest', 'American', 'Fur', 'Company', '.', 'The', 'fort', 'became', 'refuge', 'wounded', '19-year-old', 'French-Canadian', 'fur', 'trader', 'named', 'Alexis', 'St.', 'Martin', 's

In [67]:
def makeInvertedIndex(tokenized_docs):
    index = {}

    for id, doc in tokenized_docs.items():
        #id = int(id[-3:]) #paasa dnjk al entero njk.
        for token in doc:
            if token in index :
                if index[token]['posting'][-1][0] == id:
                    index[token]['posting'][-1][1] += 1
                else:
                    index[token]['posting'].append([id, 1])
                    index[token]['freq'] += 1

            else:
                index[token] = {
                    'posting': [[id, 1]],
                    'freq': 1
                }
    return index


invertedIndex = makeInvertedIndex(tokenized_docs)
print(list(invertedIndex.items())[0])

('William', {'posting': [['d001', 6], ['d015', 6], ['d028', 4], ['d035', 2], ['d055', 4], ['d056', 5], ['d069', 6], ['d088', 3], ['d091', 1], ['d092', 1], ['d095', 1], ['d098', 2], ['d102', 5], ['d106', 1], ['d109', 1], ['d111', 1], ['d129', 1], ['d136', 8], ['d138', 3], ['d147', 1], ['d175', 1], ['d179', 2], ['d180', 1], ['d189', 2], ['d190', 1], ['d191', 1], ['d197', 1], ['d212', 1], ['d230', 1], ['d241', 2], ['d254', 1], ['d257', 1], ['d266', 2], ['d272', 1], ['d273', 8], ['d274', 1], ['d289', 1], ['d291', 1], ['d294', 1], ['d299', 1], ['d300', 1], ['d309', 1], ['d310', 5], ['d320', 6], ['d323', 1], ['d330', 7]], 'freq': 46})


In [68]:
def queries_reader():
    """
    DocString Reads the query
    :return:
    """
    queries_path = os.path.join(os.getcwd(), 'docs/queries-raw-texts')
    queries = {}
    queries_paths = os.listdir(queries_path)
    queries_paths.sort()
    #print(documents_paths)
    for filename in queries_paths:
        file_path = os.path.join(queries_path, filename)
        #print(filename)
        xmldoc = minidom.parse(file_path)
        id = xmldoc.getElementsByTagName('public')[0].attributes['publicId'].value
        query = next(ElementTree.parse(file_path).iter('raw')).text
        queries[id] = query.replace(u'\xa0', u' ').replace('\n', ' ')
    return queries

queries = queries_reader()
print(list(queries.items())[0])

('q01', 'Fabrication of music instruments')


In [69]:
def queries_tokenization(queries):
    """
    :param documentos:
    :return:
    """
    nltk_stop_words_en = set(nltk.corpus.stopwords.words("english"))
    wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()
    #print("items", queries.items())
    tokenized_queries = {}
    for key,doc in queries.items():
        word_tok = nltk.word_tokenize(doc)
        word_tok_sw = [token for token in word_tok if token not in nltk_stop_words_en]
        nltk_lemmaList = [wordnet_lemmatizer.lemmatize(word) for word in word_tok_sw]
        #print(nltk_lemmaList)
        tokenized_queries[key] = nltk_lemmaList

    return tokenized_queries



tokenized_queries = queries_tokenization(queries)
print(list(tokenized_queries.items())[0])

('q01', ['Fabrication', 'music', 'instrument'])


In [71]:
def basic_ranked_retrieval(queries,invertedIndex,documents,N):
    """
    DocString
    :return: Nothing
    """
    scores= {}
    query_scores_template = {}
    for key, doc in documents.items():
        query_scores_template[key] = 0

    for query,tokens in queries.items():
        query_scores = query_scores_template.copy()
        for token in tokens:
            if token in invertedIndex:
                df = invertedIndex[token]["freq"]
                idf = math.log10( N / df )
                for docs in invertedIndex[token]["posting"]:
                    tf = docs[1]
                    tf_w = math.log10(1 + tf)
                    #if docs[0] not in query_scores:
                    #    query_scores[docs[0]] = 0
                    query_scores[docs[0]] += tf_w*idf
        clean_query_scores = { k : v for k,v in query_scores.items() if v != 0}
        clean_query_scores = dict(sorted(clean_query_scores.items(), key=lambda item: item[1], reverse=True))
        scores[query] = clean_query_scores#query_scores

    return scores

RRI = basic_ranked_retrieval(tokenized_queries,invertedIndex, documentos, len(documentos))
print(list(RRI.items())[0])


('q01', {'d254': 1.3322084124448144, 'd016': 1.280133266247014, 'd085': 0.7608038472948082, 'd185': 0.7210322829593981, 'd209': 0.7210322829593981, 'd060': 0.6881765238016224, 'd100': 0.6881765238016224, 'd153': 0.6881765238016224, 'd186': 0.6553207646438466, 'd006': 0.571404565150006, 'd215': 0.571404565150006, 'd099': 0.5193294189522057, 'd243': 0.5193294189522057, 'd004': 0.36051614147969907, 'd039': 0.36051614147969907, 'd065': 0.36051614147969907, 'd094': 0.36051614147969907, 'd130': 0.36051614147969907, 'd136': 0.36051614147969907, 'd152': 0.36051614147969907, 'd162': 0.36051614147969907, 'd164': 0.36051614147969907, 'd184': 0.36051614147969907, 'd195': 0.36051614147969907, 'd312': 0.36051614147969907, 'd316': 0.36051614147969907, 'd028': 0.3276603823219233, 'd038': 0.3276603823219233, 'd074': 0.3276603823219233, 'd082': 0.3276603823219233, 'd116': 0.3276603823219233, 'd170': 0.3276603823219233, 'd172': 0.3276603823219233, 'd212': 0.3276603823219233, 'd229': 0.3276603823219233, '