# Search Engine. Binary Search

In [2]:
!pip install nltk
import numpy as np
import pandas as pd
from xml.dom import minidom
from xml.etree import cElementTree as ElementTree
import os
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\allan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\allan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\allan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
def document_reader():
    """
    This method reads the documents
    :return: Dictionary of documents (di: content of document i)
    """
    documents_path = os.path.join(os.getcwd(), 'docs/docs-raw-texts')
    documentos = {}
    for filename in os.listdir(documents_path):
        file_path = os.path.join(documents_path, filename)
        xmldoc = minidom.parse(file_path)
        id = xmldoc.getElementsByTagName('public')[0].attributes['publicId'].value
        title = xmldoc.getElementsByTagName('fileDesc')[0].attributes['title'].value
        data = next(ElementTree.parse(file_path).iter('raw')).text
        documentos[id] = (title + ' ' + data).replace(u'\xa0', u' ').replace('\n', ' ')

    return documentos
documents = document_reader()
print(list(documents.items())[0])

('d001', 'William Beaumont and the Human Digestion William Beaumont and the Human Digestion.  William Beaumont: Physiology of digestion Image Source.  On November 21, 1785, US-American surgeon William Beaumont was born. He became best known as “Father of Gastric Physiology” following his research on human digestion. William Beaumont was born in Lebanon, Connecticut and became a physician. He served as a surgeon’s mate in the Army during the War of 1812. He opened a private practice in Plattsburgh, New York, but rejoined the Army as a surgeon in 1819. Beaumont was stationed at Fort Mackinac on Mackinac Island in Michigan in the early 1820s when it existed to protect the interests of the American Fur Company. The fort became the refuge for a wounded 19-year-old French-Canadian fur trader named Alexis St. Martin when a shotgun went off by accident in the American Fur Company store at close range June 6th, 1822. St. Martin’s wound was quite serious because his stomach was perforated and se

In [5]:
def tokenization(documentos):
    """
    :param documentos: Receives a dictionary  
    :return: dict with key id of documents/queries and value is an array of terms
    """
    nltk_stop_words_en = set(nltk.corpus.stopwords.words("english"))
    p_stemmer = nltk.stem.porter.PorterStemmer()
    wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()

    word_tok = {key: nltk.word_tokenize(doc) for key, doc in documentos.items()}
    word_tok_sw = {key: [token for token in doc if token not in nltk_stop_words_en] for key, doc in word_tok.items()}
    # nltk_stemedList_en = {key: [p_stemmer.stem(word) for word in doc] for key, doc in word_tok_sw.items()}
    nltk_lemmaList = {key: [wordnet_lemmatizer.lemmatize(word) for word in doc] for key, doc in word_tok_sw.items()}

    return nltk_lemmaList
tokenized_docs = tokenization(documents)


print((list(tokenized_docs.items())[0]))

('d001', ['William', 'Beaumont', 'Human', 'Digestion', 'William', 'Beaumont', 'Human', 'Digestion', '.', 'William', 'Beaumont', ':', 'Physiology', 'digestion', 'Image', 'Source', '.', 'On', 'November', '21', ',', '1785', ',', 'US-American', 'surgeon', 'William', 'Beaumont', 'born', '.', 'He', 'became', 'best', 'known', '“', 'Father', 'Gastric', 'Physiology', '”', 'following', 'research', 'human', 'digestion', '.', 'William', 'Beaumont', 'born', 'Lebanon', ',', 'Connecticut', 'became', 'physician', '.', 'He', 'served', 'surgeon', '’', 'mate', 'Army', 'War', '1812', '.', 'He', 'opened', 'private', 'practice', 'Plattsburgh', ',', 'New', 'York', ',', 'rejoined', 'Army', 'surgeon', '1819', '.', 'Beaumont', 'stationed', 'Fort', 'Mackinac', 'Mackinac', 'Island', 'Michigan', 'early', '1820s', 'existed', 'protect', 'interest', 'American', 'Fur', 'Company', '.', 'The', 'fort', 'became', 'refuge', 'wounded', '19-year-old', 'French-Canadian', 'fur', 'trader', 'named', 'Alexis', 'St.', 'Martin', 's

In [9]:
def binary_search(tokenized_docs):
    """
    :param tokenized_docs: dict with key id of documents and value an array of terms
    :return: Matrix term-document t1 = [d1, d2, ..., dn], where di is 1 o 0
    """
    term_document = {}
    for id,doc in tokenized_docs.items():
        id = int(id[-3:]) #paasa dnjk al entero njk.
        for token in doc:
            if token not in term_document:
                term_document[token] = [0] * len(list(tokenized_docs.items())) 
            
            term_document[token][id-1] = 1
    
    return term_document

term_document = binary_search(tokenized_docs)

# base = [1]*331
# for query in list(tokenized_queries.items()):
#     result = [1] * 331
#     bad_result = [0] * 331
#     for token in query[1][1:]:
#         if token in term_document:
#             result = np.bitwise_and(term_document[token], result)
#         else:
#             result = np.bitwise_and(bad_result, result)
#     print(query, sum(result))

print('Size of vocabulary:', len(list(term_document.items())))
print('Size of the matrix: ',len(list(term_document.items())), 'x', len(list(term_document.items())[0][1]) )


Size of vocabulary: 20447
Size of the matrix:  20447 x 331


In [10]:
import csv
def serialize_and_save(term_document):
    """
    :param term_document: Matrix term-document t1 = [d1, d2, ..., dn], where di is 1 o 0
    :return: csv with columns (token,vector) donde vector es el string concatenado de 0s y 1s de la funcion binary_search
    """
    serialize = [['token','vector']]
    
    for token,arr in term_document.items():
        row =  [token, "".join([str(val) for val in arr])]
        serialize.append(row)
    
    with open('document_term_file.csv', mode='w', encoding='utf-8', newline='') as document_term_file:
        term_document_writer = csv.writer(document_term_file, delimiter=',')
        term_document_writer.writerows(serialize)

    return serialize
value = serialize_and_save(term_document)
    
# print(type())
# ping = np.packbits([int(val) for val in vector_list])  
# print(ping[0])

print(value[1])

['William', '1000000000000010000000000001000000100000000000000000001100000000000010000000000000000001001100100100010001001010000000000000000010000001010000000010000000000000000000000000001000110000000011100000100000000000000100000000000000000100000000001000000000000100100000000100000111000000000000001010010000110000000011000000000100100000010']


In [15]:
def queries_reader():
    """
    Reads the query
    :return: Dictionary of documents (di: content of document i)
    """
    queries_path = os.path.join(os.getcwd(), 'docs/queries-raw-texts')
    queries = {}
    queries_paths = os.listdir(queries_path)
    queries_paths.sort()
    #print(documents_paths)
    for filename in queries_paths:
        file_path = os.path.join(queries_path, filename)
        #print(filename)
        xmldoc = minidom.parse(file_path)
        id = xmldoc.getElementsByTagName('public')[0].attributes['publicId'].value
        query = next(ElementTree.parse(file_path).iter('raw')).text
        queries[id] = query.replace(u'\xa0', u' ').replace('\n', ' ')
    return queries

queries = queries_reader()

#this is for tokenization of the queries
tokenized_queries = tokenization(queries)

In [19]:
import csv
def find_documents(queries):
    """
    This method prints tokenized the queries, and print the list of documents that a query appears completely
    :param queries: dict with key the id of a query and value the query
    :return: Nothing
    """
    with open('document_term_file.csv', 'r', encoding='utf-8', newline='') as document_term_file:
        rows = csv.reader(document_term_file)
        dict_with_vector = {}
        idx = 0
        for row in rows:
            idx+=1
            if idx == 1:
                continue
            token = row[0]
            string = row[1]
            vector = [int(letter) for letter in list(string)]
            dict_with_vector[token] = vector
        base = [1]*331
        for query in queries:
            result = [1] * 331
            bad_result = [0] * 331
            id_document = 0
            documents = []
            for token in query[1][1:]:
                id_document += 1
                if token in dict_with_vector:
                    documents.append(id_document)
                    result = np.bitwise_and(dict_with_vector[token], result)
                else:
                    result = np.bitwise_and(bad_result, result)
            docs = [str(str(doc)) for doc in documents]
            print(query[0], docs)
            

find_documents(list(tokenized_queries.items()))


q01 ['1', '2']
q02 ['1', '2']
q03 []
q04 ['1', '2']
q06 ['1']
q07 ['1', '2']
q08 ['1', '2', '3']
q09 ['1']
q10 ['1', '2']
q12 ['1', '2', '3']
q13 ['1', '2', '3']
q14 ['1', '2', '3']
q16 ['1']
q17 ['1', '2', '3']
q18 ['1', '2', '3', '4']
q19 ['1', '2']
q22 ['1', '2', '3', '4', '5']
q23 ['1', '2', '3', '4']
q24 ['1']
q25 ['1']
q26 ['1', '2', '3']
q27 ['1', '2', '3', '4']
q28 ['1', '2', '3']
q29 ['1']
q32 ['1']
q34 ['1', '2']
q36 ['1', '2', '3']
q37 ['1']
q38 ['1', '2', '3']
q40 ['1', '2', '3', '4']
q41 ['1']
q42 ['1', '2', '3', '4', '5']
q44 ['1', '2']
q45 ['1', '2', '3']
q46 ['1', '2', '3']
