# Search Engine. Binary Search

In [1]:
!pip install nltk
import numpy as np
import pandas as pd
from xml.dom import minidom
from xml.etree import cElementTree as ElementTree
import os
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\allan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\allan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\allan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
import re
def document_reader():
    """
    This method reads the documents
    :return: Dictionary of documents (di: content of document i)
    """
    documents_path = os.path.join(os.getcwd(), 'docs/docs-raw-texts')
    documentos = {}
    for filename in os.listdir(documents_path):
        file_path = os.path.join(documents_path, filename)
        xmldoc = minidom.parse(file_path)
        id = xmldoc.getElementsByTagName('public')[0].attributes['publicId'].value
        title = xmldoc.getElementsByTagName('fileDesc')[0].attributes['title'].value
        data = next(ElementTree.parse(file_path).iter('raw')).text
        documentos[id] = (title + ' ' + data).replace(u'\xa0', u' ').replace('\n', ' ')

    return documentos
documents = document_reader()
print(list(documents.items())[0])


('d001', 'William Beaumont and the Human Digestion William Beaumont and the Human Digestion.  William Beaumont: Physiology of digestion Image Source.  On November 21, 1785, US-American surgeon William Beaumont was born. He became best known as “Father of Gastric Physiology” following his research on human digestion. William Beaumont was born in Lebanon, Connecticut and became a physician. He served as a surgeon’s mate in the Army during the War of 1812. He opened a private practice in Plattsburgh, New York, but rejoined the Army as a surgeon in 1819. Beaumont was stationed at Fort Mackinac on Mackinac Island in Michigan in the early 1820s when it existed to protect the interests of the American Fur Company. The fort became the refuge for a wounded 19-year-old French-Canadian fur trader named Alexis St. Martin when a shotgun went off by accident in the American Fur Company store at close range June 6th, 1822. St. Martin’s wound was quite serious because his stomach was perforated and se

In [5]:
def queries_reader():
    """
    Reads the query
    :return: Dictionary of documents (di: content of document i)
    """
    queries_path = os.path.join(os.getcwd(), 'docs/queries-raw-texts')
    queries = {}
    queries_paths = os.listdir(queries_path)
    queries_paths.sort()
    for filename in queries_paths:
        file_path = os.path.join(queries_path, filename)
        xmldoc = minidom.parse(file_path)
        id = xmldoc.getElementsByTagName('public')[0].attributes['publicId'].value
        query = next(ElementTree.parse(file_path).iter('raw')).text
        queries[id] = query.replace(u'\xa0', u' ').replace('\n', ' ')
    return queries

queries = queries_reader()

print(queries)

{'q01': 'Fabrication of music instruments', 'q02': 'famous German poetry', 'q03': 'Romanticism', 'q04': 'University of Edinburgh research', 'q06': 'bridge construction', 'q07': 'Walk of Fame stars', 'q08': 'Scientists who worked on the atomic bomb', 'q09': 'Invention of the Internet', 'q10': 'early telecommunication methods', 'q12': 'Who explored the South Pole', 'q13': 'famous members of the Royal Navy', 'q14': 'Nobel Prize winning inventions', 'q16': 'South America', 'q17': 'Edward Teller and Marie Curie', 'q18': 'Computing Language for the programming of Artificial Intelligence', 'q19': 'William Hearst movie', 'q22': 'How did Captain James Cook become an explorer', 'q23': 'How did Grace Hopper get famous', 'q24': 'Computers in Astronomy', 'q25': 'WWII aircraft', 'q26': 'Literary critics on Thomas Moore', 'q27': 'Nazis confiscate or destroy art and literature', 'q28': 'Modern Age in English Literature', 'q29': 'modern Physiology', 'q32': 'Roman Empire', 'q34': 'Scientists who have co

In [69]:
def tokenization(documentos):
    """
    :param documentos: Receives a dictionary  
    :return: dict with key id of documents/queries and value is an array of terms
    """
    nltk_stop_words_en = set(nltk.corpus.stopwords.words("english"))
    wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()

    nltk_lemmaList = {key: [wordnet_lemmatizer.lemmatize(token) 
                             for token in nltk.word_tokenize(doc) if (token.isalnum()) and (token not in nltk_stop_words_en)] for key, doc in documentos.items()}

    return nltk_lemmaList

tokenized_docs = tokenization(documents)
tokenized_queries = tokenization(queries)

print(list(tokenized_docs.items())[0][1])


['William', 'Beaumont', 'Human', 'Digestion', 'William', 'Beaumont', 'Human', 'Digestion', 'William', 'Beaumont', 'Physiology', 'digestion', 'Image', 'Source', 'On', 'November', '21', '1785', 'surgeon', 'William', 'Beaumont', 'born', 'He', 'became', 'best', 'known', 'Father', 'Gastric', 'Physiology', 'following', 'research', 'human', 'digestion', 'William', 'Beaumont', 'born', 'Lebanon', 'Connecticut', 'became', 'physician', 'He', 'served', 'surgeon', 'mate', 'Army', 'War', '1812', 'He', 'opened', 'private', 'practice', 'Plattsburgh', 'New', 'York', 'rejoined', 'Army', 'surgeon', '1819', 'Beaumont', 'stationed', 'Fort', 'Mackinac', 'Mackinac', 'Island', 'Michigan', 'early', '1820s', 'existed', 'protect', 'interest', 'American', 'Fur', 'Company', 'The', 'fort', 'became', 'refuge', 'wounded', 'fur', 'trader', 'named', 'Alexis', 'Martin', 'shotgun', 'went', 'accident', 'American', 'Fur', 'Company', 'store', 'close', 'range', 'June', '6th', '1822', 'Martin', 'wound', 'quite', 'serious', 's

In [70]:
def matrix_construction(tokenized_docs):
    """
    :param tokenized_docs: dict with key id of documents and value an array of terms
    :return: Matrix term-document t1 = [d1, d2, ..., dn], where di is 1 o 0
    """
    term_document = {}
    for id,doc in tokenized_docs.items():
        id = int(id[-3:]) #paasa dnjk al entero njk.
        for token in doc:
            if token not in term_document:
                term_document[token] = [0] * len(list(tokenized_docs.items())) 
            
            term_document[token][id-1] = 1
    
    return term_document

term_document = matrix_construction(tokenized_docs)

print('Size of vocabulary:', len(list(term_document.items())))
print('Size of the matrix: ',len(list(term_document.items())), 'x', len(list(term_document.items())[0][1]) )


Size of vocabulary: 18936
Size of the matrix:  18936 x 331


In [55]:
import csv
def serialize_and_save(term_document):
    """
    :param term_document: Matrix term-document t1 = [d1, d2, ..., dn], where di is 1 o 0
    :return: csv with columns (token,vector) donde vector es el string concatenado de 0s y 1s de la funcion binary_search
    """
    serialize = [['token','vector']]
    
    for token,arr in term_document.items():
        row =  [token, "".join([str(val) for val in arr])]
        serialize.append(row)
    
    with open('document_term_file.csv', mode='w', encoding='utf-8', newline='') as document_term_file:
        term_document_writer = csv.writer(document_term_file, delimiter=',')
        term_document_writer.writerows(serialize)

    return serialize
value = serialize_and_save(term_document)
    
print('columns: [token,vector]')
print('Row in the csv:',value[1])

columns: [token,vector]
Row in the csv: ['William', '1000000000000010000000000001000000100000000000000000001100000000000010000000000000000001001100100100010001001010000000000000000010000001010000000010000000000000000000000000001000110000000011100000100000000000000100000000000000000100000000001000000000000100100000000100000111000000000000001010010000110000000011000000000100100000010']


{'q01': ['Fabrication', 'music', 'instrument'], 'q02': ['famous', 'German', 'poetry'], 'q03': ['Romanticism'], 'q04': ['University', 'Edinburgh', 'research'], 'q06': ['bridge', 'construction'], 'q07': ['Walk', 'Fame', 'star'], 'q08': ['Scientists', 'worked', 'atomic', 'bomb'], 'q09': ['Invention', 'Internet'], 'q10': ['early', 'telecommunication', 'method'], 'q12': ['Who', 'explored', 'South', 'Pole'], 'q13': ['famous', 'member', 'Royal', 'Navy'], 'q14': ['Nobel', 'Prize', 'winning', 'invention'], 'q16': ['South', 'America'], 'q17': ['Edward', 'Teller', 'Marie', 'Curie'], 'q18': ['Computing', 'Language', 'programming', 'Artificial', 'Intelligence'], 'q19': ['William', 'Hearst', 'movie'], 'q22': ['How', 'Captain', 'James', 'Cook', 'become', 'explorer'], 'q23': ['How', 'Grace', 'Hopper', 'get', 'famous'], 'q24': ['Computers', 'Astronomy'], 'q25': ['WWII', 'aircraft'], 'q26': ['Literary', 'critic', 'Thomas', 'Moore'], 'q27': ['Nazis', 'confiscate', 'destroy', 'art', 'literature'], 'q28': 

In [68]:
import csv

def calculate_result(query, dict_with_vector):
    result = [1] * 331
    bad_result = [0] * 331
    for token in query[1]:
        if token in dict_with_vector:
            result = np.bitwise_and(dict_with_vector[token], result) 
        else:
            result = bad_result
            break
    return result

def binary_search(queries):
    """
    This method prints tokenized the queries, and print the list of documents that a query appears completely
    :param queries: dict with key the id of a query and value the query
    :return: Nothing
    """
    with open('document_term_file.csv', 'r', encoding='utf-8', newline='') as document_term_file:
        rows = csv.reader(document_term_file)
        dict_with_vector = {}
        idx = 0
        for row in rows:
            idx+=1
            if idx == 1:
                continue
            token = row[0]
            string = row[1]
            vector = [int(letter) for letter in list(string)]
            dict_with_vector[token] = vector
        ans = {}
        for query in queries:
            result = calculate_result(query, dict_with_vector)

            idDoc = 0
            string = ''
            for res in result:
                idDoc += 1
                if res == 1:
                    string += 'd' + str(idDoc) + ','
            ans[query[0]] = string[:-1]
        with open('BS-queries_results.csv', mode='w', encoding='utf-8', newline='') as BS_queries:
            term_document_writer = csv.writer(BS_queries, delimiter='\t')
            for k,v in ans.items():
                term_document_writer.writerow([k,v])

binary_search(list(tokenized_queries.items()))

