# Simple search engine

Let's say we want to search every document contains 'apple', how should we do? In this script 

First we have to tokenized the text file.


In [13]:
# read file line by line and 
def readfile(path):
    raw_text = []
    tokenized_text = []
    with open(path, 'r') as f:
        lines =  f.readlines()
        for line in lines:
            text = line.strip().split('\t')[-1]
            raw_text.append(text)
            tokens = text.split()
            for i in range(len(tokens)):
                tokens[i] = tokens[i].lower()
            tokenized_text.append(tokens)
    return raw_text, tokenized_text
raw_text, tokenized_text = readfile('data/tweets_10k')
# print the first document
print(raw_text[0])
print(tokenized_text[0])

I have no problem with this, Anderson did not deserve to hang around for a win
['i', 'have', 'no', 'problem', 'with', 'this,', 'anderson', 'did', 'not', 'deserve', 'to', 'hang', 'around', 'for', 'a', 'win']


The simplest way is to search document by document

In [16]:
def search_line_by_line(tokenized_text, query_word):
    result = []
    for i in range(len(tokenized_text)):
        if query_word in tokenized_text[i]:
            result.append(i)
    if len(result) == 0:
        print('No document can be found.')
    return result
    
query_word = 'apple'
doc_id = search_line_by_line(tokenized_text, query_word)
print(doc_id)
print(raw_text[doc_id[0]])

[228, 1428, 1935, 2401, 3221, 3688, 7212, 8420, 8604, 9201, 9749, 9997]
Decifrando o convite do evento Apple ![NEWLINE]#Me #tbt #photooftheday #apple #nissan #pokemon #abreusnett #olympics... https://t.co/vEW0x9zW8D


However, it is very troublesome if the data set is huge, and you have to repeat the searching process everytime.
Another method is to build an inverted index in advanced.



## inverted index

The idea is simple, we build a dictionary contains every word type and its relevant doc id.

In [30]:
def build_index(tokenized_text): 
    inverted_index = {}
    doc_no = 0
    vocabulary = set()
    for i in range(len(tokenized_text)):
        for token in tokenized_text[i]:
            if token in inverted_index.keys():
                try:
                    inverted_index[token][doc_no] += 1
                except:
                    inverted_index[token][doc_no] = 1
            else:
                inverted_index[token] = {}
                inverted_index[token][doc_no] = 1
        
        #counting the lines
        doc_no += 1
    return inverted_index

In [50]:
# try to build the index
inverted_index = build_index(tokenized_text)
# print an item to see the result
print(list(inverted_index.items())[3])

('problem', {0: 1, 69: 1, 269: 1, 278: 1, 362: 1, 693: 1, 816: 1, 1200: 1, 1230: 1, 1614: 1, 2122: 1, 2128: 1, 2204: 1, 3007: 1, 3079: 1, 3292: 1, 3362: 1, 3633: 1, 4249: 1, 4343: 1, 4345: 1, 4734: 1, 5189: 1, 5354: 1, 5619: 1, 5688: 1, 6339: 1, 6435: 1, 6521: 1, 6609: 1, 6673: 1, 7052: 1, 7063: 1, 7084: 1, 7185: 1, 7319: 1, 7540: 1, 7796: 1, 7865: 1, 8177: 1, 8647: 1, 8895: 1, 9034: 1, 9147: 1, 9420: 1, 9538: 1, 9847: 1, 9913: 1, 9987: 1})


In [54]:
# now we can search by inverted index
def search_by_index(inverted_index, query_word):
    try:
        return list(inverted_index[query_word].keys())
    except:
        print('No result')
        return []

In [57]:
query_word = 'apple'
result = search_by_index(inverted_index, query_word )
print(result[0])
print(raw_text[result[0]])

228
Decifrando o convite do evento Apple ![NEWLINE]#Me #tbt #photooftheday #apple #nissan #pokemon #abreusnett #olympics... https://t.co/vEW0x9zW8D


We can modify it to search with more than one word

In [79]:
def search_more_words(inverted_index, query_words):
    all_results = []
    for word in query_words:
        try:
            all_results.append(list(inverted_index[word].keys()))
        except:
            print('No result for ' + word)
            return []
    common_result = set(all_results[0])
    for i in range(1,len(all_results)):
        common_result = common_result.intersection(all_results[i])
    if len(common_result)==0:
        print('No document contains all word.')
    return common_result

In [82]:
query_word = ['apple','i']
result = search_more_words(inverted_index, query_word )
print(result)

{9201}


In [83]:
print(raw_text[result[0]])

TypeError: 'set' object does not support indexing