In [None]:
import json
import re
import os

from flask import Flask, request
import nltk

import config


In [None]:
nltk.download('punkt')
forward_index = {}
inverted_index = {}
documents_id = []


In [None]:
def save_index(path, forward_file="forward_index", 
               inverted_file="inverted_index", 
               id_file="documents_id"):
    
    file_path = os.path.join(path, forward_file + ".json")
    with open(file_path, 'w', encoding='utf8') as outfile:
        json.dump(forward_index, outfile, ensure_ascii=False)

    file_path = os.path.join(path, inverted_file + ".json")
    with open(file_path, 'w', encoding='utf8') as outfile:
        json.dump(inverted_index, outfile, ensure_ascii=False)

    file_path = os.path.join(path, id_file + ".json")
    with open(file_path, 'w') as outfile:
        json.dump(documents_id, outfile)


def load_index(path, forward_file="forward_index", 
               inverted_file="inverted_index", 
               id_file="documents_id"):
    
    file_path = os.path.join(path, forward_file + ".json")
    with open(file_path, 'r', encoding='utf8') as infile:
        forward_index = json.load(infile)

    file_path = os.path.join(path, inverted_file + ".json")
    with open(file_path, 'r', encoding='utf8') as infile:
        inverted_index = json.load(infile)

    file_path = os.path.join(path, id_file + ".json")
    with open(file_path, 'r', encoding='utf8') as infile:
        documents_id = json.load(infile)
        
    return forward_index, inverted_index, documents_id


def search_boolean(search_query, forward_index, inverted_index, documents_id):
    """Search the words of search_query in inverted index"""
    docs_id = []
    words = search_query.split(" ")
    for word in words:
        if word in inverted_index.keys():
            docs_id.append(set([doc["id"] for doc in inverted_index[word]]))

    if len(docs_id) > 0:
        set_of_docs_id = docs_id[0]
        for docs_set in docs_id:
            set_of_docs_id = set_of_docs_id.intersection(docs_set)

        documents = []
        terms = []
        set_of_docs_id = list(set_of_docs_id)
        for id in set_of_docs_id:
            documents.append(forward_index[str(id)])
        for i, word in enumerate(words):
            terms.append({"term": word,
                          "inverted_index": [doc for doc in inverted_index[word]
                                             if doc['id'] in set_of_docs_id]})
        return {"documents": documents, "terms": terms}
    else:
        return "Documents aren't found."


def add_forward_index(document, forward_index):
    """Add the document to forward index"""
    forward_index[str(document["id"])] = document
    return forward_index


def add_inverted_index(document, inverted_index):
    """Add the document to inverted index"""
    tokens = nltk.word_tokenize(document["text_normalized"])
    for token in tokens:
        count = tokens.count(token)
        token_len = len(token)
        count_title = document["title_normalized"].count(token)
        positions_raw = [s.start() for s in re.finditer(token,
                                                        document["text"])]
        positions = [s.start() for s in re.finditer(token,
                                                    document["text_normalized"])]
        token_inv_idx = {"id": int(document["id"]),
                         "count": count,
                         "count_title": count_title,
                         "length": token_len,
                         "pos": positions,
                         "pos_raw": positions_raw}
        if token in inverted_index.keys():
            inverted_index[token].append(token_inv_idx)
        else:
            inverted_index[token] = [token_inv_idx]
    return inverted_index
 

In [None]:
app = Flask(__name__)


@app.route('/', methods=['GET', 'POST'])
def index():
    return "Main page of indexer"


@app.route('/indexer', methods=["POST"])
def add_to_index():
    global forward_index
    global inverted_index
    global documents_id
    
    document = request.json
    if str(document["id"]) not in documents_id:
        documents_id.append(str(document["id"]))
        forward_index = add_forward_index(document, forward_index)
        inverted_index = add_inverted_index(document, inverted_index)
        return "document is successfully added."
    else:
        return "document already exist in index."


@app.route("/search", methods=["POST"])
def search():
    search_query = request.json
    search_result = search_boolean(search_query, forward_index,
                                inverted_index, documents_id)
    return json.dumps(search_result, ensure_ascii=False)


@app.route("/save_index", methods=["POST"])
def saving():
    save_index(config.data_dir)
    return "successfully saved."


In [None]:
if __name__ == "__main__":
    # forward_index, inverted_index, documents_id = load_index(config.data_dir)
    
    app.run(host=config.INDEXER_HOST,
            port=config.INDEXER_PORT)
