In [None]:
import json
import pickle
import os

from flask import Flask, request
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

import config

In [None]:
inverted_index = {}
forward_index = {}
documents_id = []
vectorizer = None
tfidf = None

In [None]:
def load_index(path, forward_file="forward_index", 
               inverted_file="inverted_index", 
               id_file="documents_id"):
    
    inverted_index = {}
    forward_index = {}
    documents_id = []
    
    file_path = os.path.join(path, forward_file + ".json")
    with open(file_path, 'r', encoding='utf8') as infile:
        forward_index = json.load(infile)
    
    if not isinstance(inverted_file, type(None)):
        file_path = os.path.join(path, inverted_file + ".json")
        with open(file_path, 'r', encoding='utf8') as infile:
            inverted_index = json.load(infile)
    
    file_path = os.path.join(path, id_file + ".json")
    with open(file_path, 'r', encoding='utf8') as infile:
        documents_id = json.load(infile)
        
    return forward_index, inverted_index, documents_id


def compute_tfidf(forward_index, documents_id):
    corpus = []
    for i in documents_id:
        corpus.append(forward_index[i]["text_normalized"])
    
    vectorizer = TfidfVectorizer()
    tfidf = vectorizer.fit_transform(corpus).todense()
    return vectorizer, tfidf


def initialize_tfidf(index_path, save_tfidf_path): 

    forward_index, _, documents_id = load_index(index_path,
                                                inverted_file=None)
    vectorizer, tfidf = compute_tfidf(forward_index, documents_id)
    
    save_tfidf_path = os.path.join(save_tfidf_path, "vectorizer_tfidf.dat")
    with open(save_tfidf_path, "wb") as ouf:
        pickle.dump(vectorizer, ouf)
        pickle.dump(tfidf, ouf)
            
    return vectorizer, tfidf


def ranking(documents, query, vectorizer, tfidf):
    query_vect = vectorizer.transform([query]).todense()
    doc_vects = [doc["text_normalized"] for doc in documents]
    doc_vects = vectorizer.transform(doc_vects).todense()
    
    ranked_list = cosine_similarity(doc_vects, query_vect)
    ranked_list = list(np.squeeze(ranked_list, axis=1))
    assert(len(ranked_list) == len(documents))
    
    ranked_list = list(zip(ranked_list, documents))
    ranked_list = sorted(ranked_list, key=lambda x: x[0]) 
    ranked_list = list(reversed(ranked_list))
    
    return ranked_list


In [None]:
app = Flask(__name__)


@app.route('/ranking', methods=["POST"])
def get_ranked():
    if isinstance(request.json, dict):
        documents = request.json["documents"]
        query = " ".join([i["term"] for i in request.json["terms"]])
        ranked_list = ranking(documents, query, vectorizer, tfidf)
        ranked_res = {"documents": ranked_list, "query": query}
        return json.dumps(ranked_res, ensure_ascii=False)
    else:
        return "Documents aren't found"


In [None]:
if __name__ == "__main__":
    vectorizer, tfidf = initialize_tfidf(config.data_dir,
                                         config.data_dir)
    app.run(host=config.RANKING_HOST, port=config.RANKING_PORT)
