In [36]:
import json
import pickle
import os

from flask import Flask, request
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

import config

In [6]:
inverted_index = {}
forward_index = {}
documents_id = []
vectorizer = None
tfidf = None

In [70]:
def load_index(path):
    global forward_index
    global inverted_index
    global documents_id

    file_path = os.path.join(path, "forward_index.json")
    with open(file_path, 'r', encoding='utf8') as infile:
        forward_index = json.load(infile)

    file_path = os.path.join(path, "inverted_index.json")
    with open(file_path, 'r', encoding='utf8') as infile:
        inverted_index = json.load(infile)

    file_path = os.path.join(path, "documents_id.json")
    with open(file_path, 'r', encoding='utf8') as infile:
        documents_id = json.load(infile)


def compute_tfidf():
    corpus = []
    for i in documents_id:
        corpus.append(forward_index[i]["text_normalized"])
    
    vectorizer = TfidfVectorizer()
    tfidf = vectorizer.fit_transform(corpus).todense()
    return vectorizer, tfidf


def initialize_tfidf(index_path, save_tfidf_path): 
    global inverted_index
    global forward_index
    global documents_id
    
    load_index(index_path)
    vectorizer, tfidf = compute_tfidf()
    
    save_tfidf_path = os.path.join(save_tfidf_path, "vectorizer_tfidf.dat")
    with open(save_tfidf_path, "wb") as ouf:
        pickle.dump(vectorizer, ouf)
        pickle.dump(tfidf, ouf)
        
    del inverted_index
    del forward_index
    del documents_id
    
    return vectorizer, tfidf

def ranking(documents, query, vectorizer, tfidf):
    query_vect = vectorizer.transform([query]).todense()
    doc_vects = [doc["text_normalized"] for doc in documents]
    doc_vects = vectorizer.transform(doc_vects).todense()
    
    ranked_list = cosine_similarity(doc_vects, query_vect)
    ranked_list = list(np.squeeze(ranked_list))
    assert(len(ranked_list) == len(documents))
    
    ranked_list = list(zip(ranked_list, documents))
    ranked_list = sorted(ranked_list, key=lambda x: x[0]) 
    ranked_list = list(reversed(ranked_list))
    
    return ranked_list

In [71]:
app = Flask(__name__)


@app.route('/ranking', methods=["POST"])
def get_ranked():
    documents = request.json["documents"]
    query = " ".join([i["term"] for i in request.json["terms"]])
    ranked_list = ranking(documents, query, vectorizer, tfidf)
    return json.dumps(ranked_list, ensure_ascii=False)


In [72]:
if __name__ == "__main__":
    
    vectorizer, tfidf = initialize_tfidf(config.data_dir,
                                         config.data_dir)
    # app.run(port=config.RANKING_PORT)
    app.run(host='0.0.0.0', port=13510)



 * Running on http://0.0.0.0:13510/ (Press CTRL+C to quit)
127.0.0.1 - - [22/Nov/2018 19:38:08] "POST /ranking HTTP/1.1" 200 -
