In [None]:
from flask import Flask, request, render_template
import json
import requests

import pandas as pd

from textblob import TextBlob
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.snowball import RussianStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk

nltk.download('stopwords')
nltk.download('punkt')

In [2]:
def _calculate_languages_ratios(text):
    """
    Calculate probability of given text to be written in several languages and
    return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}
    
    @param text: Text whose language want to be detected
    @type text: str
    
    @return: Dictionary with languages and unique stopwords seen in analyzed text
    @rtype: dict
    """

    languages_ratios = {}

    '''
    nltk.wordpunct_tokenize() splits all punctuations into separate tokens
    
    >>> wordpunct_tokenize("That's thirty minutes away. I'll be there in ten.")
    ['That', "'", 's', 'thirty', 'minutes', 'away', '.', 'I', "'", 'll', 'be', 'there', 'in', 'ten', '.']
    '''

    tokens = wordpunct_tokenize(text)
    words = [word.lower() for word in tokens]

    # Compute per language included in nltk number of unique stopwords appearing in analyzed text
    for language in stopwords.fileids():
        stopwords_set = set(stopwords.words(language))
        words_set = set(words)
        common_elements = words_set.intersection(stopwords_set)

        languages_ratios[language] = len(common_elements) # language "score"

    return languages_ratios


#----------------------------------------------------------------------
def detect_language(text):
    """
    Calculate probability of given text to be written in several languages and
    return the highest scored.
    
    It uses a stopwords based approach, counting how many unique stopwords
    are seen in analyzed text.
    
    @param text: Text whose language want to be detected
    @type text: str
    
    @return: Most scored language guessed
    @rtype: str
    """

    ratios = _calculate_languages_ratios(text)

    most_rated_language = max(ratios, key=ratios.get)

    return most_rated_language


def stemm_analyze(text, lang):
    '''
    Preprocess text data
    
    Parameters
    ----------
    text: str
    lang:
    
    Returns
    -------
    indexes: list of str
        list of preprocessed words
    '''
    #Stemming
    new_text = text
    ps = RussianStemmer()
    words = word_tokenize(new_text)
    tokenized_words = []
    for w in words:
        tokenized_words.append(ps.stem(w))

    #Excluding Stop-words
    filtered_words = [word.lower() for word in tokenized_words if word not in stopwords.words('english') and word.isalpha()]
    return filtered_words 

In [16]:
def add_forward_index(document):
    '''
    Add the document to forward index
    
    Parameters
    ----------
    document: dictionary with keys like {"id": "12345", "text": "some text"}
    '''
    if document["id"] not in documents_id:
        documents_id.append(document["id"])
        forward_index.append({"id": int(document["id"]), "text": document["text"]})
        

def search_one(words):
    '''
    Search the words in inverted index
    
    Parameters
    ----------
    words: str
        words that are divided by space and preprocessed in the same way as words in inverted index
    
    Returns
    -------
    indexes: list of int
        list of documents id where searched words are present
    '''  
    idx = []
    words = words.split(" ")
      
    for word in words:
        if word in inverted_index.keys():
            idx.append(set( [doc["doc_id"] for doc in inverted_index[word]]))     
    
    indexes = idx[0]
    for doc_set in idx:
        indexes = indexes.intersection(doc_set)
    
    return list(indexes)


def add_inverted_index(document):
    '''
    Add the document to inverted index

    Parameters
    ----------
    document: dictionary with keys like {"id": "12345", "text": "some text"}
    '''
    tokens = TextBlob(document["text"]).words
    
    if document["id"] not in documents_id:
        documents_id.append(document["id"])
        for token in tokens:
            if token in inverted_index.keys():
                inverted_index[token].append({"id": int(document["id"])})
            else:
                inverted_index[token] = [{"id": int(document["id"])}]
          

In [17]:
# global variables for storing information
inverted_index = {}
forward_index = []
documents_id = []

In [5]:
df = pd.read_csv("../data/eval_texts.csv", sep='\t')

In [6]:
%%time
# add documents to index
for i in range(len(df)):
    doc = df.loc[i].to_dict()
    add_inverted_index(doc)
    add_forward_index(doc)

CPU times: user 3min 3s, sys: 1.27 s, total: 3min 5s
Wall time: 3min 5s


In [9]:
from wtforms import Form, TextField, TextAreaField, validators, StringField, SubmitField

class ReusableForm(Form):
    
    name = TextField('Enter your request:', validators=[validators.DataRequired()])

In [18]:
app = Flask(__name__)

@app.route('/', methods=['GET', 'POST'])
def index():
    '''
    If GET -> renders Main page from templates/index.html
    If POST -> renders ranged search results
    Parameters
    ---------
    query: str
    
    Returns
    -------
    dictionary like {"lang": "language", "text": query, "stemmed_text": stemme
    '''
    form = ReusableForm(request.form)
    if request.method == 'POST':
        query = request.form['name']
        query = stemm_serv(query)["stemmed_text"]
        result = get_snippets(q=query)
        #Ranging -> SERP, should be done instead, in future
        return result
 
    return render_template('index.html', form=form)

@app.route("/det_lang", methods=["POST"])
def det_lang(query):
    '''
    Detect language of query
    (*sholud be rewritten to be independent server)
    Parameters
    ---------
    query: str
    
    Returns
    -------
    dictionary like {"status":"ok", "got_data": query, "language": l}
    '''
    #j = request.json
    l = detect_language(query)
    return {"status":"ok", "got_data":query, "language":l}
    

@app.route("/stemm", methods=["POST"])
def stemm_serv(query):
    '''
    Preprocess the text (query)
    
    Parameters
    ---------
    query: str
    
    Returns
    -------
    dictionary like {"lang": "language", "text": query, "stemmed_text": stemmed}
    '''
    lang_out = det_lang(query)
    stemmed = stemm_analyze(query, lang_out["language"])
    stemmed = " ".join(str(x) for x in stemmed)
    return {"lang":lang_out["language"], "text":query, "stemmed_text":stemmed}

@app.route("/logging", methods=["POST"])
def loggs():
    '''
    *should Returns json with all logs got from user@
    Parameters
    ---------
    
    Returns
    -------
    dict like {"logs":logs}
    '''
    logs = []
    return json.dumps({"logs":logs})
       

@app.route('/ranging', methods=["POST"])
def ranging():
    '''
    *do ranging*
    Parameters
    -----------------
    list_of_dicts
    
    Returns
    -------
    List with dictionaries according to ranged documents ID as json file
    '''
    list_of_dicts = request.json 
    documents = [Document(**doc) for doc in list_of_dicts]
    return json.dumps([doc.to_dict() for doc in documents])
    

@app.route('/return_page', methods=["POST"])
def SERP():
    '''
    Rendets final search results page
    
    Parameters
    ---------
    list of documents id to show
    
    Returns
    -------
    page in html
    '''

    return "html"


@app.route('/indexator',  methods=["POST"])
def add_to_index():
    '''
    Adds document to inv_index and forw_index
    Parameters
    ---------
    document: dictionary
    '''
    document = request.json 
    add_inverted_index(document)
    add_forward_index(document)
   
    return "added to index"


@app.route("/search", methods=["POST"])
def search():
    '''
    Gets list of document`s id where search words are present
    Parameters
    ---------
    dict with preprocessed search words
    
    Returns
    -------
    result : list of documents id where search words are present
    '''
    search_words = request.json["search_word"]
    
    result = search_one(search_words)
    
    return json.dumps(result)


@app.route('/snippets', methods=["POST"])
def get_snippets(q=None):
    '''
    Returns 10 first search results with snippets(all text in doc)
    *change to all results | rename | improve snippets*
    Parameters
    ---------
    q : str
        search query
    
    Returns
    -------
    dicts : list of dicts
        contains snippets
    '''
    if q == None:
        search_words = request.json["search_words"]
        print(type(search_words))
        result = search_one(search_words)
        dicts = []
        for doc_id in result:
            val = list(df[df.id==doc_id].text.values)
            dicts.append({str(doc_id): val})
        return json.dumps(dicts[0: 10], ensure_ascii=False)
    else:
        search_words = q
        result = search_one(search_words)
        dicts = []
        for doc_id in result:
            val = list(df[df.id==doc_id].text.values)
            dicts.append({str(doc_id): val})
        return json.dumps(dicts[0: 10], ensure_ascii=False)

In [None]:
if __name__ == "__main__":
    app.run(port=13503)
    
#if __name__ == "__main__":
#    app.run(host='0.0.0.0', port=13502)

In [None]:
print(len(inverted_index))
print(len(forward_index))