In [None]:
import json

from flask import Flask, request
from bs4 import BeautifulSoup
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import wordpunct_tokenize
from pymystem3 import Mystem
import jsonpickle

import config
from document import Document

nltk.download('punkt')
nltk.download('stopwords')


In [None]:
def _calculate_languages_ratios(text):
    """
    Calculate probability of given text to be written in several languages and
    return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}

    @param text: Text whose language want to be detected
    @type text: str

    @return: Dictionary with languages and unique stopwords seen in analyzed text
    @rtype: dict
    """
    languages_ratios = {}
    words = wordpunct_tokenize(text.lower())

    # Compute per language included in nltk number of unique stopwords appearing in analyzed text
    for language in stopwords.fileids():
        stopwords_set = set(stopwords.words(language))
        words_set = set(words)
        common_elements = words_set.intersection(stopwords_set)
        languages_ratios[language] = len(common_elements)  # language "score"
    return languages_ratios


def detect_language(text):
    """
    Calculate probability of given text to be written in several languages and
    return the highest scored.

    It uses a stopwords based approach, counting how many unique stopwords
    are seen in analyzed text.

    @param text: Text whose language want to be detected
    @type text: str

    @return: Most scored language guessed
    @rtype: str
    """
    ratios = _calculate_languages_ratios(text)
    most_rated_language = max(ratios, key=ratios.get)
    return most_rated_language

def stemming(text, lang):
    """
    Stem the text taking into accoung language
    
    :param list of str text: list of tokens
    :return list of str text:    
    """
    stemmer = SnowballStemmer(lang)
    
    text = [stemmer.stem(word) for word in text] 
    return text


def lemmatization(text, lang):
    """
    Lemmatize the text taking into accoung language
    
    :param list of str text: list of tokens
    :return list of str text:    
    """
    lang = detect_language(text)
    
    if lang == "russian":
        lemmatizer = Mystem()
        text = " ".join(text)
        text = lemmatizer.lemmatize(text)             
        #text = [lemmatizer.lemmatize(word)[0] for word in text]
    else:
        lemmatizer = WordNetLemmatizer()
        text = [lemmatizer.lemmatize(word) for word in text] 
    return text
  
    
def clean_html(text):
    """
    Clean text from html tags
    
    :param str text:
    :return str text:
    """
    try:
        text = BeautifulSoup(text, "html").text
    except:
        print("Exception in  clean_html. NoneType argument.")
        return ""
    
    return text


def normalize_text(text, norm_type="stemming"):
    """
    Preprocess text data
    
    :param str text:
    :param str norm_type: "stemming" or "lemmatization" of None
    :return str text:
    """
    text = clean_html(text)
    text = text.lower()
    lang = detect_language(text)
    text = nltk.word_tokenize(text)
    if lang not in SnowballStemmer.languages:
        lang = "english"
        
    if norm_type.lower() == "lemmatization":
        text = lemmatization(text, lang)
    elif norm_type.lower() == "stemming":
        text = stemming(text, lang)
    else:
        None
        
    # Excluding Stop-words
    text = [word for word in text if
            word not in stopwords.words(lang) and word.isalpha()]
    text = " ".join(text)   
    return text


In [None]:
app = Flask(__name__)


@app.route('/', methods=['GET', 'POST'])
def index():
    return "Main page of text preprocessor"


@app.route("/normalize_document", methods=["POST"])
def normalize_document():
    """
    :param Document document:
    :return Document document: 
    """
    document = jsonpickle.decode(request.json)
    assert isinstance(document, Document)
    
    text = document.text
    document.text_normalized = normalize_text(text)
    text = document.title
    document.title_normalized = normalize_text(text)
    
    return jsonpickle.encode(document)


@app.route("/normalize_query", methods=["POST"])
def normalize_query():
    """
    :param str text:
    :return str text: 
    """
    text = request.json
    text = normalize_text(text)
    return text


@app.route("/stem_text", methods=["POST"])
def stem_text():
    """
    :param str text:
    :return str text: 
    """
    text = request.json
    text = normalize_text(text, norm_type="stemming")
    return text


@app.route("/lemmatize_text", methods=["POST"])
def lemmatize_text():
    """
    :param str text:
    :return str text: 
    """
    text = request.json
    text = normalize_text(text, norm_type=None)
    return text


@app.route("/tokenize_text", methods=["POST"])
def tokenize_text():
    """
    :param str text:
    :return str text: 
    """
    text = request.json
    text = normalize_text(text, norm_type="lemmatization")
    return text


@app.route("/detect_language", methods=["POST"])
def det_lang():
    """
    :param Document document:
    :return Document document: With updated language attribute
    """
    document = jsonpickle.decode(request.json)
    language = detect_language(document.text)
    document.language = language
    return jsonpickle.encode(document)


In [None]:
if __name__ == "__main__":
    app.run(host=config.TEXT_PROCESSING_HOST,
            port=config.TEXT_PROCESSING_PORT)
