In [1]:
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords

def _calculate_languages_ratios(text):
    """
    Calculate probability of given text to be written in several languages and
    return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}
    
    @param text: Text whose language want to be detected
    @type text: str
    
    @return: Dictionary with languages and unique stopwords seen in analyzed text
    @rtype: dict
    """

    languages_ratios = {}

    '''
    nltk.wordpunct_tokenize() splits all punctuations into separate tokens
    
    >>> wordpunct_tokenize("That's thirty minutes away. I'll be there in ten.")
    ['That', "'", 's', 'thirty', 'minutes', 'away', '.', 'I', "'", 'll', 'be', 'there', 'in', 'ten', '.']
    '''

    tokens = wordpunct_tokenize(text)
    words = [word.lower() for word in tokens]

    # Compute per language included in nltk number of unique stopwords appearing in analyzed text
    for language in stopwords.fileids():
        stopwords_set = set(stopwords.words(language))
        words_set = set(words)
        common_elements = words_set.intersection(stopwords_set)

        languages_ratios[language] = len(common_elements) # language "score"

    return languages_ratios


#----------------------------------------------------------------------
def detect_language(text):
    """
    Calculate probability of given text to be written in several languages and
    return the highest scored.
    
    It uses a stopwords based approach, counting how many unique stopwords
    are seen in analyzed text.
    
    @param text: Text whose language want to be detected
    @type text: str
    
    @return: Most scored language guessed
    @rtype: str
    """

    ratios = _calculate_languages_ratios(text)

    most_rated_language = max(ratios, key=ratios.get)

    return most_rated_language

In [2]:
def stemm_analyze (text, lang):
    #Stemming
    from nltk.stem import PorterStemmer
    from nltk.tokenize import sent_tokenize, word_tokenize
    import nltk
    nltk.download('punkt') 

    new_text = "It is important to by very pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once."
    ps = PorterStemmer()
    words = word_tokenize(new_text)
    tokenized_words = []
    for w in words:
        tokenized_words.append(ps.stem(w))

    #Excluding Stop-words
    from nltk.corpus import stopwords
    filtered_words = [word.lower() for word in tokenized_words if word not in stopwords.words('english') and word.isalpha()]
    return filtered_words

In [None]:
from flask import Flask, request
import json

app = Flask(__name__)

@app.route("/")
def hello():
    return "Hello World Scissors!"

@app.route("/test", methods=["POST"])
def test():
    j = request.json
    l = detect_language(j["text"])
    return json.dumps({"status":"ok", "got_data":j["text"], "data_len":l})
    

    det_lang
    
@app.route("/det_lang", methods=["POST"])
def det_lang():
    j = request.json
    l = detect_language(j["text"])
    return json.dumps({"status":"ok", "got_data":j["text"], "language":l})
    

@app.route("/stemm", methods=["POST"])
def stemm_serv():
    j = request.json
    stemmed = stemm_analyze(j["text"], j["lang"])
    return json.dumps({"lang":j["lang"], "text":j["text"], "stemmed_text":stemmed})
    
@app.route("/rev_ind", methods=["POST"])
def rev_ind():
    stemmed_text = request.json["stemmed_text"]
    idx = []
    j = request.json
    for word in stemmed_text:
        idx.append(len(word))
    return json.dumps({"idxs":idx, "tstemmed_textext":j["stemmed_text"]})

@app.route("/logging", methods=["POST"])
def loggs():
    stemmed_text = request.json["stemmed_text"]
    idx = []
    j = request.json
    for word in stemmed_text:
        idx.append(len(word))
    return json.dumps({"idxs":idx, "tstemmed_textext":j["stemmed_text"]})


if __name__ == "__main__":
    app.run(host='0.0.0.0', port=13501)

 * Running on http://0.0.0.0:13501/ (Press CTRL+C to quit)
127.0.0.1 - - [01/Nov/2018 18:43:24] "POST /rev_ind HTTP/1.1" 200 -
