In [1]:
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')
def _calculate_languages_ratios(text):
    """
    Calculate probability of given text to be written in several languages and
    return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}
    
    @param text: Text whose language want to be detected
    @type text: str
    
    @return: Dictionary with languages and unique stopwords seen in analyzed text
    @rtype: dict
    """

    languages_ratios = {}

    '''
    nltk.wordpunct_tokenize() splits all punctuations into separate tokens
    
    >>> wordpunct_tokenize("That's thirty minutes away. I'll be there in ten.")
    ['That', "'", 's', 'thirty', 'minutes', 'away', '.', 'I', "'", 'll', 'be', 'there', 'in', 'ten', '.']
    '''

    tokens = wordpunct_tokenize(text)
    words = [word.lower() for word in tokens]

    # Compute per language included in nltk number of unique stopwords appearing in analyzed text
    for language in stopwords.fileids():
        stopwords_set = set(stopwords.words(language))
        words_set = set(words)
        common_elements = words_set.intersection(stopwords_set)

        languages_ratios[language] = len(common_elements) # language "score"

    return languages_ratios


#----------------------------------------------------------------------
def detect_language(text):
    """
    Calculate probability of given text to be written in several languages and
    return the highest scored.
    
    It uses a stopwords based approach, counting how many unique stopwords
    are seen in analyzed text.
    
    @param text: Text whose language want to be detected
    @type text: str
    
    @return: Most scored language guessed
    @rtype: str
    """

    ratios = _calculate_languages_ratios(text)

    most_rated_language = max(ratios, key=ratios.get)

    return most_rated_language

[nltk_data] Error loading stopwords: <urlopen error [Errno -2] Name or
[nltk_data]     service not known>
[nltk_data] Error loading punkt: <urlopen error [Errno -2] Name or
[nltk_data]     service not known>


In [2]:
from wtforms import Form, StringField, SelectField

class SearchForm(Form):
    search = StringField('Enter your search request:')

In [8]:
def stemm_analyze (text, lang):
    #Stemming
    from nltk.stem import PorterStemmer
    from nltk.tokenize import sent_tokenize, word_tokenize
    import nltk

    new_text = text
    ps = PorterStemmer()
    words = word_tokenize(new_text)
    tokenized_words = []
    for w in words:
        tokenized_words.append(ps.stem(w))

    #Excluding Stop-words
    from nltk.corpus import stopwords
    filtered_words = [word.lower() for word in tokenized_words if word not in stopwords.words('english') and word.isalpha()]
    return filtered_words

In [4]:
from wtforms import Form, TextField, TextAreaField, validators, StringField, SubmitField

class ReusableForm(Form):
    name = TextField('Name:', validators=[validators.required()])

  after removing the cwd from sys.path.


In [None]:
from flask import Flask, request, flash, render_template, redirect
import json

app = Flask(__name__)
@app.route('/', methods=['GET', 'POST'])
def index():
    form = ReusableForm(request.form)
    if request.method == 'POST':
        query = request.form['name']
        #test(query)
        return rev_ind(query)
 
    return render_template('index.html', form=form)
    

@app.route("/test", methods=["POST"])
def test(j):
    #j = request.json
    l = detect_language(j)
    return json.dumps({"status":"ok", "got_data":j, "data_len":l})
    
@app.route("/det_lang", methods=["POST"])
def det_lang(query):
    #j = request.json
    l = detect_language(query)
    return {"status":"ok", "got_data":query, "language":l}
    

@app.route("/stemm", methods=["POST"])
def stemm_serv(query):
    lang_out = det_lang(query)
    stemmed = stemm_analyze(query, lang_out["language"])
    return {"lang":lang_out["language"], "text":query, "stemmed_text":stemmed}
    
@app.route("/rev_ind", methods=["POST"])
def rev_ind(query):
    stemmed = stemm_serv(query)
    stemmed_text = stemmed["stemmed_text"]
    idx = []
    #j = request.json
    for word in stemmed_text:
        idx.append(len(word))
    return json.dumps({"idxs":idx, "stemmed_text":stemmed_text})

@app.route("/logging", methods=["POST"])
def loggs():
    stemmed_text = request.json["stemmed_text"]
    idx = []
    j = request.json
    for word in stemmed_text:
        idx.append(len(word))
    return json.dumps({"idxs":idx, "tstemmed_textext":j["stemmed_text"]})


if __name__ == "__main__":
    app.run(host='0.0.0.0', port=13501)

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://0.0.0.0:13501/ (Press CTRL+C to quit)
127.0.0.1 - - [06/Nov/2018 13:26:55] "GET /favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [06/Nov/2018 13:26:56] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [06/Nov/2018 13:26:56] "GET /favicon.ico HTTP/1.1" 404 -
