In [88]:
import pandas
import textblob

In [1]:
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')
def _calculate_languages_ratios(text):
    """
    Calculate probability of given text to be written in several languages and
    return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}
    
    @param text: Text whose language want to be detected
    @type text: str
    
    @return: Dictionary with languages and unique stopwords seen in analyzed text
    @rtype: dict
    """

    languages_ratios = {}

    '''
    nltk.wordpunct_tokenize() splits all punctuations into separate tokens
    
    >>> wordpunct_tokenize("That's thirty minutes away. I'll be there in ten.")
    ['That', "'", 's', 'thirty', 'minutes', 'away', '.', 'I', "'", 'll', 'be', 'there', 'in', 'ten', '.']
    '''

    tokens = wordpunct_tokenize(text)
    words = [word.lower() for word in tokens]

    # Compute per language included in nltk number of unique stopwords appearing in analyzed text
    for language in stopwords.fileids():
        stopwords_set = set(stopwords.words(language))
        words_set = set(words)
        common_elements = words_set.intersection(stopwords_set)

        languages_ratios[language] = len(common_elements) # language "score"

    return languages_ratios


#----------------------------------------------------------------------
def detect_language(text):
    """
    Calculate probability of given text to be written in several languages and
    return the highest scored.
    
    It uses a stopwords based approach, counting how many unique stopwords
    are seen in analyzed text.
    
    @param text: Text whose language want to be detected
    @type text: str
    
    @return: Most scored language guessed
    @rtype: str
    """

    ratios = _calculate_languages_ratios(text)

    most_rated_language = max(ratios, key=ratios.get)

    return most_rated_language

[nltk_data] Downloading package stopwords to /home/math/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/math/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [87]:
def add_inverted_index(document):
    tokens = TextBlob(document["text_searchable"]).words
    
    if document["id"] not in documents_id:
        for token in tokens:
            if token in inverted_index.keys():
                inverted_index[token].append({"doc_id": document["id"]})
            else:
                inverted_index[token] = [{"doc_id": document["id"]}]

In [2]:
#!pip install wtforms
from wtforms import Form, StringField, SelectField

class SearchForm(Form):
    search = StringField('Enter your search request:')

In [3]:
def stemm_analyze (text, lang):
    #Stemming
    from nltk.stem import PorterStemmer
    from nltk.tokenize import sent_tokenize, word_tokenize
    import nltk

    new_text = text
    ps = PorterStemmer()
    words = word_tokenize(new_text)
    tokenized_words = []
    for w in words:
        tokenized_words.append(ps.stem(w))

    #Excluding Stop-words
    from nltk.corpus import stopwords
    filtered_words = [word.lower() for word in tokenized_words if word not in stopwords.words('english') and word.isalpha()]
    return filtered_words

In [4]:
from wtforms import Form, TextField, TextAreaField, validators, StringField, SubmitField

class ReusableForm(Form):
    name = TextField('Name:', validators=[validators.required()])

  after removing the cwd from sys.path.


In [8]:
from flask import Flask, request, flash, render_template, redirect
import json

app = Flask(__name__)
@app.route('/', methods=['GET', 'POST'])
def index():
    form = ReusableForm(request.form)
    if request.method == 'POST':
        query = request.form['name']
        #test(query)
        return rev_ind(query)
 
    return render_template('index.html', form=form)
    

@app.route("/test", methods=["POST"])
def test(j):
    #j = request.json
    l = detect_language(j)
    return json.dumps({"status":"ok", "got_data":j, "data_len":l})
    
@app.route("/det_lang", methods=["POST"])
def det_lang(query):
    #j = request.json
    l = detect_language(query)
    return {"status":"ok", "got_data":query, "language":l}
    

@app.route("/stemm", methods=["POST"])
def stemm_serv(query):
    lang_out = det_lang(query)
    stemmed = stemm_analyze(query, lang_out["language"])
    return {"lang":lang_out["language"], "text":query, "stemmed_text":stemmed}
    
@app.route("/rev_ind", methods=["POST"])
def rev_ind(query):
    stemmed = stemm_serv(query)
    stemmed_text = stemmed["stemmed_text"]
    idx = []
    #j = request.json
    for word in stemmed_text:
        idx.append(len(word))
    return json.dumps({"idxs":idx, "stemmed_text":stemmed_text})

@app.route("/logging", methods=["POST"])
def loggs():
    stemmed_text = request.json["stemmed_text"]
    idx = []
    j = request.json
    for word in stemmed_text:
        idx.append(len(word))
    return json.dumps({"idxs":idx, "tstemmed_textext":j["stemmed_text"]})


@app.route("/search", methods=["POST"])
def search():
    search_word = request.json["search_word"]
    idx = []
    for word in inverse_ind.keys():
        if search_word==word:
            idx.append(inverse_ind[word])
    return json.dumps({"idxs":idx, "words":search_word})






if __name__ == "__main__":
    app.run(host='0.0.0.0', port=13501)

 * Running on http://0.0.0.0:13501/ (Press CTRL+C to quit)
127.0.0.1 - - [08/Nov/2018 16:26:59] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [08/Nov/2018 16:27:10] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [08/Nov/2018 16:28:04] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [08/Nov/2018 16:28:15] "POST / HTTP/1.1" 400 -
127.0.0.1 - - [08/Nov/2018 16:28:29] "GET / HTTP/1.1" 200 -
62.80.177.194 - - [08/Nov/2018 16:29:50] code 400, message Bad request syntax ('\x16\x03\x01\x02\x00\x01\x00\x01ü\x03\x03\x13p4vN¦O\x15ñdl\x8dO£yV8\x92¶')
62.80.177.194 - - [08/Nov/2018 16:29:50] "  üp4vN¦OñdlO£yV8¶" HTTPStatus.BAD_REQUEST -
62.80.177.194 - - [08/Nov/2018 16:29:50] code 400, message Bad request syntax ('\x16\x03\x01\x02\x00\x01\x00\x01ü\x03\x03h£¬%\x86\x11§ê¼\x9fpàv\x8ac\x0fOÈü\x95;\x8bÍr¬c\x7fY¢\x0b\x983 Âe\x116¼Ít^\x1a¹)\t\x7f\tl{Ê\x92\x8e#^é¶(Ô7¶\x0f±¼T\x87\x00$\x13\x01\x13\x03\x13\x02À+À/Ì©Ì¨À,À0À')
62.80.177.194 - - [08/Nov/2018 16:29:50] "  üh£¬%§ê¼pàvcOÈü;Ír¬cY¢3 Âe6¼Ít^¹)		l{

In [86]:
import pandas as pd

In [None]:
import 

In [117]:
df = pd.read_csv("../../../Data/eval_texts.csv", sep='\t')

In [79]:
rrrow = {"id":[1], "text":["fdfdf"]}
forw_index = pd.DataFrame.from_dict(rrrow)

ValueError: If using all scalar values, you must pass an index

In [74]:
forw_index

Unnamed: 0,id,text
0,1,fdfdf


In [77]:
def add_forw_index(row):
    row["id"] = [row["id"]]
    row["text"] = [row["text"]]
    temp_df = pd.DataFrame.from_dict(row)
    res = forw_index.append(temp_df)
    return res

In [82]:
rrroww = {"id":3334, "text":"fdffssgddfdfdf"}

In [83]:
add_forw_index(rrroww)

Unnamed: 0,id,text
0,1,fdfdf
0,3334,fdffssgddfdfdf


In [84]:
def add_inverted_index(document):
    tokens = TextBlob(document["text_searchable"]).words
    
    if document["id"] not in documents_id:
        for token in tokens:
            if token in inverted_index.keys():
                inverted_index[token].append({"doc_id": document["id"]})
            else:
                inverted_index[token] = [{"doc_id": document["id"]}]

In [114]:
inverse_ind = {"w1":{"docid":[1]}}

In [115]:
def searchsearch_word(search_word):
    #search_word = request.json["search_word"]
    idx = []
    for word in inverse_ind.keys():
        if search_word==word:
            idx.append(inverse_ind[word])
    return idx

In [116]:
searchsearch_word("w1")

[{'docid': [1]}]