In [1]:
from flask import Flask, request, render_template
import json
from textblob import TextBlob

import pandas as pd

In [23]:
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')

def _calculate_languages_ratios(text):
    """
    Calculate probability of given text to be written in several languages and
    return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}
    
    @param text: Text whose language want to be detected
    @type text: str
    
    @return: Dictionary with languages and unique stopwords seen in analyzed text
    @rtype: dict
    """

    languages_ratios = {}

    '''
    nltk.wordpunct_tokenize() splits all punctuations into separate tokens
    
    >>> wordpunct_tokenize("That's thirty minutes away. I'll be there in ten.")
    ['That', "'", 's', 'thirty', 'minutes', 'away', '.', 'I', "'", 'll', 'be', 'there', 'in', 'ten', '.']
    '''

    tokens = wordpunct_tokenize(text)
    words = [word.lower() for word in tokens]

    # Compute per language included in nltk number of unique stopwords appearing in analyzed text
    for language in stopwords.fileids():
        stopwords_set = set(stopwords.words(language))
        words_set = set(words)
        common_elements = words_set.intersection(stopwords_set)

        languages_ratios[language] = len(common_elements) # language "score"

    return languages_ratios


#----------------------------------------------------------------------
def detect_language(text):
    """
    Calculate probability of given text to be written in several languages and
    return the highest scored.
    
    It uses a stopwords based approach, counting how many unique stopwords
    are seen in analyzed text.
    
    @param text: Text whose language want to be detected
    @type text: str
    
    @return: Most scored language guessed
    @rtype: str
    """

    ratios = _calculate_languages_ratios(text)

    most_rated_language = max(ratios, key=ratios.get)

    return most_rated_language

def stemm_analyze (text, lang):
    #Stemming
    from nltk.stem import PorterStemmer
    from nltk.stem.snowball import RussianStemmer
    from nltk.tokenize import sent_tokenize, word_tokenize
    import nltk

    new_text = text
    ps = RussianStemmer()
    words = word_tokenize(new_text)
    tokenized_words = []
    for w in words:
        tokenized_words.append(ps.stem(w))

    #Excluding Stop-words
    from nltk.corpus import stopwords
    filtered_words = [word.lower() for word in tokenized_words if word not in stopwords.words('english') and word.isalpha()]
    return filtered_words 

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/vzalevskyi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/vzalevskyi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [20]:
df = pd.read_csv("../../Downloads/eval_texts.csv", sep='\t')
df = df[:100]

KeyboardInterrupt: 

In [4]:
inverted_index = {}
forward_index = []
documents_id = []

In [5]:
def add_forward_index(document):
    if document["id"] not in documents_id:
        documents_id.append(document["id"])
        forward_index.append({"doc_id": int(document["id"]), "text": document["text"]})

In [None]:
def search_one(words):
    idx = []
    words = words.split(" ")
      
    for word in words:
        if word in inverted_index.keys():
            idx.append(set( [doc["doc_id"] for doc in inverted_index[word]]))     
    
    indexes = idx[0]
    for doc_set in idx:
        indexes = indexes.intersection(doc_set)
    
    return list(indexes)

In [None]:
def add_inverted_index(document):
    tokens = TextBlob(document["text_searchable"]).words
    
    if document["id"] not in documents_id:
        documents_id.append(document["id"])
        for token in tokens:
            if token in inverted_index.keys():
                inverted_index[token].append({"doc_id": int(document["id"])})
            else:
                inverted_index[token] = [{"doc_id": int(document["id"])}]
          

In [None]:
%%time
for i in range(len(df)):
    doc = df.loc[i].to_dict()
    add_inverted_index(doc)
    add_forward_index(doc)

In [9]:
def add_inverted_index(document):
    tokens = TextBlob(document["text_searchable"]).words
    
    if document["id"] not in documents_id:
        for token in tokens:
            if token in inverted_index.keys():
                inverted_index[token].append({"doc_id": document["id"]})
            else:
                inverted_index[token] = [{"doc_id": document["id"]}]

In [10]:
inverse_ind = {"w1":{"docid":[1]}}

In [11]:
def searchsearch_word(search_word):
    #search_word = request.json["search_word"]
    idx = []
    for word in inverse_ind.keys():
        if search_word==word:
            idx.append(inverse_ind[word])
    return idx

In [12]:
searchsearch_word("w1")

[{'docid': [1]}]

In [13]:
from wtforms import Form, TextField, TextAreaField, validators, StringField, SubmitField

class ReusableForm(Form):
    name = TextField('Enter your request:', validators=[validators.required()])

  after removing the cwd from sys.path.


In [29]:
import requests

In [61]:
app = Flask(__name__)

@app.route('/', methods=['GET', 'POST'])
def index():
    form = ReusableForm(request.form)
    if request.method == 'POST':
        query = request.form['name']
        query = stemm_serv(query)["stemmed_text"]
        result = get_snippets(q=query)
        return result
 
    return render_template('index.html', form=form)
    
@app.route("/test", methods=["POST"])
def test(j):
    #j = request.json
    l = detect_language(j)
    return json.dumps({"status":"ok", "got_data":j, "data_len":l})
    
@app.route("/det_lang", methods=["POST"])
def det_lang(query):
    #j = request.json
    l = detect_language(query)
    return {"status":"ok", "got_data":query, "language":l}
    

@app.route("/stemm", methods=["POST"])
def stemm_serv(query):
    lang_out = det_lang(query)
    stemmed = stemm_analyze(query, lang_out["language"])
    stemmed = " ".join(str(x) for x in stemmed)
    return {"lang":lang_out["language"], "text":query, "stemmed_text":stemmed}
    
@app.route("/rev_ind", methods=["POST"])
def rev_ind(query):
    stemmed = stemm_serv(query)
    stemmed_text = stemmed["stemmed_text"]
    idx = []
    #j = request.json
    for word in stemmed_text:
        idx.append(len(word))
    return json.dumps({"idxs":idx, "stemmed_text":stemmed_text}, ensure_ascii=False)

@app.route("/logging", methods=["POST"])
def loggs():
    stemmed_text = request.json["stemmed_text"]
    idx = []
    j = request.json
    for word in stemmed_text:
        idx.append(len(word))
    return json.dumps({"idxs":idx, "tstemmed_textext":j["stemmed_text"]})

    
@app.route('/about')
def about():
    return 'The about page'
        

@app.route('/ranging', methods=["POST"])
def ranging():
    """
    Returns:
    List with dictionaries according to ranged documents ID as json file
    """
    list_of_dicts = request.json 
    documents = [Document(**doc) for doc in list_of_dicts]
    return json.dumps([doc.to_dict() for doc in documents])
    

@app.route('/return_page', methods=["POST"])
def return_page():
    """
    Returns:
    
    """
    list_of_dicts = request.json 
    documents = [Document(**doc) for doc in list_of_dicts]
    result = [doc.to_dict() for doc in documents]
    
    return "html"

@app.route('/indexator',  methods=["POST"])
def add_to_index():
    """
    Returns:
    
    """
    document = request.json 
    add_inverted_index(document)
    add_forward_index(document)
   
    return "added to index"


@app.route("/search", methods=["POST"])
def search():
    search_words = request.json["search_word"]
    
    result = search_one(search_words)
    
    return json.dumps(result)


@app.route('/snippets', methods=["POST"])
def get_snippets(q=None):
    """
    Returns:   
    """ 
    if q == None:
        search_words = request.json["search_words"]
        print(type(search_words))
        result = search_one(search_words)
        dicts = []
        for doc_id in result:
            val = list(df[df.id==doc_id].text.values)
            dicts.append({str(doc_id): val})
        return json.dumps(dicts, ensure_ascii=False)
    else:
        search_words = q
        result = search_one(search_words)
        dicts = []
        for doc_id in result:
            val = list(df[df.id==doc_id].text.values)
            dicts.append({str(doc_id): val})
        return json.dumps(dicts, ensure_ascii=False)

In [62]:
if __name__ == "__main__":
    app.run(host='0.0.0.0', port=13502)

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://0.0.0.0:13502/ (Press CTRL+C to quit)
127.0.0.1 - - [12/Nov/2018 00:11:07] "GET /favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [12/Nov/2018 00:11:08] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [12/Nov/2018 00:11:09] "GET /favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [12/Nov/2018 00:11:10] "GET /favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [12/Nov/2018 00:11:13] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [12/Nov/2018 00:11:13] "GET /favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [12/Nov/2018 00:11:14] "GET /favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [12/Nov/2018 00:11:17] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [12/Nov/2018 00:11:18] "GET /favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [12/Nov/2018 00:11:19] "GET /favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [12/Nov/2018 00:11:26] "POST / HTTP/1.1" 200 -
127.0.0.1 - - [12/Nov/2018 00:11:27] "GET /favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [12/Nov/2018 00:11:28] "GET /favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [12/Nov/2018 00:11:31] "POST / HTTP/1.1" 200

In [77]:
print(len(inverted_index))
print(len(forward_index))

In [82]:
inverted_index.keys()

In [83]:
add_forw_index(rrroww)

Unnamed: 0,id,text
0,1,fdfdf
0,3334,fdffssgddfdfdf
