In [76]:
#!pip install pymorphy2
from pymorphy2 import MorphAnalyzer
import re
from tqdm import tqdm_notebook
import torch
import numpy as np
from nltk.tokenize import TweetTokenizer
from nltk.stem.snowball import RussianStemmer
import pickle
import os
import pandas as pd
from tqdm import tqdm
import json

In [14]:
import json
with open("team_keywords.json", "r") as fp_:
  json_ = json.load(fp_)

In [17]:
morph = MorphAnalyzer()
for team in json_.keys():
  json_[team] = [morph.parse(token)[0].normal_form for token in json_[team]]

In [23]:
def lemmatize_texts(texts):
    tokenized_text = []
    for text in tqdm_notebook(texts):
        #tokens = nltk.wordpunct_tokenize(text.lower())
        prog = re.compile(r'[а-яa-z]+')
        tokens = prog.findall(str(text).lower())
        res=[morph.parse(token)[0].normal_form for token in tokens]
        tokenized_text.append(res)
    return tokenized_text

In [44]:
def intersection(lemmatized_texts, json_):
  intersection_list = []
  for lemm in lemmatized_texts:

    intersection={}
    for team in json_.keys():
      intersection[team] = len(list(set(lemm) & set(json_[team])))/len(json_[team])
    intersection_list.append(intersection)
  return intersection_list

In [45]:
def top(intersection_list, top_=3):
  topn = []
  for intersection in intersection_list:
    intersection = {x:y for x,y in intersection.items() if y!=0}
    topn.append(sorted(intersection, key=intersection.get, reverse=True)[:top_])
  return topn

In [46]:
#!pip install rutermextract
def extract_keywords(text):
    from rutermextract import TermExtractor
    term_extractor = TermExtractor()
    return [term.normalized for term in term_extractor(text)]

In [57]:
VOCAB_SIZE = 5000
stemer = RussianStemmer()
stem_cache = {}
regex = re.compile('[^а-яА-Я ]')
tokenizer = TweetTokenizer()

In [58]:
class senNet(torch.nn.Module):
    def __init__(self):
        super(senNet,self).__init__()
        self.fc1=torch.nn.Linear(VOCAB_SIZE,125)
        self.act1=torch.nn.ReLU()
        self.fc2=torch.nn.Linear(125,25)
        self.act2=torch.nn.ReLU()
        self.fc3=torch.nn.Linear(25,2)
        
    
    def forward(self,x):
        x=self.fc1(x)
        x=self.act1(x)
        x=self.fc2(x)
        x=self.act2(x)
        x=self.fc3(x)
        return x

sen_net = senNet()

In [59]:
with open("model_v1.pk", "rb") as f:
  model = pickle.load(f)

with open("vocab.pkl", "rb") as pkl:
  vocab = pickle.load(pkl)

In [60]:
def tweet_to_vector(tweet, show_unknowns=False):
    vector = np.zeros(VOCAB_SIZE, dtype=np.int_)
    for token in tokenizer.tokenize(tweet):
        stem = get_stem(token)
        idx = token_2_idx.get(stem, None)
        if idx is not None:
            vector[idx] = 1
        elif show_unknowns:
            pass
            #print("Unknown token: {}".format(token))
    return vector

In [62]:
def get_stem(token):
    stem = stem_cache.get(token, None)
    if stem:
        return stem
    token = regex.sub('', token).lower()
    stem = stemer.stem(token)
    stem_cache[token] = stem
    return stem

token_2_idx = {vocab[i] : i for i in range(VOCAB_SIZE)}

In [66]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [70]:
def FLAIR_MODEL(text):

    with open(os.path.join('team_keywords.json'),'r') as f:
        team_names_keywords = json.load(f)

    with open(os.path.join('grouped_team_keywords.json'),'r') as f:
        groups_keywords = json.load(f)

    team_names_keywords_vects = [' '.join(team_names_keywords[team]) for team in team_names_keywords]
    
    from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings
    from flair.data import Sentence
    from collections import OrderedDict
    from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity

    glove_embedding = WordEmbeddings(os.path.join("../content/gdrive/My Drive/ru-wiki-fasttext-300d-1M"))

    document_embeddings = DocumentPoolEmbeddings([glove_embedding])
    
    import string
    def text_embeddings(text):
        sentence = Sentence(text)
        document_embeddings.embed(sentence)
        return sentence.embedding.numpy()
    
    text_vec = text_embeddings(text)
    
    teams_vecs = []
    for team in tqdm(team_names_keywords_vects):
        teams_vecs.append(text_embeddings(team))
    
    predictions = {}
    destinations = {}
    for i,team in enumerate(teams_vecs):
        dest = euclidean_distances([text_vec], [team])[0]
        destinations[list(team_names_keywords.keys())[i]] = dest
    scores = OrderedDict(sorted(destinations.items(), key=lambda kv: kv[1]))
    predictions['euclid_team'] = list(scores.keys())[0]
    predictions['euclid_score'] = scores[list(scores.keys())[0]][0]

    destinations = {}
    for i,team in enumerate(teams_vecs):
        dest = cosine_similarity([text_vec], [team])[0]
        destinations[list(team_names_keywords.keys())[i]] = dest
    scores = OrderedDict(sorted(destinations.items(), key=lambda kv: kv[1],reverse=True))
    predictions['cosine_team'] = list(scores.keys())[0]
    predictions['cosine_score'] = scores[list(scores.keys())[0]][0]
    predictions['text'] = text
    
    return predictions




def BERT_MODEL(text):

    with open(os.path.join('team_keywords.json'),'r') as f:
        team_names_keywords = json.load(f)

    with open(os.path.join('grouped_team_keywords.json'),'r') as f:
        groups_keywords = json.load(f)

    team_names_keywords_vects = [' '.join(team_names_keywords[team]) for team in team_names_keywords]

    from sentence_transformers import SentenceTransformer, util
    
    from collections import OrderedDict
    from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity

    embedder = SentenceTransformer(os.path.join('../content/gdrive/My Drive/public.ukp.informatik.tu-darmstadt.de_reimers_sentence-transformers_v0.2_distiluse-base-multilingual-cased.zip'))

    teams_vecs = embedder.encode(team_names_keywords_vects)

    text_vec = embedder.encode(text)[0]
    
    predictions = {}
    destinations = {}
    for i,team in enumerate(teams_vecs):
        dest = euclidean_distances([text_vec], [team])[0]
        destinations[list(team_names_keywords.keys())[i]] = dest
    scores = OrderedDict(sorted(destinations.items(), key=lambda kv: kv[1]))
    predictions['euclid_team'] = list(scores.keys())[0]
    predictions['euclid_score'] = scores[list(scores.keys())[0]][0]

    destinations = {}
    for i,team in enumerate(teams_vecs):
        dest = cosine_similarity([text_vec], [team])[0]
        destinations[list(team_names_keywords.keys())[i]] = dest
    scores = OrderedDict(sorted(destinations.items(), key=lambda kv: kv[1],reverse=True))
    predictions['cosine_team'] = list(scores.keys())[0]
    predictions['cosine_score'] = scores[list(scores.keys())[0]][0]
    predictions['text'] = text
    
    return predictions

In [None]:
#!pip install flask-ngrok
#!pip install flask_cors
#!pip install flair
#!pip install sentence_transformers

from flask_ngrok import run_with_ngrok
from flask import Flask
from flask import request
from flask_cors import CORS
import collections
from flask import jsonify

app = Flask(__name__)
run_with_ngrok(app)
CORS(app)

@app.route("/")
def home():

  text = request.args.get('text')
  topn = request.args.get('topn')
  lemmatized_texts = lemmatize_texts([text])

  d = {}
  exctracted_keywords = extract_keywords(" ".join(lemmatized_texts[0]))
  d["keywords"] = exctracted_keywords

  intersection={}
  for team in json_.keys():
    intersection[team] = len(list(set(exctracted_keywords) & set(json_[team])))/len(json_[team])

  intersection = {x:y for x,y in intersection.items() if y!=0}
  teams = sorted(intersection, key=intersection.get, reverse=True)[:int(topn)]
  d["teams"] = teams

  tweet = text
  tweet_vector = tweet_to_vector(tweet, True)
  tweet_vector = torch.from_numpy(tweet_vector).float()
  positive_prob = model.forward(tweet_vector)[1]
  #print('Original tweet: {}'.format(tweet))
  #print('P(positive) = {:.5f}. Result: '.format(positive_prob), 'Positive' if positive_prob > 0.5 else 'Negative')
  tonality = 'Positive' if positive_prob > 0.5 else 'Negative'
  d["tonality"] = tonality

  flair_preds = FLAIR_MODEL(text)
  d["FLAIR"] = {}
  d["FLAIR"]["euclid_team"] = flair_preds["euclid_team"]
  d["FLAIR"]["cosine_team"] = flair_preds["cosine_team"]
  bert_preds = BERT_MODEL(text)
  d["BERT"] = {}
  d["BERT"]["euclid_team"] = bert_preds["euclid_team"]
  d["BERT"]["cosine_team"] = bert_preds["cosine_team"]

  return jsonify(d)
  
app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
INFO:werkzeug: * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


 * Running on http://9bf34562b983.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

  return _compile(pattern, flags).split(string, maxsplit)
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL



Unknown token: бочке
Unknown token: дегтя
Unknown token: техподдержки
Unknown token: сбера
Unknown token: сбера
Unknown token: пос-терминалы
Unknown token: интернет-банкинг
Unknown token: выписка
Unknown token: неформативна
Unknown token: назначение
Unknown token: платежа
Unknown token: просмотреть
Unknown token: платежку
Unknown token: переслать
Unknown token: предъявить
Unknown token: сбера
Unknown token: платежку
Unknown token: неоперативно


100%|██████████| 42/42 [00:00<00:00, 1579.86it/s]
127.0.0.1 - - [01/Aug/2020 17:12:32] "[37mGET /?text=Здравствуйте!%20В%20этой%20прекрасной%20бочке%20меда%20уже%20давно%20имеется%20ложка%20дегтя,%20о%20которой%20пишу%20и%20говорю%20в%20телефон%20техподдержки%20сбера%20уже%20второй%20год:%20сделайте,%20пожалуйста,%20полную%20историю%20операций%20по%20счетам.%20Сейчас%20в%20историю%20попадают%20только%20переводы%20между%20своими%20счетами%20и%20на%20карты%20сбера%20и%20других%20банков.%20Не%20попадают%20операции%20оплаты%20в%20магазинах%20через%20пос-терминалы%20и%20через%20интернет-банкинг.%20Конечно%20для%20каждого%20счета%20имеется%20отчет%20-%20выписка%20за%20период.%20Но%20она%20абсолютно%20неформативна:%20отсутствует%20назначение%20платежа;%20невозможно%20просмотреть%20платежку,%20переслать%20ее%20(как%20это%20сделано%20в%20разделе%20История).%20Т.е.%20при%20необходимости%20предъявить%20кому-то%20факт%20оплаты%20невозможно%20поднять%20документ.%20Придется%20идти%20в%20отделение%2

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

  return _compile(pattern, flags).split(string, maxsplit)
[2020-08-01 17:12:54,158] ERROR in app: Exception on / [GET]
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/flask/app.py", line 2447, in wsgi_app
    response = self.full_dispatch_request()
  File "/usr/local/lib/python3.6/dist-packages/flask/app.py", line 1952, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "/usr/local/lib/python3.6/dist-packages/flask_cors/extension.py", line 161, in wrapped_function
    return cors_after_request(app.make_response(f(*args, **kwargs)))
  File "/usr/local/lib/python3.6/dist-packages/flask/app.py", line 1821, in handle_user_exception
    reraise(exc_type, exc_value, tb)
  File "/usr/local/lib/python3.6/dist-packages/flask/_compat.py", line 39, in reraise
    raise value
  File "/usr/local/lib/python3.6/dist-packages/flask/app.py", line 1950, in full_dispatch_request
    rv = self.dispatch_request()
  File "/usr/local/lib/python3.6/dist-pa


