## Importing libraries

In [None]:
#!pip install pymystem3
#!pip install future
#!pip install beautifulsoup4
#!pip install ufal.udpipe
from __future__ import print_function
from __future__ import division
from future import standard_library
import sys
import requests
from pymystem3 import Mystem
from bs4 import BeautifulSoup

In [None]:
import gensim
import pandas as pd
import numpy as np
import re

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

stop_words = stopwords.words('russian')
stop_words.extend(['что', 'это', 'так', 'вот', 'быть', 'как', 'в', '—', 'к', 'на'])


#### Getting mapping for different tag systems

In [None]:
import requests
import re

url = 'https://raw.githubusercontent.com/akutuzov/universal-pos-tags/4653e8a9154e93fe2f417c7fdb7a357b7d6ce333/ru-rnc.map'

mapping = {}
r = requests.get(url, stream=True)
for pair in r.text.split('\n'):
    pair = re.sub('\s+', ' ', pair, flags=re.U).split(' ')
    if len(pair) > 1:
        mapping[pair[0]] = pair[1]

print(mapping)

#### Uploading pretrained word2vec russian model

In [None]:
import gensim.downloader as api
import gensim

# online loading
#ruscorpora_model = api.load("word2vec-ruscorpora-300")

#local upload
model = gensim.models.KeyedVectors.load_word2vec_format('ruwikiruscorpora_upos_skipgram_300_2_2018.vec.gz', binary=False)

## Importing and preprocessing dataset

In [None]:
df = pd.read_csv("../data/hh_dataset.csv", sep="\t")

In [None]:
df["text_raw"][0]

In [None]:
def getridoff_stopwords(text):
    res  = []
    for word in split_text(text):
        if word not in stop_words:
            res.append(word)
    return res

In [None]:
def tag_mystem(text='Текст нужно передать функции в виде строки!', postags=True):
    '''
    Return list of taged and lemmed words
    
    :param str input string
    :return list of strings lemmed and tagged words
    '''
    m = Mystem()
    processed = m.analyze(text)
    tagged = []
    for w in processed:
        try:
            if w["analysis"][0]["lex"] not in stop_words:
                lemma = w["analysis"][0]["lex"].lower().strip()
                pos = w["analysis"][0]["gr"].split(',')[0]
                pos = pos.split('=')[0].strip()
                if pos in mapping:
                    tagged.append(lemma + '_' + mapping[pos]) # здесь мы конвертируем тэги
                else:
                    tagged.append(lemma + '_X') # на случай, если попадется тэг, которого нет в маппинге
        except KeyError:
            continue # я здесь пропускаю знаки препинания, но вы можете поступить по-другому
    if not postags:
        tagged = [t.split('_')[0] for t in tagged]
    return tagged

In [None]:
def clean_html(text):
    """
    Clean text from html tags
    
    :param str text:
    :return str text:
    """
    try:
        text = BeautifulSoup(text, "html").text
    except:
        print("Exception in  clean_html. NoneType argument.")
        return ""
    
    return text

In [None]:
def split_text(text):
    return(re.findall(r"[\w']+", text))

### !!!!!!!!

In [None]:
def get_mask(text, req):
    mask = []
    # FOR TRAINING PURPOSES
    # SHOULD HAVE BEEN DONE PREVIOUSLY!
    # WHILE CREATING A DATASET
    #print(tag_mystem(text, postags=False), tag_mystem(req, postags=False))
    
    text = tag_mystem(text, postags=False)
    req = tag_mystem(req, postags=False)
    
    
    i = 0
    while i  <= (len(text)):
        if (req == text[i:i+len(req)]):
            for j in range(len(req)):
                mask.append(1.)
                i += 1
        else:
            mask.append(0.)
        i += 1
    return mask 

In [None]:
def get_training_sample(model, text, query):
    """
    Gets 1 sample of training data
    
    :param  model text:
    :return str text:
    """
    tagged_list = tag_mystem(text)
    mask = get_mask(text, query)
    vec_list = []
    
    #converting word2vec 
    for word in tagged_list:
        try:
            vec_list.append(model[word])
        except:
            print("Word " + word + " isn't in vocab. Embeding as zeros")
            vec_list.append(np.zeros(300))
    return vec_list, mask

In [None]:
vvec_list, mmask = get_training_sample(ruscorpora_model, "Мартышка бежала по берегу наполненому змеями", "бежала по берегу")

In [None]:
len(vvec_list[0])

In [None]:
np.array(mmask)

In [None]:
np.zeros(5)

In [None]:
req = df[:1]["requirement"].values[0]

In [None]:
text = df[:1]["text_raw"].values[0]

In [None]:
text = clean_html(text)

## Implementin Keras Model

In [None]:
from random import random
from numpy import array
from numpy import cumsum
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import Dense
from keras.layers import TimeDistributed
from keras.layers import Bidirectional


In [None]:
# create a sequence of classification instance
def get_sequence(model, text, query, n_timesteps=500, dims=300):
    """
    Gets 1 sequence of training data fully prepared for NN
    
    :param  model text query
    n_timesteps == max_num of words in one document
    :return X y:
    """
    X, y  = get_training_sample(model, text, query)

    if len(X) < n_timesteps:
        X = np.array(X)
        X = np.concatenate((X, np.zeros((n_timesteps - len(X), dims))))
        
        y = np.array(y).reshape((len(y),1))
        y = np.concatenate((y.reshape((len(y),1)), np.zeros((n_timesteps - len(y), 1))), axis=0)
        
    X = X.reshape(1, n_timesteps, dims)
    y = y.reshape(1, n_timesteps, 1)
    return X, y

Should try using KERAS EMBEDING LAYER instead

In [None]:
X, y = get_sequence(ruscorpora_model, "Лужа и курица пошли гулять", "курица пошли", n_timesteps=500, dims=300)

In [None]:
#dimensionality of each vord-vector
dims = 300
# number of observations/samples/max words in one document
n_timesteps = 5
# dimensionality of the output space *2 for bidirectional
output_dim = 500*2

# define LSTM
model = Sequential()
model.add(Bidirectional(LSTM(output_dim, return_sequences=True), input_shape=(n_timesteps, dims)))
model.add(Dropout(0.2))
model.add(TimeDistributed(Dense(1, activation='sigmoid')))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

# train LSTM
for epoch in range(2):
    # generate new random sequence
    X,y = get_sequence(n_timesteps, dims)
    # fit model for one epoch on this sequence
    model.fit(X, y, epochs=1, batch_size=1, verbose=2)

# evaluate LSTM
X,y = get_sequence(n_timesteps,  dims)
yhat = model.predict_classes(X, verbose=0)
for i in range(n_timesteps):
    print('Expected:', y[0, i], 'Predicted', yhat[0, i])