In [None]:
import os
import re
import json
import pickle

import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, Bidirectional, Dropout
from keras.layers import SpatialDropout1D, Conv1D, MaxPooling1D
from gensim.models import Word2Vec
from keras.utils import np_utils
from keras.preprocessing import sequence
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from keras.regularizers import l2
import seaborn as sns
from gensim.models import KeyedVectors
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.models import model_from_json
from bs4 import BeautifulSoup
import jsonpickle
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Bidirectional, Concatenate
from keras.utils import to_categorical
from keras.models import load_model

from config_src import config
from document import Document
import text_processor

pd.options.display.max_rows = 100
pd.set_option('display.max_columns', None)

In [None]:
w2v_path = os.path.join(config.data_dir, "word_embedding/ruwikiruscorpora-nobigrams_upos_skipgram_300_5_2018.vec.gz")
jooble_data_path = os.path.join(config.src_dir, "labeled_jooble_data", "dataset_jooble_labeled.csv")
hh_data_path = os.path.join(config.headhunter_dir, "hh_dataset.csv")

## Save index

In [None]:
dataset_jooble = pd.read_csv(jooble_data_path, sep="\t")

In [None]:
dataset_hh = pd.read_csv(hh_data_path , sep="\t")

In [None]:
dataset_hh.head()

In [None]:
dataset_hh.drop_duplicates(["text"], inplace=True)
print("Final size of dataset =", len(dataset_hh))
dataset_hh.reset_index(drop=True, inplace=True)
dataset_hh.info()

In [None]:
dataset_jooble.info()

In [None]:
def add_inverted_index(document, inverted_index):
    """Add the document to inverted index.
    
    :param Document document: document must be preprocessed 
    :param dict inverted_index:
    
    :return dict inverted_index: Updated with new document.
    """
    tokens = document.text_normalized.split(" ")
    for token in tokens:
        if token in inverted_index.keys():
            inverted_index[token].append(document.id)
        else:
            inverted_index[token] = [document.id]
    return inverted_index



def create_index_from_dataframe(dataframe, forward_index={},
                               inverted_index={}, documents_id=[]):
    
    dataset = dataframe
    dataset.drop_duplicates(["text"], inplace=True)
    dataset.reset_index(drop=True, inplace=True)
    
    for index, row in dataset.iterrows():
        doc_id = row.loc["id"]
        title = row.loc["title"]
        text = row.loc["text"]
        lang = row.loc["lang_text"]
        title_normalized = row.loc["title_normalized"]
        text_normalized = row.loc["text_normalized"]
        url = row.loc["url"]
        
        document = Document(doc_id, title, text, lang,
                            title_normalized, text_normalized, url)
        try:   
            if str(document.id) not in documents_id:
                documents_id.append(str(document.id))
                forward_index[str(document.id)] = document
                inverted_index = add_inverted_index(document, inverted_index)
        except AttributeError:
            print("Bad vacancy index =", index)
            continue

    return forward_index, inverted_index, documents_id


def save_index(path, forward_index, inverted_index, documents_id,
               forward_file="forward_index", 
               inverted_file="inverted_index", 
               id_file="documents_id"):
    
    """Save index as json files
    
    :param str path: path to folder
    :param dict forward_index: link to forward_index instance.
    :param dict inverted_index: link to inverted_index instance.
    :param list of str documents_id: link to documents_id  instance.
    
    :param str forward_file: file name for forward_index without extension.
    :param str inverted_file: file name for inverted_index without extension.
    :param str id_file: file name for documents_id without extension.
    """
    
    file_path = os.path.join(path, forward_file + ".json")
    with open(file_path, 'w', encoding='utf8') as outfile:
        forward_index = jsonpickle.encode(forward_index)
        json.dump(forward_index, outfile, ensure_ascii=False)

    file_path = os.path.join(path, inverted_file + ".json")
    with open(file_path, 'w', encoding='utf8') as outfile:
        inverted_index = jsonpickle.encode(inverted_index)
        json.dump(inverted_index, outfile, ensure_ascii=False)

    file_path = os.path.join(path, id_file + ".json")
    with open(file_path, 'w') as outfile:
        documents_id = jsonpickle.encode(documents_id)
        json.dump(documents_id, outfile)

        
def clean_html(text):
    """
    Clean text from html tags
    
    :param str text:
    :return str text:
    """
    try:
        text = BeautifulSoup(text, "html").text
    except:
        print("Exception in  clean_html. NoneType argument.")
        return ""
    
    return text

def create_index_from_dataframe_hh(dataframe, forward_index={},
                               inverted_index={}, documents_id=[]):
    
    dataset = dataframe
    dataset.drop_duplicates(["text"], inplace=True)
    dataset.reset_index(drop=True, inplace=True)
    
    for index, row in dataset.iterrows():
        doc_id = row.loc["id"]
        title = row.loc["title"]
        text = clean_html(row.loc["text"])
        lang = row.loc["lang_text"]
        title_normalized = row.loc["title_normalized"]
        text_normalized = row.loc["text_normalized"]
        url = row.loc["url"]
        requirement_normalized = str(row.loc["requirement_norm"]) + str(row.loc["responsibility_norm"])
        
        if isinstance(row["profarea_names"], str):
            labels_norm = []
            labels = []
            specs = row["profarea_names"].lower()
            specs = specs.split("', ") 
            for spec in specs: 
                spec = re.sub('[\[\'\]]', '', spec)
                labels.append(spec)
                spec = text_processor.normalize_text(spec.strip())
                labels_norm.append(spec)
            
            prof_area_normalized = " ".join(labels_norm)
            prof_area = " ".join(labels)
        else:
            prof_area_normalized = ""
            prof_area = ""
            
        document = Document(doc_id, title, text, lang,
                            title_normalized, text_normalized, url,
                            requirement_normalized, prof_area, prof_area_normalized)
        try:   
            if str(document.id) not in documents_id:
                documents_id.append(str(document.id))
                forward_index[str(document.id)] = document
                inverted_index = add_inverted_index(document, inverted_index)
        except AttributeError:
            print("Bad vacancy index =", index)
            continue

    return forward_index, inverted_index, documents_id


In [None]:
%%time
forward_index, inverted_index, documents_id = create_index_from_dataframe_hh(dataset_hh)

In [None]:
forward_index, inverted_index, documents_id = create_index_from_dataframe(dataset_jooble)

In [None]:
len(forward_index)

In [None]:
dataset_hh.head()

In [None]:
for i, row in dataset_hh.iterrows():
    text = dataset_hh.loc[i, "text"]
    dataset_hh.loc[i, "text"] = clean_html(text)

In [None]:
dataset_hh.head()

In [None]:
dataset_hh.to_csv(os.path.join(config.headhunter_dir, "hh_dataset_cleaned_html.csv"),
                  sep='\t', header=True, index=False)

In [None]:
forward_index, inverted_index, documents_id = create_index_from_dataframe(dataset_hh,
                                                                         forward_index=forward_index,
                                                                         inverted_index=inverted_index,
                                                                         documents_id=documents_id)

In [None]:
print("Final index size:", len(forward_index), " ", len(inverted_index), " ", len(documents_id))

In [None]:
save_index(config.index_dir, forward_index, inverted_index, documents_id,
               forward_file="forward_index", 
               inverted_file="inverted_index", 
               id_file="documents_id")
    

In [None]:
corpus = []

def build_tfidf_from_index(save_tfidf_path): 
    """
    :param str save_tfidf_path:
    :retunt TfidfVectorizer vectorizer:
    """
    global corpus
    for index, i in enumerate(documents_id):
        try:
            assert(isinstance(forward_index[i].text_normalized, str))
            corpus.append(forward_index[i].text_normalized)
        except AssertionError:
            print("Bad document index =", index, " id =", i)
            continue
    print(len(corpus))        
    vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1, 2), 
                                lowercase=False)
    vectorizer.fit(corpus)
    
    save_tfidf_path = os.path.join(save_tfidf_path, "vectorizer_tfidf.dat")
    with open(save_tfidf_path, "wb") as ouf:
        pickle.dump(vectorizer, ouf)          
    return vectorizer

In [None]:
vectorizer_tfidf = build_tfidf_from_index(config.index_dir)

In [None]:
forward_index["-4974176290193358031"]

In [None]:
len(vectorizer_tfidf.get_feature_names())

In [None]:
#vectorizer_tfidf.get_feature_names()[130000:]

# RNN seq2seq

In [None]:
max_len = 200

num_encoder_tokens = 300
latent_dim = 400

num_decoder_tokens = 2
max_decoder_seq_length = max_len

batch_size = 64
epochs = 40

In [None]:
%%time
vectorizer = KeyedVectors.load_word2vec_format(w2v_path, binary=False)

In [None]:
def load_model(model_filename, model_weights_filename):
    with open(model_filename, 'r', encoding='utf8') as f:
        model = model_from_json(f.read())
    model.load_weights(model_weights_filename)
    return model

encoder_model = load_model('encoder_model.json', 'encoder_model_weights.h5')
decoder_model = load_model('decoder_model.json', 'decoder_model_weights.h5')

In [None]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, 0] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = []
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        """sampled_char = reverse_target_char_index[sampled_token_index]"""
        decoded_sentence.append(sampled_token_index)

        # Exit condition: either hit max length
        # or find stop character.
        if (len(decoded_sentence) == max_decoder_seq_length) or \
        (len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [None]:
maxlen = 200
def get_sample_nonlabeled(row):

    text = row["text_lemmas_tags"]
    x_sample = get_training_sample(vectorizer, text)
    x_sample = sequence.pad_sequences([x_sample], maxlen=max_len, dtype='float', padding="post", truncating="post")
    x_sample = np.array(x_sample)
  
      #print("x shape =", x_sample.shape)
  
    return x_sample
  

In [None]:
def get_training_sample(model, text):
    """
    Gets 1 sample of training data
    
    :param  model text:
    :return str text:
    """
    tagged_list = text.split(" ")

    vec_list = []
    
    #converting word2vec 
    for word in tagged_list:
        try:
            vec_list.append(model[word])
        except:
            #print("Word " + word + " isn't in vocab. Embeding as zeros")
            vec_list.append(np.zeros(300))
    return vec_list
  

In [None]:
%%time
index = 8001

x = get_sample_nonlabeled(dataset_jooble.loc[index])

predict = decode_sequence(x)

print(np.sum(predict))

print(dataset_jooble.loc[index, "title"])
print()
print(dataset_jooble.loc[index, "text"])

In [None]:
print(np.sum(predict))
words = dataset_jooble.loc[index, "text_lemmas"].split(" ")
words2 = dataset_jooble.loc[index, "text_normalized"].split(" ")
for i in range(0, 50):
    if len(words) > i:
        word = words[i]
        word2 = words2[i]
    else:
        word = "END"
    print(i, 'Predicted', predict[i], " - ", word, " - ", word2)

In [None]:
dataset_jooble = pd.read_csv(jooble_data_path, sep="\t")

In [None]:
dataset_jooble.loc[2000: 2100]

In [None]:
%%time
n_test = 8349
#predicts = []
count = 0

for i, row in dataset_jooble.loc[8349: ].iterrows():
    try:
        x = get_sample_nonlabeled(dataset_jooble.loc[i])
        predict = decode_sequence(x)
        
        tokens = dataset_jooble.loc[i, "text_normalized"].split(" ")
        req = []
        for j, token in enumerate(tokens):
            if j >= len(predict):
                break
            if predict[j] == 1:
                count += 1
                req.append(token)
    
        dataset_jooble.loc[i, "requirement_normalized"] =  " ".join(req)
        
        #predicts.append({"index": i, "id": dataset_jooble.loc[i, "id"], "predict": x,
        #                "requirement": " ".join(req)})
        
        if i % 100 == 0:
            file_name = "dataset_jooble_labeled.csv"
            save_path = os.path.join(config.src_dir, "labeled_jooble_data", file_name)
            dataset_jooble.to_csv(save_path, sep='\t', header=True, index=False)
            
            #file_name = "dataset_jooble_predicts_" + str(i) + ".dat"
            #save_path = os.path.join(config.src_dir, "labeled_jooble_data", file_name)
            #with open(save_path, "wb") as ouf:
            #    pickle.dump(predicts, ouf)
            #
            print(i, "Saved")
            print("Count =" , count)
            count = 0
    except AttributeError:
        print("Some error, index =", i, "id =", dataset_jooble.loc[i, "id"])
        print("Count =" , count)
        print(dataset_jooble.loc[i, "text"])
        print()
        dataset_jooble.loc[i, "requirement_normalized"] =  " "

In [None]:
dataset_jooble.loc[8300: 8400, "text_lemmas_tags"]

In [None]:
dataset_jooble.loc[8397, "text"]

In [None]:
file_name = "dataset_jooble_labeled_all.csv"
save_path = os.path.join(config.src_dir, "labeled_jooble_data", file_name)

In [None]:
dataset_jooble.to_csv(save_path, sep='\t', header=True, index=False)