In [1]:
import os
import re

import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, Bidirectional
from keras.layers import SpatialDropout1D
from gensim.models import Word2Vec
from keras.utils import np_utils
from keras.preprocessing import sequence
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from keras.regularizers import l2
import seaborn as sns

import config


Using TensorFlow backend.


In [2]:
hh_data_path = os.path.join("headHunter_data", "hh_dataset.csv")

In [3]:
dataset_hh = pd.read_csv(hh_data_path, sep="\t")
dataset_jooble = pd.read_csv("by_jobs.csv", sep="\t")

In [None]:
#dataset_hh.head()

In [None]:
#dataset_jooble.head()

In [6]:
corpus = []
spec_list = []
for i, row in dataset_hh.iterrows():
    specs = row["profarea_names"].lower()
    specs = specs.split("', ") 
    for spec in specs: 
        spec = re.sub('[\[\'\]]', '', spec)
        spec_list.append(spec.strip())
    corpus.append(row["text_normalized"].split(" "))
    
for i, row in dataset_jooble.loc[:5000].iterrows():
    corpus.append(row["text_normalized"].split(" "))
    

In [7]:
print(len(spec_list))
print("Corpus size =", len(corpus))
spec_list = list(set(spec_list))
print("set of prof areas =", len(spec_list))
spec_list

14399
Corpus size = 8654
set of prof areas = 28


['управление персоналом, тренинги',
 'начало карьеры, студенты',
 'административный персонал',
 'консультирование',
 'закупки',
 'автомобильный бизнес',
 'безопасность',
 'добыча сырья',
 'наука, образование',
 'высший менеджмент',
 'продажи',
 'транспорт, логистика',
 'медицина, фармацевтика',
 'информационные технологии, интернет, телеком',
 'юристы',
 'рабочий персонал',
 'искусство, развлечения, масс-медиа',
 'инсталляция и сервис',
 'маркетинг, реклама, pr',
 'производство',
 'домашний персонал',
 'строительство, недвижимость',
 'государственная служба, некоммерческие организации',
 'бухгалтерия, управленческий учет, финансы предприятия',
 'спортивные клубы, фитнес, салоны красоты',
 'туризм, гостиницы, рестораны',
 'страхование',
 'банки, инвестиции, лизинг']

### Word embedding

In [None]:
#vectorizer = Word2Vec.load('word2vec_simple.bin')
vectorizer = Word2Vec(corpus, min_count=5)
print(vectorizer)
vectorizer.save('word2vec_simple.bin')

In [None]:
word = np.reshape(vectorizer.wv["свеж"], (1, -1))
cosine_similarity(vectorizer.wv["компан", "молок"], word)

In [None]:
def vectorize(text, vectorizer, max_len):
    """
    :param str text: normalized text
    :param Word2Vec vectorizer:
    :param int max_len:
    :return np.array text_vect: of shape like (1, max_len, 100)
    """
    text_vect = []
    words = text.split(" ")
    for word in words:
        try:
            word_vec = vectorizer.wv[word]
            text_vect.append(word_vec)
        except KeyError:
            None
      
    np.array(text_vect)
    text_vect = np.reshape(text_vect, (1, -1, 100))
    text_vect = sequence.pad_sequences(text_vect, maxlen=max_len, dtype='float')
            
    return text_vect
    

### Prepare data for RNN

In [None]:
labels = spec_list
n_classes = len(labels)
n_examples = len(dataset_hh)
X = []
Y = np.zeros((n_examples, n_classes))
for i, row in dataset_hh.iterrows():
    X.append([])
    words = row["text_normalized"].split(" ")
    for word in words:
        try:
            word_vec = vectorizer.wv[word]
            X[i].append(word_vec)
        except KeyError:
            None
    
    specs = row["profarea_names"].lower() 
    specs = specs.split("', ") 
    for spec in specs: 
        spec = re.sub('[\[\'\]]', '', spec)
        pos = labels.index(spec.strip())
        Y[i][pos] = 1
        
assert(len(X) == len(Y))

In [None]:
max_len = max([len(i) for i in X])
print("Max len =", max_len)

In [None]:
X = sequence.pad_sequences(X, maxlen=max_len, dtype='float')
X = np.array(X)

In [None]:
print(X.shape)
print(Y.shape)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=100)

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
sns.countplot(np.argmax(Y, axis=1))

In [None]:
del dataset_hh
del dataset_jooble

In [None]:
del X
del Y

In [None]:
def get_classes(y_one_hot, labels, threshold=0.5):
    """
    :param np.array y_one_hot: shape like (1, n_classes)
    :param list of str labels:
    :return list of str classes: 
    """
    classes = []
    y_rounded = np.zeros(y_one_hot.shape)
    y_rounded[y_one_hot > threshold] = 1
    for i in range(len(labels)):
        if y_rounded[0][i] == 1:
            classes.append(labels[i])
            
    return classes
        

### Create RNN

In [None]:
sample_weight = np.ones((n_classes,))
sample_weight[12] = 4
sample_weight[11] = 2
sample_weight[7] = 2
sample_weight[5] = 2
sample_weight[3] = 2
sample_weight = [list(sample_weight)]


In [None]:
model = Sequential()

model.add(Bidirectional(layer=LSTM(10, dropout=0.3, recurrent_dropout=0.3),
                        input_shape=(max_len, 100)))

model.add(Dense(n_classes, activation="relu", kernel_regularizer=l2(0.1)))
model.add(Dense(n_classes, activation="sigmoid", kernel_regularizer=l2(0.1)))

model.compile(loss="binary_crossentropy",
              optimizer="adam",
              metrics=["accuracy"],
              loss_weights=sample_weight)

model.summary()

In [None]:
%%time
model.fit(X_train, Y_train,
          batch_size=64, 
          epochs=2,
          validation_split=0.15,
          verbose=1)


In [None]:
%%time
scores = model.evaluate(X_test, Y_test, batch_size=64)
print("Test accuracy: %.2f%%" % (scores[1] * 100))

### Check model on real data

In [None]:
#dataset_jooble = pd.read_csv("by_jobs.csv", sep="\t")
dataset_jooble = pd.read_csv(hh_data_path, sep="\t")

In [None]:
%%time
#for i in range(1005, 1010):
index = 3005
title = dataset_jooble.loc[index, "title"]
title_norm = dataset_jooble.loc[index, "title_normalized"]
vacancy = dataset_jooble.loc[index, "text_raw"]
vacancy_norm = dataset_jooble.loc[index, "text_normalized"]
true_label = dataset_jooble.loc[index, "profarea_names"]

vacancy_vect = vectorize(vacancy_norm, vectorizer, max_len)

predict = model.predict(vacancy_vect)
result = get_classes(predict, labels, threshold=0.4)

title_vect = vectorize(title_norm, vectorizer, max_len)

predict_title = model.predict(title_vect)
result_title = get_classes(predict_title, labels, threshold=0.45)

    #if np.sum(np.round(predict)) > 0:
     #   break
print("List of classes:", labels)
print()
print("Classes for vacancy:", result)
print("Classes for title:", result_title)
print("True labels:", true_label)
print(np.argmax(predict))
#print(predict_title)
print("Vacancy title:", title)
print()
print(vacancy)

### Save model

In [None]:
model.save_weights("class1_simple_weights.hdf5")

saved_model = model.to_json()
with open("class1_simple.json", "w") as json_file:
    json_file.write(saved_model)
    

In [None]:
from gensim.models import Word2Vec
# define training data
sentences = [['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec'],
			['this', 'is', 'the', 'second', 'sentence'],
			['yet', 'another', 'sentence'],
			['one', 'more', 'sentence'],
			['and', 'the', 'final', 'sentence']]
# train model
model = Word2Vec(sentences, min_count=1)
# summarize the loaded model
print(model)
# summarize vocabulary
words = list(model.wv.vocab)
print(words)
# access vector for one word
print(model['sentence'])
# save model
model.save('model.bin')
# load model
new_model = Word2Vec.load('model.bin')
print(new_model)