In [2]:
import json
import gzip
import os

import pandas as pd
import numpy as np
import requests

import config

data_path = os.path.join("by_jobs")
text_processing_url = config.text_processing_url
n_vacancies_to_save = 10_000

pd.options.display.max_rows = 20
pd.set_option('display.max_columns', None)

### Read data from "by_jobs" folder

In [3]:
def load_data(data_path):
    """
    Load all data to pandas.DataFrame
    
    :param str data_path: Path to folder with data 
    :return pd.DataFrame data:
    """
    docs_info = []
    docs_text = []
    file_names = os.listdir(data_path)

    for file in file_names:
        if "text" in file:
            with gzip.open(os.path.join(data_path, file), "rb") as f:
                for line in f:
                    vacancy = json.loads(line)
                    docs_text.append(vacancy)
        else:
            with gzip.open(os.path.join(data_path, file), "rb") as inf:
                for line in inf:
                    vacancy = json.loads(line)
                    docs_info.append(vacancy)

    assert len(docs_info) == len(docs_text)
    assert "id" in docs_info[0].keys()
    assert "id_job" in docs_text[0].keys()

    docs_info = pd.DataFrame(docs_info)
    docs_text = pd.DataFrame(docs_text)
    docs_info.drop_duplicates(["id"], inplace=True)
    docs_text.drop_duplicates(["id_job"], inplace=True)
    data = docs_info.merge(docs_text, left_on='id', right_on='id_job', how='outer')
    
    return data

In [21]:
full_df = load_data(data_path)
print(full_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 90081 entries, 0 to 90080
Data columns (total 41 columns):
campaigns              0 non-null object
checksum               90081 non-null object
company_name           90081 non-null object
company_name_hash64    90081 non-null object
date_created_x         90081 non-null object
date_expired           90081 non-null object
date_updated           90081 non-null object
emails                 90081 non-null object
emails_src             90081 non-null object
html_desc_mode         90081 non-null object
id                     90081 non-null object
id_campaign            90081 non-null object
id_currency            90081 non-null object
id_language            90081 non-null object
id_project             90081 non-null object
id_region              90081 non-null object
id_salary_rate         90081 non-null object
id_similar_group       90081 non-null object
inactive               90081 non-null object
is_active              90081 non-null ob

In [22]:
full_df.to_csv("by_jobs_full.csv", sep='\t', header=True, index=False)

In [23]:
nessesary_df = full_df.reindex(columns=["id",
                                        "title",
                                        "lang_title",
                                        "title_normalized",
                                        "title_lemmas",
                                        "title_lemmas_tags",
                                        "title_tokens",                    
                                        "text", 
                                        "lang_text",
                                        "text_normalized",
                                        "text_lemmas",
                                        "text_lemmas_tags",
                                        "text_tokens",
                                        "url"])

In [24]:
print(nessesary_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 90081 entries, 0 to 90080
Data columns (total 14 columns):
id                   90081 non-null object
title                90081 non-null object
lang_title           90081 non-null object
title_normalized     90081 non-null object
title_lemmas         0 non-null float64
title_lemmas_tags    0 non-null float64
title_tokens         0 non-null float64
text                 90081 non-null object
lang_text            90081 non-null object
text_normalized      0 non-null float64
text_lemmas          0 non-null float64
text_lemmas_tags     0 non-null float64
text_tokens          0 non-null float64
url                  90081 non-null object
dtypes: float64(7), object(7)
memory usage: 10.3+ MB
None


### Normalize text and title using text_preprocessing service

In [25]:
# drop prepared data
nessesary_df = nessesary_df.loc[10_000: ]

In [None]:
%%time
for index, row in nessesary_df.iterrows():
    
    fields = ["title", "text"]
    for field in fields:
        text = row[field]
        r = requests.post(text_processing_url + config.STEM_TEXT_PATH,
                          json=text)
        nessesary_df.loc[index, field + "_normalized"] = r.text
        
        r = requests.post(text_processing_url + config.LEMM_TEXT_PATH,
                          json=text)
        nessesary_df.loc[index, field + "_lemmas"] = r.text
        
        r = requests.post(text_processing_url + config.TAG_TEXT_PATH,
                          json=text)
        nessesary_df.loc[index, field + "_lemmas_tags"] = r.text
        
        r = requests.post(text_processing_url + config.TOKEN_TEXT_PATH,
                          json=text)
        nessesary_df.loc[index, field + "_tokens"] = r.text
        
        r = requests.post(text_processing_url + config.DETECT_LANG_PATH ,
                          json=text)
        nessesary_df.loc[index, "lang_" + field] = r.text
        
    
    if index % 1000 == 0:
        file_name = "by_jobs_10K_to_" + str(index) + ".csv"
        print(index)
        nessesary_df.to_csv(file_name, sep='\t', header=True, index=None)
  

In [33]:
nessesary_df = nessesary_df[np.logical_or(nessesary_df["lang_text"]=="russian",
                                          nessesary_df["lang_text"]=="english")]

In [34]:
print(len(nessesary_df))
print(nessesary_df.info())

50809
<class 'pandas.core.frame.DataFrame'>
Int64Index: 50809 entries, 10000 to 60808
Data columns (total 14 columns):
id                   50809 non-null object
title                50809 non-null object
lang_title           50809 non-null object
title_normalized     50809 non-null object
title_lemmas         50809 non-null object
title_lemmas_tags    50809 non-null object
title_tokens         50809 non-null object
text                 50809 non-null object
lang_text            50809 non-null object
text_normalized      50809 non-null object
text_lemmas          50809 non-null object
text_lemmas_tags     50809 non-null object
text_tokens          50809 non-null object
url                  50809 non-null object
dtypes: object(14)
memory usage: 5.8+ MB
None


In [35]:
nessesary_df.tail()

Unnamed: 0,id,title,lang_title,title_normalized,title_lemmas,title_lemmas_tags,title_tokens,text,lang_text,text_normalized,text_lemmas,text_lemmas_tags,text_tokens,url
60804,-6380895435018986256,Художник векторной графики. Отрисовать из jpg ...,russian,художник векторн график отрисова jpg a удален,художник векторный график рисовать jpg ai удал...,художник_NOUN векторный_ADJ график_NOUN рисова...,художник векторной графики отрисовать jpg ai у...,Необходимо отрисовать рисунок jpg в ai (2-3 шт),russian,необходим отрисова рисунок jpg a шт,необходимый рисовать рисунок jpg ai штука,необходимый_ADJ рисовать_VERB рисунок_NOUN jpg...,необходимо отрисовать рисунок jpg ai шт,https://www.fl.ru/projects/3107090/otrisovat-i...
60805,-5127522319541783233,Музыкальный руководитель,russian,музыкальн руководител,музыкальный руководитель,музыкальный_ADJ руководитель_NOUN,музыкальный руководитель,Режим работы:Полный рабочий деньЗарплата:320.0...,russian,реж работ полн рабоч музыкальн руководител спе...,режим работа полный рабочий музыкальный руково...,режим_NOUN работа_NOUN полный_ADJ рабочий_ADJ ...,режим работы полный рабочий музыкальный руково...,http://minsk.regiony.by/работа/вакансии/#!jobs...
60806,-2700786999414105615,Грузчик-комплектовщик,russian,,,,,"работа на складе или на производстве (в цеху),...",russian,работ склад производств цех машин комплектац п...,работа склад производство цех машина комплекта...,работа_NOUN склад_NOUN производство_NOUN цех_N...,работа складе производстве цеху машин комплект...,http://rdw.by/vakansii/gruzchik-komplektovshch...
60807,3725440889222302714,Создание бота для сообщества в социальной сети...,russian,создан бот сообществ социальн сет контакт удален,создание бовать сообщество социальный сеть кон...,создание_NOUN бовать_NOUN сообщество_NOUN соци...,создание бота сообщества социальной сети конта...,Я занимаюсь преподаванием и разработал некотор...,russian,занима преподаван разработа некотор количеств ...,заниматься преподавание разрабатывать некоторы...,заниматься_VERB преподавание_NOUN разрабатыват...,занимаюсь преподаванием разработал некоторое к...,https://www.fl.ru/projects/3106788/sozdanie-bo...
60808,3209976227911552970,Operations - Junior Specialist with Scandinavi...,english,oper junior specialist scandinavian languag,operation junior specialist scandinavian language,operation junior specialist scandinavian language,operations junior specialist scandinavian lang...,"Accenture is a global management consulting, t...",english,accentur global manag consult technolog servic...,accenture global management consulting technol...,accenture global management consulting technol...,accenture global management consulting technol...,https://www.careersinpoland.com/job/accenture/...


In [37]:
exist_df = pd.read_csv(os.path.join("headHunter_data", "by_jobs.csv"), sep='\t')
exist_df = exist_df.loc[np.logical_or(exist_df["lang_text"]=="russian",
                                      exist_df["lang_text"]=="english")]
exist_df.reset_index(drop=True, inplace=True)
print(exist_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001 entries, 0 to 10000
Data columns (total 14 columns):
id                   10001 non-null int64
title                10001 non-null object
lang_title           10001 non-null object
title_normalized     9048 non-null object
title_lemmas         9048 non-null object
title_lemmas_tags    9048 non-null object
title_tokens         9048 non-null object
text                 10001 non-null object
lang_text            10001 non-null object
text_normalized      10000 non-null object
text_lemmas          10000 non-null object
text_lemmas_tags     10000 non-null object
text_tokens          10000 non-null object
url                  10001 non-null object
dtypes: int64(1), object(13)
memory usage: 1.1+ MB
None


In [38]:
nessesary_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50809 entries, 10000 to 60808
Data columns (total 14 columns):
id                   50809 non-null object
title                50809 non-null object
lang_title           50809 non-null object
title_normalized     50809 non-null object
title_lemmas         50809 non-null object
title_lemmas_tags    50809 non-null object
title_tokens         50809 non-null object
text                 50809 non-null object
lang_text            50809 non-null object
text_normalized      50809 non-null object
text_lemmas          50809 non-null object
text_lemmas_tags     50809 non-null object
text_tokens          50809 non-null object
url                  50809 non-null object
dtypes: object(14)
memory usage: 5.8+ MB


In [40]:
exist_df = exist_df.append(nessesary_df, ignore_index=True, sort=True)
print("Final size of dataset =", len(exist_df))

Final size of dataset = 60810


In [42]:
exist_df.to_csv("by_jobs.csv", sep='\t', header=True, index=None)

### Check saved data

In [30]:
exist_df = pd.read_csv("by_jobs.csv", sep='\t')
exist_df.info()                   

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80081 entries, 0 to 80080
Data columns (total 14 columns):
id                   80081 non-null int64
title                80081 non-null object
lang_title           80081 non-null object
title_normalized     75770 non-null object
title_lemmas         46496 non-null object
title_lemmas_tags    46496 non-null object
title_tokens         46499 non-null object
text                 80081 non-null object
lang_text            80081 non-null object
text_normalized      50800 non-null object
text_lemmas          50800 non-null object
text_lemmas_tags     50800 non-null object
text_tokens          50800 non-null object
url                  80081 non-null object
dtypes: int64(1), object(13)
memory usage: 8.6+ MB


In [31]:
del  exist_df