In [None]:
import json
import pickle
import os

import pandas as pd
import requests
import numpy as np

import config

data_path = os.path.join("headHunter_data")

text_processing_url = config.text_processing_url

pd.options.display.max_rows = 20
pd.set_option('display.max_columns', None)

In [None]:

with open(os.path.join(data_path, "hh_ids.json"), 'rb') as inf:
    ids = json.load(inf)
    
with open(os.path.join(data_path, "hh_vacancies.json"), 'rb') as inf:
    vacancies = json.load(inf)
    
with open(os.path.join(data_path, "hh_vacancies_ext.json"), 'rb') as inf:
    vacancies_ext = json.load(inf)
    

In [None]:
columns = ["id", "title", "specializations", "profarea_names", "requirement", 
           "requirement_norm", "responsibility_norm", "responsibility",
           "url", "title_normalized", "text_raw", "text_normilized"]


In [None]:
vac_rows = []

for vac in vacancies:
    row = {"id": vac["id"], "title": vac["name"],
           "title_normalized": "",
           "lang_title": "",
           "requirement_norm": "",
           "responsibility_norm": "",
           "requirement": vac["snippet"]["requirement"],
           "responsibility": vac["snippet"]["responsibility"],
           "url": vac["url"]}
    vac_rows.append(row)   
vac_df = pd.DataFrame(vac_rows)

vac_rows = []
for vac in vacancies_ext:
    row = {"id": vac["id"], "text_raw": vac["description"],
           "text_normalized": "", "lang_text": "",
           "specializations": [i["name"] for i in vac["specializations"]],
           "profarea_names": [i["profarea_name"] for i in vac["specializations"]]}
    vac_rows.append(row)   
vac_df_ext = pd.DataFrame(vac_rows)

vac_df.drop_duplicates(["id"], inplace=True)
vac_df_ext.drop_duplicates(["id"], inplace=True)
full_df = vac_df.merge(vac_df_ext, left_on='id', right_on='id', how='outer')
   

### Normalize text and title using text_preprocessing service

In [None]:
%%time
for index, row in full_df.iterrows():
    title = row["title"]
    text = row["text_raw"]
    requirement = row["requirement"]
    responsibility = row["responsibility"] 
    
    r = requests.post(text_processing_url + config.STEM_TEXT_PATH,
                          json=text)
    full_df.loc[index, "text_normalized"] = r.text
    
    r = requests.post(text_processing_url + config.STEM_TEXT_PATH,
                          json=title)
    full_df.loc[index, "title_normalized"] = r.text
    
    r = requests.post(text_processing_url + config.STEM_TEXT_PATH,
                          json=requirement)
    full_df.loc[index, "requirement_norm"] = r.text
    
    r = requests.post(text_processing_url + config.STEM_TEXT_PATH,
                          json=responsibility)
    full_df.loc[index, "responsibility_norm"] = r.text
  

In [None]:
print(len(full_df))
full_df


In [None]:
full_df.to_csv(os.path.join(data_path, "hh_dataset.csv"),
               sep='\t', header=True, index=None)

In [None]:
vacancies[0]

In [None]:
vacancies_ext[0]