In [None]:
import json
import gzip
import os

import pandas as pd
import requests

import config

data_path = os.path.join("by_jobs")
text_processing_url = config.text_processing_url
n_vacancies_to_save = 5000

In [None]:
def load_data(data_path):
    """
    Load all data to pandas.DataFrame
    
    :param str data_path: Path to folder with data 
    :return pd.DataFrame data:
    """
    docs_info = []
    docs_text = []
    file_names = os.listdir(data_path)

    for file in file_names:
        if "text" in file:
            with gzip.open(os.path.join(data_path, file), "rb") as f:
                for line in f:
                    vacancy = json.loads(line)
                    docs_text.append(vacancy)
        else:
            with gzip.open(os.path.join(data_path, file), "rb") as inf:
                for line in inf:
                    vacancy = json.loads(line)
                    docs_info.append(vacancy)

    assert len(docs_info) == len(docs_text)
    assert "id" in docs_info[0].keys()
    assert "id_job" in docs_text[0].keys()

    docs_info = pd.DataFrame(docs_info)
    docs_text = pd.DataFrame(docs_text)
    docs_info.drop_duplicates(["id"], inplace=True)
    docs_text.drop_duplicates(["id_job"], inplace=True)
    data = docs_info.merge(docs_text, left_on='id', right_on='id_job', how='outer')
    
    return data

In [None]:
full_df = load_data(data_path)
print(len(full_df))
#full_df.head()

In [None]:
full_df.to_csv("by_jobs_full.csv", sep='\t', header=True, index=None)

In [None]:
nessesary_df = full_df.reindex(columns=["id", "title",
                                        "lang_title",
                                        "title_normalized",
                                        "text", 
                                        "lang_text",
                                        "text_normalized"])

### Normalize text and title using text_preprocessing service

In [None]:
%%time
for index, row in nessesary_df.loc[:n_vacancies_to_save].iterrows():
    title = row["title"]
    text = row["text"]
    r = requests.post(text_processing_url + config.STEM_TEXT_PATH,
                          json=text)
    nessesary_df.loc[index, ["text_normalized"]] = r.text
    
    r = requests.post(text_processing_url + config.STEM_TEXT_PATH,
                          json=title)
    nessesary_df.loc[index, ["title_normalized"]] = r.text
    
    if index % 500 == 0:
        print(index)
        

In [None]:
nessesary_df.head()

In [None]:
nessesary_df.to_csv("by_jobs.csv", sep='\t', header=True, index=None)