In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import requests
from datetime import datetime
from bs4 import BeautifulSoup
import logging

In [2]:
data = pd.read_csv('../data/vacancies_ru.csv')
data.drop('Unnamed: 0', axis=1, inplace=True)
data

Unnamed: 0,vacancy_id,name,company_id,keySkills,compensation_from,compensation_to,compensation_currencyCode,area_id,area_regionId,employment,workSchedule,workExperience,clean_name,area_regionId_encoder,area_id_encoder,company_id_encoder,employment_encoder,workSchedule_encoder,workExperience_encoder
0,v_862116,Смотритель музейный,c_162972,"['Пользователь ПК', 'Работа в команде', 'Умени...",16500.0,,RUR,a_4761,ar_33,full,fullDay,noExperience,Смотритель музейный,32,4181,69972,5,5,1
1,v_288642,Ведущий менеджер по работе с физическими лицами,c_208672,"['Активные продажи', 'Холодные продажи', 'Кред...",50000.0,,RUR,a_744,ar_2,full,fullDay,noExperience,Ведущий менеджер по работе с физическими лицами,17,6731,120750,5,5,1
2,v_1840054,Бухгалтер (по расчету зарплаты),c_198109,,50000.0,65000.0,RUR,a_6223,ar_78,full,fullDay,between3And6,Бухгалтер,81,5806,109013,5,5,3
3,v_2346232,"Пекарь (Токсово, Привокзальная, 16)",c_6137,,38500.0,42000.0,RUR,a_4795,ar_51,full,fullDay,noExperience,Пекарь,52,4218,235992,5,5,1
4,v_312507,Торговый представитель (г. Абакан),c_206699,"['Продуктивность', 'Клиентоориентированность',...",60000.0,,RUR,a_6837,ar_4,full,fullDay,between1And3,Торговый представитель,39,6487,118557,5,5,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2209567,v_2484959,Сборщик-упаковщик,c_203256,"['Пользователь ПК', 'Умение работать в команде...",40000.0,90000.0,RUR,a_5387,ar_71,full,flexible,noExperience,Сборщик-упаковщик,74,4876,114733,5,2,1
2209568,v_205163,Сварщик на полуавтомат,c_158695,"['Желание работать и зарабатывать', 'Высокая э...",80000.0,130000.0,RUR,a_5527,ar_69,full,fullDay,between1And3,Сварщик на полуавтомат,71,5032,65219,5,5,2
2209569,v_639897,Главный инженер / Технический директор,c_209365,"['Контроль исправности оборудования', 'Инженер...",200000.0,,RUR,a_1756,ar_41,full,fullDay,between1And3,Главный инженер,41,842,121520,5,5,2
2209570,v_1636531,"Провизор/Фармацевт (г.Адыгейск, 20 км от Красн...",c_246244,"['Предпечатная подготовка', 'Статистический ан...",25000.0,,RUR,a_3403,ar_60,full,fullDay,noExperience,Фармацевт,62,2673,162497,5,5,1


In [3]:
url = "https://api.hh.ru/professional_roles"

response = requests.get(url)

if response.status_code == 200:
    data_list = response.json()

In [4]:
data_list['categories'][0]

{'id': '19',
 'name': 'Автомобильный бизнес',
 'roles': [{'id': '4',
   'name': 'Автомойщик',
   'accept_incomplete_resumes': True,
   'is_default': False,
   'select_deprecated': False,
   'search_deprecated': False},
  {'id': '5',
   'name': 'Автослесарь, автомеханик',
   'accept_incomplete_resumes': True,
   'is_default': False,
   'select_deprecated': False,
   'search_deprecated': False},
  {'id': '62',
   'name': 'Мастер-приемщик',
   'accept_incomplete_resumes': True,
   'is_default': False,
   'select_deprecated': False,
   'search_deprecated': False},
  {'id': '70',
   'name': 'Менеджер по продажам, менеджер по работе с клиентами',
   'accept_incomplete_resumes': False,
   'is_default': False,
   'select_deprecated': False,
   'search_deprecated': False}]}

In [5]:
prof_roles_codes = []
for i in range(len(data_list['categories'])):
    for j in range(len(data_list['categories'][i]['roles'])):
          prof_roles_codes.append(int(data_list['categories'][i]['roles'][j]['id']))   

In [6]:
filename = 'vacancy_dataset.csv'
page_cnt = 20
item_per_page = 20
area_code = 113
currency_code = 'RUR'
order_by = 'publication_time'
search_period = 30

url_hh = 'https://api.hh.ru/'
url_roles = url_hh + 'professional_roles'
url_vacancies = url_hh + 'vacancies' 

In [7]:
response = requests.get(url_roles)

if response.status_code != 200:
    print('Error:', response.text)
else:
    data_list = response.json()

    prof_roles_codes = []
    for i in range(len(data_list['categories'])):
        for j in range(len(data_list['categories'][i]['roles'])):
            prof_roles_codes.append(int(data_list['categories'][i]['roles'][j]['id']))

In [8]:
len(prof_roles_codes)

270

In [9]:
columns = ['vacancy_id', 'name', 'company_id', 'keySkills', 'compensation_from',
           'compensation_to', 'compensation_currencyCode', 'area_id', 'employment',
           'workSchedule', 'workExperience', 'description', 'published_at']

In [10]:
root = logging.getLogger()
for handler in root.handlers[:]:
    root.removeHandler(handler)

In [11]:
logging.basicConfig(
    filename='hh_logs.log',
    level=logging.INFO,
    format = '%(asctime)s - %(levelname)s - %(message)s'
    )

In [12]:
df = pd.DataFrame(columns=columns)
ind = 0

for code in prof_roles_codes:
    ind += 1
    vacancies = []
    progress_bar = tqdm(total=page_cnt, desc=f"Professional role {code}, {ind}/{len(prof_roles_codes)}")
    for i in range(page_cnt):
        progress_bar.update(1)
        
        params = {'area': area_code,'professional_role': code, 'currency_code': currency_code, 
                  'per_page': item_per_page, 'page': i, 'search_period': search_period, 'order_by': order_by}
        
        response = requests.get(url_vacancies, params=params)
        
        if response.status_code == 200:
            vac_page = response.json()
            
            for j in range(len(vac_page['items'])):
                vac = vac_page['items'][j]
                row = {column: None for column in columns}

                row['vacancy_id'] = vac.get('id')

                response = requests.get(f"{url_vacancies}/{row['vacancy_id']}")

                if response.status_code == 200:
                    vac = response.json()

                    row['name'] = vac.get('name')
                    row['company_id'] = vac['employer'].get('id')
                    row['keySkills'] = vac.get('id')

                    if vac.get('salary') is not None:
                        row['compensation_from'] = vac['salary'].get('from')
                        row['compensation_to'] = vac['salary'].get('to')
                        row['compensation_currencyCode'] = vac['salary'].get('currency')
                    else:
                        row['compensation_from'] = None
                        row['compensation_to'] = None
                        row['compensation_currencyCode'] = None

                    row['area_id'] = vac['area'].get('id')
                    row['employment'] = vac.get('employment').get('id') if vac.get('employment') else None
                    row['workSchedule'] = vac.get('schedule').get('id') if vac.get('schedule') else None
                    row['workExperience'] = vac.get('experience').get('id') if vac.get('experience') else None
                    row['description'] = BeautifulSoup(vac['description'], 'html.parser').get_text()
                    row['keySkills'] = [item['name'] for item in vac.get('key_skills', [])]
                    row['published_at'] = datetime.strptime(vac.get('published_at')[:19],
                                                                  '%Y-%m-%dT%H:%M:%S').strftime('%Y-%m-%d %H:%M:%S')
                    row['alternate_url'] = vac.get('alternate_url')
                    
                    df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
                    logging.info(f'Successful concatenation. Shape of dataframe: {df.shape}')

    progress_bar.close()


Professional role 4, 1/270: 100%|███████████████| 20/20 [01:35<00:00,  4.76s/it]
Professional role 5, 2/270: 100%|███████████████| 20/20 [01:28<00:00,  4.42s/it]
Professional role 62, 3/270: 100%|██████████████| 20/20 [01:33<00:00,  4.67s/it]
Professional role 70, 4/270: 100%|██████████████| 20/20 [01:30<00:00,  4.51s/it]
Professional role 8, 5/270: 100%|███████████████| 20/20 [01:34<00:00,  4.71s/it]
Professional role 33, 6/270: 100%|██████████████| 20/20 [01:39<00:00,  4.98s/it]
Professional role 58, 7/270: 100%|██████████████| 20/20 [01:41<00:00,  5.08s/it]
Professional role 76, 8/270: 100%|██████████████| 20/20 [01:40<00:00,  5.03s/it]
Professional role 84, 9/270: 100%|██████████████| 20/20 [01:41<00:00,  5.07s/it]
Professional role 88, 10/270: 100%|█████████████| 20/20 [01:41<00:00,  5.06s/it]
Professional role 93, 11/270: 100%|█████████████| 20/20 [01:37<00:00,  4.90s/it]
Professional role 110, 12/270: 100%|████████████| 20/20 [01:36<00:00,  4.84s/it]
Professional role 22, 13/270

Professional role 24, 102/270: 100%|████████████| 20/20 [01:34<00:00,  4.70s/it]
Professional role 29, 103/270: 100%|████████████| 20/20 [01:42<00:00,  5.11s/it]
Professional role 42, 104/270: 100%|████████████| 20/20 [01:43<00:00,  5.19s/it]
Professional role 168, 105/270: 100%|███████████| 20/20 [01:42<00:00,  5.13s/it]
Professional role 64, 106/270: 100%|████████████| 20/20 [01:40<00:00,  5.01s/it]
Professional role 65, 107/270: 100%|████████████| 20/20 [01:42<00:00,  5.12s/it]
Professional role 79, 108/270: 100%|████████████| 20/20 [01:40<00:00,  5.02s/it]
Professional role 151, 109/270: 100%|███████████| 20/20 [01:39<00:00,  4.96s/it]
Professional role 133, 110/270: 100%|███████████| 20/20 [01:42<00:00,  5.11s/it]
Professional role 17, 111/270: 100%|████████████| 20/20 [01:43<00:00,  5.17s/it]
Professional role 23, 112/270: 100%|████████████| 20/20 [01:38<00:00,  4.92s/it]
Professional role 168, 113/270: 100%|███████████| 20/20 [01:32<00:00,  4.62s/it]
Professional role 167, 114/2

Professional role 134, 203/270: 100%|███████████| 20/20 [01:42<00:00,  5.11s/it]
Professional role 11, 204/270: 100%|████████████| 20/20 [00:48<00:00,  2.44s/it]
Professional role 91, 205/270: 100%|████████████| 20/20 [01:10<00:00,  3.51s/it]
Professional role 122, 206/270: 100%|███████████| 20/20 [01:43<00:00,  5.16s/it]
Professional role 6, 207/270: 100%|█████████████| 20/20 [01:45<00:00,  5.26s/it]
Professional role 14, 208/270: 100%|████████████| 20/20 [01:42<00:00,  5.14s/it]
Professional role 154, 209/270: 100%|███████████| 20/20 [01:38<00:00,  4.93s/it]
Professional role 27, 210/270: 100%|████████████| 20/20 [01:41<00:00,  5.06s/it]
Professional role 30, 211/270: 100%|████████████| 20/20 [01:36<00:00,  4.83s/it]
Professional role 34, 212/270: 100%|████████████| 20/20 [01:37<00:00,  4.87s/it]
Professional role 47, 213/270: 100%|████████████| 20/20 [01:37<00:00,  4.87s/it]
Professional role 45, 214/270: 100%|████████████| 20/20 [01:40<00:00,  5.03s/it]
Professional role 46, 215/27

In [13]:
with open('hh_logs.log', 'r') as file:
    logs = file.readlines()
1
print(logs[0].strip())

2024-05-04 14:40:35,778 - INFO - Successful concatenation. Shape of dataframe: (1, 14)


In [20]:
len(df['vacancy_id'].unique())

18572

In [22]:
df_new = df.dropna(subset=['name'])
df_new = df_new.drop_duplicates(subset=['vacancy_id'])

In [23]:
df_new

Unnamed: 0,vacancy_id,name,company_id,keySkills,compensation_from,compensation_to,compensation_currencyCode,area_id,employment,workSchedule,workExperience,description,published_at,alternate_url
0,97665787,Шиномонтажник - Автомойщик,5804402,"[Работа в команде, грамотная речь на русском я...",70000,250000,RUR,1,full,fullDay,between1And3,Обязанности: - Шиномонтаж - все виды работ - С...,2024-05-05 00:45:33,https://hh.ru/vacancy/97665787
1,96659947,Автомойщик,9555546,"[Ответственность, Обучаемость]",35000,,RUR,24,full,fullDay,between1And3,Обязанности: Мойка автомобилей Требования: ...,2024-05-04 23:58:21,https://hh.ru/vacancy/96659947
2,97154604,Автомойщик,4211440,[],40000,50000,RUR,26,full,flexible,noExperience,Автомойщики на автомойки в Северном районе тре...,2024-05-04 20:58:49,https://hh.ru/vacancy/97154604
3,98394231,Автомойщик,1104595,[],45000,70000,RUR,17,part,fullDay,noExperience,Приглашаем на работу на автомоечные комплексы ...,2024-05-04 20:33:31,https://hh.ru/vacancy/98394231
4,97657513,Автомойщик,9256296,[],2000,,RUR,104,full,fullDay,between1And3,Обязанности: Мойка легковых автомобилей Подде...,2024-05-04 20:19:09,https://hh.ru/vacancy/97657513
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20309,97184914,"Курьер (доставка документов, посылок)",10593319,[],80000,125000,RUR,79,part,shift,noExperience,"Требуются Курьеры на доставку писем, пысылок, ...",2024-05-05 08:54:34,https://hh.ru/vacancy/97184914
20310,97184918,"Курьер (доставка документов, посылок)",10593319,[],80000,125000,RUR,54,part,shift,noExperience,"Требуются Курьеры на доставку писем, пысылок, ...",2024-05-05 08:54:34,https://hh.ru/vacancy/97184918
20311,97184919,"Курьер (доставка документов, посылок)",10593319,[],80000,125000,RUR,56,part,shift,noExperience,"Требуются Курьеры на доставку писем, пысылок, ...",2024-05-05 08:54:34,https://hh.ru/vacancy/97184919
20312,97184921,"Курьер (доставка документов, посылок)",10593319,[],80000,125000,RUR,1440,part,shift,noExperience,"Требуются Курьеры на доставку писем, пысылок, ...",2024-05-05 08:54:34,https://hh.ru/vacancy/97184921


In [24]:
df_new.to_csv(filename, index=False, mode='a')

In [25]:
new_data = pd.read_csv(filename)
new_data.shape

(18572, 14)

In [None]:
# для апдейта можно search_period поменьше брать

In [219]:
# draft to check fields
response = requests.get(f"{url_vacancies}/{row['vacancy_id']}")

if response.status_code == 200:
    vac = response.json()
    print(vac['alternate_url'])

https://hh.ru/vacancy/92767727
